aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig5
-rw-r--r--fs/9p/Makefile1
-rw-r--r--fs/9p/acl.c4
-rw-r--r--fs/9p/v9fs.h42
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_dentry.c2
-rw-r--r--fs/9p/vfs_inode.c876
-rw-r--r--fs/9p/vfs_inode_dotl.c824
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/9p/xattr.c2
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/adfs/dir.c1
-rw-r--r--fs/adfs/super.c4
-rw-r--r--fs/affs/affs.h1
-rw-r--r--fs/affs/namei.c3
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/cmservice.c12
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/afs/inode.c3
-rw-r--r--fs/afs/internal.h3
-rw-r--r--fs/afs/main.c13
-rw-r--r--fs/afs/mntpt.c63
-rw-r--r--fs/afs/rxrpc.c2
-rw-r--r--fs/afs/server.c13
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/vlocation.c14
-rw-r--r--fs/aio.c31
-rw-r--r--fs/anon_inodes.c29
-rw-r--r--fs/autofs4/autofs_i.h113
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/expire.c55
-rw-r--r--fs/autofs4/inode.c114
-rw-r--r--fs/autofs4/root.c743
-rw-r--r--fs/autofs4/symlink.c3
-rw-r--r--fs/autofs4/waitq.c17
-rw-r--r--fs/befs/endian.h16
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/binfmt_elf.c23
-rw-r--r--fs/bio-integrity.c7
-rw-r--r--fs/block_dev.c768
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/compression.c329
-rw-r--r--fs/btrfs/compression.h72
-rw-r--r--fs/btrfs/ctree.c8
-rw-r--r--fs/btrfs/ctree.h49
-rw-r--r--fs/btrfs/disk-io.c412
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/export.c12
-rw-r--r--fs/btrfs/extent-tree.c90
-rw-r--r--fs/btrfs/extent_io.c7
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c126
-rw-r--r--fs/btrfs/inode.c199
-rw-r--r--fs/btrfs/ioctl.c220
-rw-r--r--fs/btrfs/ioctl.h12
-rw-r--r--fs/btrfs/lzo.c420
-rw-r--r--fs/btrfs/ordered-data.c18
-rw-r--r--fs/btrfs/ordered-data.h8
-rw-r--r--fs/btrfs/super.c282
-rw-r--r--fs/btrfs/transaction.c11
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/volumes.c654
-rw-r--r--fs/btrfs/volumes.h29
-rw-r--r--fs/btrfs/xattr.c18
-rw-r--r--fs/btrfs/zlib.c369
-rw-r--r--fs/ceph/Makefile23
-rw-r--r--fs/ceph/debugfs.c9
-rw-r--r--fs/ceph/dir.c20
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/mds_client.c56
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/super.c13
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/char_dev.c15
-rw-r--r--fs/cifs/cache.c16
-rw-r--r--fs/cifs/cifs_debug.c32
-rw-r--r--fs/cifs/cifs_dfs_ref.c120
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_spnego.c10
-rw-r--r--fs/cifs/cifs_unicode.c127
-rw-r--r--fs/cifs/cifsacl.c13
-rw-r--r--fs/cifs/cifsencrypt.c6
-rw-r--r--fs/cifs/cifsfs.c67
-rw-r--r--fs/cifs/cifsfs.h21
-rw-r--r--fs/cifs/cifsglob.h73
-rw-r--r--fs/cifs/cifspdu.h62
-rw-r--r--fs/cifs/cifsproto.h9
-rw-r--r--fs/cifs/cifssmb.c118
-rw-r--r--fs/cifs/connect.c654
-rw-r--r--fs/cifs/dir.c33
-rw-r--r--fs/cifs/file.c522
-rw-r--r--fs/cifs/inode.c30
-rw-r--r--fs/cifs/link.c4
-rw-r--r--fs/cifs/misc.c73
-rw-r--r--fs/cifs/netmisc.c8
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/cifs/sess.c150
-rw-r--r--fs/cifs/transport.c436
-rw-r--r--fs/coda/cache.c5
-rw-r--r--fs/coda/cnode.c3
-rw-r--r--fs/coda/coda_cache.h22
-rw-r--r--fs/coda/coda_fs_i.h58
-rw-r--r--fs/coda/coda_linux.c3
-rw-r--r--fs/coda/coda_linux.h101
-rw-r--r--fs/coda/dir.c9
-rw-r--r--fs/coda/file.c3
-rw-r--r--fs/coda/inode.c8
-rw-r--r--fs/coda/pioctl.c4
-rw-r--r--fs/coda/psdev.c4
-rw-r--r--fs/coda/symlink.c4
-rw-r--r--fs/coda/upcall.c5
-rw-r--r--fs/compat.c10
-rw-r--r--fs/configfs/Kconfig4
-rw-r--r--fs/configfs/configfs_internal.h1
-rw-r--r--fs/configfs/dir.c6
-rw-r--r--fs/configfs/mount.c1
-rw-r--r--fs/cramfs/inode.c110
-rw-r--r--fs/dcache.c18
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/dlm/Kconfig3
-rw-r--r--fs/dlm/lowcomms.c63
-rw-r--r--fs/ecryptfs/crypto.c30
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c28
-rw-r--r--fs/ecryptfs/inode.c33
-rw-r--r--fs/ecryptfs/keystore.c26
-rw-r--r--fs/ecryptfs/main.c165
-rw-r--r--fs/ecryptfs/mmap.c35
-rw-r--r--fs/eventpoll.c20
-rw-r--r--fs/ext2/dir.c19
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c25
-rw-r--r--fs/ext2/xattr.c10
-rw-r--r--fs/ext3/balloc.c266
-rw-r--r--fs/ext3/dir.c15
-rw-r--r--fs/ext3/inode.c6
-rw-r--r--fs/ext3/ioctl.c22
-rw-r--r--fs/ext3/namei.c138
-rw-r--r--fs/ext3/resize.c65
-rw-r--r--fs/ext3/super.c101
-rw-r--r--fs/ext3/xattr.c2
-rw-r--r--fs/ext4/balloc.c3
-rw-r--r--fs/ext4/dir.c56
-rw-r--r--fs/ext4/ext4.h97
-rw-r--r--fs/ext4/ext4_extents.h8
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c105
-rw-r--r--fs/ext4/file.c24
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c81
-rw-r--r--fs/ext4/mballoc.c55
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/namei.c69
-rw-r--r--fs/ext4/page-io.c7
-rw-r--r--fs/ext4/resize.c64
-rw-r--r--fs/ext4/super.c325
-rw-r--r--fs/ext4/xattr.c28
-rw-r--r--fs/fat/fat.h3
-rw-r--r--fs/fat/inode.c13
-rw-r--r--fs/fat/namei_msdos.c27
-rw-r--r--fs/fat/namei_vfat.c27
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fs-writeback.c105
-rw-r--r--fs/fs_struct.c35
-rw-r--r--fs/fscache/operation.c2
-rw-r--r--fs/fuse/dev.c156
-rw-r--r--fs/fuse/dir.c54
-rw-r--r--fs/fuse/file.c66
-rw-r--r--fs/fuse/fuse_i.h27
-rw-r--r--fs/fuse/inode.c40
-rw-r--r--fs/gfs2/export.c13
-rw-r--r--fs/gfs2/file.c258
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c72
-rw-r--r--fs/gfs2/inode.h1
-rw-r--r--fs/gfs2/ops_fstype.c10
-rw-r--r--fs/gfs2/ops_inode.c256
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/hfs/dir.c2
-rw-r--r--fs/hfs/super.c3
-rw-r--r--fs/hfsplus/dir.c1
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/dentry.c7
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/internal.h4
-rw-r--r--fs/ioctl.c10
-rw-r--r--fs/isofs/inode.c13
-rw-r--r--fs/isofs/namei.c2
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/journal.c34
-rw-r--r--fs/jbd2/recovery.c2
-rw-r--r--fs/jbd2/transaction.c8
-rw-r--r--fs/jffs2/build.c5
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/xattr.c12
-rw-r--r--fs/jfs/jfs_logmgr.c17
-rw-r--r--fs/jfs/namei.c10
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/libfs.c4
-rw-r--r--fs/lockd/Makefile6
-rw-r--r--fs/lockd/clnt4xdr.c605
-rw-r--r--fs/lockd/clntlock.c4
-rw-r--r--fs/lockd/clntproc.c18
-rw-r--r--fs/lockd/clntxdr.c627
-rw-r--r--fs/lockd/host.c409
-rw-r--r--fs/lockd/mon.c110
-rw-r--r--fs/lockd/svc4proc.c20
-rw-r--r--fs/lockd/svclock.c34
-rw-r--r--fs/lockd/svcproc.c28
-rw-r--r--fs/lockd/xdr.c287
-rw-r--r--fs/lockd/xdr4.c255
-rw-r--r--fs/locks.c8
-rw-r--r--fs/logfs/dev_bdev.c7
-rw-r--r--fs/mbcache.c12
-rw-r--r--fs/minix/namei.c2
-rw-r--r--fs/mpage.c49
-rw-r--r--fs/namei.c432
-rw-r--r--fs/namespace.c223
-rw-r--r--fs/ncpfs/dir.c19
-rw-r--r--fs/ncpfs/file.c3
-rw-r--r--fs/ncpfs/inode.c6
-rw-r--r--fs/ncpfs/ioctl.c4
-rw-r--r--fs/ncpfs/mmap.c4
-rw-r--r--fs/ncpfs/ncp_fs.h98
-rw-r--r--fs/ncpfs/ncp_fs_i.h29
-rw-r--r--fs/ncpfs/ncp_fs_sb.h176
-rw-r--r--fs/ncpfs/ncplib_kernel.c2
-rw-r--r--fs/ncpfs/ncplib_kernel.h2
-rw-r--r--fs/ncpfs/ncpsign_kernel.c1
-rw-r--r--fs/ncpfs/ncpsign_kernel.h2
-rw-r--r--fs/ncpfs/sock.c2
-rw-r--r--fs/ncpfs/symlink.c4
-rw-r--r--fs/nfs/callback.c83
-rw-r--r--fs/nfs/callback.h59
-rw-r--r--fs/nfs/callback_proc.c326
-rw-r--r--fs/nfs/callback_xdr.c143
-rw-r--r--fs/nfs/client.c302
-rw-r--r--fs/nfs/delegation.c362
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c92
-rw-r--r--fs/nfs/getroot.c6
-rw-r--r--fs/nfs/idmap.c2
-rw-r--r--fs/nfs/inode.c7
-rw-r--r--fs/nfs/internal.h20
-rw-r--r--fs/nfs/mount_clnt.c83
-rw-r--r--fs/nfs/namespace.c77
-rw-r--r--fs/nfs/nfs2xdr.c1294
-rw-r--r--fs/nfs/nfs3xdr.c2817
-rw-r--r--fs/nfs/nfs4_fs.h13
-rw-r--r--fs/nfs/nfs4filelayout.c6
-rw-r--r--fs/nfs/nfs4proc.c188
-rw-r--r--fs/nfs/nfs4renewd.c11
-rw-r--r--fs/nfs/nfs4state.c293
-rw-r--r--fs/nfs/nfs4xdr.c1426
-rw-r--r--fs/nfs/pagelist.c7
-rw-r--r--fs/nfs/pnfs.c524
-rw-r--r--fs/nfs/pnfs.h76
-rw-r--r--fs/nfs/proc.c5
-rw-r--r--fs/nfs/super.c19
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfsd/acl.h59
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/idmap.h62
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4acl.c2
-rw-r--r--fs/nfsd/nfs4callback.c841
-rw-r--r--fs/nfsd/nfs4idmap.c15
-rw-r--r--fs/nfsd/nfs4proc.c59
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c243
-rw-r--r--fs/nfsd/nfs4xdr.c115
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/nfsd.h1
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h16
-rw-r--r--fs/nfsd/vfs.c95
-rw-r--r--fs/nfsd/xdr4.h9
-rw-r--r--fs/nilfs2/bmap.c47
-rw-r--r--fs/nilfs2/btnode.c3
-rw-r--r--fs/nilfs2/dir.c3
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/ifile.c11
-rw-r--r--fs/nilfs2/inode.c180
-rw-r--r--fs/nilfs2/ioctl.c12
-rw-r--r--fs/nilfs2/mdt.c32
-rw-r--r--fs/nilfs2/namei.c1
-rw-r--r--fs/nilfs2/nilfs.h13
-rw-r--r--fs/nilfs2/page.c86
-rw-r--r--fs/nilfs2/page.h3
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/sb.h8
-rw-r--r--fs/nilfs2/segment.c43
-rw-r--r--fs/nilfs2/super.c38
-rw-r--r--fs/nilfs2/the_nilfs.c6
-rw-r--r--fs/nilfs2/the_nilfs.h3
-rw-r--r--fs/notify/fanotify/Kconfig2
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/file.c35
-rw-r--r--fs/ntfs/super.c6
-rw-r--r--fs/ocfs2/Kconfig5
-rw-r--r--fs/ocfs2/alloc.c77
-rw-r--r--fs/ocfs2/alloc.h4
-rw-r--r--fs/ocfs2/aops.c59
-rw-r--r--fs/ocfs2/cluster/heartbeat.c248
-rw-r--r--fs/ocfs2/cluster/netdebug.c286
-rw-r--r--fs/ocfs2/cluster/tcp.c145
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h33
-rw-r--r--fs/ocfs2/dlm/dlmast.c76
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h86
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c200
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c10
-rw-r--r--fs/ocfs2/dlm/dlmlock.c3
-rw-r--r--fs/ocfs2/dlm/dlmthread.c132
-rw-r--r--fs/ocfs2/export.c6
-rw-r--r--fs/ocfs2/file.c18
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/namei.c10
-rw-r--r--fs/ocfs2/ocfs2.h5
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/open.c11
-rw-r--r--fs/partitions/check.c106
-rw-r--r--fs/pipe.c16
-rw-r--r--fs/proc/Kconfig6
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/array.c28
-rw-r--r--fs/proc/base.c57
-rw-r--r--fs/proc/consoles.c (renamed from fs/proc/proc_console.c)4
-rw-r--r--fs/proc/devices.c4
-rw-r--r--fs/proc/generic.c17
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/meminfo.c14
-rw-r--r--fs/proc/page.c16
-rw-r--r--fs/proc/proc_tty.c26
-rw-r--r--fs/proc/softirqs.c6
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c12
-rw-r--r--fs/proc/task_nommu.c7
-rw-r--r--fs/quota/dquot.c36
-rw-r--r--fs/quota/quota.c41
-rw-r--r--fs/quota/quota_tree.c9
-rw-r--r--fs/read_write.c27
-rw-r--r--fs/reiserfs/journal.c21
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/reiserfs/super.c17
-rw-r--r--fs/select.c2
-rw-r--r--fs/splice.c43
-rw-r--r--fs/squashfs/Kconfig18
-rw-r--r--fs/squashfs/Makefile1
-rw-r--r--fs/squashfs/block.c1
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c16
-rw-r--r--fs/squashfs/decompressor.h9
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/lzo_wrapper.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_i.h6
-rw-r--r--fs/squashfs/xattr_id.c1
-rw-r--r--fs/squashfs/xz_wrapper.c153
-rw-r--r--fs/squashfs/zlib_wrapper.c15
-rw-r--r--fs/stat.c4
-rw-r--r--fs/super.c21
-rw-r--r--fs/sysfs/Kconfig2
-rw-r--r--fs/sysfs/group.c10
-rw-r--r--fs/sysfs/inode.c1
-rw-r--r--fs/sysfs/sysfs.h1
-rw-r--r--fs/sysv/namei.c1
-rw-r--r--fs/sysv/super.c8
-rw-r--r--fs/udf/Kconfig1
-rw-r--r--fs/udf/balloc.c3
-rw-r--r--fs/udf/dir.c5
-rw-r--r--fs/udf/file.c11
-rw-r--r--fs/udf/ialloc.c21
-rw-r--r--fs/udf/inode.c51
-rw-r--r--fs/udf/namei.c107
-rw-r--r--fs/udf/partition.c27
-rw-r--r--fs/udf/super.c67
-rw-r--r--fs/udf/symlink.c12
-rw-r--r--fs/udf/udf_i.h13
-rw-r--r--fs/udf/udf_sb.h22
-rw-r--r--fs/udf/udfdecl.h4
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/sv.h59
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c425
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h16
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c238
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h29
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c191
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c587
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c54
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c103
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h92
-rw-r--r--fs/xfs/quota/xfs_dquot.c1
-rw-r--r--fs/xfs/support/debug.c112
-rw-r--r--fs/xfs/support/debug.h25
-rw-r--r--fs/xfs/xfs_ag.h2
-rw-r--r--fs/xfs/xfs_alloc.c361
-rw-r--r--fs/xfs/xfs_alloc.h25
-rw-r--r--fs/xfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/xfs_btree.c9
-rw-r--r--fs/xfs/xfs_buf_item.c179
-rw-r--r--fs/xfs/xfs_buf_item.h11
-rw-r--r--fs/xfs/xfs_error.c31
-rw-r--r--fs/xfs/xfs_error.h18
-rw-r--r--fs/xfs/xfs_extfree_item.c97
-rw-r--r--fs/xfs/xfs_extfree_item.h11
-rw-r--r--fs/xfs/xfs_fsops.c11
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_iget.c79
-rw-r--r--fs/xfs/xfs_inode.c54
-rw-r--r--fs/xfs/xfs_inode.h15
-rw-r--r--fs/xfs/xfs_inode_item.c90
-rw-r--r--fs/xfs/xfs_iomap.c233
-rw-r--r--fs/xfs/xfs_iomap.h27
-rw-r--r--fs/xfs/xfs_log.c741
-rw-r--r--fs/xfs/xfs_log_cil.c17
-rw-r--r--fs/xfs/xfs_log_priv.h127
-rw-r--r--fs/xfs/xfs_log_recover.c622
-rw-r--r--fs/xfs/xfs_mount.c23
-rw-r--r--fs/xfs/xfs_mount.h14
-rw-r--r--fs/xfs/xfs_trans.c81
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_ail.c232
-rw-r--r--fs/xfs/xfs_trans_extfree.c8
-rw-r--r--fs/xfs/xfs_trans_priv.h35
-rw-r--r--fs/xfs/xfs_vnodeops.c61
449 files changed, 21562 insertions, 13761 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 7e0511476797..814ac4e213a8 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -9,6 +9,8 @@ config 9P_FS
9 9
10 If unsure, say N. 10 If unsure, say N.
11 11
12if 9P_FS
13
12config 9P_FSCACHE 14config 9P_FSCACHE
13 bool "Enable 9P client caching support (EXPERIMENTAL)" 15 bool "Enable 9P client caching support (EXPERIMENTAL)"
14 depends on EXPERIMENTAL 16 depends on EXPERIMENTAL
@@ -20,7 +22,6 @@ config 9P_FSCACHE
20 22
21config 9P_FS_POSIX_ACL 23config 9P_FS_POSIX_ACL
22 bool "9P POSIX Access Control Lists" 24 bool "9P POSIX Access Control Lists"
23 depends on 9P_FS
24 select FS_POSIX_ACL 25 select FS_POSIX_ACL
25 help 26 help
26 POSIX Access Control Lists (ACLs) support permissions for users and 27 POSIX Access Control Lists (ACLs) support permissions for users and
@@ -30,3 +31,5 @@ config 9P_FS_POSIX_ACL
30 Linux website <http://acl.bestbits.at/>. 31 Linux website <http://acl.bestbits.at/>.
31 32
32 If you don't know what Access Control Lists are, say N 33 If you don't know what Access Control Lists are, say N
34
35endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index f8ba37effd1b..ab8c12780634 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
39p-objs := \ 39p-objs := \
4 vfs_super.o \ 4 vfs_super.o \
5 vfs_inode.o \ 5 vfs_inode.o \
6 vfs_inode_dotl.o \
6 vfs_addr.o \ 7 vfs_addr.o \
7 vfs_file.o \ 8 vfs_file.o \
8 vfs_dir.o \ 9 vfs_dir.o \
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 6e58c4ca1e6e..02a2cf616318 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -28,7 +28,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
28{ 28{
29 ssize_t size; 29 ssize_t size;
30 void *value = NULL; 30 void *value = NULL;
31 struct posix_acl *acl = NULL;; 31 struct posix_acl *acl = NULL;
32 32
33 size = v9fs_fid_xattr_get(fid, name, NULL, 0); 33 size = v9fs_fid_xattr_get(fid, name, NULL, 0);
34 if (size > 0) { 34 if (size > 0) {
@@ -365,7 +365,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
365 case ACL_TYPE_DEFAULT: 365 case ACL_TYPE_DEFAULT:
366 name = POSIX_ACL_XATTR_DEFAULT; 366 name = POSIX_ACL_XATTR_DEFAULT;
367 if (!S_ISDIR(inode->i_mode)) { 367 if (!S_ISDIR(inode->i_mode)) {
368 retval = -EINVAL; 368 retval = acl ? -EINVAL : 0;
369 goto err_out; 369 goto err_out;
370 } 370 }
371 break; 371 break;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index cb6396855e2d..c4b5d8864f0d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -113,9 +113,27 @@ struct v9fs_session_info {
113 113
114struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 114struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
115 char *); 115 char *);
116void v9fs_session_close(struct v9fs_session_info *v9ses); 116extern void v9fs_session_close(struct v9fs_session_info *v9ses);
117void v9fs_session_cancel(struct v9fs_session_info *v9ses); 117extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
118void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); 118extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
119extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
120 struct nameidata *nameidata);
121extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
122extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
123extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
124 struct inode *new_dir, struct dentry *new_dentry);
125extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
126 void *p);
127extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
128 struct p9_fid *fid,
129 struct super_block *sb);
130
131extern const struct inode_operations v9fs_dir_inode_operations_dotl;
132extern const struct inode_operations v9fs_file_inode_operations_dotl;
133extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
134extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
135 struct p9_fid *fid,
136 struct super_block *sb);
119 137
120/* other default globals */ 138/* other default globals */
121#define V9FS_PORT 564 139#define V9FS_PORT 564
@@ -138,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
138{ 156{
139 return v9ses->flags & V9FS_PROTO_2000L; 157 return v9ses->flags & V9FS_PROTO_2000L;
140} 158}
159
160/**
161 * v9fs_inode_from_fid - Helper routine to populate an inode by
162 * issuing a attribute request
163 * @v9ses: session information
164 * @fid: fid to issue attribute request for
165 * @sb: superblock on which to create inode
166 *
167 */
168static inline struct inode *
169v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
170 struct super_block *sb)
171{
172 if (v9fs_proto_dotl(v9ses))
173 return v9fs_inode_dotl(v9ses, fid, sb);
174 else
175 return v9fs_inode(v9ses, fid, sb);
176}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index bab0eac873f4..b789f8e597ec 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,7 +59,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
59int v9fs_dir_release(struct inode *inode, struct file *filp); 59int v9fs_dir_release(struct inode *inode, struct file *filp);
60int v9fs_file_open(struct inode *inode, struct file *file); 60int v9fs_file_open(struct inode *inode, struct file *file);
61void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); 61void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
62void v9fs_dentry_release(struct dentry *);
63int v9fs_uflags2omode(int uflags, int extended); 62int v9fs_uflags2omode(int uflags, int extended);
64 63
65ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); 64ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 466d2a4fc5cb..233b7d4ffe5e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -86,7 +86,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
86 * 86 *
87 */ 87 */
88 88
89void v9fs_dentry_release(struct dentry *dentry) 89static void v9fs_dentry_release(struct dentry *dentry)
90{ 90{
91 struct v9fs_dentry *dent; 91 struct v9fs_dentry *dent;
92 struct p9_fid *temp, *current_fid; 92 struct p9_fid *temp, *current_fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 59782981b225..b76a40bdf4c2 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -49,15 +49,8 @@
49 49
50static const struct inode_operations v9fs_dir_inode_operations; 50static const struct inode_operations v9fs_dir_inode_operations;
51static const struct inode_operations v9fs_dir_inode_operations_dotu; 51static const struct inode_operations v9fs_dir_inode_operations_dotu;
52static const struct inode_operations v9fs_dir_inode_operations_dotl;
53static const struct inode_operations v9fs_file_inode_operations; 52static const struct inode_operations v9fs_file_inode_operations;
54static const struct inode_operations v9fs_file_inode_operations_dotl;
55static const struct inode_operations v9fs_symlink_inode_operations; 53static const struct inode_operations v9fs_symlink_inode_operations;
56static const struct inode_operations v9fs_symlink_inode_operations_dotl;
57
58static int
59v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
60 dev_t rdev);
61 54
62/** 55/**
63 * unixmode2p9mode - convert unix mode bits to plan 9 56 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -251,41 +244,6 @@ void v9fs_destroy_inode(struct inode *inode)
251#endif 244#endif
252 245
253/** 246/**
254 * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
255 * new file system object. This checks the S_ISGID to determine the owning
256 * group of the new file system object.
257 */
258
259static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
260{
261 BUG_ON(dir_inode == NULL);
262
263 if (dir_inode->i_mode & S_ISGID) {
264 /* set_gid bit is set.*/
265 return dir_inode->i_gid;
266 }
267 return current_fsgid();
268}
269
270/**
271 * v9fs_dentry_from_dir_inode - helper function to get the dentry from
272 * dir inode.
273 *
274 */
275
276static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
277{
278 struct dentry *dentry;
279
280 spin_lock(&inode->i_lock);
281 /* Directory should have only one entry. */
282 BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
283 dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
284 spin_unlock(&inode->i_lock);
285 return dentry;
286}
287
288/**
289 * v9fs_get_inode - helper function to setup an inode 247 * v9fs_get_inode - helper function to setup an inode
290 * @sb: superblock 248 * @sb: superblock
291 * @mode: mode to setup inode with 249 * @mode: mode to setup inode with
@@ -454,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode)
454#endif 412#endif
455} 413}
456 414
457static struct inode * 415struct inode *
458v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, 416v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
459 struct super_block *sb) 417 struct super_block *sb)
460{ 418{
@@ -489,60 +447,6 @@ error:
489 return ERR_PTR(err); 447 return ERR_PTR(err);
490} 448}
491 449
492static struct inode *
493v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
494 struct super_block *sb)
495{
496 struct inode *ret = NULL;
497 int err;
498 struct p9_stat_dotl *st;
499
500 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
501 if (IS_ERR(st))
502 return ERR_CAST(st);
503
504 ret = v9fs_get_inode(sb, st->st_mode);
505 if (IS_ERR(ret)) {
506 err = PTR_ERR(ret);
507 goto error;
508 }
509
510 v9fs_stat2inode_dotl(st, ret);
511 ret->i_ino = v9fs_qid2ino(&st->qid);
512#ifdef CONFIG_9P_FSCACHE
513 v9fs_vcookie_set_qid(ret, &st->qid);
514 v9fs_cache_inode_get_cookie(ret);
515#endif
516 err = v9fs_get_acl(ret, fid);
517 if (err) {
518 iput(ret);
519 goto error;
520 }
521 kfree(st);
522 return ret;
523error:
524 kfree(st);
525 return ERR_PTR(err);
526}
527
528/**
529 * v9fs_inode_from_fid - Helper routine to populate an inode by
530 * issuing a attribute request
531 * @v9ses: session information
532 * @fid: fid to issue attribute request for
533 * @sb: superblock on which to create inode
534 *
535 */
536static inline struct inode *
537v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
538 struct super_block *sb)
539{
540 if (v9fs_proto_dotl(v9ses))
541 return v9fs_inode_dotl(v9ses, fid, sb);
542 else
543 return v9fs_inode(v9ses, fid, sb);
544}
545
546/** 450/**
547 * v9fs_remove - helper function to remove files and directories 451 * v9fs_remove - helper function to remove files and directories
548 * @dir: directory inode that is being deleted 452 * @dir: directory inode that is being deleted
@@ -633,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
633 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 537 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
634 goto error; 538 goto error;
635 } 539 }
636
637 if (v9ses->cache)
638 d_set_d_op(dentry, &v9fs_cached_dentry_operations);
639 else
640 d_set_d_op(dentry, &v9fs_dentry_operations);
641
642 d_instantiate(dentry, inode); 540 d_instantiate(dentry, inode);
643 err = v9fs_fid_add(dentry, fid); 541 err = v9fs_fid_add(dentry, fid);
644 if (err < 0) 542 if (err < 0)
@@ -657,144 +555,6 @@ error:
657} 555}
658 556
659/** 557/**
660 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
661 * @dir: directory inode that is being created
662 * @dentry: dentry that is being deleted
663 * @mode: create permissions
664 * @nd: path information
665 *
666 */
667
668static int
669v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
670 struct nameidata *nd)
671{
672 int err = 0;
673 char *name = NULL;
674 gid_t gid;
675 int flags;
676 mode_t mode;
677 struct v9fs_session_info *v9ses;
678 struct p9_fid *fid = NULL;
679 struct p9_fid *dfid, *ofid;
680 struct file *filp;
681 struct p9_qid qid;
682 struct inode *inode;
683 struct posix_acl *pacl = NULL, *dacl = NULL;
684
685 v9ses = v9fs_inode2v9ses(dir);
686 if (nd && nd->flags & LOOKUP_OPEN)
687 flags = nd->intent.open.flags - 1;
688 else {
689 /*
690 * create call without LOOKUP_OPEN is due
691 * to mknod of regular files. So use mknod
692 * operation.
693 */
694 return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
695 }
696
697 name = (char *) dentry->d_name.name;
698 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
699 "mode:0x%x\n", name, flags, omode);
700
701 dfid = v9fs_fid_lookup(dentry->d_parent);
702 if (IS_ERR(dfid)) {
703 err = PTR_ERR(dfid);
704 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
705 return err;
706 }
707
708 /* clone a fid to use for creation */
709 ofid = p9_client_walk(dfid, 0, NULL, 1);
710 if (IS_ERR(ofid)) {
711 err = PTR_ERR(ofid);
712 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
713 return err;
714 }
715
716 gid = v9fs_get_fsgid_for_create(dir);
717
718 mode = omode;
719 /* Update mode based on ACL value */
720 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
721 if (err) {
722 P9_DPRINTK(P9_DEBUG_VFS,
723 "Failed to get acl values in creat %d\n", err);
724 goto error;
725 }
726 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
727 if (err < 0) {
728 P9_DPRINTK(P9_DEBUG_VFS,
729 "p9_client_open_dotl failed in creat %d\n",
730 err);
731 goto error;
732 }
733 /* instantiate inode and assign the unopened fid to the dentry */
734 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
735 (nd && nd->flags & LOOKUP_OPEN)) {
736 fid = p9_client_walk(dfid, 1, &name, 1);
737 if (IS_ERR(fid)) {
738 err = PTR_ERR(fid);
739 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
740 err);
741 fid = NULL;
742 goto error;
743 }
744
745 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
746 if (IS_ERR(inode)) {
747 err = PTR_ERR(inode);
748 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
749 err);
750 goto error;
751 }
752 d_set_d_op(dentry, &v9fs_cached_dentry_operations);
753 d_instantiate(dentry, inode);
754 err = v9fs_fid_add(dentry, fid);
755 if (err < 0)
756 goto error;
757 /* The fid would get clunked via a dput */
758 fid = NULL;
759 } else {
760 /*
761 * Not in cached mode. No need to populate
762 * inode with stat. We need to get an inode
763 * so that we can set the acl with dentry
764 */
765 inode = v9fs_get_inode(dir->i_sb, mode);
766 if (IS_ERR(inode)) {
767 err = PTR_ERR(inode);
768 goto error;
769 }
770 d_set_d_op(dentry, &v9fs_dentry_operations);
771 d_instantiate(dentry, inode);
772 }
773 /* Now set the ACL based on the default value */
774 v9fs_set_create_acl(dentry, dacl, pacl);
775
776 /* if we are opening a file, assign the open fid to the file */
777 if (nd && nd->flags & LOOKUP_OPEN) {
778 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
779 if (IS_ERR(filp)) {
780 p9_client_clunk(ofid);
781 return PTR_ERR(filp);
782 }
783 filp->private_data = ofid;
784 } else
785 p9_client_clunk(ofid);
786
787 return 0;
788
789error:
790 if (ofid)
791 p9_client_clunk(ofid);
792 if (fid)
793 p9_client_clunk(fid);
794 return err;
795}
796
797/**
798 * v9fs_vfs_create - VFS hook to create files 558 * v9fs_vfs_create - VFS hook to create files
799 * @dir: directory inode that is being created 559 * @dir: directory inode that is being created
800 * @dentry: dentry that is being deleted 560 * @dentry: dentry that is being deleted
@@ -884,107 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
884 return err; 644 return err;
885} 645}
886 646
887
888/**
889 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
890 * @dir: inode that is being unlinked
891 * @dentry: dentry that is being unlinked
892 * @mode: mode for new directory
893 *
894 */
895
896static int v9fs_vfs_mkdir_dotl(struct inode *dir,
897 struct dentry *dentry, int omode)
898{
899 int err;
900 struct v9fs_session_info *v9ses;
901 struct p9_fid *fid = NULL, *dfid = NULL;
902 gid_t gid;
903 char *name;
904 mode_t mode;
905 struct inode *inode;
906 struct p9_qid qid;
907 struct dentry *dir_dentry;
908 struct posix_acl *dacl = NULL, *pacl = NULL;
909
910 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
911 err = 0;
912 v9ses = v9fs_inode2v9ses(dir);
913
914 omode |= S_IFDIR;
915 if (dir->i_mode & S_ISGID)
916 omode |= S_ISGID;
917
918 dir_dentry = v9fs_dentry_from_dir_inode(dir);
919 dfid = v9fs_fid_lookup(dir_dentry);
920 if (IS_ERR(dfid)) {
921 err = PTR_ERR(dfid);
922 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
923 dfid = NULL;
924 goto error;
925 }
926
927 gid = v9fs_get_fsgid_for_create(dir);
928 mode = omode;
929 /* Update mode based on ACL value */
930 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
931 if (err) {
932 P9_DPRINTK(P9_DEBUG_VFS,
933 "Failed to get acl values in mkdir %d\n", err);
934 goto error;
935 }
936 name = (char *) dentry->d_name.name;
937 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
938 if (err < 0)
939 goto error;
940
941 /* instantiate inode and assign the unopened fid to the dentry */
942 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
943 fid = p9_client_walk(dfid, 1, &name, 1);
944 if (IS_ERR(fid)) {
945 err = PTR_ERR(fid);
946 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
947 err);
948 fid = NULL;
949 goto error;
950 }
951
952 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
953 if (IS_ERR(inode)) {
954 err = PTR_ERR(inode);
955 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
956 err);
957 goto error;
958 }
959 d_set_d_op(dentry, &v9fs_cached_dentry_operations);
960 d_instantiate(dentry, inode);
961 err = v9fs_fid_add(dentry, fid);
962 if (err < 0)
963 goto error;
964 fid = NULL;
965 } else {
966 /*
967 * Not in cached mode. No need to populate
968 * inode with stat. We need to get an inode
969 * so that we can set the acl with dentry
970 */
971 inode = v9fs_get_inode(dir->i_sb, mode);
972 if (IS_ERR(inode)) {
973 err = PTR_ERR(inode);
974 goto error;
975 }
976 d_set_d_op(dentry, &v9fs_dentry_operations);
977 d_instantiate(dentry, inode);
978 }
979 /* Now set the ACL based on the default value */
980 v9fs_set_create_acl(dentry, dacl, pacl);
981
982error:
983 if (fid)
984 p9_client_clunk(fid);
985 return err;
986}
987
988/** 647/**
989 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode 648 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
990 * @dir: inode that is being walked from 649 * @dir: inode that is being walked from
@@ -993,7 +652,7 @@ error:
993 * 652 *
994 */ 653 */
995 654
996static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, 655struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
997 struct nameidata *nameidata) 656 struct nameidata *nameidata)
998{ 657{
999 struct super_block *sb; 658 struct super_block *sb;
@@ -1040,11 +699,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
1040 goto error_iput; 699 goto error_iput;
1041 700
1042inst_out: 701inst_out:
1043 if (v9ses->cache)
1044 d_set_d_op(dentry, &v9fs_cached_dentry_operations);
1045 else
1046 d_set_d_op(dentry, &v9fs_dentry_operations);
1047
1048 d_add(dentry, inode); 702 d_add(dentry, inode);
1049 return NULL; 703 return NULL;
1050 704
@@ -1063,7 +717,7 @@ error:
1063 * 717 *
1064 */ 718 */
1065 719
1066static int v9fs_vfs_unlink(struct inode *i, struct dentry *d) 720int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
1067{ 721{
1068 return v9fs_remove(i, d, 0); 722 return v9fs_remove(i, d, 0);
1069} 723}
@@ -1075,7 +729,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
1075 * 729 *
1076 */ 730 */
1077 731
1078static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) 732int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
1079{ 733{
1080 return v9fs_remove(i, d, 1); 734 return v9fs_remove(i, d, 1);
1081} 735}
@@ -1089,7 +743,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
1089 * 743 *
1090 */ 744 */
1091 745
1092static int 746int
1093v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 747v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1094 struct inode *new_dir, struct dentry *new_dentry) 748 struct inode *new_dir, struct dentry *new_dentry)
1095{ 749{
@@ -1196,42 +850,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1196 return 0; 850 return 0;
1197} 851}
1198 852
1199static int
1200v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
1201 struct kstat *stat)
1202{
1203 int err;
1204 struct v9fs_session_info *v9ses;
1205 struct p9_fid *fid;
1206 struct p9_stat_dotl *st;
1207
1208 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
1209 err = -EPERM;
1210 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1211 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
1212 return simple_getattr(mnt, dentry, stat);
1213
1214 fid = v9fs_fid_lookup(dentry);
1215 if (IS_ERR(fid))
1216 return PTR_ERR(fid);
1217
1218 /* Ask for all the fields in stat structure. Server will return
1219 * whatever it supports
1220 */
1221
1222 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
1223 if (IS_ERR(st))
1224 return PTR_ERR(st);
1225
1226 v9fs_stat2inode_dotl(st, dentry->d_inode);
1227 generic_fillattr(dentry->d_inode, stat);
1228 /* Change block size to what the server returned */
1229 stat->blksize = st->st_blksize;
1230
1231 kfree(st);
1232 return 0;
1233}
1234
1235/** 853/**
1236 * v9fs_vfs_setattr - set file metadata 854 * v9fs_vfs_setattr - set file metadata
1237 * @dentry: file whose metadata to set 855 * @dentry: file whose metadata to set
@@ -1291,64 +909,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
1291} 909}
1292 910
1293/** 911/**
1294 * v9fs_vfs_setattr_dotl - set file metadata
1295 * @dentry: file whose metadata to set
1296 * @iattr: metadata assignment structure
1297 *
1298 */
1299
1300int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
1301{
1302 int retval;
1303 struct v9fs_session_info *v9ses;
1304 struct p9_fid *fid;
1305 struct p9_iattr_dotl p9attr;
1306
1307 P9_DPRINTK(P9_DEBUG_VFS, "\n");
1308
1309 retval = inode_change_ok(dentry->d_inode, iattr);
1310 if (retval)
1311 return retval;
1312
1313 p9attr.valid = iattr->ia_valid;
1314 p9attr.mode = iattr->ia_mode;
1315 p9attr.uid = iattr->ia_uid;
1316 p9attr.gid = iattr->ia_gid;
1317 p9attr.size = iattr->ia_size;
1318 p9attr.atime_sec = iattr->ia_atime.tv_sec;
1319 p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
1320 p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
1321 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
1322
1323 retval = -EPERM;
1324 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1325 fid = v9fs_fid_lookup(dentry);
1326 if (IS_ERR(fid))
1327 return PTR_ERR(fid);
1328
1329 retval = p9_client_setattr(fid, &p9attr);
1330 if (retval < 0)
1331 return retval;
1332
1333 if ((iattr->ia_valid & ATTR_SIZE) &&
1334 iattr->ia_size != i_size_read(dentry->d_inode)) {
1335 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
1336 if (retval)
1337 return retval;
1338 }
1339
1340 setattr_copy(dentry->d_inode, iattr);
1341 mark_inode_dirty(dentry->d_inode);
1342 if (iattr->ia_valid & ATTR_MODE) {
1343 /* We also want to update ACL when we update mode bits */
1344 retval = v9fs_acl_chmod(dentry);
1345 if (retval < 0)
1346 return retval;
1347 }
1348 return 0;
1349}
1350
1351/**
1352 * v9fs_stat2inode - populate an inode structure with mistat info 912 * v9fs_stat2inode - populate an inode structure with mistat info
1353 * @stat: Plan 9 metadata (mistat) structure 913 * @stat: Plan 9 metadata (mistat) structure
1354 * @inode: inode to populate 914 * @inode: inode to populate
@@ -1426,77 +986,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1426} 986}
1427 987
1428/** 988/**
1429 * v9fs_stat2inode_dotl - populate an inode structure with stat info
1430 * @stat: stat structure
1431 * @inode: inode to populate
1432 * @sb: superblock of filesystem
1433 *
1434 */
1435
1436void
1437v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
1438{
1439
1440 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
1441 inode->i_atime.tv_sec = stat->st_atime_sec;
1442 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1443 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1444 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1445 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1446 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1447 inode->i_uid = stat->st_uid;
1448 inode->i_gid = stat->st_gid;
1449 inode->i_nlink = stat->st_nlink;
1450 inode->i_mode = stat->st_mode;
1451 inode->i_rdev = new_decode_dev(stat->st_rdev);
1452
1453 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
1454 init_special_inode(inode, inode->i_mode, inode->i_rdev);
1455
1456 i_size_write(inode, stat->st_size);
1457 inode->i_blocks = stat->st_blocks;
1458 } else {
1459 if (stat->st_result_mask & P9_STATS_ATIME) {
1460 inode->i_atime.tv_sec = stat->st_atime_sec;
1461 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1462 }
1463 if (stat->st_result_mask & P9_STATS_MTIME) {
1464 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1465 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1466 }
1467 if (stat->st_result_mask & P9_STATS_CTIME) {
1468 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1469 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1470 }
1471 if (stat->st_result_mask & P9_STATS_UID)
1472 inode->i_uid = stat->st_uid;
1473 if (stat->st_result_mask & P9_STATS_GID)
1474 inode->i_gid = stat->st_gid;
1475 if (stat->st_result_mask & P9_STATS_NLINK)
1476 inode->i_nlink = stat->st_nlink;
1477 if (stat->st_result_mask & P9_STATS_MODE) {
1478 inode->i_mode = stat->st_mode;
1479 if ((S_ISBLK(inode->i_mode)) ||
1480 (S_ISCHR(inode->i_mode)))
1481 init_special_inode(inode, inode->i_mode,
1482 inode->i_rdev);
1483 }
1484 if (stat->st_result_mask & P9_STATS_RDEV)
1485 inode->i_rdev = new_decode_dev(stat->st_rdev);
1486 if (stat->st_result_mask & P9_STATS_SIZE)
1487 i_size_write(inode, stat->st_size);
1488 if (stat->st_result_mask & P9_STATS_BLOCKS)
1489 inode->i_blocks = stat->st_blocks;
1490 }
1491 if (stat->st_result_mask & P9_STATS_GEN)
1492 inode->i_generation = stat->st_gen;
1493
1494 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
1495 * because the inode structure does not have fields for them.
1496 */
1497}
1498
1499/**
1500 * v9fs_qid2ino - convert qid into inode number 989 * v9fs_qid2ino - convert qid into inode number
1501 * @qid: qid to hash 990 * @qid: qid to hash
1502 * 991 *
@@ -1602,7 +1091,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
1602 * 1091 *
1603 */ 1092 */
1604 1093
1605static void 1094void
1606v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) 1095v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1607{ 1096{
1608 char *s = nd_get_link(nd); 1097 char *s = nd_get_link(nd);
@@ -1646,94 +1135,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1646} 1135}
1647 1136
1648/** 1137/**
1649 * v9fs_vfs_symlink_dotl - helper function to create symlinks
1650 * @dir: directory inode containing symlink
1651 * @dentry: dentry for symlink
1652 * @symname: symlink data
1653 *
1654 * See Also: 9P2000.L RFC for more information
1655 *
1656 */
1657
1658static int
1659v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
1660 const char *symname)
1661{
1662 struct v9fs_session_info *v9ses;
1663 struct p9_fid *dfid;
1664 struct p9_fid *fid = NULL;
1665 struct inode *inode;
1666 struct p9_qid qid;
1667 char *name;
1668 int err;
1669 gid_t gid;
1670
1671 name = (char *) dentry->d_name.name;
1672 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
1673 dir->i_ino, name, symname);
1674 v9ses = v9fs_inode2v9ses(dir);
1675
1676 dfid = v9fs_fid_lookup(dentry->d_parent);
1677 if (IS_ERR(dfid)) {
1678 err = PTR_ERR(dfid);
1679 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1680 return err;
1681 }
1682
1683 gid = v9fs_get_fsgid_for_create(dir);
1684
1685 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
1686 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
1687
1688 if (err < 0) {
1689 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
1690 goto error;
1691 }
1692
1693 if (v9ses->cache) {
1694 /* Now walk from the parent so we can get an unopened fid. */
1695 fid = p9_client_walk(dfid, 1, &name, 1);
1696 if (IS_ERR(fid)) {
1697 err = PTR_ERR(fid);
1698 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1699 err);
1700 fid = NULL;
1701 goto error;
1702 }
1703
1704 /* instantiate inode and assign the unopened fid to dentry */
1705 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1706 if (IS_ERR(inode)) {
1707 err = PTR_ERR(inode);
1708 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1709 err);
1710 goto error;
1711 }
1712 d_set_d_op(dentry, &v9fs_cached_dentry_operations);
1713 d_instantiate(dentry, inode);
1714 err = v9fs_fid_add(dentry, fid);
1715 if (err < 0)
1716 goto error;
1717 fid = NULL;
1718 } else {
1719 /* Not in cached mode. No need to populate inode with stat */
1720 inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
1721 if (IS_ERR(inode)) {
1722 err = PTR_ERR(inode);
1723 goto error;
1724 }
1725 d_set_d_op(dentry, &v9fs_dentry_operations);
1726 d_instantiate(dentry, inode);
1727 }
1728
1729error:
1730 if (fid)
1731 p9_client_clunk(fid);
1732
1733 return err;
1734}
1735
1736/**
1737 * v9fs_vfs_symlink - helper function to create symlinks 1138 * v9fs_vfs_symlink - helper function to create symlinks
1738 * @dir: directory inode containing symlink 1139 * @dir: directory inode containing symlink
1739 * @dentry: dentry for symlink 1140 * @dentry: dentry for symlink
@@ -1792,77 +1193,6 @@ clunk_fid:
1792} 1193}
1793 1194
1794/** 1195/**
1795 * v9fs_vfs_link_dotl - create a hardlink for dotl
1796 * @old_dentry: dentry for file to link to
1797 * @dir: inode destination for new link
1798 * @dentry: dentry for link
1799 *
1800 */
1801
1802static int
1803v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
1804 struct dentry *dentry)
1805{
1806 int err;
1807 struct p9_fid *dfid, *oldfid;
1808 char *name;
1809 struct v9fs_session_info *v9ses;
1810 struct dentry *dir_dentry;
1811
1812 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
1813 dir->i_ino, old_dentry->d_name.name,
1814 dentry->d_name.name);
1815
1816 v9ses = v9fs_inode2v9ses(dir);
1817 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1818 dfid = v9fs_fid_lookup(dir_dentry);
1819 if (IS_ERR(dfid))
1820 return PTR_ERR(dfid);
1821
1822 oldfid = v9fs_fid_lookup(old_dentry);
1823 if (IS_ERR(oldfid))
1824 return PTR_ERR(oldfid);
1825
1826 name = (char *) dentry->d_name.name;
1827
1828 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
1829
1830 if (err < 0) {
1831 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
1832 return err;
1833 }
1834
1835 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1836 /* Get the latest stat info from server. */
1837 struct p9_fid *fid;
1838 struct p9_stat_dotl *st;
1839
1840 fid = v9fs_fid_lookup(old_dentry);
1841 if (IS_ERR(fid))
1842 return PTR_ERR(fid);
1843
1844 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
1845 if (IS_ERR(st))
1846 return PTR_ERR(st);
1847
1848 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
1849
1850 kfree(st);
1851 } else {
1852 /* Caching disabled. No need to get upto date stat info.
1853 * This dentry will be released immediately. So, just hold the
1854 * inode
1855 */
1856 ihold(old_dentry->d_inode);
1857 }
1858
1859 d_set_d_op(dentry, old_dentry->d_op);
1860 d_instantiate(dentry, old_dentry->d_inode);
1861
1862 return err;
1863}
1864
1865/**
1866 * v9fs_vfs_mknod - create a special file 1196 * v9fs_vfs_mknod - create a special file
1867 * @dir: inode destination for new link 1197 * @dir: inode destination for new link
1868 * @dentry: dentry for file 1198 * @dentry: dentry for file
@@ -1907,160 +1237,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1907 return retval; 1237 return retval;
1908} 1238}
1909 1239
1910/**
1911 * v9fs_vfs_mknod_dotl - create a special file
1912 * @dir: inode destination for new link
1913 * @dentry: dentry for file
1914 * @mode: mode for creation
1915 * @rdev: device associated with special file
1916 *
1917 */
1918static int
1919v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
1920 dev_t rdev)
1921{
1922 int err;
1923 char *name;
1924 mode_t mode;
1925 struct v9fs_session_info *v9ses;
1926 struct p9_fid *fid = NULL, *dfid = NULL;
1927 struct inode *inode;
1928 gid_t gid;
1929 struct p9_qid qid;
1930 struct dentry *dir_dentry;
1931 struct posix_acl *dacl = NULL, *pacl = NULL;
1932
1933 P9_DPRINTK(P9_DEBUG_VFS,
1934 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
1935 dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
1936
1937 if (!new_valid_dev(rdev))
1938 return -EINVAL;
1939
1940 v9ses = v9fs_inode2v9ses(dir);
1941 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1942 dfid = v9fs_fid_lookup(dir_dentry);
1943 if (IS_ERR(dfid)) {
1944 err = PTR_ERR(dfid);
1945 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1946 dfid = NULL;
1947 goto error;
1948 }
1949
1950 gid = v9fs_get_fsgid_for_create(dir);
1951 mode = omode;
1952 /* Update mode based on ACL value */
1953 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
1954 if (err) {
1955 P9_DPRINTK(P9_DEBUG_VFS,
1956 "Failed to get acl values in mknod %d\n", err);
1957 goto error;
1958 }
1959 name = (char *) dentry->d_name.name;
1960
1961 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
1962 if (err < 0)
1963 goto error;
1964
1965 /* instantiate inode and assign the unopened fid to the dentry */
1966 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1967 fid = p9_client_walk(dfid, 1, &name, 1);
1968 if (IS_ERR(fid)) {
1969 err = PTR_ERR(fid);
1970 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1971 err);
1972 fid = NULL;
1973 goto error;
1974 }
1975
1976 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1977 if (IS_ERR(inode)) {
1978 err = PTR_ERR(inode);
1979 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1980 err);
1981 goto error;
1982 }
1983 d_set_d_op(dentry, &v9fs_cached_dentry_operations);
1984 d_instantiate(dentry, inode);
1985 err = v9fs_fid_add(dentry, fid);
1986 if (err < 0)
1987 goto error;
1988 fid = NULL;
1989 } else {
1990 /*
1991 * Not in cached mode. No need to populate inode with stat.
1992 * socket syscall returns a fd, so we need instantiate
1993 */
1994 inode = v9fs_get_inode(dir->i_sb, mode);
1995 if (IS_ERR(inode)) {
1996 err = PTR_ERR(inode);
1997 goto error;
1998 }
1999 d_set_d_op(dentry, &v9fs_dentry_operations);
2000 d_instantiate(dentry, inode);
2001 }
2002 /* Now set the ACL based on the default value */
2003 v9fs_set_create_acl(dentry, dacl, pacl);
2004error:
2005 if (fid)
2006 p9_client_clunk(fid);
2007 return err;
2008}
2009
2010static int
2011v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
2012{
2013 int retval;
2014 struct p9_fid *fid;
2015 char *target = NULL;
2016
2017 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
2018 retval = -EPERM;
2019 fid = v9fs_fid_lookup(dentry);
2020 if (IS_ERR(fid))
2021 return PTR_ERR(fid);
2022
2023 retval = p9_client_readlink(fid, &target);
2024 if (retval < 0)
2025 return retval;
2026
2027 strncpy(buffer, target, buflen);
2028 P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
2029
2030 retval = strnlen(buffer, buflen);
2031 return retval;
2032}
2033
2034/**
2035 * v9fs_vfs_follow_link_dotl - follow a symlink path
2036 * @dentry: dentry for symlink
2037 * @nd: nameidata
2038 *
2039 */
2040
2041static void *
2042v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
2043{
2044 int len = 0;
2045 char *link = __getname();
2046
2047 P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
2048
2049 if (!link)
2050 link = ERR_PTR(-ENOMEM);
2051 else {
2052 len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
2053 if (len < 0) {
2054 __putname(link);
2055 link = ERR_PTR(len);
2056 } else
2057 link[min(len, PATH_MAX-1)] = 0;
2058 }
2059 nd_set_link(nd, link);
2060
2061 return NULL;
2062}
2063
2064static const struct inode_operations v9fs_dir_inode_operations_dotu = { 1240static const struct inode_operations v9fs_dir_inode_operations_dotu = {
2065 .create = v9fs_vfs_create, 1241 .create = v9fs_vfs_create,
2066 .lookup = v9fs_vfs_lookup, 1242 .lookup = v9fs_vfs_lookup,
@@ -2075,25 +1251,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
2075 .setattr = v9fs_vfs_setattr, 1251 .setattr = v9fs_vfs_setattr,
2076}; 1252};
2077 1253
2078static const struct inode_operations v9fs_dir_inode_operations_dotl = {
2079 .create = v9fs_vfs_create_dotl,
2080 .lookup = v9fs_vfs_lookup,
2081 .link = v9fs_vfs_link_dotl,
2082 .symlink = v9fs_vfs_symlink_dotl,
2083 .unlink = v9fs_vfs_unlink,
2084 .mkdir = v9fs_vfs_mkdir_dotl,
2085 .rmdir = v9fs_vfs_rmdir,
2086 .mknod = v9fs_vfs_mknod_dotl,
2087 .rename = v9fs_vfs_rename,
2088 .getattr = v9fs_vfs_getattr_dotl,
2089 .setattr = v9fs_vfs_setattr_dotl,
2090 .setxattr = generic_setxattr,
2091 .getxattr = generic_getxattr,
2092 .removexattr = generic_removexattr,
2093 .listxattr = v9fs_listxattr,
2094 .check_acl = v9fs_check_acl,
2095};
2096
2097static const struct inode_operations v9fs_dir_inode_operations = { 1254static const struct inode_operations v9fs_dir_inode_operations = {
2098 .create = v9fs_vfs_create, 1255 .create = v9fs_vfs_create,
2099 .lookup = v9fs_vfs_lookup, 1256 .lookup = v9fs_vfs_lookup,
@@ -2111,16 +1268,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
2111 .setattr = v9fs_vfs_setattr, 1268 .setattr = v9fs_vfs_setattr,
2112}; 1269};
2113 1270
2114static const struct inode_operations v9fs_file_inode_operations_dotl = {
2115 .getattr = v9fs_vfs_getattr_dotl,
2116 .setattr = v9fs_vfs_setattr_dotl,
2117 .setxattr = generic_setxattr,
2118 .getxattr = generic_getxattr,
2119 .removexattr = generic_removexattr,
2120 .listxattr = v9fs_listxattr,
2121 .check_acl = v9fs_check_acl,
2122};
2123
2124static const struct inode_operations v9fs_symlink_inode_operations = { 1271static const struct inode_operations v9fs_symlink_inode_operations = {
2125 .readlink = generic_readlink, 1272 .readlink = generic_readlink,
2126 .follow_link = v9fs_vfs_follow_link, 1273 .follow_link = v9fs_vfs_follow_link,
@@ -2129,14 +1276,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
2129 .setattr = v9fs_vfs_setattr, 1276 .setattr = v9fs_vfs_setattr,
2130}; 1277};
2131 1278
2132static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
2133 .readlink = v9fs_vfs_readlink_dotl,
2134 .follow_link = v9fs_vfs_follow_link_dotl,
2135 .put_link = v9fs_vfs_put_link,
2136 .getattr = v9fs_vfs_getattr_dotl,
2137 .setattr = v9fs_vfs_setattr_dotl,
2138 .setxattr = generic_setxattr,
2139 .getxattr = generic_getxattr,
2140 .removexattr = generic_removexattr,
2141 .listxattr = v9fs_listxattr,
2142};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 000000000000..fe3ffa9aace4
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,824 @@
1/*
2 * linux/fs/9p/vfs_inode_dotl.c
3 *
4 * This file contains vfs inode ops for the 9P2000.L protocol.
5 *
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to:
20 * Free Software Foundation
21 * 51 Franklin Street, Fifth Floor
22 * Boston, MA 02111-1301 USA
23 *
24 */
25
26#include <linux/module.h>
27#include <linux/errno.h>
28#include <linux/fs.h>
29#include <linux/file.h>
30#include <linux/pagemap.h>
31#include <linux/stat.h>
32#include <linux/string.h>
33#include <linux/inet.h>
34#include <linux/namei.h>
35#include <linux/idr.h>
36#include <linux/sched.h>
37#include <linux/slab.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include <net/9p/9p.h>
41#include <net/9p/client.h>
42
43#include "v9fs.h"
44#include "v9fs_vfs.h"
45#include "fid.h"
46#include "cache.h"
47#include "xattr.h"
48#include "acl.h"
49
50static int
51v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
52 dev_t rdev);
53
54/**
55 * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
56 * new file system object. This checks the S_ISGID to determine the owning
57 * group of the new file system object.
58 */
59
60static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
61{
62 BUG_ON(dir_inode == NULL);
63
64 if (dir_inode->i_mode & S_ISGID) {
65 /* set_gid bit is set.*/
66 return dir_inode->i_gid;
67 }
68 return current_fsgid();
69}
70
71/**
72 * v9fs_dentry_from_dir_inode - helper function to get the dentry from
73 * dir inode.
74 *
75 */
76
77static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
78{
79 struct dentry *dentry;
80
81 spin_lock(&inode->i_lock);
82 /* Directory should have only one entry. */
83 BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
84 dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
85 spin_unlock(&inode->i_lock);
86 return dentry;
87}
88
89struct inode *
90v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
91 struct super_block *sb)
92{
93 struct inode *ret = NULL;
94 int err;
95 struct p9_stat_dotl *st;
96
97 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
98 if (IS_ERR(st))
99 return ERR_CAST(st);
100
101 ret = v9fs_get_inode(sb, st->st_mode);
102 if (IS_ERR(ret)) {
103 err = PTR_ERR(ret);
104 goto error;
105 }
106
107 v9fs_stat2inode_dotl(st, ret);
108 ret->i_ino = v9fs_qid2ino(&st->qid);
109#ifdef CONFIG_9P_FSCACHE
110 v9fs_vcookie_set_qid(ret, &st->qid);
111 v9fs_cache_inode_get_cookie(ret);
112#endif
113 err = v9fs_get_acl(ret, fid);
114 if (err) {
115 iput(ret);
116 goto error;
117 }
118 kfree(st);
119 return ret;
120error:
121 kfree(st);
122 return ERR_PTR(err);
123}
124
125/**
126 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
127 * @dir: directory inode that is being created
128 * @dentry: dentry that is being deleted
129 * @mode: create permissions
130 * @nd: path information
131 *
132 */
133
134static int
135v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
136 struct nameidata *nd)
137{
138 int err = 0;
139 char *name = NULL;
140 gid_t gid;
141 int flags;
142 mode_t mode;
143 struct v9fs_session_info *v9ses;
144 struct p9_fid *fid = NULL;
145 struct p9_fid *dfid, *ofid;
146 struct file *filp;
147 struct p9_qid qid;
148 struct inode *inode;
149 struct posix_acl *pacl = NULL, *dacl = NULL;
150
151 v9ses = v9fs_inode2v9ses(dir);
152 if (nd && nd->flags & LOOKUP_OPEN)
153 flags = nd->intent.open.flags - 1;
154 else {
155 /*
156 * create call without LOOKUP_OPEN is due
157 * to mknod of regular files. So use mknod
158 * operation.
159 */
160 return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
161 }
162
163 name = (char *) dentry->d_name.name;
164 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
165 "mode:0x%x\n", name, flags, omode);
166
167 dfid = v9fs_fid_lookup(dentry->d_parent);
168 if (IS_ERR(dfid)) {
169 err = PTR_ERR(dfid);
170 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
171 return err;
172 }
173
174 /* clone a fid to use for creation */
175 ofid = p9_client_walk(dfid, 0, NULL, 1);
176 if (IS_ERR(ofid)) {
177 err = PTR_ERR(ofid);
178 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
179 return err;
180 }
181
182 gid = v9fs_get_fsgid_for_create(dir);
183
184 mode = omode;
185 /* Update mode based on ACL value */
186 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
187 if (err) {
188 P9_DPRINTK(P9_DEBUG_VFS,
189 "Failed to get acl values in creat %d\n", err);
190 goto error;
191 }
192 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
193 if (err < 0) {
194 P9_DPRINTK(P9_DEBUG_VFS,
195 "p9_client_open_dotl failed in creat %d\n",
196 err);
197 goto error;
198 }
199
200 /* instantiate inode and assign the unopened fid to the dentry */
201 fid = p9_client_walk(dfid, 1, &name, 1);
202 if (IS_ERR(fid)) {
203 err = PTR_ERR(fid);
204 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
205 fid = NULL;
206 goto error;
207 }
208 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
209 if (IS_ERR(inode)) {
210 err = PTR_ERR(inode);
211 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
212 goto error;
213 }
214 d_instantiate(dentry, inode);
215 err = v9fs_fid_add(dentry, fid);
216 if (err < 0)
217 goto error;
218
219 /* Now set the ACL based on the default value */
220 v9fs_set_create_acl(dentry, dacl, pacl);
221
222 /* Since we are opening a file, assign the open fid to the file */
223 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
224 if (IS_ERR(filp)) {
225 p9_client_clunk(ofid);
226 return PTR_ERR(filp);
227 }
228 filp->private_data = ofid;
229 return 0;
230
231error:
232 if (ofid)
233 p9_client_clunk(ofid);
234 if (fid)
235 p9_client_clunk(fid);
236 return err;
237}
238
239/**
240 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
241 * @dir: inode that is being unlinked
242 * @dentry: dentry that is being unlinked
243 * @mode: mode for new directory
244 *
245 */
246
247static int v9fs_vfs_mkdir_dotl(struct inode *dir,
248 struct dentry *dentry, int omode)
249{
250 int err;
251 struct v9fs_session_info *v9ses;
252 struct p9_fid *fid = NULL, *dfid = NULL;
253 gid_t gid;
254 char *name;
255 mode_t mode;
256 struct inode *inode;
257 struct p9_qid qid;
258 struct dentry *dir_dentry;
259 struct posix_acl *dacl = NULL, *pacl = NULL;
260
261 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
262 err = 0;
263 v9ses = v9fs_inode2v9ses(dir);
264
265 omode |= S_IFDIR;
266 if (dir->i_mode & S_ISGID)
267 omode |= S_ISGID;
268
269 dir_dentry = v9fs_dentry_from_dir_inode(dir);
270 dfid = v9fs_fid_lookup(dir_dentry);
271 if (IS_ERR(dfid)) {
272 err = PTR_ERR(dfid);
273 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
274 dfid = NULL;
275 goto error;
276 }
277
278 gid = v9fs_get_fsgid_for_create(dir);
279 mode = omode;
280 /* Update mode based on ACL value */
281 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
282 if (err) {
283 P9_DPRINTK(P9_DEBUG_VFS,
284 "Failed to get acl values in mkdir %d\n", err);
285 goto error;
286 }
287 name = (char *) dentry->d_name.name;
288 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
289 if (err < 0)
290 goto error;
291
292 /* instantiate inode and assign the unopened fid to the dentry */
293 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
294 fid = p9_client_walk(dfid, 1, &name, 1);
295 if (IS_ERR(fid)) {
296 err = PTR_ERR(fid);
297 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
298 err);
299 fid = NULL;
300 goto error;
301 }
302
303 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
304 if (IS_ERR(inode)) {
305 err = PTR_ERR(inode);
306 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
307 err);
308 goto error;
309 }
310 d_instantiate(dentry, inode);
311 err = v9fs_fid_add(dentry, fid);
312 if (err < 0)
313 goto error;
314 fid = NULL;
315 } else {
316 /*
317 * Not in cached mode. No need to populate
318 * inode with stat. We need to get an inode
319 * so that we can set the acl with dentry
320 */
321 inode = v9fs_get_inode(dir->i_sb, mode);
322 if (IS_ERR(inode)) {
323 err = PTR_ERR(inode);
324 goto error;
325 }
326 d_instantiate(dentry, inode);
327 }
328 /* Now set the ACL based on the default value */
329 v9fs_set_create_acl(dentry, dacl, pacl);
330
331error:
332 if (fid)
333 p9_client_clunk(fid);
334 return err;
335}
336
337static int
338v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
339 struct kstat *stat)
340{
341 int err;
342 struct v9fs_session_info *v9ses;
343 struct p9_fid *fid;
344 struct p9_stat_dotl *st;
345
346 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
347 err = -EPERM;
348 v9ses = v9fs_inode2v9ses(dentry->d_inode);
349 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
350 return simple_getattr(mnt, dentry, stat);
351
352 fid = v9fs_fid_lookup(dentry);
353 if (IS_ERR(fid))
354 return PTR_ERR(fid);
355
356 /* Ask for all the fields in stat structure. Server will return
357 * whatever it supports
358 */
359
360 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
361 if (IS_ERR(st))
362 return PTR_ERR(st);
363
364 v9fs_stat2inode_dotl(st, dentry->d_inode);
365 generic_fillattr(dentry->d_inode, stat);
366 /* Change block size to what the server returned */
367 stat->blksize = st->st_blksize;
368
369 kfree(st);
370 return 0;
371}
372
373/**
374 * v9fs_vfs_setattr_dotl - set file metadata
375 * @dentry: file whose metadata to set
376 * @iattr: metadata assignment structure
377 *
378 */
379
380int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
381{
382 int retval;
383 struct v9fs_session_info *v9ses;
384 struct p9_fid *fid;
385 struct p9_iattr_dotl p9attr;
386
387 P9_DPRINTK(P9_DEBUG_VFS, "\n");
388
389 retval = inode_change_ok(dentry->d_inode, iattr);
390 if (retval)
391 return retval;
392
393 p9attr.valid = iattr->ia_valid;
394 p9attr.mode = iattr->ia_mode;
395 p9attr.uid = iattr->ia_uid;
396 p9attr.gid = iattr->ia_gid;
397 p9attr.size = iattr->ia_size;
398 p9attr.atime_sec = iattr->ia_atime.tv_sec;
399 p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
400 p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
401 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
402
403 retval = -EPERM;
404 v9ses = v9fs_inode2v9ses(dentry->d_inode);
405 fid = v9fs_fid_lookup(dentry);
406 if (IS_ERR(fid))
407 return PTR_ERR(fid);
408
409 retval = p9_client_setattr(fid, &p9attr);
410 if (retval < 0)
411 return retval;
412
413 if ((iattr->ia_valid & ATTR_SIZE) &&
414 iattr->ia_size != i_size_read(dentry->d_inode)) {
415 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
416 if (retval)
417 return retval;
418 }
419
420 setattr_copy(dentry->d_inode, iattr);
421 mark_inode_dirty(dentry->d_inode);
422 if (iattr->ia_valid & ATTR_MODE) {
423 /* We also want to update ACL when we update mode bits */
424 retval = v9fs_acl_chmod(dentry);
425 if (retval < 0)
426 return retval;
427 }
428 return 0;
429}
430
431/**
432 * v9fs_stat2inode_dotl - populate an inode structure with stat info
433 * @stat: stat structure
434 * @inode: inode to populate
435 * @sb: superblock of filesystem
436 *
437 */
438
439void
440v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
441{
442
443 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
444 inode->i_atime.tv_sec = stat->st_atime_sec;
445 inode->i_atime.tv_nsec = stat->st_atime_nsec;
446 inode->i_mtime.tv_sec = stat->st_mtime_sec;
447 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
448 inode->i_ctime.tv_sec = stat->st_ctime_sec;
449 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
450 inode->i_uid = stat->st_uid;
451 inode->i_gid = stat->st_gid;
452 inode->i_nlink = stat->st_nlink;
453 inode->i_mode = stat->st_mode;
454 inode->i_rdev = new_decode_dev(stat->st_rdev);
455
456 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
457 init_special_inode(inode, inode->i_mode, inode->i_rdev);
458
459 i_size_write(inode, stat->st_size);
460 inode->i_blocks = stat->st_blocks;
461 } else {
462 if (stat->st_result_mask & P9_STATS_ATIME) {
463 inode->i_atime.tv_sec = stat->st_atime_sec;
464 inode->i_atime.tv_nsec = stat->st_atime_nsec;
465 }
466 if (stat->st_result_mask & P9_STATS_MTIME) {
467 inode->i_mtime.tv_sec = stat->st_mtime_sec;
468 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
469 }
470 if (stat->st_result_mask & P9_STATS_CTIME) {
471 inode->i_ctime.tv_sec = stat->st_ctime_sec;
472 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
473 }
474 if (stat->st_result_mask & P9_STATS_UID)
475 inode->i_uid = stat->st_uid;
476 if (stat->st_result_mask & P9_STATS_GID)
477 inode->i_gid = stat->st_gid;
478 if (stat->st_result_mask & P9_STATS_NLINK)
479 inode->i_nlink = stat->st_nlink;
480 if (stat->st_result_mask & P9_STATS_MODE) {
481 inode->i_mode = stat->st_mode;
482 if ((S_ISBLK(inode->i_mode)) ||
483 (S_ISCHR(inode->i_mode)))
484 init_special_inode(inode, inode->i_mode,
485 inode->i_rdev);
486 }
487 if (stat->st_result_mask & P9_STATS_RDEV)
488 inode->i_rdev = new_decode_dev(stat->st_rdev);
489 if (stat->st_result_mask & P9_STATS_SIZE)
490 i_size_write(inode, stat->st_size);
491 if (stat->st_result_mask & P9_STATS_BLOCKS)
492 inode->i_blocks = stat->st_blocks;
493 }
494 if (stat->st_result_mask & P9_STATS_GEN)
495 inode->i_generation = stat->st_gen;
496
497 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
498 * because the inode structure does not have fields for them.
499 */
500}
501
502static int
503v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
504 const char *symname)
505{
506 struct v9fs_session_info *v9ses;
507 struct p9_fid *dfid;
508 struct p9_fid *fid = NULL;
509 struct inode *inode;
510 struct p9_qid qid;
511 char *name;
512 int err;
513 gid_t gid;
514
515 name = (char *) dentry->d_name.name;
516 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
517 dir->i_ino, name, symname);
518 v9ses = v9fs_inode2v9ses(dir);
519
520 dfid = v9fs_fid_lookup(dentry->d_parent);
521 if (IS_ERR(dfid)) {
522 err = PTR_ERR(dfid);
523 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
524 return err;
525 }
526
527 gid = v9fs_get_fsgid_for_create(dir);
528
529 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
530 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
531
532 if (err < 0) {
533 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
534 goto error;
535 }
536
537 if (v9ses->cache) {
538 /* Now walk from the parent so we can get an unopened fid. */
539 fid = p9_client_walk(dfid, 1, &name, 1);
540 if (IS_ERR(fid)) {
541 err = PTR_ERR(fid);
542 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
543 err);
544 fid = NULL;
545 goto error;
546 }
547
548 /* instantiate inode and assign the unopened fid to dentry */
549 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
550 if (IS_ERR(inode)) {
551 err = PTR_ERR(inode);
552 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
553 err);
554 goto error;
555 }
556 d_instantiate(dentry, inode);
557 err = v9fs_fid_add(dentry, fid);
558 if (err < 0)
559 goto error;
560 fid = NULL;
561 } else {
562 /* Not in cached mode. No need to populate inode with stat */
563 inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
564 if (IS_ERR(inode)) {
565 err = PTR_ERR(inode);
566 goto error;
567 }
568 d_instantiate(dentry, inode);
569 }
570
571error:
572 if (fid)
573 p9_client_clunk(fid);
574
575 return err;
576}
577
578/**
579 * v9fs_vfs_link_dotl - create a hardlink for dotl
580 * @old_dentry: dentry for file to link to
581 * @dir: inode destination for new link
582 * @dentry: dentry for link
583 *
584 */
585
586static int
587v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
588 struct dentry *dentry)
589{
590 int err;
591 struct p9_fid *dfid, *oldfid;
592 char *name;
593 struct v9fs_session_info *v9ses;
594 struct dentry *dir_dentry;
595
596 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
597 dir->i_ino, old_dentry->d_name.name,
598 dentry->d_name.name);
599
600 v9ses = v9fs_inode2v9ses(dir);
601 dir_dentry = v9fs_dentry_from_dir_inode(dir);
602 dfid = v9fs_fid_lookup(dir_dentry);
603 if (IS_ERR(dfid))
604 return PTR_ERR(dfid);
605
606 oldfid = v9fs_fid_lookup(old_dentry);
607 if (IS_ERR(oldfid))
608 return PTR_ERR(oldfid);
609
610 name = (char *) dentry->d_name.name;
611
612 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
613
614 if (err < 0) {
615 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
616 return err;
617 }
618
619 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
620 /* Get the latest stat info from server. */
621 struct p9_fid *fid;
622 struct p9_stat_dotl *st;
623
624 fid = v9fs_fid_lookup(old_dentry);
625 if (IS_ERR(fid))
626 return PTR_ERR(fid);
627
628 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
629 if (IS_ERR(st))
630 return PTR_ERR(st);
631
632 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
633
634 kfree(st);
635 } else {
636 /* Caching disabled. No need to get upto date stat info.
637 * This dentry will be released immediately. So, just hold the
638 * inode
639 */
640 ihold(old_dentry->d_inode);
641 }
642 d_instantiate(dentry, old_dentry->d_inode);
643
644 return err;
645}
646
647/**
648 * v9fs_vfs_mknod_dotl - create a special file
649 * @dir: inode destination for new link
650 * @dentry: dentry for file
651 * @mode: mode for creation
652 * @rdev: device associated with special file
653 *
654 */
655static int
656v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
657 dev_t rdev)
658{
659 int err;
660 char *name;
661 mode_t mode;
662 struct v9fs_session_info *v9ses;
663 struct p9_fid *fid = NULL, *dfid = NULL;
664 struct inode *inode;
665 gid_t gid;
666 struct p9_qid qid;
667 struct dentry *dir_dentry;
668 struct posix_acl *dacl = NULL, *pacl = NULL;
669
670 P9_DPRINTK(P9_DEBUG_VFS,
671 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
672 dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
673
674 if (!new_valid_dev(rdev))
675 return -EINVAL;
676
677 v9ses = v9fs_inode2v9ses(dir);
678 dir_dentry = v9fs_dentry_from_dir_inode(dir);
679 dfid = v9fs_fid_lookup(dir_dentry);
680 if (IS_ERR(dfid)) {
681 err = PTR_ERR(dfid);
682 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
683 dfid = NULL;
684 goto error;
685 }
686
687 gid = v9fs_get_fsgid_for_create(dir);
688 mode = omode;
689 /* Update mode based on ACL value */
690 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
691 if (err) {
692 P9_DPRINTK(P9_DEBUG_VFS,
693 "Failed to get acl values in mknod %d\n", err);
694 goto error;
695 }
696 name = (char *) dentry->d_name.name;
697
698 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
699 if (err < 0)
700 goto error;
701
702 /* instantiate inode and assign the unopened fid to the dentry */
703 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
704 fid = p9_client_walk(dfid, 1, &name, 1);
705 if (IS_ERR(fid)) {
706 err = PTR_ERR(fid);
707 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
708 err);
709 fid = NULL;
710 goto error;
711 }
712
713 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
714 if (IS_ERR(inode)) {
715 err = PTR_ERR(inode);
716 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
717 err);
718 goto error;
719 }
720 d_instantiate(dentry, inode);
721 err = v9fs_fid_add(dentry, fid);
722 if (err < 0)
723 goto error;
724 fid = NULL;
725 } else {
726 /*
727 * Not in cached mode. No need to populate inode with stat.
728 * socket syscall returns a fd, so we need instantiate
729 */
730 inode = v9fs_get_inode(dir->i_sb, mode);
731 if (IS_ERR(inode)) {
732 err = PTR_ERR(inode);
733 goto error;
734 }
735 d_instantiate(dentry, inode);
736 }
737 /* Now set the ACL based on the default value */
738 v9fs_set_create_acl(dentry, dacl, pacl);
739error:
740 if (fid)
741 p9_client_clunk(fid);
742 return err;
743}
744
745/**
746 * v9fs_vfs_follow_link_dotl - follow a symlink path
747 * @dentry: dentry for symlink
748 * @nd: nameidata
749 *
750 */
751
752static void *
753v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
754{
755 int retval;
756 struct p9_fid *fid;
757 char *link = __getname();
758 char *target;
759
760 P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
761
762 if (!link) {
763 link = ERR_PTR(-ENOMEM);
764 goto ndset;
765 }
766 fid = v9fs_fid_lookup(dentry);
767 if (IS_ERR(fid)) {
768 __putname(link);
769 link = ERR_PTR(PTR_ERR(fid));
770 goto ndset;
771 }
772 retval = p9_client_readlink(fid, &target);
773 if (!retval) {
774 strcpy(link, target);
775 kfree(target);
776 goto ndset;
777 }
778 __putname(link);
779 link = ERR_PTR(retval);
780ndset:
781 nd_set_link(nd, link);
782 return NULL;
783}
784
785const struct inode_operations v9fs_dir_inode_operations_dotl = {
786 .create = v9fs_vfs_create_dotl,
787 .lookup = v9fs_vfs_lookup,
788 .link = v9fs_vfs_link_dotl,
789 .symlink = v9fs_vfs_symlink_dotl,
790 .unlink = v9fs_vfs_unlink,
791 .mkdir = v9fs_vfs_mkdir_dotl,
792 .rmdir = v9fs_vfs_rmdir,
793 .mknod = v9fs_vfs_mknod_dotl,
794 .rename = v9fs_vfs_rename,
795 .getattr = v9fs_vfs_getattr_dotl,
796 .setattr = v9fs_vfs_setattr_dotl,
797 .setxattr = generic_setxattr,
798 .getxattr = generic_getxattr,
799 .removexattr = generic_removexattr,
800 .listxattr = v9fs_listxattr,
801 .check_acl = v9fs_check_acl,
802};
803
804const struct inode_operations v9fs_file_inode_operations_dotl = {
805 .getattr = v9fs_vfs_getattr_dotl,
806 .setattr = v9fs_vfs_setattr_dotl,
807 .setxattr = generic_setxattr,
808 .getxattr = generic_getxattr,
809 .removexattr = generic_removexattr,
810 .listxattr = v9fs_listxattr,
811 .check_acl = v9fs_check_acl,
812};
813
814const struct inode_operations v9fs_symlink_inode_operations_dotl = {
815 .readlink = generic_readlink,
816 .follow_link = v9fs_vfs_follow_link_dotl,
817 .put_link = v9fs_vfs_put_link,
818 .getattr = v9fs_vfs_getattr_dotl,
819 .setattr = v9fs_vfs_setattr_dotl,
820 .setxattr = generic_setxattr,
821 .getxattr = generic_getxattr,
822 .removexattr = generic_removexattr,
823 .listxattr = v9fs_listxattr,
824};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index c55c614500ad..dbaabe3b8131 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -141,6 +141,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
141 } 141 }
142 v9fs_fill_super(sb, v9ses, flags, data); 142 v9fs_fill_super(sb, v9ses, flags, data);
143 143
144 if (v9ses->cache)
145 sb->s_d_op = &v9fs_cached_dentry_operations;
146 else
147 sb->s_d_op = &v9fs_dentry_operations;
148
144 inode = v9fs_get_inode(sb, S_IFDIR | mode); 149 inode = v9fs_get_inode(sb, S_IFDIR | mode);
145 if (IS_ERR(inode)) { 150 if (IS_ERR(inode)) {
146 retval = PTR_ERR(inode); 151 retval = PTR_ERR(inode);
@@ -217,9 +222,6 @@ static void v9fs_kill_super(struct super_block *s)
217 222
218 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); 223 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
219 224
220 if (s->s_root)
221 v9fs_dentry_release(s->s_root); /* clunk root */
222
223 kill_anon_super(s); 225 kill_anon_super(s);
224 226
225 v9fs_session_cancel(v9ses); 227 v9fs_session_cancel(v9ses);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 43ec7df84336..d288773871b3 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -133,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
133 "p9_client_xattrcreate failed %d\n", retval); 133 "p9_client_xattrcreate failed %d\n", retval);
134 goto error; 134 goto error;
135 } 135 }
136 msize = fid->clnt->msize;; 136 msize = fid->clnt->msize;
137 while (value_len) { 137 while (value_len) {
138 if (value_len > (msize - P9_IOHDRSZ)) 138 if (value_len > (msize - P9_IOHDRSZ))
139 write_count = msize - P9_IOHDRSZ; 139 write_count = msize - P9_IOHDRSZ;
diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457402d4..3db9caa57edc 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
30source "fs/reiserfs/Kconfig" 30source "fs/reiserfs/Kconfig"
31source "fs/jfs/Kconfig" 31source "fs/jfs/Kconfig"
32 32
33config FS_POSIX_ACL
34# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
35#
36# NOTE: you can implement Posix ACLs without these helpers (XFS does).
37# Never use this symbol for ifdefs.
38#
39 bool
40 default n
41
42source "fs/xfs/Kconfig" 33source "fs/xfs/Kconfig"
43source "fs/gfs2/Kconfig" 34source "fs/gfs2/Kconfig"
44source "fs/ocfs2/Kconfig" 35source "fs/ocfs2/Kconfig"
@@ -47,11 +38,19 @@ source "fs/nilfs2/Kconfig"
47 38
48endif # BLOCK 39endif # BLOCK
49 40
41# Posix ACL utility routines
42#
43# Note: Posix ACLs can be implemented without these helpers. Never use
44# this symbol for ifdefs in core code.
45#
46config FS_POSIX_ACL
47 def_bool n
48
50config EXPORTFS 49config EXPORTFS
51 tristate 50 tristate
52 51
53config FILE_LOCKING 52config FILE_LOCKING
54 bool "Enable POSIX file locking API" if EMBEDDED 53 bool "Enable POSIX file locking API" if EXPERT
55 default y 54 default y
56 help 55 help
57 This option enables standard file locking support, required 56 This option enables standard file locking support, required
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index bf7693c384f9..3b4a764ed780 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -276,7 +276,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
276 struct object_info obj; 276 struct object_info obj;
277 int error; 277 int error;
278 278
279 d_set_d_op(dentry, &adfs_dentry_operations);
280 lock_kernel(); 279 lock_kernel();
281 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); 280 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
282 if (error == 0) { 281 if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index a4041b52fbca..2d7954049fbe 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -473,6 +473,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
473 asb->s_namelen = ADFS_F_NAME_LEN; 473 asb->s_namelen = ADFS_F_NAME_LEN;
474 } 474 }
475 475
476 sb->s_d_op = &adfs_dentry_operations;
476 root = adfs_iget(sb, &root_obj); 477 root = adfs_iget(sb, &root_obj);
477 sb->s_root = d_alloc_root(root); 478 sb->s_root = d_alloc_root(root);
478 if (!sb->s_root) { 479 if (!sb->s_root) {
@@ -483,8 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
483 kfree(asb->s_map); 484 kfree(asb->s_map);
484 adfs_error(sb, "get root inode failed\n"); 485 adfs_error(sb, "get root inode failed\n");
485 goto error; 486 goto error;
486 } else 487 }
487 d_set_d_op(sb->s_root, &adfs_dentry_operations);
488 unlock_kernel(); 488 unlock_kernel();
489 return 0; 489 return 0;
490 490
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a8cbdeb34025..0e95f73a7023 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -201,6 +201,7 @@ extern const struct address_space_operations affs_aops;
201extern const struct address_space_operations affs_aops_ofs; 201extern const struct address_space_operations affs_aops_ofs;
202 202
203extern const struct dentry_operations affs_dentry_operations; 203extern const struct dentry_operations affs_dentry_operations;
204extern const struct dentry_operations affs_intl_dentry_operations;
204 205
205static inline void 206static inline void
206affs_set_blocksize(struct super_block *sb, int size) 207affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 944a4042fb65..e3e9efc1fdd8 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -32,7 +32,7 @@ const struct dentry_operations affs_dentry_operations = {
32 .d_compare = affs_compare_dentry, 32 .d_compare = affs_compare_dentry,
33}; 33};
34 34
35static const struct dentry_operations affs_intl_dentry_operations = { 35const struct dentry_operations affs_intl_dentry_operations = {
36 .d_hash = affs_intl_hash_dentry, 36 .d_hash = affs_intl_hash_dentry,
37 .d_compare = affs_intl_compare_dentry, 37 .d_compare = affs_intl_compare_dentry,
38}; 38};
@@ -240,7 +240,6 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
240 if (IS_ERR(inode)) 240 if (IS_ERR(inode))
241 return ERR_CAST(inode); 241 return ERR_CAST(inode);
242 } 242 }
243 d_set_d_op(dentry, AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations);
244 d_add(dentry, inode); 243 d_add(dentry, inode);
245 return NULL; 244 return NULL;
246} 245}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d39081bbe7ce..b31507d0f9b9 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -477,12 +477,16 @@ got_root:
477 goto out_error_noinode; 477 goto out_error_noinode;
478 } 478 }
479 479
480 if (AFFS_SB(sb)->s_flags & SF_INTL)
481 sb->s_d_op = &affs_intl_dentry_operations;
482 else
483 sb->s_d_op = &affs_dentry_operations;
484
480 sb->s_root = d_alloc_root(root_inode); 485 sb->s_root = d_alloc_root(root_inode);
481 if (!sb->s_root) { 486 if (!sb->s_root) {
482 printk(KERN_ERR "AFFS: Get root inode failed\n"); 487 printk(KERN_ERR "AFFS: Get root inode failed\n");
483 goto out_error; 488 goto out_error;
484 } 489 }
485 d_set_d_op(sb->s_root, &affs_dentry_operations);
486 490
487 pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); 491 pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
488 return 0; 492 return 0;
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54a..1c8c6cc6de30 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
289 call->server = server; 289 call->server = server;
290 290
291 INIT_WORK(&call->work, SRXAFSCB_CallBack); 291 INIT_WORK(&call->work, SRXAFSCB_CallBack);
292 schedule_work(&call->work); 292 queue_work(afs_wq, &call->work);
293 return 0; 293 return 0;
294} 294}
295 295
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
336 call->server = server; 336 call->server = server;
337 337
338 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); 338 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
339 schedule_work(&call->work); 339 queue_work(afs_wq, &call->work);
340 return 0; 340 return 0;
341} 341}
342 342
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
367 call->server = server; 367 call->server = server;
368 368
369 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); 369 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
370 schedule_work(&call->work); 370 queue_work(afs_wq, &call->work);
371 return 0; 371 return 0;
372} 372}
373 373
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
400 call->state = AFS_CALL_REPLYING; 400 call->state = AFS_CALL_REPLYING;
401 401
402 INIT_WORK(&call->work, SRXAFSCB_Probe); 402 INIT_WORK(&call->work, SRXAFSCB_Probe);
403 schedule_work(&call->work); 403 queue_work(afs_wq, &call->work);
404 return 0; 404 return 0;
405} 405}
406 406
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
496 call->state = AFS_CALL_REPLYING; 496 call->state = AFS_CALL_REPLYING;
497 497
498 INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); 498 INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
499 schedule_work(&call->work); 499 queue_work(afs_wq, &call->work);
500 return 0; 500 return 0;
501} 501}
502 502
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
580 call->state = AFS_CALL_REPLYING; 580 call->state = AFS_CALL_REPLYING;
581 581
582 INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); 582 INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
583 schedule_work(&call->work); 583 queue_work(afs_wq, &call->work);
584 return 0; 584 return 0;
585} 585}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 34a3263d60a4..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -62,10 +62,11 @@ const struct inode_operations afs_dir_inode_operations = {
62 .setattr = afs_setattr, 62 .setattr = afs_setattr,
63}; 63};
64 64
65static const struct dentry_operations afs_fs_dentry_operations = { 65const struct dentry_operations afs_fs_dentry_operations = {
66 .d_revalidate = afs_d_revalidate, 66 .d_revalidate = afs_d_revalidate,
67 .d_delete = afs_d_delete, 67 .d_delete = afs_d_delete,
68 .d_release = afs_d_release, 68 .d_release = afs_d_release,
69 .d_automount = afs_d_automount,
69}; 70};
70 71
71#define AFS_DIR_HASHTBL_SIZE 128 72#define AFS_DIR_HASHTBL_SIZE 128
@@ -582,8 +583,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
582 } 583 }
583 584
584success: 585success:
585 d_set_d_op(dentry, &afs_fs_dentry_operations);
586
587 d_add(dentry, inode); 586 d_add(dentry, inode);
588 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }", 587 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
589 fid.vnode, 588 fid.vnode,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c3..db66c5201474 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
184 inode->i_generation = 0; 184 inode->i_generation = 0;
185 185
186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); 186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
187 inode->i_flags |= S_NOATIME; 187 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
188 inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
188 unlock_new_inode(inode); 189 unlock_new_inode(inode);
189 _leave(" = %p", inode); 190 _leave(" = %p", inode);
190 return inode; 191 return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6d4bc1c8ff60..5a9b6843bac1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -486,6 +486,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
486 * dir.c 486 * dir.c
487 */ 487 */
488extern const struct inode_operations afs_dir_inode_operations; 488extern const struct inode_operations afs_dir_inode_operations;
489extern const struct dentry_operations afs_fs_dentry_operations;
489extern const struct file_operations afs_dir_file_operations; 490extern const struct file_operations afs_dir_file_operations;
490 491
491/* 492/*
@@ -576,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
576/* 577/*
577 * main.c 578 * main.c
578 */ 579 */
580extern struct workqueue_struct *afs_wq;
579extern struct afs_uuid afs_uuid; 581extern struct afs_uuid afs_uuid;
580 582
581/* 583/*
@@ -590,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
590extern const struct inode_operations afs_autocell_inode_operations; 592extern const struct inode_operations afs_autocell_inode_operations;
591extern const struct file_operations afs_mntpt_file_operations; 593extern const struct file_operations afs_mntpt_file_operations;
592 594
595extern struct vfsmount *afs_d_automount(struct path *);
593extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); 596extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
594extern void afs_mntpt_kill_timer(void); 597extern void afs_mntpt_kill_timer(void);
595 598
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b22..42dd2e499ed8 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); 30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
31 31
32struct afs_uuid afs_uuid; 32struct afs_uuid afs_uuid;
33struct workqueue_struct *afs_wq;
33 34
34/* 35/*
35 * get a client UUID 36 * get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
87 if (ret < 0) 88 if (ret < 0)
88 return ret; 89 return ret;
89 90
91 /* create workqueue */
92 ret = -ENOMEM;
93 afs_wq = alloc_workqueue("afs", 0, 0);
94 if (!afs_wq)
95 return ret;
96
90 /* register the /proc stuff */ 97 /* register the /proc stuff */
91 ret = afs_proc_init(); 98 ret = afs_proc_init();
92 if (ret < 0) 99 if (ret < 0)
93 return ret; 100 goto error_proc;
94 101
95#ifdef CONFIG_AFS_FSCACHE 102#ifdef CONFIG_AFS_FSCACHE
96 /* we want to be able to cache */ 103 /* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
140error_cache: 147error_cache:
141#endif 148#endif
142 afs_proc_cleanup(); 149 afs_proc_cleanup();
150error_proc:
151 destroy_workqueue(afs_wq);
143 rcu_barrier(); 152 rcu_barrier();
144 printk(KERN_ERR "kAFS: failed to register: %d\n", ret); 153 printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
145 return ret; 154 return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
163 afs_purge_servers(); 172 afs_purge_servers();
164 afs_callback_update_kill(); 173 afs_callback_update_kill();
165 afs_vlocation_purge(); 174 afs_vlocation_purge();
166 flush_scheduled_work(); 175 destroy_workqueue(afs_wq);
167 afs_cell_purge(); 176 afs_cell_purge();
168#ifdef CONFIG_AFS_FSCACHE 177#ifdef CONFIG_AFS_FSCACHE
169 fscache_unregister_netfs(&afs_cache_netfs); 178 fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6153417caf57..aa59184151d0 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
24 struct dentry *dentry, 24 struct dentry *dentry,
25 struct nameidata *nd); 25 struct nameidata *nd);
26static int afs_mntpt_open(struct inode *inode, struct file *file); 26static int afs_mntpt_open(struct inode *inode, struct file *file);
27static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
28static void afs_mntpt_expiry_timed_out(struct work_struct *work); 27static void afs_mntpt_expiry_timed_out(struct work_struct *work);
29 28
30const struct file_operations afs_mntpt_file_operations = { 29const struct file_operations afs_mntpt_file_operations = {
@@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = {
34 33
35const struct inode_operations afs_mntpt_inode_operations = { 34const struct inode_operations afs_mntpt_inode_operations = {
36 .lookup = afs_mntpt_lookup, 35 .lookup = afs_mntpt_lookup,
37 .follow_link = afs_mntpt_follow_link,
38 .readlink = page_readlink, 36 .readlink = page_readlink,
39 .getattr = afs_getattr, 37 .getattr = afs_getattr,
40}; 38};
41 39
42const struct inode_operations afs_autocell_inode_operations = { 40const struct inode_operations afs_autocell_inode_operations = {
43 .follow_link = afs_mntpt_follow_link,
44 .getattr = afs_getattr, 41 .getattr = afs_getattr,
45}; 42};
46 43
@@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
88 _debug("symlink is a mountpoint"); 85 _debug("symlink is a mountpoint");
89 spin_lock(&vnode->lock); 86 spin_lock(&vnode->lock);
90 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); 87 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
88 vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
91 spin_unlock(&vnode->lock); 89 spin_unlock(&vnode->lock);
92 } 90 }
93 91
@@ -238,52 +236,24 @@ error_no_devname:
238} 236}
239 237
240/* 238/*
241 * follow a link from a mountpoint directory, thus causing it to be mounted 239 * handle an automount point
242 */ 240 */
243static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) 241struct vfsmount *afs_d_automount(struct path *path)
244{ 242{
245 struct vfsmount *newmnt; 243 struct vfsmount *newmnt;
246 int err;
247 244
248 _enter("%p{%s},{%s:%p{%s},}", 245 _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
249 dentry,
250 dentry->d_name.name,
251 nd->path.mnt->mnt_devname,
252 dentry,
253 nd->path.dentry->d_name.name);
254
255 dput(nd->path.dentry);
256 nd->path.dentry = dget(dentry);
257 246
258 newmnt = afs_mntpt_do_automount(nd->path.dentry); 247 newmnt = afs_mntpt_do_automount(path->dentry);
259 if (IS_ERR(newmnt)) { 248 if (IS_ERR(newmnt))
260 path_put(&nd->path); 249 return newmnt;
261 return (void *)newmnt;
262 }
263
264 mntget(newmnt);
265 err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
266 switch (err) {
267 case 0:
268 path_put(&nd->path);
269 nd->path.mnt = newmnt;
270 nd->path.dentry = dget(newmnt->mnt_root);
271 schedule_delayed_work(&afs_mntpt_expiry_timer,
272 afs_mntpt_expiry_timeout * HZ);
273 break;
274 case -EBUSY:
275 /* someone else made a mount here whilst we were busy */
276 while (d_mountpoint(nd->path.dentry) &&
277 follow_down(&nd->path))
278 ;
279 err = 0;
280 default:
281 mntput(newmnt);
282 break;
283 }
284 250
285 _leave(" = %d", err); 251 mntget(newmnt); /* prevent immediate expiration */
286 return ERR_PTR(err); 252 mnt_set_expiry(newmnt, &afs_vfsmounts);
253 queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
254 afs_mntpt_expiry_timeout * HZ);
255 _leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
256 return newmnt;
287} 257}
288 258
289/* 259/*
@@ -295,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
295 265
296 if (!list_empty(&afs_vfsmounts)) { 266 if (!list_empty(&afs_vfsmounts)) {
297 mark_mounts_for_expiry(&afs_vfsmounts); 267 mark_mounts_for_expiry(&afs_vfsmounts);
298 schedule_delayed_work(&afs_mntpt_expiry_timer, 268 queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
299 afs_mntpt_expiry_timeout * HZ); 269 afs_mntpt_expiry_timeout * HZ);
300 } 270 }
301 271
302 _leave(""); 272 _leave("");
@@ -310,6 +280,5 @@ void afs_mntpt_kill_timer(void)
310 _enter(""); 280 _enter("");
311 281
312 ASSERT(list_empty(&afs_vfsmounts)); 282 ASSERT(list_empty(&afs_vfsmounts));
313 cancel_delayed_work(&afs_mntpt_expiry_timer); 283 cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
314 flush_scheduled_work();
315} 284}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01f..e45a323aebb4 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
410 if (!call) { 410 if (!call) {
411 /* its an incoming call for our callback service */ 411 /* its an incoming call for our callback service */
412 skb_queue_tail(&afs_incoming_calls, skb); 412 skb_queue_tail(&afs_incoming_calls, skb);
413 schedule_work(&afs_collect_incoming_call_work); 413 queue_work(afs_wq, &afs_collect_incoming_call_work);
414 } else { 414 } else {
415 /* route the messages directly to the appropriate call */ 415 /* route the messages directly to the appropriate call */
416 skb_queue_tail(&call->rx_queue, skb); 416 skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7bc..d59b7516e943 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
238 if (atomic_read(&server->usage) == 0) { 238 if (atomic_read(&server->usage) == 0) {
239 list_move_tail(&server->grave, &afs_server_graveyard); 239 list_move_tail(&server->grave, &afs_server_graveyard);
240 server->time_of_death = get_seconds(); 240 server->time_of_death = get_seconds();
241 schedule_delayed_work(&afs_server_reaper, 241 queue_delayed_work(afs_wq, &afs_server_reaper,
242 afs_server_timeout * HZ); 242 afs_server_timeout * HZ);
243 } 243 }
244 spin_unlock(&afs_server_graveyard_lock); 244 spin_unlock(&afs_server_graveyard_lock);
245 _leave(" [dead]"); 245 _leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
285 expiry = server->time_of_death + afs_server_timeout; 285 expiry = server->time_of_death + afs_server_timeout;
286 if (expiry > now) { 286 if (expiry > now) {
287 delay = (expiry - now) * HZ; 287 delay = (expiry - now) * HZ;
288 if (!schedule_delayed_work(&afs_server_reaper, delay)) { 288 if (!queue_delayed_work(afs_wq, &afs_server_reaper,
289 delay)) {
289 cancel_delayed_work(&afs_server_reaper); 290 cancel_delayed_work(&afs_server_reaper);
290 schedule_delayed_work(&afs_server_reaper, 291 queue_delayed_work(afs_wq, &afs_server_reaper,
291 delay); 292 delay);
292 } 293 }
293 break; 294 break;
294 } 295 }
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
323{ 324{
324 afs_server_timeout = 0; 325 afs_server_timeout = 0;
325 cancel_delayed_work(&afs_server_reaper); 326 cancel_delayed_work(&afs_server_reaper);
326 schedule_delayed_work(&afs_server_reaper, 0); 327 queue_delayed_work(afs_wq, &afs_server_reaper, 0);
327} 328}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f901a9d7c111..fb240e8766d6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -336,6 +336,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
336 if (!root) 336 if (!root)
337 goto error; 337 goto error;
338 338
339 sb->s_d_op = &afs_fs_dentry_operations;
339 sb->s_root = root; 340 sb->s_root = root;
340 341
341 _leave(" = 0"); 342 _leave(" = 0");
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361d..431984d2e372 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
507 _debug("buried"); 507 _debug("buried");
508 list_move_tail(&vl->grave, &afs_vlocation_graveyard); 508 list_move_tail(&vl->grave, &afs_vlocation_graveyard);
509 vl->time_of_death = get_seconds(); 509 vl->time_of_death = get_seconds();
510 schedule_delayed_work(&afs_vlocation_reap, 510 queue_delayed_work(afs_wq, &afs_vlocation_reap,
511 afs_vlocation_timeout * HZ); 511 afs_vlocation_timeout * HZ);
512 512
513 /* suspend updates on this record */ 513 /* suspend updates on this record */
514 if (!list_empty(&vl->update)) { 514 if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
561 if (expiry > now) { 561 if (expiry > now) {
562 delay = (expiry - now) * HZ; 562 delay = (expiry - now) * HZ;
563 _debug("delay %lu", delay); 563 _debug("delay %lu", delay);
564 if (!schedule_delayed_work(&afs_vlocation_reap, 564 if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
565 delay)) { 565 delay)) {
566 cancel_delayed_work(&afs_vlocation_reap); 566 cancel_delayed_work(&afs_vlocation_reap);
567 schedule_delayed_work(&afs_vlocation_reap, 567 queue_delayed_work(afs_wq, &afs_vlocation_reap,
568 delay); 568 delay);
569 } 569 }
570 break; 570 break;
571 } 571 }
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
620 destroy_workqueue(afs_vlocation_update_worker); 620 destroy_workqueue(afs_vlocation_update_worker);
621 621
622 cancel_delayed_work(&afs_vlocation_reap); 622 cancel_delayed_work(&afs_vlocation_reap);
623 schedule_delayed_work(&afs_vlocation_reap, 0); 623 queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
624} 624}
625 625
626/* 626/*
diff --git a/fs/aio.c b/fs/aio.c
index 8c8f6c5b6d79..fc557a3be0a9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
87 87
88 aio_wq = create_workqueue("aio"); 88 aio_wq = create_workqueue("aio");
89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); 89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
90 BUG_ON(!abe_pool); 90 BUG_ON(!aio_wq || !abe_pool);
91 91
92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
93 93
@@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx)
798 queue_delayed_work(aio_wq, &ctx->wq, timeout); 798 queue_delayed_work(aio_wq, &ctx->wq, timeout);
799} 799}
800 800
801
802/*
803 * aio_run_iocbs:
804 * Process all pending retries queued on the ioctx
805 * run list.
806 * Assumes it is operating within the aio issuer's mm
807 * context.
808 */
809static inline void aio_run_iocbs(struct kioctx *ctx)
810{
811 int requeue;
812
813 spin_lock_irq(&ctx->ctx_lock);
814
815 requeue = __aio_run_iocbs(ctx);
816 spin_unlock_irq(&ctx->ctx_lock);
817 if (requeue)
818 aio_queue_work(ctx);
819}
820
821/* 801/*
822 * just like aio_run_iocbs, but keeps running them until 802 * aio_run_all_iocbs:
823 * the list stays empty 803 * Process all pending retries queued on the ioctx
804 * run list, and keep running them until the list
805 * stays empty.
806 * Assumes it is operating within the aio issuer's mm context.
824 */ 807 */
825static inline void aio_run_all_iocbs(struct kioctx *ctx) 808static inline void aio_run_all_iocbs(struct kioctx *ctx)
826{ 809{
@@ -1839,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1839 long ret = -EINVAL; 1822 long ret = -EINVAL;
1840 1823
1841 if (likely(ioctx)) { 1824 if (likely(ioctx)) {
1842 if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0)) 1825 if (likely(min_nr <= nr && min_nr >= 0))
1843 ret = read_events(ioctx, min_nr, nr, events, timeout); 1826 ret = read_events(ioctx, min_nr, nr, events, timeout);
1844 put_ioctx(ioctx); 1827 put_ioctx(ioctx);
1845 } 1828 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 5fd38112a6ca..c5567cb78432 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
26static struct inode *anon_inode_inode; 26static struct inode *anon_inode_inode;
27static const struct file_operations anon_inode_fops; 27static const struct file_operations anon_inode_fops;
28 28
29static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
30 int flags, const char *dev_name, void *data)
31{
32 return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
33}
34
35/* 29/*
36 * anon_inodefs_dname() is called from d_path(). 30 * anon_inodefs_dname() is called from d_path().
37 */ 31 */
@@ -41,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
41 dentry->d_name.name); 35 dentry->d_name.name);
42} 36}
43 37
38static const struct dentry_operations anon_inodefs_dentry_operations = {
39 .d_dname = anon_inodefs_dname,
40};
41
42static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
43 int flags, const char *dev_name, void *data)
44{
45 return mount_pseudo(fs_type, "anon_inode:", NULL,
46 &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
47}
48
44static struct file_system_type anon_inode_fs_type = { 49static struct file_system_type anon_inode_fs_type = {
45 .name = "anon_inodefs", 50 .name = "anon_inodefs",
46 .mount = anon_inodefs_mount, 51 .mount = anon_inodefs_mount,
47 .kill_sb = kill_anon_super, 52 .kill_sb = kill_anon_super,
48}; 53};
49static const struct dentry_operations anon_inodefs_dentry_operations = {
50 .d_dname = anon_inodefs_dname,
51};
52 54
53/* 55/*
54 * nop .set_page_dirty method so that people can use .page_mkwrite on 56 * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -64,9 +66,9 @@ static const struct address_space_operations anon_aops = {
64}; 66};
65 67
66/** 68/**
67 * anon_inode_getfd - creates a new file instance by hooking it up to an 69 * anon_inode_getfile - creates a new file instance by hooking it up to an
68 * anonymous inode, and a dentry that describe the "class" 70 * anonymous inode, and a dentry that describe the "class"
69 * of the file 71 * of the file
70 * 72 *
71 * @name: [in] name of the "class" of the new file 73 * @name: [in] name of the "class" of the new file
72 * @fops: [in] file operations for the new file 74 * @fops: [in] file operations for the new file
@@ -113,7 +115,6 @@ struct file *anon_inode_getfile(const char *name,
113 */ 115 */
114 ihold(anon_inode_inode); 116 ihold(anon_inode_inode);
115 117
116 d_set_d_op(path.dentry, &anon_inodefs_dentry_operations);
117 d_instantiate(path.dentry, anon_inode_inode); 118 d_instantiate(path.dentry, anon_inode_inode);
118 119
119 error = -ENFILE; 120 error = -ENFILE;
@@ -232,7 +233,7 @@ static int __init anon_inode_init(void)
232 return 0; 233 return 0;
233 234
234err_mntput: 235err_mntput:
235 mntput_long(anon_inode_mnt); 236 mntput(anon_inode_mnt);
236err_unregister_filesystem: 237err_unregister_filesystem:
237 unregister_filesystem(&anon_inode_fs_type); 238 unregister_filesystem(&anon_inode_fs_type);
238err_exit: 239err_exit:
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0fffe1c24cec..54f923792728 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -88,18 +88,9 @@ struct autofs_info {
88 88
89 uid_t uid; 89 uid_t uid;
90 gid_t gid; 90 gid_t gid;
91
92 mode_t mode;
93 size_t size;
94
95 void (*free)(struct autofs_info *);
96 union {
97 const char *symlink;
98 } u;
99}; 91};
100 92
101#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 93#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
102#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
103#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ 94#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
104 95
105struct autofs_wait_queue { 96struct autofs_wait_queue {
@@ -176,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
176 return 0; 167 return 0;
177} 168}
178 169
179static inline void autofs4_copy_atime(struct file *src, struct file *dst) 170struct inode *autofs4_get_inode(struct super_block *, mode_t);
180{
181 dst->f_path.dentry->d_inode->i_atime =
182 src->f_path.dentry->d_inode->i_atime;
183 return;
184}
185
186struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
187void autofs4_free_ino(struct autofs_info *); 171void autofs4_free_ino(struct autofs_info *);
188 172
189/* Expiration */ 173/* Expiration */
@@ -212,16 +196,89 @@ void autofs_dev_ioctl_exit(void);
212 196
213extern const struct inode_operations autofs4_symlink_inode_operations; 197extern const struct inode_operations autofs4_symlink_inode_operations;
214extern const struct inode_operations autofs4_dir_inode_operations; 198extern const struct inode_operations autofs4_dir_inode_operations;
215extern const struct inode_operations autofs4_root_inode_operations;
216extern const struct inode_operations autofs4_indirect_root_inode_operations;
217extern const struct inode_operations autofs4_direct_root_inode_operations;
218extern const struct file_operations autofs4_dir_operations; 199extern const struct file_operations autofs4_dir_operations;
219extern const struct file_operations autofs4_root_operations; 200extern const struct file_operations autofs4_root_operations;
201extern const struct dentry_operations autofs4_dentry_operations;
202
203/* VFS automount flags management functions */
204
205static inline void __managed_dentry_set_automount(struct dentry *dentry)
206{
207 dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
208}
209
210static inline void managed_dentry_set_automount(struct dentry *dentry)
211{
212 spin_lock(&dentry->d_lock);
213 __managed_dentry_set_automount(dentry);
214 spin_unlock(&dentry->d_lock);
215}
216
217static inline void __managed_dentry_clear_automount(struct dentry *dentry)
218{
219 dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
220}
221
222static inline void managed_dentry_clear_automount(struct dentry *dentry)
223{
224 spin_lock(&dentry->d_lock);
225 __managed_dentry_clear_automount(dentry);
226 spin_unlock(&dentry->d_lock);
227}
228
229static inline void __managed_dentry_set_transit(struct dentry *dentry)
230{
231 dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
232}
233
234static inline void managed_dentry_set_transit(struct dentry *dentry)
235{
236 spin_lock(&dentry->d_lock);
237 __managed_dentry_set_transit(dentry);
238 spin_unlock(&dentry->d_lock);
239}
240
241static inline void __managed_dentry_clear_transit(struct dentry *dentry)
242{
243 dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
244}
245
246static inline void managed_dentry_clear_transit(struct dentry *dentry)
247{
248 spin_lock(&dentry->d_lock);
249 __managed_dentry_clear_transit(dentry);
250 spin_unlock(&dentry->d_lock);
251}
252
253static inline void __managed_dentry_set_managed(struct dentry *dentry)
254{
255 dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
256}
257
258static inline void managed_dentry_set_managed(struct dentry *dentry)
259{
260 spin_lock(&dentry->d_lock);
261 __managed_dentry_set_managed(dentry);
262 spin_unlock(&dentry->d_lock);
263}
264
265static inline void __managed_dentry_clear_managed(struct dentry *dentry)
266{
267 dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
268}
269
270static inline void managed_dentry_clear_managed(struct dentry *dentry)
271{
272 spin_lock(&dentry->d_lock);
273 __managed_dentry_clear_managed(dentry);
274 spin_unlock(&dentry->d_lock);
275}
220 276
221/* Initializing function */ 277/* Initializing function */
222 278
223int autofs4_fill_super(struct super_block *, void *, int); 279int autofs4_fill_super(struct super_block *, void *, int);
224struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode); 280struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
281void autofs4_clean_ino(struct autofs_info *);
225 282
226/* Queue management functions */ 283/* Queue management functions */
227 284
@@ -229,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
229int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); 286int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
230void autofs4_catatonic_mode(struct autofs_sb_info *); 287void autofs4_catatonic_mode(struct autofs_sb_info *);
231 288
232static inline int autofs4_follow_mount(struct path *path)
233{
234 int res = 0;
235
236 while (d_mountpoint(path->dentry)) {
237 int followed = follow_down(path);
238 if (!followed)
239 break;
240 res = 1;
241 }
242 return res;
243}
244
245static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) 289static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
246{ 290{
247 return new_encode_dev(sbi->sb->s_dev); 291 return new_encode_dev(sbi->sb->s_dev);
@@ -294,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
294 return; 338 return;
295} 339}
296 340
297void autofs4_dentry_release(struct dentry *);
298extern void autofs4_kill_sb(struct super_block *); 341extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469a..1442da4860e5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
551 551
552 err = have_submounts(path.dentry); 552 err = have_submounts(path.dentry);
553 553
554 if (follow_down(&path)) 554 if (follow_down_one(&path))
555 magic = path.mnt->mnt_sb->s_magic; 555 magic = path.mnt->mnt_sb->s_magic;
556 } 556 }
557 557
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cc1d01365905..f43100b9662b 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
26 if (ino == NULL) 26 if (ino == NULL)
27 return 0; 27 return 0;
28 28
29 /* No point expiring a pending mount */
30 if (ino->flags & AUTOFS_INF_PENDING)
31 return 0;
32
33 if (!do_now) { 29 if (!do_now) {
34 /* Too young to die */ 30 /* Too young to die */
35 if (!timeout || time_after(ino->last_used + timeout, now)) 31 if (!timeout || time_after(ino->last_used + timeout, now))
@@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
56 52
57 path_get(&path); 53 path_get(&path);
58 54
59 if (!follow_down(&path)) 55 if (!follow_down_one(&path))
60 goto done; 56 goto done;
61 57
62 if (is_autofs4_dentry(path.dentry)) { 58 if (is_autofs4_dentry(path.dentry)) {
@@ -100,7 +96,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
100 struct dentry *p, *ret; 96 struct dentry *p, *ret;
101 97
102 if (prev == NULL) 98 if (prev == NULL)
103 return dget(prev); 99 return dget(root);
104 100
105 spin_lock(&autofs4_lock); 101 spin_lock(&autofs4_lock);
106relock: 102relock:
@@ -137,7 +133,7 @@ again:
137 spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); 133 spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
138 /* Negative dentry - try next */ 134 /* Negative dentry - try next */
139 if (!simple_positive(ret)) { 135 if (!simple_positive(ret)) {
140 spin_unlock(&ret->d_lock); 136 spin_unlock(&p->d_lock);
141 p = ret; 137 p = ret;
142 goto again; 138 goto again;
143 } 139 }
@@ -283,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
283 unsigned long timeout; 279 unsigned long timeout;
284 struct dentry *root = dget(sb->s_root); 280 struct dentry *root = dget(sb->s_root);
285 int do_now = how & AUTOFS_EXP_IMMEDIATE; 281 int do_now = how & AUTOFS_EXP_IMMEDIATE;
282 struct autofs_info *ino;
286 283
287 if (!root) 284 if (!root)
288 return NULL; 285 return NULL;
@@ -291,19 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
291 timeout = sbi->exp_timeout; 288 timeout = sbi->exp_timeout;
292 289
293 spin_lock(&sbi->fs_lock); 290 spin_lock(&sbi->fs_lock);
291 ino = autofs4_dentry_ino(root);
292 /* No point expiring a pending mount */
293 if (ino->flags & AUTOFS_INF_PENDING) {
294 spin_unlock(&sbi->fs_lock);
295 return NULL;
296 }
297 managed_dentry_set_transit(root);
294 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 298 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
295 struct autofs_info *ino = autofs4_dentry_ino(root); 299 struct autofs_info *ino = autofs4_dentry_ino(root);
296 if (d_mountpoint(root)) {
297 ino->flags |= AUTOFS_INF_MOUNTPOINT;
298 spin_lock(&root->d_lock);
299 root->d_flags &= ~DCACHE_MOUNTED;
300 spin_unlock(&root->d_lock);
301 }
302 ino->flags |= AUTOFS_INF_EXPIRING; 300 ino->flags |= AUTOFS_INF_EXPIRING;
303 init_completion(&ino->expire_complete); 301 init_completion(&ino->expire_complete);
304 spin_unlock(&sbi->fs_lock); 302 spin_unlock(&sbi->fs_lock);
305 return root; 303 return root;
306 } 304 }
305 managed_dentry_clear_transit(root);
307 spin_unlock(&sbi->fs_lock); 306 spin_unlock(&sbi->fs_lock);
308 dput(root); 307 dput(root);
309 308
@@ -340,6 +339,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
340 while ((dentry = get_next_positive_dentry(dentry, root))) { 339 while ((dentry = get_next_positive_dentry(dentry, root))) {
341 spin_lock(&sbi->fs_lock); 340 spin_lock(&sbi->fs_lock);
342 ino = autofs4_dentry_ino(dentry); 341 ino = autofs4_dentry_ino(dentry);
342 /* No point expiring a pending mount */
343 if (ino->flags & AUTOFS_INF_PENDING)
344 goto cont;
345 managed_dentry_set_transit(dentry);
343 346
344 /* 347 /*
345 * Case 1: (i) indirect mount or top level pseudo direct mount 348 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -399,6 +402,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
399 } 402 }
400 } 403 }
401next: 404next:
405 managed_dentry_clear_transit(dentry);
406cont:
402 spin_unlock(&sbi->fs_lock); 407 spin_unlock(&sbi->fs_lock);
403 } 408 }
404 return NULL; 409 return NULL;
@@ -479,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb,
479 spin_lock(&sbi->fs_lock); 484 spin_lock(&sbi->fs_lock);
480 ino = autofs4_dentry_ino(dentry); 485 ino = autofs4_dentry_ino(dentry);
481 ino->flags &= ~AUTOFS_INF_EXPIRING; 486 ino->flags &= ~AUTOFS_INF_EXPIRING;
487 if (!d_unhashed(dentry))
488 managed_dentry_clear_transit(dentry);
482 complete_all(&ino->expire_complete); 489 complete_all(&ino->expire_complete);
483 spin_unlock(&sbi->fs_lock); 490 spin_unlock(&sbi->fs_lock);
484 491
@@ -504,18 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
504 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); 511 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
505 512
506 spin_lock(&sbi->fs_lock); 513 spin_lock(&sbi->fs_lock);
507 if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
508 spin_lock(&sb->s_root->d_lock);
509 /*
510 * If we haven't been expired away, then reset
511 * mounted status.
512 */
513 if (mnt->mnt_parent != mnt)
514 sb->s_root->d_flags |= DCACHE_MOUNTED;
515 spin_unlock(&sb->s_root->d_lock);
516 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
517 }
518 ino->flags &= ~AUTOFS_INF_EXPIRING; 514 ino->flags &= ~AUTOFS_INF_EXPIRING;
515 spin_lock(&dentry->d_lock);
516 if (ret)
517 __managed_dentry_clear_transit(dentry);
518 else {
519 if ((IS_ROOT(dentry) ||
520 (autofs_type_indirect(sbi->type) &&
521 IS_ROOT(dentry->d_parent))) &&
522 !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
523 __managed_dentry_set_automount(dentry);
524 }
525 spin_unlock(&dentry->d_lock);
519 complete_all(&ino->expire_complete); 526 complete_all(&ino->expire_complete);
520 spin_unlock(&sbi->fs_lock); 527 spin_unlock(&sbi->fs_lock);
521 dput(dentry); 528 dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a7bdb9dcac84..180fa2425e49 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,77 +22,27 @@
22#include "autofs_i.h" 22#include "autofs_i.h"
23#include <linux/module.h> 23#include <linux/module.h>
24 24
25static void ino_lnkfree(struct autofs_info *ino) 25struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
26{ 26{
27 if (ino->u.symlink) { 27 struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
28 kfree(ino->u.symlink); 28 if (ino) {
29 ino->u.symlink = NULL;
30 }
31}
32
33struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
34 struct autofs_sb_info *sbi, mode_t mode)
35{
36 int reinit = 1;
37
38 if (ino == NULL) {
39 reinit = 0;
40 ino = kmalloc(sizeof(*ino), GFP_KERNEL);
41 }
42
43 if (ino == NULL)
44 return NULL;
45
46 if (!reinit) {
47 ino->flags = 0;
48 ino->inode = NULL;
49 ino->dentry = NULL;
50 ino->size = 0;
51 INIT_LIST_HEAD(&ino->active); 29 INIT_LIST_HEAD(&ino->active);
52 ino->active_count = 0;
53 INIT_LIST_HEAD(&ino->expiring); 30 INIT_LIST_HEAD(&ino->expiring);
54 atomic_set(&ino->count, 0); 31 ino->last_used = jiffies;
32 ino->sbi = sbi;
55 } 33 }
34 return ino;
35}
56 36
37void autofs4_clean_ino(struct autofs_info *ino)
38{
57 ino->uid = 0; 39 ino->uid = 0;
58 ino->gid = 0; 40 ino->gid = 0;
59 ino->mode = mode;
60 ino->last_used = jiffies; 41 ino->last_used = jiffies;
61
62 ino->sbi = sbi;
63
64 if (reinit && ino->free)
65 (ino->free)(ino);
66
67 memset(&ino->u, 0, sizeof(ino->u));
68
69 ino->free = NULL;
70
71 if (S_ISLNK(mode))
72 ino->free = ino_lnkfree;
73
74 return ino;
75} 42}
76 43
77void autofs4_free_ino(struct autofs_info *ino) 44void autofs4_free_ino(struct autofs_info *ino)
78{ 45{
79 struct autofs_info *p_ino;
80
81 if (ino->dentry) {
82 ino->dentry->d_fsdata = NULL;
83 if (ino->dentry->d_inode) {
84 struct dentry *parent = ino->dentry->d_parent;
85 if (atomic_dec_and_test(&ino->count)) {
86 p_ino = autofs4_dentry_ino(parent);
87 if (p_ino && parent != ino->dentry)
88 atomic_dec(&p_ino->count);
89 }
90 dput(ino->dentry);
91 }
92 ino->dentry = NULL;
93 }
94 if (ino->free)
95 (ino->free)(ino);
96 kfree(ino); 46 kfree(ino);
97} 47}
98 48
@@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
148 return 0; 98 return 0;
149} 99}
150 100
101static void autofs4_evict_inode(struct inode *inode)
102{
103 end_writeback(inode);
104 kfree(inode->i_private);
105}
106
151static const struct super_operations autofs4_sops = { 107static const struct super_operations autofs4_sops = {
152 .statfs = simple_statfs, 108 .statfs = simple_statfs,
153 .show_options = autofs4_show_options, 109 .show_options = autofs4_show_options,
110 .evict_inode = autofs4_evict_inode,
154}; 111};
155 112
156enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, 113enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
240 return (*pipefd < 0); 197 return (*pipefd < 0);
241} 198}
242 199
243static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
244{
245 struct autofs_info *ino;
246
247 ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
248 if (!ino)
249 return NULL;
250
251 return ino;
252}
253
254static const struct dentry_operations autofs4_sb_dentry_operations = {
255 .d_release = autofs4_dentry_release,
256};
257
258int autofs4_fill_super(struct super_block *s, void *data, int silent) 200int autofs4_fill_super(struct super_block *s, void *data, int silent)
259{ 201{
260 struct inode * root_inode; 202 struct inode * root_inode;
@@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
292 s->s_blocksize_bits = 10; 234 s->s_blocksize_bits = 10;
293 s->s_magic = AUTOFS_SUPER_MAGIC; 235 s->s_magic = AUTOFS_SUPER_MAGIC;
294 s->s_op = &autofs4_sops; 236 s->s_op = &autofs4_sops;
237 s->s_d_op = &autofs4_dentry_operations;
295 s->s_time_gran = 1; 238 s->s_time_gran = 1;
296 239
297 /* 240 /*
298 * Get the root inode and dentry, but defer checking for errors. 241 * Get the root inode and dentry, but defer checking for errors.
299 */ 242 */
300 ino = autofs4_mkroot(sbi); 243 ino = autofs4_new_ino(sbi);
301 if (!ino) 244 if (!ino)
302 goto fail_free; 245 goto fail_free;
303 root_inode = autofs4_get_inode(s, ino); 246 root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
304 if (!root_inode) 247 if (!root_inode)
305 goto fail_ino; 248 goto fail_ino;
306 249
@@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
309 goto fail_iput; 252 goto fail_iput;
310 pipe = NULL; 253 pipe = NULL;
311 254
312 d_set_d_op(root, &autofs4_sb_dentry_operations);
313 root->d_fsdata = ino; 255 root->d_fsdata = ino;
314 256
315 /* Can this call block? */ 257 /* Can this call block? */
@@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
320 goto fail_dput; 262 goto fail_dput;
321 } 263 }
322 264
265 if (autofs_type_trigger(sbi->type))
266 __managed_dentry_set_managed(root);
267
323 root_inode->i_fop = &autofs4_root_operations; 268 root_inode->i_fop = &autofs4_root_operations;
324 root_inode->i_op = autofs_type_trigger(sbi->type) ? 269 root_inode->i_op = &autofs4_dir_inode_operations;
325 &autofs4_direct_root_inode_operations :
326 &autofs4_indirect_root_inode_operations;
327 270
328 /* Couldn't this be tested earlier? */ 271 /* Couldn't this be tested earlier? */
329 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || 272 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
@@ -383,16 +326,14 @@ fail_unlock:
383 return -EINVAL; 326 return -EINVAL;
384} 327}
385 328
386struct inode *autofs4_get_inode(struct super_block *sb, 329struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
387 struct autofs_info *inf)
388{ 330{
389 struct inode *inode = new_inode(sb); 331 struct inode *inode = new_inode(sb);
390 332
391 if (inode == NULL) 333 if (inode == NULL)
392 return NULL; 334 return NULL;
393 335
394 inf->inode = inode; 336 inode->i_mode = mode;
395 inode->i_mode = inf->mode;
396 if (sb->s_root) { 337 if (sb->s_root) {
397 inode->i_uid = sb->s_root->d_inode->i_uid; 338 inode->i_uid = sb->s_root->d_inode->i_uid;
398 inode->i_gid = sb->s_root->d_inode->i_gid; 339 inode->i_gid = sb->s_root->d_inode->i_gid;
@@ -400,12 +341,11 @@ struct inode *autofs4_get_inode(struct super_block *sb,
400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 341 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
401 inode->i_ino = get_next_ino(); 342 inode->i_ino = get_next_ino();
402 343
403 if (S_ISDIR(inf->mode)) { 344 if (S_ISDIR(mode)) {
404 inode->i_nlink = 2; 345 inode->i_nlink = 2;
405 inode->i_op = &autofs4_dir_inode_operations; 346 inode->i_op = &autofs4_dir_inode_operations;
406 inode->i_fop = &autofs4_dir_operations; 347 inode->i_fop = &autofs4_dir_operations;
407 } else if (S_ISLNK(inf->mode)) { 348 } else if (S_ISLNK(mode)) {
408 inode->i_size = inf->size;
409 inode->i_op = &autofs4_symlink_inode_operations; 349 inode->i_op = &autofs4_symlink_inode_operations;
410 } 350 }
411 351
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 651e4ef563b1..014e7aba3b08 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -35,10 +35,9 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
35#endif 35#endif
36static int autofs4_dir_open(struct inode *inode, struct file *file); 36static int autofs4_dir_open(struct inode *inode, struct file *file);
37static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 37static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
38static void *autofs4_follow_link(struct dentry *, struct nameidata *); 38static struct vfsmount *autofs4_d_automount(struct path *);
39 39static int autofs4_d_manage(struct dentry *, bool, bool);
40#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) 40static void autofs4_dentry_release(struct dentry *);
41#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
42 41
43const struct file_operations autofs4_root_operations = { 42const struct file_operations autofs4_root_operations = {
44 .open = dcache_dir_open, 43 .open = dcache_dir_open,
@@ -60,7 +59,7 @@ const struct file_operations autofs4_dir_operations = {
60 .llseek = dcache_dir_lseek, 59 .llseek = dcache_dir_lseek,
61}; 60};
62 61
63const struct inode_operations autofs4_indirect_root_inode_operations = { 62const struct inode_operations autofs4_dir_inode_operations = {
64 .lookup = autofs4_lookup, 63 .lookup = autofs4_lookup,
65 .unlink = autofs4_dir_unlink, 64 .unlink = autofs4_dir_unlink,
66 .symlink = autofs4_dir_symlink, 65 .symlink = autofs4_dir_symlink,
@@ -68,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = {
68 .rmdir = autofs4_dir_rmdir, 67 .rmdir = autofs4_dir_rmdir,
69}; 68};
70 69
71const struct inode_operations autofs4_direct_root_inode_operations = { 70const struct dentry_operations autofs4_dentry_operations = {
72 .lookup = autofs4_lookup, 71 .d_automount = autofs4_d_automount,
73 .unlink = autofs4_dir_unlink, 72 .d_manage = autofs4_d_manage,
74 .mkdir = autofs4_dir_mkdir, 73 .d_release = autofs4_dentry_release,
75 .rmdir = autofs4_dir_rmdir,
76 .follow_link = autofs4_follow_link,
77};
78
79const struct inode_operations autofs4_dir_inode_operations = {
80 .lookup = autofs4_lookup,
81 .unlink = autofs4_dir_unlink,
82 .symlink = autofs4_dir_symlink,
83 .mkdir = autofs4_dir_mkdir,
84 .rmdir = autofs4_dir_rmdir,
85}; 74};
86 75
87static void autofs4_add_active(struct dentry *dentry) 76static void autofs4_add_active(struct dentry *dentry)
@@ -116,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
116 return; 105 return;
117} 106}
118 107
119static unsigned int autofs4_need_mount(unsigned int flags)
120{
121 unsigned int res = 0;
122 if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
123 res = 1;
124 return res;
125}
126
127static int autofs4_dir_open(struct inode *inode, struct file *file) 108static int autofs4_dir_open(struct inode *inode, struct file *file)
128{ 109{
129 struct dentry *dentry = file->f_path.dentry; 110 struct dentry *dentry = file->f_path.dentry;
@@ -158,278 +139,27 @@ out:
158 return dcache_dir_open(inode, file); 139 return dcache_dir_open(inode, file);
159} 140}
160 141
161static int try_to_fill_dentry(struct dentry *dentry, int flags) 142static void autofs4_dentry_release(struct dentry *de)
162{
163 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
164 struct autofs_info *ino = autofs4_dentry_ino(dentry);
165 int status;
166
167 DPRINTK("dentry=%p %.*s ino=%p",
168 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
169
170 /*
171 * Wait for a pending mount, triggering one if there
172 * isn't one already
173 */
174 if (dentry->d_inode == NULL) {
175 DPRINTK("waiting for mount name=%.*s",
176 dentry->d_name.len, dentry->d_name.name);
177
178 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
179
180 DPRINTK("mount done status=%d", status);
181
182 /* Turn this into a real negative dentry? */
183 if (status == -ENOENT) {
184 spin_lock(&sbi->fs_lock);
185 ino->flags &= ~AUTOFS_INF_PENDING;
186 spin_unlock(&sbi->fs_lock);
187 return status;
188 } else if (status) {
189 /* Return a negative dentry, but leave it "pending" */
190 return status;
191 }
192 /* Trigger mount for path component or follow link */
193 } else if (ino->flags & AUTOFS_INF_PENDING ||
194 autofs4_need_mount(flags)) {
195 DPRINTK("waiting for mount name=%.*s",
196 dentry->d_name.len, dentry->d_name.name);
197
198 spin_lock(&sbi->fs_lock);
199 ino->flags |= AUTOFS_INF_PENDING;
200 spin_unlock(&sbi->fs_lock);
201 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
202
203 DPRINTK("mount done status=%d", status);
204
205 if (status) {
206 spin_lock(&sbi->fs_lock);
207 ino->flags &= ~AUTOFS_INF_PENDING;
208 spin_unlock(&sbi->fs_lock);
209 return status;
210 }
211 }
212
213 /* Initialize expiry counter after successful mount */
214 ino->last_used = jiffies;
215
216 spin_lock(&sbi->fs_lock);
217 ino->flags &= ~AUTOFS_INF_PENDING;
218 spin_unlock(&sbi->fs_lock);
219
220 return 0;
221}
222
223/* For autofs direct mounts the follow link triggers the mount */
224static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
225{
226 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
227 struct autofs_info *ino = autofs4_dentry_ino(dentry);
228 int oz_mode = autofs4_oz_mode(sbi);
229 unsigned int lookup_type;
230 int status;
231
232 DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
233 dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
234 nd->flags);
235 /*
236 * For an expire of a covered direct or offset mount we need
237 * to break out of follow_down() at the autofs mount trigger
238 * (d_mounted--), so we can see the expiring flag, and manage
239 * the blocking and following here until the expire is completed.
240 */
241 if (oz_mode) {
242 spin_lock(&sbi->fs_lock);
243 if (ino->flags & AUTOFS_INF_EXPIRING) {
244 spin_unlock(&sbi->fs_lock);
245 /* Follow down to our covering mount. */
246 if (!follow_down(&nd->path))
247 goto done;
248 goto follow;
249 }
250 spin_unlock(&sbi->fs_lock);
251 goto done;
252 }
253
254 /* If an expire request is pending everyone must wait. */
255 autofs4_expire_wait(dentry);
256
257 /* We trigger a mount for almost all flags */
258 lookup_type = autofs4_need_mount(nd->flags);
259 spin_lock(&sbi->fs_lock);
260 spin_lock(&autofs4_lock);
261 spin_lock(&dentry->d_lock);
262 if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
263 spin_unlock(&dentry->d_lock);
264 spin_unlock(&autofs4_lock);
265 spin_unlock(&sbi->fs_lock);
266 goto follow;
267 }
268
269 /*
270 * If the dentry contains directories then it is an autofs
271 * multi-mount with no root mount offset. So don't try to
272 * mount it again.
273 */
274 if (ino->flags & AUTOFS_INF_PENDING ||
275 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
276 spin_unlock(&dentry->d_lock);
277 spin_unlock(&autofs4_lock);
278 spin_unlock(&sbi->fs_lock);
279
280 status = try_to_fill_dentry(dentry, nd->flags);
281 if (status)
282 goto out_error;
283
284 goto follow;
285 }
286 spin_unlock(&dentry->d_lock);
287 spin_unlock(&autofs4_lock);
288 spin_unlock(&sbi->fs_lock);
289follow:
290 /*
291 * If there is no root mount it must be an autofs
292 * multi-mount with no root offset so we don't need
293 * to follow it.
294 */
295 if (d_mountpoint(dentry)) {
296 if (!autofs4_follow_mount(&nd->path)) {
297 status = -ENOENT;
298 goto out_error;
299 }
300 }
301
302done:
303 return NULL;
304
305out_error:
306 path_put(&nd->path);
307 return ERR_PTR(status);
308}
309
310/*
311 * Revalidate is called on every cache lookup. Some of those
312 * cache lookups may actually happen while the dentry is not
313 * yet completely filled in, and revalidate has to delay such
314 * lookups..
315 */
316static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
317{ 143{
318 struct inode *dir; 144 struct autofs_info *ino = autofs4_dentry_ino(de);
319 struct autofs_sb_info *sbi; 145 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
320 int oz_mode;
321 int flags = nd ? nd->flags : 0;
322 int status = 1;
323
324 if (flags & LOOKUP_RCU)
325 return -ECHILD;
326
327 dir = dentry->d_parent->d_inode;
328 sbi = autofs4_sbi(dir->i_sb);
329 oz_mode = autofs4_oz_mode(sbi);
330
331 /* Pending dentry */
332 spin_lock(&sbi->fs_lock);
333 if (autofs4_ispending(dentry)) {
334 /* The daemon never causes a mount to trigger */
335 spin_unlock(&sbi->fs_lock);
336
337 if (oz_mode)
338 return 1;
339
340 /*
341 * If the directory has gone away due to an expire
342 * we have been called as ->d_revalidate() and so
343 * we need to return false and proceed to ->lookup().
344 */
345 if (autofs4_expire_wait(dentry) == -EAGAIN)
346 return 0;
347
348 /*
349 * A zero status is success otherwise we have a
350 * negative error code.
351 */
352 status = try_to_fill_dentry(dentry, flags);
353 if (status == 0)
354 return 1;
355
356 return status;
357 }
358 spin_unlock(&sbi->fs_lock);
359
360 /* Negative dentry.. invalidate if "old" */
361 if (dentry->d_inode == NULL)
362 return 0;
363
364 /* Check for a non-mountpoint directory with no contents */
365 spin_lock(&autofs4_lock);
366 spin_lock(&dentry->d_lock);
367 if (S_ISDIR(dentry->d_inode->i_mode) &&
368 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
369 DPRINTK("dentry=%p %.*s, emptydir",
370 dentry, dentry->d_name.len, dentry->d_name.name);
371 spin_unlock(&dentry->d_lock);
372 spin_unlock(&autofs4_lock);
373
374 /* The daemon never causes a mount to trigger */
375 if (oz_mode)
376 return 1;
377
378 /*
379 * A zero status is success otherwise we have a
380 * negative error code.
381 */
382 status = try_to_fill_dentry(dentry, flags);
383 if (status == 0)
384 return 1;
385
386 return status;
387 }
388 spin_unlock(&dentry->d_lock);
389 spin_unlock(&autofs4_lock);
390
391 return 1;
392}
393
394void autofs4_dentry_release(struct dentry *de)
395{
396 struct autofs_info *inf;
397 146
398 DPRINTK("releasing %p", de); 147 DPRINTK("releasing %p", de);
399 148
400 inf = autofs4_dentry_ino(de); 149 if (!ino)
401 de->d_fsdata = NULL; 150 return;
402
403 if (inf) {
404 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
405
406 if (sbi) {
407 spin_lock(&sbi->lookup_lock);
408 if (!list_empty(&inf->active))
409 list_del(&inf->active);
410 if (!list_empty(&inf->expiring))
411 list_del(&inf->expiring);
412 spin_unlock(&sbi->lookup_lock);
413 }
414
415 inf->dentry = NULL;
416 inf->inode = NULL;
417 151
418 autofs4_free_ino(inf); 152 if (sbi) {
153 spin_lock(&sbi->lookup_lock);
154 if (!list_empty(&ino->active))
155 list_del(&ino->active);
156 if (!list_empty(&ino->expiring))
157 list_del(&ino->expiring);
158 spin_unlock(&sbi->lookup_lock);
419 } 159 }
420}
421 160
422/* For dentries of directories in the root dir */ 161 autofs4_free_ino(ino);
423static const struct dentry_operations autofs4_root_dentry_operations = { 162}
424 .d_revalidate = autofs4_revalidate,
425 .d_release = autofs4_dentry_release,
426};
427
428/* For other dentries */
429static const struct dentry_operations autofs4_dentry_operations = {
430 .d_revalidate = autofs4_revalidate,
431 .d_release = autofs4_dentry_release,
432};
433 163
434static struct dentry *autofs4_lookup_active(struct dentry *dentry) 164static struct dentry *autofs4_lookup_active(struct dentry *dentry)
435{ 165{
@@ -541,51 +271,246 @@ next:
541 return NULL; 271 return NULL;
542} 272}
543 273
274static int autofs4_mount_wait(struct dentry *dentry)
275{
276 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
277 struct autofs_info *ino = autofs4_dentry_ino(dentry);
278 int status;
279
280 if (ino->flags & AUTOFS_INF_PENDING) {
281 DPRINTK("waiting for mount name=%.*s",
282 dentry->d_name.len, dentry->d_name.name);
283 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
284 DPRINTK("mount wait done status=%d", status);
285 ino->last_used = jiffies;
286 return status;
287 }
288 return 0;
289}
290
291static int do_expire_wait(struct dentry *dentry)
292{
293 struct dentry *expiring;
294
295 expiring = autofs4_lookup_expiring(dentry);
296 if (!expiring)
297 return autofs4_expire_wait(dentry);
298 else {
299 /*
300 * If we are racing with expire the request might not
301 * be quite complete, but the directory has been removed
302 * so it must have been successful, just wait for it.
303 */
304 autofs4_expire_wait(expiring);
305 autofs4_del_expiring(expiring);
306 dput(expiring);
307 }
308 return 0;
309}
310
311static struct dentry *autofs4_mountpoint_changed(struct path *path)
312{
313 struct dentry *dentry = path->dentry;
314 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
315
316 /*
317 * If this is an indirect mount the dentry could have gone away
318 * as a result of an expire and a new one created.
319 */
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent;
322 struct dentry *new = d_lookup(parent, &dentry->d_name);
323 if (!new)
324 return NULL;
325 dput(path->dentry);
326 path->dentry = new;
327 }
328 return path->dentry;
329}
330
331static struct vfsmount *autofs4_d_automount(struct path *path)
332{
333 struct dentry *dentry = path->dentry;
334 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
335 struct autofs_info *ino = autofs4_dentry_ino(dentry);
336 int status;
337
338 DPRINTK("dentry=%p %.*s",
339 dentry, dentry->d_name.len, dentry->d_name.name);
340
341 /*
342 * Someone may have manually umounted this or it was a submount
343 * that has gone away.
344 */
345 spin_lock(&dentry->d_lock);
346 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
347 if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
348 (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
349 __managed_dentry_set_transit(path->dentry);
350 }
351 spin_unlock(&dentry->d_lock);
352
353 /* The daemon never triggers a mount. */
354 if (autofs4_oz_mode(sbi))
355 return NULL;
356
357 /*
358 * If an expire request is pending everyone must wait.
359 * If the expire fails we're still mounted so continue
360 * the follow and return. A return of -EAGAIN (which only
361 * happens with indirect mounts) means the expire completed
362 * and the directory was removed, so just go ahead and try
363 * the mount.
364 */
365 status = do_expire_wait(dentry);
366 if (status && status != -EAGAIN)
367 return NULL;
368
369 /* Callback to the daemon to perform the mount or wait */
370 spin_lock(&sbi->fs_lock);
371 if (ino->flags & AUTOFS_INF_PENDING) {
372 spin_unlock(&sbi->fs_lock);
373 status = autofs4_mount_wait(dentry);
374 if (status)
375 return ERR_PTR(status);
376 spin_lock(&sbi->fs_lock);
377 goto done;
378 }
379
380 /*
381 * If the dentry is a symlink it's equivalent to a directory
382 * having d_mountpoint() true, so there's no need to call back
383 * to the daemon.
384 */
385 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
386 goto done;
387 if (!d_mountpoint(dentry)) {
388 /*
389 * It's possible that user space hasn't removed directories
390 * after umounting a rootless multi-mount, although it
391 * should. For v5 have_submounts() is sufficient to handle
392 * this because the leaves of the directory tree under the
393 * mount never trigger mounts themselves (they have an autofs
394 * trigger mount mounted on them). But v4 pseudo direct mounts
395 * do need the leaves to to trigger mounts. In this case we
396 * have no choice but to use the list_empty() check and
397 * require user space behave.
398 */
399 if (sbi->version > 4) {
400 if (have_submounts(dentry))
401 goto done;
402 } else {
403 spin_lock(&dentry->d_lock);
404 if (!list_empty(&dentry->d_subdirs)) {
405 spin_unlock(&dentry->d_lock);
406 goto done;
407 }
408 spin_unlock(&dentry->d_lock);
409 }
410 ino->flags |= AUTOFS_INF_PENDING;
411 spin_unlock(&sbi->fs_lock);
412 status = autofs4_mount_wait(dentry);
413 if (status)
414 return ERR_PTR(status);
415 spin_lock(&sbi->fs_lock);
416 ino->flags &= ~AUTOFS_INF_PENDING;
417 }
418done:
419 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
420 /*
421 * Any needed mounting has been completed and the path updated
422 * so turn this into a normal dentry so we don't continually
423 * call ->d_automount() and ->d_manage().
424 */
425 spin_lock(&dentry->d_lock);
426 __managed_dentry_clear_transit(dentry);
427 /*
428 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
429 * symlinks as in all other cases the dentry will be covered by
430 * an actual mount so ->d_automount() won't be called during
431 * the follow.
432 */
433 if ((!d_mountpoint(dentry) &&
434 !list_empty(&dentry->d_subdirs)) ||
435 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
436 __managed_dentry_clear_automount(dentry);
437 spin_unlock(&dentry->d_lock);
438 }
439 spin_unlock(&sbi->fs_lock);
440
441 /* Mount succeeded, check if we ended up with a new dentry */
442 dentry = autofs4_mountpoint_changed(path);
443 if (!dentry)
444 return ERR_PTR(-ENOENT);
445
446 return NULL;
447}
448
449int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
450{
451 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
452
453 DPRINTK("dentry=%p %.*s",
454 dentry, dentry->d_name.len, dentry->d_name.name);
455
456 /* The daemon never waits. */
457 if (autofs4_oz_mode(sbi) || mounting_here) {
458 if (!d_mountpoint(dentry))
459 return -EISDIR;
460 return 0;
461 }
462
463 /* We need to sleep, so we need pathwalk to be in ref-mode */
464 if (rcu_walk)
465 return -ECHILD;
466
467 /* Wait for pending expires */
468 do_expire_wait(dentry);
469
470 /*
471 * This dentry may be under construction so wait on mount
472 * completion.
473 */
474 return autofs4_mount_wait(dentry);
475}
476
544/* Lookups in the root directory */ 477/* Lookups in the root directory */
545static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 478static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
546{ 479{
547 struct autofs_sb_info *sbi; 480 struct autofs_sb_info *sbi;
548 struct autofs_info *ino; 481 struct autofs_info *ino;
549 struct dentry *expiring, *active; 482 struct dentry *active;
550 int oz_mode;
551 483
552 DPRINTK("name = %.*s", 484 DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
553 dentry->d_name.len, dentry->d_name.name);
554 485
555 /* File name too long to exist */ 486 /* File name too long to exist */
556 if (dentry->d_name.len > NAME_MAX) 487 if (dentry->d_name.len > NAME_MAX)
557 return ERR_PTR(-ENAMETOOLONG); 488 return ERR_PTR(-ENAMETOOLONG);
558 489
559 sbi = autofs4_sbi(dir->i_sb); 490 sbi = autofs4_sbi(dir->i_sb);
560 oz_mode = autofs4_oz_mode(sbi);
561 491
562 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 492 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
563 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 493 current->pid, task_pgrp_nr(current), sbi->catatonic,
494 autofs4_oz_mode(sbi));
564 495
565 active = autofs4_lookup_active(dentry); 496 active = autofs4_lookup_active(dentry);
566 if (active) { 497 if (active) {
567 dentry = active; 498 return active;
568 ino = autofs4_dentry_ino(dentry);
569 } else { 499 } else {
570 /* 500 /*
571 * Mark the dentry incomplete but don't hash it. We do this 501 * A dentry that is not within the root can never trigger a
572 * to serialize our inode creation operations (symlink and 502 * mount operation, unless the directory already exists, so we
573 * mkdir) which prevents deadlock during the callback to 503 * can return fail immediately. The daemon however does need
574 * the daemon. Subsequent user space lookups for the same 504 * to create directories within the file system.
575 * dentry are placed on the wait queue while the daemon
576 * itself is allowed passage unresticted so the create
577 * operation itself can then hash the dentry. Finally,
578 * we check for the hashed dentry and return the newly
579 * hashed dentry.
580 */ 505 */
581 d_set_d_op(dentry, &autofs4_root_dentry_operations); 506 if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
507 return ERR_PTR(-ENOENT);
582 508
583 /* 509 /* Mark entries in the root as mount triggers */
584 * And we need to ensure that the same dentry is used for 510 if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
585 * all following lookup calls until it is hashed so that 511 __managed_dentry_set_managed(dentry);
586 * the dentry flags are persistent throughout the request. 512
587 */ 513 ino = autofs4_new_ino(sbi);
588 ino = autofs4_init_ino(NULL, sbi, 0555);
589 if (!ino) 514 if (!ino)
590 return ERR_PTR(-ENOMEM); 515 return ERR_PTR(-ENOMEM);
591 516
@@ -596,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
596 521
597 d_instantiate(dentry, NULL); 522 d_instantiate(dentry, NULL);
598 } 523 }
599
600 if (!oz_mode) {
601 mutex_unlock(&dir->i_mutex);
602 expiring = autofs4_lookup_expiring(dentry);
603 if (expiring) {
604 /*
605 * If we are racing with expire the request might not
606 * be quite complete but the directory has been removed
607 * so it must have been successful, so just wait for it.
608 */
609 autofs4_expire_wait(expiring);
610 autofs4_del_expiring(expiring);
611 dput(expiring);
612 }
613
614 spin_lock(&sbi->fs_lock);
615 ino->flags |= AUTOFS_INF_PENDING;
616 spin_unlock(&sbi->fs_lock);
617 if (dentry->d_op && dentry->d_op->d_revalidate)
618 (dentry->d_op->d_revalidate)(dentry, nd);
619 mutex_lock(&dir->i_mutex);
620 }
621
622 /*
623 * If we are still pending, check if we had to handle
624 * a signal. If so we can force a restart..
625 */
626 if (ino->flags & AUTOFS_INF_PENDING) {
627 /* See if we were interrupted */
628 if (signal_pending(current)) {
629 sigset_t *sigset = &current->pending.signal;
630 if (sigismember (sigset, SIGKILL) ||
631 sigismember (sigset, SIGQUIT) ||
632 sigismember (sigset, SIGINT)) {
633 if (active)
634 dput(active);
635 return ERR_PTR(-ERESTARTNOINTR);
636 }
637 }
638 if (!oz_mode) {
639 spin_lock(&sbi->fs_lock);
640 ino->flags &= ~AUTOFS_INF_PENDING;
641 spin_unlock(&sbi->fs_lock);
642 }
643 }
644
645 /*
646 * If this dentry is unhashed, then we shouldn't honour this
647 * lookup. Returning ENOENT here doesn't do the right thing
648 * for all system calls, but it should be OK for the operations
649 * we permit from an autofs.
650 */
651 if (!oz_mode && d_unhashed(dentry)) {
652 /*
653 * A user space application can (and has done in the past)
654 * remove and re-create this directory during the callback.
655 * This can leave us with an unhashed dentry, but a
656 * successful mount! So we need to perform another
657 * cached lookup in case the dentry now exists.
658 */
659 struct dentry *parent = dentry->d_parent;
660 struct dentry *new = d_lookup(parent, &dentry->d_name);
661 if (new != NULL)
662 dentry = new;
663 else
664 dentry = ERR_PTR(-ENOENT);
665
666 if (active)
667 dput(active);
668
669 return dentry;
670 }
671
672 if (active)
673 return active;
674
675 return NULL; 524 return NULL;
676} 525}
677 526
@@ -683,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir,
683 struct autofs_info *ino = autofs4_dentry_ino(dentry); 532 struct autofs_info *ino = autofs4_dentry_ino(dentry);
684 struct autofs_info *p_ino; 533 struct autofs_info *p_ino;
685 struct inode *inode; 534 struct inode *inode;
535 size_t size = strlen(symname);
686 char *cp; 536 char *cp;
687 537
688 DPRINTK("%s <- %.*s", symname, 538 DPRINTK("%s <- %.*s", symname,
@@ -691,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir,
691 if (!autofs4_oz_mode(sbi)) 541 if (!autofs4_oz_mode(sbi))
692 return -EACCES; 542 return -EACCES;
693 543
694 ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555); 544 BUG_ON(!ino);
695 if (!ino) 545
696 return -ENOMEM; 546 autofs4_clean_ino(ino);
697 547
698 autofs4_del_active(dentry); 548 autofs4_del_active(dentry);
699 549
700 ino->size = strlen(symname); 550 cp = kmalloc(size + 1, GFP_KERNEL);
701 cp = kmalloc(ino->size + 1, GFP_KERNEL); 551 if (!cp)
702 if (!cp) {
703 if (!dentry->d_fsdata)
704 kfree(ino);
705 return -ENOMEM; 552 return -ENOMEM;
706 }
707 553
708 strcpy(cp, symname); 554 strcpy(cp, symname);
709 555
710 inode = autofs4_get_inode(dir->i_sb, ino); 556 inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
711 if (!inode) { 557 if (!inode) {
712 kfree(cp); 558 kfree(cp);
713 if (!dentry->d_fsdata) 559 if (!dentry->d_fsdata)
714 kfree(ino); 560 kfree(ino);
715 return -ENOMEM; 561 return -ENOMEM;
716 } 562 }
563 inode->i_private = cp;
564 inode->i_size = size;
717 d_add(dentry, inode); 565 d_add(dentry, inode);
718 566
719 if (dir == dir->i_sb->s_root->d_inode) 567 dget(dentry);
720 d_set_d_op(dentry, &autofs4_root_dentry_operations);
721 else
722 d_set_d_op(dentry, &autofs4_dentry_operations);
723
724 dentry->d_fsdata = ino;
725 ino->dentry = dget(dentry);
726 atomic_inc(&ino->count); 568 atomic_inc(&ino->count);
727 p_ino = autofs4_dentry_ino(dentry->d_parent); 569 p_ino = autofs4_dentry_ino(dentry->d_parent);
728 if (p_ino && dentry->d_parent != dentry) 570 if (p_ino && dentry->d_parent != dentry)
729 atomic_inc(&p_ino->count); 571 atomic_inc(&p_ino->count);
730 ino->inode = inode;
731 572
732 ino->u.symlink = cp;
733 dir->i_mtime = CURRENT_TIME; 573 dir->i_mtime = CURRENT_TIME;
734 574
735 return 0; 575 return 0;
@@ -782,6 +622,58 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
782 return 0; 622 return 0;
783} 623}
784 624
625/*
626 * Version 4 of autofs provides a pseudo direct mount implementation
627 * that relies on directories at the leaves of a directory tree under
628 * an indirect mount to trigger mounts. To allow for this we need to
629 * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
630 * of the directory tree. There is no need to clear the automount flag
631 * following a mount or restore it after an expire because these mounts
632 * are always covered. However, it is neccessary to ensure that these
633 * flags are clear on non-empty directories to avoid unnecessary calls
634 * during path walks.
635 */
636static void autofs_set_leaf_automount_flags(struct dentry *dentry)
637{
638 struct dentry *parent;
639
640 /* root and dentrys in the root are already handled */
641 if (IS_ROOT(dentry->d_parent))
642 return;
643
644 managed_dentry_set_managed(dentry);
645
646 parent = dentry->d_parent;
647 /* only consider parents below dentrys in the root */
648 if (IS_ROOT(parent->d_parent))
649 return;
650 managed_dentry_clear_managed(parent);
651 return;
652}
653
654static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
655{
656 struct list_head *d_child;
657 struct dentry *parent;
658
659 /* flags for dentrys in the root are handled elsewhere */
660 if (IS_ROOT(dentry->d_parent))
661 return;
662
663 managed_dentry_clear_managed(dentry);
664
665 parent = dentry->d_parent;
666 /* only consider parents below dentrys in the root */
667 if (IS_ROOT(parent->d_parent))
668 return;
669 d_child = &dentry->d_u.d_child;
670 /* Set parent managed if it's becoming empty */
671 if (d_child->next == &parent->d_subdirs &&
672 d_child->prev == &parent->d_subdirs)
673 managed_dentry_set_managed(parent);
674 return;
675}
676
785static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) 677static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
786{ 678{
787 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 679 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -809,6 +701,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
809 spin_unlock(&dentry->d_lock); 701 spin_unlock(&dentry->d_lock);
810 spin_unlock(&autofs4_lock); 702 spin_unlock(&autofs4_lock);
811 703
704 if (sbi->version < 5)
705 autofs_clear_leaf_automount_flags(dentry);
706
812 if (atomic_dec_and_test(&ino->count)) { 707 if (atomic_dec_and_test(&ino->count)) {
813 p_ino = autofs4_dentry_ino(dentry->d_parent); 708 p_ino = autofs4_dentry_ino(dentry->d_parent);
814 if (p_ino && dentry->d_parent != dentry) 709 if (p_ino && dentry->d_parent != dentry)
@@ -837,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
837 DPRINTK("dentry %p, creating %.*s", 732 DPRINTK("dentry %p, creating %.*s",
838 dentry, dentry->d_name.len, dentry->d_name.name); 733 dentry, dentry->d_name.len, dentry->d_name.name);
839 734
840 ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555); 735 BUG_ON(!ino);
841 if (!ino) 736
842 return -ENOMEM; 737 autofs4_clean_ino(ino);
843 738
844 autofs4_del_active(dentry); 739 autofs4_del_active(dentry);
845 740
846 inode = autofs4_get_inode(dir->i_sb, ino); 741 inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
847 if (!inode) { 742 if (!inode)
848 if (!dentry->d_fsdata)
849 kfree(ino);
850 return -ENOMEM; 743 return -ENOMEM;
851 }
852 d_add(dentry, inode); 744 d_add(dentry, inode);
853 745
854 if (dir == dir->i_sb->s_root->d_inode) 746 if (sbi->version < 5)
855 d_set_d_op(dentry, &autofs4_root_dentry_operations); 747 autofs_set_leaf_automount_flags(dentry);
856 else
857 d_set_d_op(dentry, &autofs4_dentry_operations);
858 748
859 dentry->d_fsdata = ino; 749 dget(dentry);
860 ino->dentry = dget(dentry);
861 atomic_inc(&ino->count); 750 atomic_inc(&ino->count);
862 p_ino = autofs4_dentry_ino(dentry->d_parent); 751 p_ino = autofs4_dentry_ino(dentry->d_parent);
863 if (p_ino && dentry->d_parent != dentry) 752 if (p_ino && dentry->d_parent != dentry)
864 atomic_inc(&p_ino->count); 753 atomic_inc(&p_ino->count);
865 ino->inode = inode;
866 inc_nlink(dir); 754 inc_nlink(dir);
867 dir->i_mtime = CURRENT_TIME; 755 dir->i_mtime = CURRENT_TIME;
868 756
@@ -944,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
944int is_autofs4_dentry(struct dentry *dentry) 832int is_autofs4_dentry(struct dentry *dentry)
945{ 833{
946 return dentry && dentry->d_inode && 834 return dentry && dentry->d_inode &&
947 (dentry->d_op == &autofs4_root_dentry_operations || 835 dentry->d_op == &autofs4_dentry_operations &&
948 dentry->d_op == &autofs4_dentry_operations) &&
949 dentry->d_fsdata != NULL; 836 dentry->d_fsdata != NULL;
950} 837}
951 838
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2e..f27c094a1919 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
14 14
15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) 15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
16{ 16{
17 struct autofs_info *ino = autofs4_dentry_ino(dentry); 17 nd_set_link(nd, dentry->d_inode->i_private);
18 nd_set_link(nd, (char *)ino->u.symlink);
19 return NULL; 18 return NULL;
20} 19}
21 20
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index c5f8459c905e..56010056b2e6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -309,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
309 * completed while we waited on the mutex ... 309 * completed while we waited on the mutex ...
310 */ 310 */
311 if (notify == NFY_MOUNT) { 311 if (notify == NFY_MOUNT) {
312 struct dentry *new = NULL;
313 int valid = 1;
314
312 /* 315 /*
313 * If the dentry was successfully mounted while we slept 316 * If the dentry was successfully mounted while we slept
314 * on the wait queue mutex we can return success. If it 317 * on the wait queue mutex we can return success. If it
@@ -316,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
316 * a multi-mount with no mount at it's base) we can 319 * a multi-mount with no mount at it's base) we can
317 * continue on and create a new request. 320 * continue on and create a new request.
318 */ 321 */
322 if (!IS_ROOT(dentry)) {
323 if (dentry->d_inode && d_unhashed(dentry)) {
324 struct dentry *parent = dentry->d_parent;
325 new = d_lookup(parent, &dentry->d_name);
326 if (new)
327 dentry = new;
328 }
329 }
319 if (have_submounts(dentry)) 330 if (have_submounts(dentry))
320 return 0; 331 valid = 0;
332
333 if (new)
334 dput(new);
335 return valid;
321 } 336 }
322 337
323 return 1; 338 return 1;
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 6cb84d896d05..27223878ba9f 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
102} 102}
103 103
104static inline befs_data_stream 104static inline befs_data_stream
105fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n) 105fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
106{ 106{
107 befs_data_stream data; 107 befs_data_stream data;
108 int i; 108 int i;
109 109
110 for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i) 110 for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
111 data.direct[i] = fsrun_to_cpu(sb, n.direct[i]); 111 data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
112 112
113 data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range); 113 data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
114 data.indirect = fsrun_to_cpu(sb, n.indirect); 114 data.indirect = fsrun_to_cpu(sb, n->indirect);
115 data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range); 115 data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
116 data.double_indirect = fsrun_to_cpu(sb, n.double_indirect); 116 data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
117 data.max_double_indirect_range = fs64_to_cpu(sb, 117 data.max_double_indirect_range = fs64_to_cpu(sb,
118 n. 118 n->
119 max_double_indirect_range); 119 max_double_indirect_range);
120 data.size = fs64_to_cpu(sb, n.size); 120 data.size = fs64_to_cpu(sb, n->size);
121 121
122 return data; 122 return data;
123} 123}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index de93581b79a2..b1d0c794747b 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -390,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
390 int num_blks; 390 int num_blks;
391 391
392 befs_ino->i_data.ds = 392 befs_ino->i_data.ds =
393 fsds_to_cpu(sb, raw_inode->data.datastream); 393 fsds_to_cpu(sb, &raw_inode->data.datastream);
394 394
395 num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds); 395 num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
396 inode->i_blocks = 396 inode->i_blocks =
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6884e198e0c7..d5b640ba6cb1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
66#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) 66#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
67 67
68static struct linux_binfmt elf_format = { 68static struct linux_binfmt elf_format = {
69 .module = THIS_MODULE, 69 .module = THIS_MODULE,
70 .load_binary = load_elf_binary, 70 .load_binary = load_elf_binary,
71 .load_shlib = load_elf_library, 71 .load_shlib = load_elf_library,
72 .core_dump = elf_core_dump, 72 .core_dump = elf_core_dump,
73 .min_coredump = ELF_EXEC_PAGESIZE, 73 .min_coredump = ELF_EXEC_PAGESIZE,
74 .hasvdso = 1
75}; 74};
76 75
77#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) 76#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
316 return 0; 315 return 0;
317} 316}
318 317
319#ifndef elf_map
320
321static unsigned long elf_map(struct file *filep, unsigned long addr, 318static unsigned long elf_map(struct file *filep, unsigned long addr,
322 struct elf_phdr *eppnt, int prot, int type, 319 struct elf_phdr *eppnt, int prot, int type,
323 unsigned long total_size) 320 unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
354 return(map_addr); 351 return(map_addr);
355} 352}
356 353
357#endif /* !elf_map */
358
359static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) 354static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
360{ 355{
361 int i, first_idx = -1, last_idx = -1; 356 int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
421 goto out; 416 goto out;
422 417
423 retval = kernel_read(interpreter, interp_elf_ex->e_phoff, 418 retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
424 (char *)elf_phdata,size); 419 (char *)elf_phdata, size);
425 error = -EIO; 420 error = -EIO;
426 if (retval != size) { 421 if (retval != size) {
427 if (retval < 0) 422 if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
601 goto out; 596 goto out;
602 if (!elf_check_arch(&loc->elf_ex)) 597 if (!elf_check_arch(&loc->elf_ex))
603 goto out; 598 goto out;
604 if (!bprm->file->f_op||!bprm->file->f_op->mmap) 599 if (!bprm->file->f_op || !bprm->file->f_op->mmap)
605 goto out; 600 goto out;
606 601
607 /* Now read in all of the header information */ 602 /* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
761 /* There was a PT_LOAD segment with p_memsz > p_filesz 756 /* There was a PT_LOAD segment with p_memsz > p_filesz
762 before this one. Map anonymous pages, if needed, 757 before this one. Map anonymous pages, if needed,
763 and clear the area. */ 758 and clear the area. */
764 retval = set_brk (elf_bss + load_bias, 759 retval = set_brk(elf_bss + load_bias,
765 elf_brk + load_bias); 760 elf_brk + load_bias);
766 if (retval) { 761 if (retval) {
767 send_sig(SIGKILL, current, 0); 762 send_sig(SIGKILL, current, 0);
768 goto out_free_dentry; 763 goto out_free_dentry;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4d0ff5ee27b8..e49cce234c65 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -782,7 +782,12 @@ void __init bio_integrity_init(void)
782{ 782{
783 unsigned int i; 783 unsigned int i;
784 784
785 kintegrityd_wq = create_workqueue("kintegrityd"); 785 /*
786 * kintegrityd won't block much but may burn a lot of CPU cycles.
787 * Make it highpri CPU intensive wq with max concurrency of 1.
788 */
789 kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
790 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
786 if (!kintegrityd_wq) 791 if (!kintegrityd_wq)
787 panic("Failed to create kintegrityd\n"); 792 panic("Failed to create kintegrityd\n");
788 793
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 771f23527010..333a7bb4cb9c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -433,7 +433,7 @@ static void init_once(void *foo)
433 INIT_LIST_HEAD(&bdev->bd_inodes); 433 INIT_LIST_HEAD(&bdev->bd_inodes);
434 INIT_LIST_HEAD(&bdev->bd_list); 434 INIT_LIST_HEAD(&bdev->bd_list);
435#ifdef CONFIG_SYSFS 435#ifdef CONFIG_SYSFS
436 INIT_LIST_HEAD(&bdev->bd_holder_list); 436 INIT_LIST_HEAD(&bdev->bd_holder_disks);
437#endif 437#endif
438 inode_init_once(&ei->vfs_inode); 438 inode_init_once(&ei->vfs_inode);
439 /* Initialize mutex for freeze. */ 439 /* Initialize mutex for freeze. */
@@ -473,7 +473,7 @@ static const struct super_operations bdev_sops = {
473static struct dentry *bd_mount(struct file_system_type *fs_type, 473static struct dentry *bd_mount(struct file_system_type *fs_type,
474 int flags, const char *dev_name, void *data) 474 int flags, const char *dev_name, void *data)
475{ 475{
476 return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); 476 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
477} 477}
478 478
479static struct file_system_type bd_type = { 479static struct file_system_type bd_type = {
@@ -669,7 +669,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
669 else if (bdev->bd_contains == bdev) 669 else if (bdev->bd_contains == bdev)
670 return true; /* is a whole device which isn't held */ 670 return true; /* is a whole device which isn't held */
671 671
672 else if (whole->bd_holder == bd_claim) 672 else if (whole->bd_holder == bd_may_claim)
673 return true; /* is a partition of a device that is being partitioned */ 673 return true; /* is a partition of a device that is being partitioned */
674 else if (whole->bd_holder != NULL) 674 else if (whole->bd_holder != NULL)
675 return false; /* is a partition of a held device */ 675 return false; /* is a partition of a held device */
@@ -781,439 +781,142 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
781 } 781 }
782} 782}
783 783
784/* releases bdev_lock */ 784#ifdef CONFIG_SYSFS
785static void __bd_abort_claiming(struct block_device *whole, void *holder) 785struct bd_holder_disk {
786{ 786 struct list_head list;
787 BUG_ON(whole->bd_claiming != holder); 787 struct gendisk *disk;
788 whole->bd_claiming = NULL; 788 int refcnt;
789 wake_up_bit(&whole->bd_claiming, 0); 789};
790
791 spin_unlock(&bdev_lock);
792 bdput(whole);
793}
794
795/**
796 * bd_abort_claiming - abort claiming a block device
797 * @whole: whole block device returned by bd_start_claiming()
798 * @holder: holder trying to claim @bdev
799 *
800 * Abort a claiming block started by bd_start_claiming(). Note that
801 * @whole is not the block device to be claimed but the whole device
802 * returned by bd_start_claiming().
803 *
804 * CONTEXT:
805 * Grabs and releases bdev_lock.
806 */
807static void bd_abort_claiming(struct block_device *whole, void *holder)
808{
809 spin_lock(&bdev_lock);
810 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
811}
812
813/* increment holders when we have a legitimate claim. requires bdev_lock */
814static void __bd_claim(struct block_device *bdev, struct block_device *whole,
815 void *holder)
816{
817 /* note that for a whole device bd_holders
818 * will be incremented twice, and bd_holder will
819 * be set to bd_claim before being set to holder
820 */
821 whole->bd_holders++;
822 whole->bd_holder = bd_claim;
823 bdev->bd_holders++;
824 bdev->bd_holder = holder;
825}
826
827/**
828 * bd_finish_claiming - finish claiming a block device
829 * @bdev: block device of interest (passed to bd_start_claiming())
830 * @whole: whole block device returned by bd_start_claiming()
831 * @holder: holder trying to claim @bdev
832 *
833 * Finish a claiming block started by bd_start_claiming().
834 *
835 * CONTEXT:
836 * Grabs and releases bdev_lock.
837 */
838static void bd_finish_claiming(struct block_device *bdev,
839 struct block_device *whole, void *holder)
840{
841 spin_lock(&bdev_lock);
842 BUG_ON(!bd_may_claim(bdev, whole, holder));
843 __bd_claim(bdev, whole, holder);
844 __bd_abort_claiming(whole, holder); /* not actually an abort */
845}
846 790
847/** 791static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
848 * bd_claim - claim a block device 792 struct gendisk *disk)
849 * @bdev: block device to claim
850 * @holder: holder trying to claim @bdev
851 *
852 * Try to claim @bdev which must have been opened successfully.
853 *
854 * CONTEXT:
855 * Might sleep.
856 *
857 * RETURNS:
858 * 0 if successful, -EBUSY if @bdev is already claimed.
859 */
860int bd_claim(struct block_device *bdev, void *holder)
861{ 793{
862 struct block_device *whole = bdev->bd_contains; 794 struct bd_holder_disk *holder;
863 int res;
864 795
865 might_sleep(); 796 list_for_each_entry(holder, &bdev->bd_holder_disks, list)
866 797 if (holder->disk == disk)
867 spin_lock(&bdev_lock); 798 return holder;
868 res = bd_prepare_to_claim(bdev, whole, holder); 799 return NULL;
869 if (res == 0)
870 __bd_claim(bdev, whole, holder);
871 spin_unlock(&bdev_lock);
872
873 return res;
874}
875EXPORT_SYMBOL(bd_claim);
876
877void bd_release(struct block_device *bdev)
878{
879 spin_lock(&bdev_lock);
880 if (!--bdev->bd_contains->bd_holders)
881 bdev->bd_contains->bd_holder = NULL;
882 if (!--bdev->bd_holders)
883 bdev->bd_holder = NULL;
884 spin_unlock(&bdev_lock);
885} 800}
886 801
887EXPORT_SYMBOL(bd_release);
888
889#ifdef CONFIG_SYSFS
890/*
891 * Functions for bd_claim_by_kobject / bd_release_from_kobject
892 *
893 * If a kobject is passed to bd_claim_by_kobject()
894 * and the kobject has a parent directory,
895 * following symlinks are created:
896 * o from the kobject to the claimed bdev
897 * o from "holders" directory of the bdev to the parent of the kobject
898 * bd_release_from_kobject() removes these symlinks.
899 *
900 * Example:
901 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to
902 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
903 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
904 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
905 */
906
907static int add_symlink(struct kobject *from, struct kobject *to) 802static int add_symlink(struct kobject *from, struct kobject *to)
908{ 803{
909 if (!from || !to)
910 return 0;
911 return sysfs_create_link(from, to, kobject_name(to)); 804 return sysfs_create_link(from, to, kobject_name(to));
912} 805}
913 806
914static void del_symlink(struct kobject *from, struct kobject *to) 807static void del_symlink(struct kobject *from, struct kobject *to)
915{ 808{
916 if (!from || !to)
917 return;
918 sysfs_remove_link(from, kobject_name(to)); 809 sysfs_remove_link(from, kobject_name(to));
919} 810}
920 811
921/*
922 * 'struct bd_holder' contains pointers to kobjects symlinked by
923 * bd_claim_by_kobject.
924 * It's connected to bd_holder_list which is protected by bdev->bd_sem.
925 */
926struct bd_holder {
927 struct list_head list; /* chain of holders of the bdev */
928 int count; /* references from the holder */
929 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */
930 struct kobject *hdev; /* e.g. "/block/dm-0" */
931 struct kobject *hdir; /* e.g. "/block/sda/holders" */
932 struct kobject *sdev; /* e.g. "/block/sda" */
933};
934
935/*
936 * Get references of related kobjects at once.
937 * Returns 1 on success. 0 on failure.
938 *
939 * Should call bd_holder_release_dirs() after successful use.
940 */
941static int bd_holder_grab_dirs(struct block_device *bdev,
942 struct bd_holder *bo)
943{
944 if (!bdev || !bo)
945 return 0;
946
947 bo->sdir = kobject_get(bo->sdir);
948 if (!bo->sdir)
949 return 0;
950
951 bo->hdev = kobject_get(bo->sdir->parent);
952 if (!bo->hdev)
953 goto fail_put_sdir;
954
955 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
956 if (!bo->sdev)
957 goto fail_put_hdev;
958
959 bo->hdir = kobject_get(bdev->bd_part->holder_dir);
960 if (!bo->hdir)
961 goto fail_put_sdev;
962
963 return 1;
964
965fail_put_sdev:
966 kobject_put(bo->sdev);
967fail_put_hdev:
968 kobject_put(bo->hdev);
969fail_put_sdir:
970 kobject_put(bo->sdir);
971
972 return 0;
973}
974
975/* Put references of related kobjects at once. */
976static void bd_holder_release_dirs(struct bd_holder *bo)
977{
978 kobject_put(bo->hdir);
979 kobject_put(bo->sdev);
980 kobject_put(bo->hdev);
981 kobject_put(bo->sdir);
982}
983
984static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
985{
986 struct bd_holder *bo;
987
988 bo = kzalloc(sizeof(*bo), GFP_KERNEL);
989 if (!bo)
990 return NULL;
991
992 bo->count = 1;
993 bo->sdir = kobj;
994
995 return bo;
996}
997
998static void free_bd_holder(struct bd_holder *bo)
999{
1000 kfree(bo);
1001}
1002
1003/** 812/**
1004 * find_bd_holder - find matching struct bd_holder from the block device 813 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
814 * @bdev: the claimed slave bdev
815 * @disk: the holding disk
1005 * 816 *
1006 * @bdev: struct block device to be searched 817 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
1007 * @bo: target struct bd_holder
1008 * 818 *
1009 * Returns matching entry with @bo in @bdev->bd_holder_list. 819 * This functions creates the following sysfs symlinks.
1010 * If found, increment the reference count and return the pointer. 820 *
1011 * If not found, returns NULL. 821 * - from "slaves" directory of the holder @disk to the claimed @bdev
1012 */ 822 * - from "holders" directory of the @bdev to the holder @disk
1013static struct bd_holder *find_bd_holder(struct block_device *bdev, 823 *
1014 struct bd_holder *bo) 824 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
1015{ 825 * passed to bd_link_disk_holder(), then:
1016 struct bd_holder *tmp;
1017
1018 list_for_each_entry(tmp, &bdev->bd_holder_list, list)
1019 if (tmp->sdir == bo->sdir) {
1020 tmp->count++;
1021 return tmp;
1022 }
1023
1024 return NULL;
1025}
1026
1027/**
1028 * add_bd_holder - create sysfs symlinks for bd_claim() relationship
1029 * 826 *
1030 * @bdev: block device to be bd_claimed 827 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
1031 * @bo: preallocated and initialized by alloc_bd_holder() 828 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
1032 * 829 *
1033 * Add @bo to @bdev->bd_holder_list, create symlinks. 830 * The caller must have claimed @bdev before calling this function and
831 * ensure that both @bdev and @disk are valid during the creation and
832 * lifetime of these symlinks.
1034 * 833 *
1035 * Returns 0 if symlinks are created. 834 * CONTEXT:
1036 * Returns -ve if something fails. 835 * Might sleep.
836 *
837 * RETURNS:
838 * 0 on success, -errno on failure.
1037 */ 839 */
1038static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 840int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
1039{ 841{
1040 int err; 842 struct bd_holder_disk *holder;
843 int ret = 0;
1041 844
1042 if (!bo) 845 mutex_lock(&bdev->bd_mutex);
1043 return -EINVAL;
1044 846
1045 if (!bd_holder_grab_dirs(bdev, bo)) 847 WARN_ON_ONCE(!bdev->bd_holder);
1046 return -EBUSY;
1047 848
1048 err = add_symlink(bo->sdir, bo->sdev); 849 /* FIXME: remove the following once add_disk() handles errors */
1049 if (err) 850 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
1050 return err; 851 goto out_unlock;
1051 852
1052 err = add_symlink(bo->hdir, bo->hdev); 853 holder = bd_find_holder_disk(bdev, disk);
1053 if (err) { 854 if (holder) {
1054 del_symlink(bo->sdir, bo->sdev); 855 holder->refcnt++;
1055 return err; 856 goto out_unlock;
1056 } 857 }
1057 858
1058 list_add_tail(&bo->list, &bdev->bd_holder_list); 859 holder = kzalloc(sizeof(*holder), GFP_KERNEL);
1059 return 0; 860 if (!holder) {
1060} 861 ret = -ENOMEM;
1061 862 goto out_unlock;
1062/**
1063 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
1064 *
1065 * @bdev: block device to be bd_claimed
1066 * @kobj: holder's kobject
1067 *
1068 * If there is matching entry with @kobj in @bdev->bd_holder_list
1069 * and no other bd_claim() from the same kobject,
1070 * remove the struct bd_holder from the list, delete symlinks for it.
1071 *
1072 * Returns a pointer to the struct bd_holder when it's removed from the list
1073 * and ready to be freed.
1074 * Returns NULL if matching claim isn't found or there is other bd_claim()
1075 * by the same kobject.
1076 */
1077static struct bd_holder *del_bd_holder(struct block_device *bdev,
1078 struct kobject *kobj)
1079{
1080 struct bd_holder *bo;
1081
1082 list_for_each_entry(bo, &bdev->bd_holder_list, list) {
1083 if (bo->sdir == kobj) {
1084 bo->count--;
1085 BUG_ON(bo->count < 0);
1086 if (!bo->count) {
1087 list_del(&bo->list);
1088 del_symlink(bo->sdir, bo->sdev);
1089 del_symlink(bo->hdir, bo->hdev);
1090 bd_holder_release_dirs(bo);
1091 return bo;
1092 }
1093 break;
1094 }
1095 } 863 }
1096 864
1097 return NULL; 865 INIT_LIST_HEAD(&holder->list);
1098} 866 holder->disk = disk;
1099 867 holder->refcnt = 1;
1100/**
1101 * bd_claim_by_kobject - bd_claim() with additional kobject signature
1102 *
1103 * @bdev: block device to be claimed
1104 * @holder: holder's signature
1105 * @kobj: holder's kobject
1106 *
1107 * Do bd_claim() and if it succeeds, create sysfs symlinks between
1108 * the bdev and the holder's kobject.
1109 * Use bd_release_from_kobject() when relesing the claimed bdev.
1110 *
1111 * Returns 0 on success. (same as bd_claim())
1112 * Returns errno on failure.
1113 */
1114static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
1115 struct kobject *kobj)
1116{
1117 int err;
1118 struct bd_holder *bo, *found;
1119
1120 if (!kobj)
1121 return -EINVAL;
1122
1123 bo = alloc_bd_holder(kobj);
1124 if (!bo)
1125 return -ENOMEM;
1126 868
1127 mutex_lock(&bdev->bd_mutex); 869 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
870 if (ret)
871 goto out_free;
1128 872
1129 err = bd_claim(bdev, holder); 873 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
1130 if (err) 874 if (ret)
1131 goto fail; 875 goto out_del;
1132 876
1133 found = find_bd_holder(bdev, bo); 877 list_add(&holder->list, &bdev->bd_holder_disks);
1134 if (found) 878 goto out_unlock;
1135 goto fail;
1136 879
1137 err = add_bd_holder(bdev, bo); 880out_del:
1138 if (err) 881 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1139 bd_release(bdev); 882out_free:
1140 else 883 kfree(holder);
1141 bo = NULL; 884out_unlock:
1142fail:
1143 mutex_unlock(&bdev->bd_mutex); 885 mutex_unlock(&bdev->bd_mutex);
1144 free_bd_holder(bo); 886 return ret;
1145 return err;
1146} 887}
888EXPORT_SYMBOL_GPL(bd_link_disk_holder);
1147 889
1148/** 890/**
1149 * bd_release_from_kobject - bd_release() with additional kobject signature 891 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
892 * @bdev: the calimed slave bdev
893 * @disk: the holding disk
1150 * 894 *
1151 * @bdev: block device to be released 895 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
1152 * @kobj: holder's kobject
1153 * 896 *
1154 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). 897 * CONTEXT:
898 * Might sleep.
1155 */ 899 */
1156static void bd_release_from_kobject(struct block_device *bdev, 900void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
1157 struct kobject *kobj)
1158{ 901{
1159 if (!kobj) 902 struct bd_holder_disk *holder;
1160 return;
1161 903
1162 mutex_lock(&bdev->bd_mutex); 904 mutex_lock(&bdev->bd_mutex);
1163 bd_release(bdev);
1164 free_bd_holder(del_bd_holder(bdev, kobj));
1165 mutex_unlock(&bdev->bd_mutex);
1166}
1167 905
1168/** 906 holder = bd_find_holder_disk(bdev, disk);
1169 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
1170 *
1171 * @bdev: block device to be claimed
1172 * @holder: holder's signature
1173 * @disk: holder's gendisk
1174 *
1175 * Call bd_claim_by_kobject() with getting @disk->slave_dir.
1176 */
1177int bd_claim_by_disk(struct block_device *bdev, void *holder,
1178 struct gendisk *disk)
1179{
1180 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
1181}
1182EXPORT_SYMBOL_GPL(bd_claim_by_disk);
1183 907
1184/** 908 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
1185 * bd_release_from_disk - wrapper function for bd_release_from_kobject() 909 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1186 * 910 del_symlink(bdev->bd_part->holder_dir,
1187 * @bdev: block device to be claimed 911 &disk_to_dev(disk)->kobj);
1188 * @disk: holder's gendisk 912 list_del_init(&holder->list);
1189 * 913 kfree(holder);
1190 * Call bd_release_from_kobject() and put @disk->slave_dir. 914 }
1191 */
1192void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
1193{
1194 bd_release_from_kobject(bdev, disk->slave_dir);
1195 kobject_put(disk->slave_dir);
1196}
1197EXPORT_SYMBOL_GPL(bd_release_from_disk);
1198#endif
1199 915
1200/* 916 mutex_unlock(&bdev->bd_mutex);
1201 * Tries to open block device by device number. Use it ONLY if you
1202 * really do not have anything better - i.e. when you are behind a
1203 * truly sucky interface and all you are given is a device number. _Never_
1204 * to be used for internal purposes. If you ever need it - reconsider
1205 * your API.
1206 */
1207struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
1208{
1209 struct block_device *bdev = bdget(dev);
1210 int err = -ENOMEM;
1211 if (bdev)
1212 err = blkdev_get(bdev, mode);
1213 return err ? ERR_PTR(err) : bdev;
1214} 917}
1215 918EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
1216EXPORT_SYMBOL(open_by_devnum); 919#endif
1217 920
1218/** 921/**
1219 * flush_disk - invalidates all buffer-cache entries on a disk 922 * flush_disk - invalidates all buffer-cache entries on a disk
@@ -1309,10 +1012,11 @@ int check_disk_change(struct block_device *bdev)
1309{ 1012{
1310 struct gendisk *disk = bdev->bd_disk; 1013 struct gendisk *disk = bdev->bd_disk;
1311 const struct block_device_operations *bdops = disk->fops; 1014 const struct block_device_operations *bdops = disk->fops;
1015 unsigned int events;
1312 1016
1313 if (!bdops->media_changed) 1017 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1314 return 0; 1018 DISK_EVENT_EJECT_REQUEST);
1315 if (!bdops->media_changed(bdev->bd_disk)) 1019 if (!(events & DISK_EVENT_MEDIA_CHANGE))
1316 return 0; 1020 return 0;
1317 1021
1318 flush_disk(bdev); 1022 flush_disk(bdev);
@@ -1475,17 +1179,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1475 return ret; 1179 return ret;
1476} 1180}
1477 1181
1478int blkdev_get(struct block_device *bdev, fmode_t mode) 1182/**
1183 * blkdev_get - open a block device
1184 * @bdev: block_device to open
1185 * @mode: FMODE_* mask
1186 * @holder: exclusive holder identifier
1187 *
1188 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
1189 * open with exclusive access. Specifying %FMODE_EXCL with %NULL
1190 * @holder is invalid. Exclusive opens may nest for the same @holder.
1191 *
1192 * On success, the reference count of @bdev is unchanged. On failure,
1193 * @bdev is put.
1194 *
1195 * CONTEXT:
1196 * Might sleep.
1197 *
1198 * RETURNS:
1199 * 0 on success, -errno on failure.
1200 */
1201int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1479{ 1202{
1480 return __blkdev_get(bdev, mode, 0); 1203 struct block_device *whole = NULL;
1204 int res;
1205
1206 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1207
1208 if ((mode & FMODE_EXCL) && holder) {
1209 whole = bd_start_claiming(bdev, holder);
1210 if (IS_ERR(whole)) {
1211 bdput(bdev);
1212 return PTR_ERR(whole);
1213 }
1214 }
1215
1216 res = __blkdev_get(bdev, mode, 0);
1217
1218 /* __blkdev_get() may alter read only status, check it afterwards */
1219 if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1220 __blkdev_put(bdev, mode, 0);
1221 res = -EACCES;
1222 }
1223
1224 if (whole) {
1225 /* finish claiming */
1226 mutex_lock(&bdev->bd_mutex);
1227 spin_lock(&bdev_lock);
1228
1229 if (!res) {
1230 BUG_ON(!bd_may_claim(bdev, whole, holder));
1231 /*
1232 * Note that for a whole device bd_holders
1233 * will be incremented twice, and bd_holder
1234 * will be set to bd_may_claim before being
1235 * set to holder
1236 */
1237 whole->bd_holders++;
1238 whole->bd_holder = bd_may_claim;
1239 bdev->bd_holders++;
1240 bdev->bd_holder = holder;
1241 }
1242
1243 /* tell others that we're done */
1244 BUG_ON(whole->bd_claiming != holder);
1245 whole->bd_claiming = NULL;
1246 wake_up_bit(&whole->bd_claiming, 0);
1247
1248 spin_unlock(&bdev_lock);
1249
1250 /*
1251 * Block event polling for write claims. Any write
1252 * holder makes the write_holder state stick until all
1253 * are released. This is good enough and tracking
1254 * individual writeable reference is too fragile given
1255 * the way @mode is used in blkdev_get/put().
1256 */
1257 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
1258 bdev->bd_write_holder = true;
1259 disk_block_events(bdev->bd_disk);
1260 }
1261
1262 mutex_unlock(&bdev->bd_mutex);
1263 bdput(whole);
1264 }
1265
1266 return res;
1481} 1267}
1482EXPORT_SYMBOL(blkdev_get); 1268EXPORT_SYMBOL(blkdev_get);
1483 1269
1270/**
1271 * blkdev_get_by_path - open a block device by name
1272 * @path: path to the block device to open
1273 * @mode: FMODE_* mask
1274 * @holder: exclusive holder identifier
1275 *
1276 * Open the blockdevice described by the device file at @path. @mode
1277 * and @holder are identical to blkdev_get().
1278 *
1279 * On success, the returned block_device has reference count of one.
1280 *
1281 * CONTEXT:
1282 * Might sleep.
1283 *
1284 * RETURNS:
1285 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1286 */
1287struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1288 void *holder)
1289{
1290 struct block_device *bdev;
1291 int err;
1292
1293 bdev = lookup_bdev(path);
1294 if (IS_ERR(bdev))
1295 return bdev;
1296
1297 err = blkdev_get(bdev, mode, holder);
1298 if (err)
1299 return ERR_PTR(err);
1300
1301 return bdev;
1302}
1303EXPORT_SYMBOL(blkdev_get_by_path);
1304
1305/**
1306 * blkdev_get_by_dev - open a block device by device number
1307 * @dev: device number of block device to open
1308 * @mode: FMODE_* mask
1309 * @holder: exclusive holder identifier
1310 *
1311 * Open the blockdevice described by device number @dev. @mode and
1312 * @holder are identical to blkdev_get().
1313 *
1314 * Use it ONLY if you really do not have anything better - i.e. when
1315 * you are behind a truly sucky interface and all you are given is a
1316 * device number. _Never_ to be used for internal purposes. If you
1317 * ever need it - reconsider your API.
1318 *
1319 * On success, the returned block_device has reference count of one.
1320 *
1321 * CONTEXT:
1322 * Might sleep.
1323 *
1324 * RETURNS:
1325 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1326 */
1327struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1328{
1329 struct block_device *bdev;
1330 int err;
1331
1332 bdev = bdget(dev);
1333 if (!bdev)
1334 return ERR_PTR(-ENOMEM);
1335
1336 err = blkdev_get(bdev, mode, holder);
1337 if (err)
1338 return ERR_PTR(err);
1339
1340 return bdev;
1341}
1342EXPORT_SYMBOL(blkdev_get_by_dev);
1343
1484static int blkdev_open(struct inode * inode, struct file * filp) 1344static int blkdev_open(struct inode * inode, struct file * filp)
1485{ 1345{
1486 struct block_device *whole = NULL;
1487 struct block_device *bdev; 1346 struct block_device *bdev;
1488 int res;
1489 1347
1490 /* 1348 /*
1491 * Preserve backwards compatibility and allow large file access 1349 * Preserve backwards compatibility and allow large file access
@@ -1506,26 +1364,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1506 if (bdev == NULL) 1364 if (bdev == NULL)
1507 return -ENOMEM; 1365 return -ENOMEM;
1508 1366
1509 if (filp->f_mode & FMODE_EXCL) {
1510 whole = bd_start_claiming(bdev, filp);
1511 if (IS_ERR(whole)) {
1512 bdput(bdev);
1513 return PTR_ERR(whole);
1514 }
1515 }
1516
1517 filp->f_mapping = bdev->bd_inode->i_mapping; 1367 filp->f_mapping = bdev->bd_inode->i_mapping;
1518 1368
1519 res = blkdev_get(bdev, filp->f_mode); 1369 return blkdev_get(bdev, filp->f_mode, filp);
1520
1521 if (whole) {
1522 if (res == 0)
1523 bd_finish_claiming(bdev, whole, filp);
1524 else
1525 bd_abort_claiming(whole, filp);
1526 }
1527
1528 return res;
1529} 1370}
1530 1371
1531static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1372static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1539,6 +1380,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1539 bdev->bd_part_count--; 1380 bdev->bd_part_count--;
1540 1381
1541 if (!--bdev->bd_openers) { 1382 if (!--bdev->bd_openers) {
1383 WARN_ON_ONCE(bdev->bd_holders);
1542 sync_blockdev(bdev); 1384 sync_blockdev(bdev);
1543 kill_bdev(bdev); 1385 kill_bdev(bdev);
1544 } 1386 }
@@ -1569,6 +1411,44 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1569 1411
1570int blkdev_put(struct block_device *bdev, fmode_t mode) 1412int blkdev_put(struct block_device *bdev, fmode_t mode)
1571{ 1413{
1414 if (mode & FMODE_EXCL) {
1415 bool bdev_free;
1416
1417 /*
1418 * Release a claim on the device. The holder fields
1419 * are protected with bdev_lock. bd_mutex is to
1420 * synchronize disk_holder unlinking.
1421 */
1422 mutex_lock(&bdev->bd_mutex);
1423 spin_lock(&bdev_lock);
1424
1425 WARN_ON_ONCE(--bdev->bd_holders < 0);
1426 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1427
1428 /* bd_contains might point to self, check in a separate step */
1429 if ((bdev_free = !bdev->bd_holders))
1430 bdev->bd_holder = NULL;
1431 if (!bdev->bd_contains->bd_holders)
1432 bdev->bd_contains->bd_holder = NULL;
1433
1434 spin_unlock(&bdev_lock);
1435
1436 /*
1437 * If this was the last claim, remove holder link and
1438 * unblock evpoll if it was a write holder.
1439 */
1440 if (bdev_free) {
1441 if (bdev->bd_write_holder) {
1442 disk_unblock_events(bdev->bd_disk);
1443 bdev->bd_write_holder = false;
1444 } else
1445 disk_check_events(bdev->bd_disk);
1446 }
1447
1448 mutex_unlock(&bdev->bd_mutex);
1449 } else
1450 disk_check_events(bdev->bd_disk);
1451
1572 return __blkdev_put(bdev, mode, 0); 1452 return __blkdev_put(bdev, mode, 0);
1573} 1453}
1574EXPORT_SYMBOL(blkdev_put); 1454EXPORT_SYMBOL(blkdev_put);
@@ -1576,8 +1456,7 @@ EXPORT_SYMBOL(blkdev_put);
1576static int blkdev_close(struct inode * inode, struct file * filp) 1456static int blkdev_close(struct inode * inode, struct file * filp)
1577{ 1457{
1578 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1458 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1579 if (bdev->bd_holder == filp) 1459
1580 bd_release(bdev);
1581 return blkdev_put(bdev, filp->f_mode); 1460 return blkdev_put(bdev, filp->f_mode);
1582} 1461}
1583 1462
@@ -1722,67 +1601,6 @@ fail:
1722} 1601}
1723EXPORT_SYMBOL(lookup_bdev); 1602EXPORT_SYMBOL(lookup_bdev);
1724 1603
1725/**
1726 * open_bdev_exclusive - open a block device by name and set it up for use
1727 *
1728 * @path: special file representing the block device
1729 * @mode: FMODE_... combination to pass be used
1730 * @holder: owner for exclusion
1731 *
1732 * Open the blockdevice described by the special file at @path, claim it
1733 * for the @holder.
1734 */
1735struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1736{
1737 struct block_device *bdev, *whole;
1738 int error;
1739
1740 bdev = lookup_bdev(path);
1741 if (IS_ERR(bdev))
1742 return bdev;
1743
1744 whole = bd_start_claiming(bdev, holder);
1745 if (IS_ERR(whole)) {
1746 bdput(bdev);
1747 return whole;
1748 }
1749
1750 error = blkdev_get(bdev, mode);
1751 if (error)
1752 goto out_abort_claiming;
1753
1754 error = -EACCES;
1755 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1756 goto out_blkdev_put;
1757
1758 bd_finish_claiming(bdev, whole, holder);
1759 return bdev;
1760
1761out_blkdev_put:
1762 blkdev_put(bdev, mode);
1763out_abort_claiming:
1764 bd_abort_claiming(whole, holder);
1765 return ERR_PTR(error);
1766}
1767
1768EXPORT_SYMBOL(open_bdev_exclusive);
1769
1770/**
1771 * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive()
1772 *
1773 * @bdev: blockdevice to close
1774 * @mode: mode, must match that used to open.
1775 *
1776 * This is the counterpart to open_bdev_exclusive().
1777 */
1778void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
1779{
1780 bd_release(bdev);
1781 blkdev_put(bdev, mode);
1782}
1783
1784EXPORT_SYMBOL(close_bdev_exclusive);
1785
1786int __invalidate_device(struct block_device *bdev) 1604int __invalidate_device(struct block_device *bdev)
1787{ 1605{
1788 struct super_block *sb = get_super(bdev); 1606 struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
4 select LIBCRC32C 4 select LIBCRC32C
5 select ZLIB_INFLATE 5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS
8 select LZO_DECOMPRESS
7 help 9 help
8 Btrfs is a new filesystem with extents, writable snapshotting, 10 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features. 11 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..31610ea73aec 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o 10 compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ae2c8cac9d5..15b5ca2a2606 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 61 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl)) 63 if (IS_ERR(acl)) {
64 kfree(value);
64 return acl; 65 return acl;
66 }
65 set_cached_acl(inode, type, acl); 67 set_cached_acl(inode, type, acl);
66 } 68 }
67 kfree(value); 69 kfree(value);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..ccc991c542df 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
157 /* 157 /*
158 * always compress this one file 158 * always compress this one file
159 */ 159 */
160 unsigned force_compress:1; 160 unsigned force_compress:4;
161 161
162 struct inode vfs_inode; 162 struct inode vfs_inode;
163}; 163};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b50bc4bd5c56..f745287fbf2e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
62 /* number of bytes on disk */ 62 /* number of bytes on disk */
63 unsigned long compressed_len; 63 unsigned long compressed_len;
64 64
65 /* the compression algorithm for this bio */
66 int compress_type;
67
65 /* number of compressed pages in the array */ 68 /* number of compressed pages in the array */
66 unsigned long nr_pages; 69 unsigned long nr_pages;
67 70
@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
173 /* ok, we're the last bio for this extent, lets start 176 /* ok, we're the last bio for this extent, lets start
174 * the decompression. 177 * the decompression.
175 */ 178 */
176 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 179 ret = btrfs_decompress_biovec(cb->compress_type,
177 cb->start, 180 cb->compressed_pages,
178 cb->orig_bio->bi_io_vec, 181 cb->start,
179 cb->orig_bio->bi_vcnt, 182 cb->orig_bio->bi_io_vec,
180 cb->compressed_len); 183 cb->orig_bio->bi_vcnt,
184 cb->compressed_len);
181csum_failed: 185csum_failed:
182 if (ret) 186 if (ret)
183 cb->errors = 1; 187 cb->errors = 1;
@@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
588 592
589 cb->len = uncompressed_len; 593 cb->len = uncompressed_len;
590 cb->compressed_len = compressed_len; 594 cb->compressed_len = compressed_len;
595 cb->compress_type = extent_compress_type(bio_flags);
591 cb->orig_bio = bio; 596 cb->orig_bio = bio;
592 597
593 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 598 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -677,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
677 bio_put(comp_bio); 682 bio_put(comp_bio);
678 return 0; 683 return 0;
679} 684}
685
686static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
687static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
688static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
689static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
690static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
691
692struct btrfs_compress_op *btrfs_compress_op[] = {
693 &btrfs_zlib_compress,
694 &btrfs_lzo_compress,
695};
696
697int __init btrfs_init_compress(void)
698{
699 int i;
700
701 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
702 INIT_LIST_HEAD(&comp_idle_workspace[i]);
703 spin_lock_init(&comp_workspace_lock[i]);
704 atomic_set(&comp_alloc_workspace[i], 0);
705 init_waitqueue_head(&comp_workspace_wait[i]);
706 }
707 return 0;
708}
709
710/*
711 * this finds an available workspace or allocates a new one
712 * ERR_PTR is returned if things go bad.
713 */
714static struct list_head *find_workspace(int type)
715{
716 struct list_head *workspace;
717 int cpus = num_online_cpus();
718 int idx = type - 1;
719
720 struct list_head *idle_workspace = &comp_idle_workspace[idx];
721 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
722 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
723 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
724 int *num_workspace = &comp_num_workspace[idx];
725again:
726 spin_lock(workspace_lock);
727 if (!list_empty(idle_workspace)) {
728 workspace = idle_workspace->next;
729 list_del(workspace);
730 (*num_workspace)--;
731 spin_unlock(workspace_lock);
732 return workspace;
733
734 }
735 if (atomic_read(alloc_workspace) > cpus) {
736 DEFINE_WAIT(wait);
737
738 spin_unlock(workspace_lock);
739 prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
740 if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
741 schedule();
742 finish_wait(workspace_wait, &wait);
743 goto again;
744 }
745 atomic_inc(alloc_workspace);
746 spin_unlock(workspace_lock);
747
748 workspace = btrfs_compress_op[idx]->alloc_workspace();
749 if (IS_ERR(workspace)) {
750 atomic_dec(alloc_workspace);
751 wake_up(workspace_wait);
752 }
753 return workspace;
754}
755
756/*
757 * put a workspace struct back on the list or free it if we have enough
758 * idle ones sitting around
759 */
760static void free_workspace(int type, struct list_head *workspace)
761{
762 int idx = type - 1;
763 struct list_head *idle_workspace = &comp_idle_workspace[idx];
764 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
765 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
766 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
767 int *num_workspace = &comp_num_workspace[idx];
768
769 spin_lock(workspace_lock);
770 if (*num_workspace < num_online_cpus()) {
771 list_add_tail(workspace, idle_workspace);
772 (*num_workspace)++;
773 spin_unlock(workspace_lock);
774 goto wake;
775 }
776 spin_unlock(workspace_lock);
777
778 btrfs_compress_op[idx]->free_workspace(workspace);
779 atomic_dec(alloc_workspace);
780wake:
781 if (waitqueue_active(workspace_wait))
782 wake_up(workspace_wait);
783}
784
785/*
786 * cleanup function for module exit
787 */
788static void free_workspaces(void)
789{
790 struct list_head *workspace;
791 int i;
792
793 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
794 while (!list_empty(&comp_idle_workspace[i])) {
795 workspace = comp_idle_workspace[i].next;
796 list_del(workspace);
797 btrfs_compress_op[i]->free_workspace(workspace);
798 atomic_dec(&comp_alloc_workspace[i]);
799 }
800 }
801}
802
803/*
804 * given an address space and start/len, compress the bytes.
805 *
806 * pages are allocated to hold the compressed result and stored
807 * in 'pages'
808 *
809 * out_pages is used to return the number of pages allocated. There
810 * may be pages allocated even if we return an error
811 *
812 * total_in is used to return the number of bytes actually read. It
813 * may be smaller then len if we had to exit early because we
814 * ran out of room in the pages array or because we cross the
815 * max_out threshold.
816 *
817 * total_out is used to return the total number of compressed bytes
818 *
819 * max_out tells us the max number of bytes that we're allowed to
820 * stuff into pages
821 */
822int btrfs_compress_pages(int type, struct address_space *mapping,
823 u64 start, unsigned long len,
824 struct page **pages,
825 unsigned long nr_dest_pages,
826 unsigned long *out_pages,
827 unsigned long *total_in,
828 unsigned long *total_out,
829 unsigned long max_out)
830{
831 struct list_head *workspace;
832 int ret;
833
834 workspace = find_workspace(type);
835 if (IS_ERR(workspace))
836 return -1;
837
838 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
839 start, len, pages,
840 nr_dest_pages, out_pages,
841 total_in, total_out,
842 max_out);
843 free_workspace(type, workspace);
844 return ret;
845}
846
847/*
848 * pages_in is an array of pages with compressed data.
849 *
850 * disk_start is the starting logical offset of this array in the file
851 *
852 * bvec is a bio_vec of pages from the file that we want to decompress into
853 *
854 * vcnt is the count of pages in the biovec
855 *
856 * srclen is the number of bytes in pages_in
857 *
858 * The basic idea is that we have a bio that was created by readpages.
859 * The pages in the bio are for the uncompressed data, and they may not
860 * be contiguous. They all correspond to the range of bytes covered by
861 * the compressed extent.
862 */
863int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
864 struct bio_vec *bvec, int vcnt, size_t srclen)
865{
866 struct list_head *workspace;
867 int ret;
868
869 workspace = find_workspace(type);
870 if (IS_ERR(workspace))
871 return -ENOMEM;
872
873 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
874 disk_start,
875 bvec, vcnt, srclen);
876 free_workspace(type, workspace);
877 return ret;
878}
879
880/*
881 * a less complex decompression routine. Our compressed data fits in a
882 * single page, and we want to read a single page out of it.
883 * start_byte tells us the offset into the compressed data we're interested in
884 */
885int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
886 unsigned long start_byte, size_t srclen, size_t destlen)
887{
888 struct list_head *workspace;
889 int ret;
890
891 workspace = find_workspace(type);
892 if (IS_ERR(workspace))
893 return -ENOMEM;
894
895 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
896 dest_page, start_byte,
897 srclen, destlen);
898
899 free_workspace(type, workspace);
900 return ret;
901}
902
903void __exit btrfs_exit_compress(void)
904{
905 free_workspaces();
906}
907
908/*
909 * Copy uncompressed data from working buffer to pages.
910 *
911 * buf_start is the byte offset we're of the start of our workspace buffer.
912 *
913 * total_out is the last byte of the buffer
914 */
915int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
916 unsigned long total_out, u64 disk_start,
917 struct bio_vec *bvec, int vcnt,
918 unsigned long *page_index,
919 unsigned long *pg_offset)
920{
921 unsigned long buf_offset;
922 unsigned long current_buf_start;
923 unsigned long start_byte;
924 unsigned long working_bytes = total_out - buf_start;
925 unsigned long bytes;
926 char *kaddr;
927 struct page *page_out = bvec[*page_index].bv_page;
928
929 /*
930 * start byte is the first byte of the page we're currently
931 * copying into relative to the start of the compressed data.
932 */
933 start_byte = page_offset(page_out) - disk_start;
934
935 /* we haven't yet hit data corresponding to this page */
936 if (total_out <= start_byte)
937 return 1;
938
939 /*
940 * the start of the data we care about is offset into
941 * the middle of our working buffer
942 */
943 if (total_out > start_byte && buf_start < start_byte) {
944 buf_offset = start_byte - buf_start;
945 working_bytes -= buf_offset;
946 } else {
947 buf_offset = 0;
948 }
949 current_buf_start = buf_start;
950
951 /* copy bytes from the working buffer into the pages */
952 while (working_bytes > 0) {
953 bytes = min(PAGE_CACHE_SIZE - *pg_offset,
954 PAGE_CACHE_SIZE - buf_offset);
955 bytes = min(bytes, working_bytes);
956 kaddr = kmap_atomic(page_out, KM_USER0);
957 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
958 kunmap_atomic(kaddr, KM_USER0);
959 flush_dcache_page(page_out);
960
961 *pg_offset += bytes;
962 buf_offset += bytes;
963 working_bytes -= bytes;
964 current_buf_start += bytes;
965
966 /* check if we need to pick another page */
967 if (*pg_offset == PAGE_CACHE_SIZE) {
968 (*page_index)++;
969 if (*page_index >= vcnt)
970 return 0;
971
972 page_out = bvec[*page_index].bv_page;
973 *pg_offset = 0;
974 start_byte = page_offset(page_out) - disk_start;
975
976 /*
977 * make sure our new page is covered by this
978 * working buffer
979 */
980 if (total_out <= start_byte)
981 return 1;
982
983 /*
984 * the next page in the biovec might not be adjacent
985 * to the last page, but it might still be found
986 * inside this working buffer. bump our offset pointer
987 */
988 if (total_out > start_byte &&
989 current_buf_start < start_byte) {
990 buf_offset = start_byte - buf_start;
991 working_bytes = total_out - start_byte;
992 current_buf_start = buf_start + buf_offset;
993 }
994 }
995 }
996
997 return 1;
998}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..51000174b9d7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
19#ifndef __BTRFS_COMPRESSION_ 19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_ 20#define __BTRFS_COMPRESSION_
21 21
22int btrfs_zlib_decompress(unsigned char *data_in, 22int btrfs_init_compress(void);
23 struct page *dest_page, 23void btrfs_exit_compress(void);
24 unsigned long start_byte, 24
25 size_t srclen, size_t destlen); 25int btrfs_compress_pages(int type, struct address_space *mapping,
26int btrfs_zlib_compress_pages(struct address_space *mapping, 26 u64 start, unsigned long len,
27 u64 start, unsigned long len, 27 struct page **pages,
28 struct page **pages, 28 unsigned long nr_dest_pages,
29 unsigned long nr_dest_pages, 29 unsigned long *out_pages,
30 unsigned long *out_pages, 30 unsigned long *total_in,
31 unsigned long *total_in, 31 unsigned long *total_out,
32 unsigned long *total_out, 32 unsigned long max_out);
33 unsigned long max_out); 33int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
34int btrfs_zlib_decompress_biovec(struct page **pages_in, 34 struct bio_vec *bvec, int vcnt, size_t srclen);
35 u64 disk_start, 35int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
36 struct bio_vec *bvec, 36 unsigned long start_byte, size_t srclen, size_t destlen);
37 int vcnt, 37int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
38 size_t srclen); 38 unsigned long total_out, u64 disk_start,
39void btrfs_zlib_exit(void); 39 struct bio_vec *bvec, int vcnt,
40 unsigned long *page_index,
41 unsigned long *pg_offset);
42
40int btrfs_submit_compressed_write(struct inode *inode, u64 start, 43int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start, 44 unsigned long len, u64 disk_start,
42 unsigned long compressed_len, 45 unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
44 unsigned long nr_pages); 47 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 48int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags); 49 int mirror_num, unsigned long bio_flags);
50
51struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void);
53
54 void (*free_workspace)(struct list_head *workspace);
55
56 int (*compress_pages)(struct list_head *workspace,
57 struct address_space *mapping,
58 u64 start, unsigned long len,
59 struct page **pages,
60 unsigned long nr_dest_pages,
61 unsigned long *out_pages,
62 unsigned long *total_in,
63 unsigned long *total_out,
64 unsigned long max_out);
65
66 int (*decompress_biovec)(struct list_head *workspace,
67 struct page **pages_in,
68 u64 disk_start,
69 struct bio_vec *bvec,
70 int vcnt,
71 size_t srclen);
72
73 int (*decompress)(struct list_head *workspace,
74 unsigned char *data_in,
75 struct page *dest_page,
76 unsigned long start_byte,
77 size_t srclen, size_t destlen);
78};
79
80extern struct btrfs_compress_op btrfs_zlib_compress;
81extern struct btrfs_compress_op btrfs_lzo_compress;
82
47#endif 83#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac171599258..b5baff0dccfe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
105/* this also releases the path */ 105/* this also releases the path */
106void btrfs_free_path(struct btrfs_path *p) 106void btrfs_free_path(struct btrfs_path *p)
107{ 107{
108 if (!p)
109 return;
108 btrfs_release_path(NULL, p); 110 btrfs_release_path(NULL, p);
109 kmem_cache_free(btrfs_path_cachep, p); 111 kmem_cache_free(btrfs_path_cachep, p);
110} 112}
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2514 btrfs_assert_tree_locked(path->nodes[1]); 2516 btrfs_assert_tree_locked(path->nodes[1]);
2515 2517
2516 right = read_node_slot(root, upper, slot + 1); 2518 right = read_node_slot(root, upper, slot + 1);
2519 if (right == NULL)
2520 return 1;
2521
2517 btrfs_tree_lock(right); 2522 btrfs_tree_lock(right);
2518 btrfs_set_lock_blocking(right); 2523 btrfs_set_lock_blocking(right);
2519 2524
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2764 btrfs_assert_tree_locked(path->nodes[1]); 2769 btrfs_assert_tree_locked(path->nodes[1]);
2765 2770
2766 left = read_node_slot(root, path->nodes[1], slot - 1); 2771 left = read_node_slot(root, path->nodes[1], slot - 1);
2772 if (left == NULL)
2773 return 1;
2774
2767 btrfs_tree_lock(left); 2775 btrfs_tree_lock(left);
2768 btrfs_set_lock_blocking(left); 2776 btrfs_set_lock_blocking(left);
2769 2777
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a142d204b526..2c98b3af6052 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/kobject.h>
30#include <asm/kmap_types.h> 31#include <asm/kmap_types.h>
31#include "extent_io.h" 32#include "extent_io.h"
32#include "extent_map.h" 33#include "extent_map.h"
@@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
294#define BTRFS_FSID_SIZE 16 295#define BTRFS_FSID_SIZE 16
295#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) 296#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
296#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) 297#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
298
299/*
300 * File system states
301 */
302
303/* Errors detected */
304#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
305
297#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) 306#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
298#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) 307#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
299 308
@@ -398,13 +407,15 @@ struct btrfs_super_block {
398#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 407#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
399#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) 408#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
400#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) 409#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
410#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
401 411
402#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 412#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
403#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 413#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
404#define BTRFS_FEATURE_INCOMPAT_SUPP \ 414#define BTRFS_FEATURE_INCOMPAT_SUPP \
405 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 415 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
406 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 416 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
407 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 417 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
418 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
408 419
409/* 420/*
410 * A leaf is full of items. offset and size tell us where to find 421 * A leaf is full of items. offset and size tell us where to find
@@ -551,9 +562,11 @@ struct btrfs_timespec {
551} __attribute__ ((__packed__)); 562} __attribute__ ((__packed__));
552 563
553enum btrfs_compression_type { 564enum btrfs_compression_type {
554 BTRFS_COMPRESS_NONE = 0, 565 BTRFS_COMPRESS_NONE = 0,
555 BTRFS_COMPRESS_ZLIB = 1, 566 BTRFS_COMPRESS_ZLIB = 1,
556 BTRFS_COMPRESS_LAST = 2, 567 BTRFS_COMPRESS_LZO = 2,
568 BTRFS_COMPRESS_TYPES = 2,
569 BTRFS_COMPRESS_LAST = 3,
557}; 570};
558 571
559struct btrfs_inode_item { 572struct btrfs_inode_item {
@@ -597,6 +610,8 @@ struct btrfs_dir_item {
597 u8 type; 610 u8 type;
598} __attribute__ ((__packed__)); 611} __attribute__ ((__packed__));
599 612
613#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
614
600struct btrfs_root_item { 615struct btrfs_root_item {
601 struct btrfs_inode_item inode; 616 struct btrfs_inode_item inode;
602 __le64 generation; 617 __le64 generation;
@@ -895,7 +910,8 @@ struct btrfs_fs_info {
895 */ 910 */
896 u64 last_trans_log_full_commit; 911 u64 last_trans_log_full_commit;
897 u64 open_ioctl_trans; 912 u64 open_ioctl_trans;
898 unsigned long mount_opt; 913 unsigned long mount_opt:20;
914 unsigned long compress_type:4;
899 u64 max_inline; 915 u64 max_inline;
900 u64 alloc_start; 916 u64 alloc_start;
901 struct btrfs_transaction *running_transaction; 917 struct btrfs_transaction *running_transaction;
@@ -1050,6 +1066,9 @@ struct btrfs_fs_info {
1050 unsigned metadata_ratio; 1066 unsigned metadata_ratio;
1051 1067
1052 void *bdev_holder; 1068 void *bdev_holder;
1069
1070 /* filesystem state */
1071 u64 fs_state;
1053}; 1072};
1054 1073
1055/* 1074/*
@@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1893BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 1912BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1894 last_snapshot, 64); 1913 last_snapshot, 64);
1895 1914
1915static inline bool btrfs_root_readonly(struct btrfs_root *root)
1916{
1917 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1918}
1919
1896/* struct btrfs_super_block */ 1920/* struct btrfs_super_block */
1897 1921
1898BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 1922BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2145int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2169int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2146 struct btrfs_root *root, u64 group_start); 2170 struct btrfs_root *root, u64 group_start);
2147u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2171u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2172u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2148void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2173void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2149void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2174void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2150int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2175int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
2188int btrfs_set_block_group_rw(struct btrfs_root *root, 2213int btrfs_set_block_group_rw(struct btrfs_root *root,
2189 struct btrfs_block_group_cache *cache); 2214 struct btrfs_block_group_cache *cache);
2190void btrfs_put_block_group_cache(struct btrfs_fs_info *info); 2215void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2216u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2217int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2218 u64 start, u64 end);
2219int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2220 u64 num_bytes);
2221
2191/* ctree.c */ 2222/* ctree.c */
2192int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2223int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2193 int level, int *slot); 2224 int level, int *slot);
@@ -2541,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2541/* super.c */ 2572/* super.c */
2542int btrfs_parse_options(struct btrfs_root *root, char *options); 2573int btrfs_parse_options(struct btrfs_root *root, char *options);
2543int btrfs_sync_fs(struct super_block *sb, int wait); 2574int btrfs_sync_fs(struct super_block *sb, int wait);
2575void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
2576 unsigned int line, int errno);
2577
2578#define btrfs_std_error(fs_info, errno) \
2579do { \
2580 if ((errno)) \
2581 __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
2582} while (0)
2544 2583
2545/* acl.c */ 2584/* acl.c */
2546#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2585#ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51d2e4de34eb..b531c36455d8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,20 @@
44static struct extent_io_ops btree_extent_io_ops; 44static struct extent_io_ops btree_extent_io_ops;
45static void end_workqueue_fn(struct btrfs_work *work); 45static void end_workqueue_fn(struct btrfs_work *work);
46static void free_fs_root(struct btrfs_root *root); 46static void free_fs_root(struct btrfs_root *root);
47static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
48 int read_only);
49static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
50static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
51static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
52 struct btrfs_root *root);
53static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
54static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
55static int btrfs_destroy_marked_extents(struct btrfs_root *root,
56 struct extent_io_tree *dirty_pages,
57 int mark);
58static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
59 struct extent_io_tree *pinned_extents);
60static int btrfs_cleanup_transaction(struct btrfs_root *root);
47 61
48/* 62/*
49 * end_io_wq structs are used to do processing in task context when an IO is 63 * end_io_wq structs are used to do processing in task context when an IO is
@@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
353 WARN_ON(len == 0); 367 WARN_ON(len == 0);
354 368
355 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 369 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
370 if (eb == NULL) {
371 WARN_ON(1);
372 goto out;
373 }
356 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 374 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
357 btrfs_header_generation(eb)); 375 btrfs_header_generation(eb));
358 BUG_ON(ret); 376 BUG_ON(ret);
@@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
427 WARN_ON(len == 0); 445 WARN_ON(len == 0);
428 446
429 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 447 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
448 if (eb == NULL) {
449 ret = -EIO;
450 goto out;
451 }
430 452
431 found_start = btrfs_header_bytenr(eb); 453 found_start = btrfs_header_bytenr(eb);
432 if (found_start != start) { 454 if (found_start != start) {
@@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1145 } 1167 }
1146 btrfs_free_path(path); 1168 btrfs_free_path(path);
1147 if (ret) { 1169 if (ret) {
1170 kfree(root);
1148 if (ret > 0) 1171 if (ret > 0)
1149 ret = -ENOENT; 1172 ret = -ENOENT;
1150 return ERR_PTR(ret); 1173 return ERR_PTR(ret);
@@ -1713,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1713 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1736 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1714 1737
1715 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1738 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1716 if (!bh) 1739 if (!bh) {
1740 err = -EINVAL;
1717 goto fail_iput; 1741 goto fail_iput;
1742 }
1718 1743
1719 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1744 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1720 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1745 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1727 if (!btrfs_super_root(disk_super)) 1752 if (!btrfs_super_root(disk_super))
1728 goto fail_iput; 1753 goto fail_iput;
1729 1754
1755 /* check FS state, whether FS is broken. */
1756 fs_info->fs_state |= btrfs_super_flags(disk_super);
1757
1758 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1759
1730 ret = btrfs_parse_options(tree_root, options); 1760 ret = btrfs_parse_options(tree_root, options);
1731 if (ret) { 1761 if (ret) {
1732 err = ret; 1762 err = ret;
@@ -1744,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1744 } 1774 }
1745 1775
1746 features = btrfs_super_incompat_flags(disk_super); 1776 features = btrfs_super_incompat_flags(disk_super);
1747 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1777 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1748 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1778 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1749 btrfs_set_super_incompat_flags(disk_super, features); 1779 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1750 } 1780 btrfs_set_super_incompat_flags(disk_super, features);
1751 1781
1752 features = btrfs_super_compat_ro_flags(disk_super) & 1782 features = btrfs_super_compat_ro_flags(disk_super) &
1753 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1783 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1957,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1957 btrfs_set_opt(fs_info->mount_opt, SSD); 1987 btrfs_set_opt(fs_info->mount_opt, SSD);
1958 } 1988 }
1959 1989
1960 if (btrfs_super_log_root(disk_super) != 0) { 1990 /* do not make disk changes in broken FS */
1991 if (btrfs_super_log_root(disk_super) != 0 &&
1992 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1961 u64 bytenr = btrfs_super_log_root(disk_super); 1993 u64 bytenr = btrfs_super_log_root(disk_super);
1962 1994
1963 if (fs_devices->rw_devices == 0) { 1995 if (fs_devices->rw_devices == 0) {
@@ -2442,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
2442 smp_mb(); 2474 smp_mb();
2443 2475
2444 btrfs_put_block_group_cache(fs_info); 2476 btrfs_put_block_group_cache(fs_info);
2477
2478 /*
2479 * Here come 2 situations when btrfs is broken to flip readonly:
2480 *
2481 * 1. when btrfs flips readonly somewhere else before
2482 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2483 * and btrfs will skip to write sb directly to keep
2484 * ERROR state on disk.
2485 *
2486 * 2. when btrfs flips readonly just in btrfs_commit_super,
2487 * and in such case, btrfs cannnot write sb via btrfs_commit_super,
2488 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2489 * btrfs will cleanup all FS resources first and write sb then.
2490 */
2445 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2491 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2446 ret = btrfs_commit_super(root); 2492 ret = btrfs_commit_super(root);
2493 if (ret)
2494 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2495 }
2496
2497 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2498 ret = btrfs_error_commit_super(root);
2447 if (ret) 2499 if (ret)
2448 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2500 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2449 } 2501 }
@@ -2619,6 +2671,352 @@ out:
2619 return 0; 2671 return 0;
2620} 2672}
2621 2673
2674static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2675 int read_only)
2676{
2677 if (read_only)
2678 return;
2679
2680 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2681 printk(KERN_WARNING "warning: mount fs with errors, "
2682 "running btrfsck is recommended\n");
2683}
2684
2685int btrfs_error_commit_super(struct btrfs_root *root)
2686{
2687 int ret;
2688
2689 mutex_lock(&root->fs_info->cleaner_mutex);
2690 btrfs_run_delayed_iputs(root);
2691 mutex_unlock(&root->fs_info->cleaner_mutex);
2692
2693 down_write(&root->fs_info->cleanup_work_sem);
2694 up_write(&root->fs_info->cleanup_work_sem);
2695
2696 /* cleanup FS via transaction */
2697 btrfs_cleanup_transaction(root);
2698
2699 ret = write_ctree_super(NULL, root, 0);
2700
2701 return ret;
2702}
2703
2704static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2705{
2706 struct btrfs_inode *btrfs_inode;
2707 struct list_head splice;
2708
2709 INIT_LIST_HEAD(&splice);
2710
2711 mutex_lock(&root->fs_info->ordered_operations_mutex);
2712 spin_lock(&root->fs_info->ordered_extent_lock);
2713
2714 list_splice_init(&root->fs_info->ordered_operations, &splice);
2715 while (!list_empty(&splice)) {
2716 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2717 ordered_operations);
2718
2719 list_del_init(&btrfs_inode->ordered_operations);
2720
2721 btrfs_invalidate_inodes(btrfs_inode->root);
2722 }
2723
2724 spin_unlock(&root->fs_info->ordered_extent_lock);
2725 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2726
2727 return 0;
2728}
2729
2730static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2731{
2732 struct list_head splice;
2733 struct btrfs_ordered_extent *ordered;
2734 struct inode *inode;
2735
2736 INIT_LIST_HEAD(&splice);
2737
2738 spin_lock(&root->fs_info->ordered_extent_lock);
2739
2740 list_splice_init(&root->fs_info->ordered_extents, &splice);
2741 while (!list_empty(&splice)) {
2742 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2743 root_extent_list);
2744
2745 list_del_init(&ordered->root_extent_list);
2746 atomic_inc(&ordered->refs);
2747
2748 /* the inode may be getting freed (in sys_unlink path). */
2749 inode = igrab(ordered->inode);
2750
2751 spin_unlock(&root->fs_info->ordered_extent_lock);
2752 if (inode)
2753 iput(inode);
2754
2755 atomic_set(&ordered->refs, 1);
2756 btrfs_put_ordered_extent(ordered);
2757
2758 spin_lock(&root->fs_info->ordered_extent_lock);
2759 }
2760
2761 spin_unlock(&root->fs_info->ordered_extent_lock);
2762
2763 return 0;
2764}
2765
2766static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2767 struct btrfs_root *root)
2768{
2769 struct rb_node *node;
2770 struct btrfs_delayed_ref_root *delayed_refs;
2771 struct btrfs_delayed_ref_node *ref;
2772 int ret = 0;
2773
2774 delayed_refs = &trans->delayed_refs;
2775
2776 spin_lock(&delayed_refs->lock);
2777 if (delayed_refs->num_entries == 0) {
2778 printk(KERN_INFO "delayed_refs has NO entry\n");
2779 return ret;
2780 }
2781
2782 node = rb_first(&delayed_refs->root);
2783 while (node) {
2784 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2785 node = rb_next(node);
2786
2787 ref->in_tree = 0;
2788 rb_erase(&ref->rb_node, &delayed_refs->root);
2789 delayed_refs->num_entries--;
2790
2791 atomic_set(&ref->refs, 1);
2792 if (btrfs_delayed_ref_is_head(ref)) {
2793 struct btrfs_delayed_ref_head *head;
2794
2795 head = btrfs_delayed_node_to_head(ref);
2796 mutex_lock(&head->mutex);
2797 kfree(head->extent_op);
2798 delayed_refs->num_heads--;
2799 if (list_empty(&head->cluster))
2800 delayed_refs->num_heads_ready--;
2801 list_del_init(&head->cluster);
2802 mutex_unlock(&head->mutex);
2803 }
2804
2805 spin_unlock(&delayed_refs->lock);
2806 btrfs_put_delayed_ref(ref);
2807
2808 cond_resched();
2809 spin_lock(&delayed_refs->lock);
2810 }
2811
2812 spin_unlock(&delayed_refs->lock);
2813
2814 return ret;
2815}
2816
2817static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2818{
2819 struct btrfs_pending_snapshot *snapshot;
2820 struct list_head splice;
2821
2822 INIT_LIST_HEAD(&splice);
2823
2824 list_splice_init(&t->pending_snapshots, &splice);
2825
2826 while (!list_empty(&splice)) {
2827 snapshot = list_entry(splice.next,
2828 struct btrfs_pending_snapshot,
2829 list);
2830
2831 list_del_init(&snapshot->list);
2832
2833 kfree(snapshot);
2834 }
2835
2836 return 0;
2837}
2838
2839static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2840{
2841 struct btrfs_inode *btrfs_inode;
2842 struct list_head splice;
2843
2844 INIT_LIST_HEAD(&splice);
2845
2846 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2847
2848 spin_lock(&root->fs_info->delalloc_lock);
2849
2850 while (!list_empty(&splice)) {
2851 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2852 delalloc_inodes);
2853
2854 list_del_init(&btrfs_inode->delalloc_inodes);
2855
2856 btrfs_invalidate_inodes(btrfs_inode->root);
2857 }
2858
2859 spin_unlock(&root->fs_info->delalloc_lock);
2860
2861 return 0;
2862}
2863
2864static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2865 struct extent_io_tree *dirty_pages,
2866 int mark)
2867{
2868 int ret;
2869 struct page *page;
2870 struct inode *btree_inode = root->fs_info->btree_inode;
2871 struct extent_buffer *eb;
2872 u64 start = 0;
2873 u64 end;
2874 u64 offset;
2875 unsigned long index;
2876
2877 while (1) {
2878 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2879 mark);
2880 if (ret)
2881 break;
2882
2883 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2884 while (start <= end) {
2885 index = start >> PAGE_CACHE_SHIFT;
2886 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2887 page = find_get_page(btree_inode->i_mapping, index);
2888 if (!page)
2889 continue;
2890 offset = page_offset(page);
2891
2892 spin_lock(&dirty_pages->buffer_lock);
2893 eb = radix_tree_lookup(
2894 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2895 offset >> PAGE_CACHE_SHIFT);
2896 spin_unlock(&dirty_pages->buffer_lock);
2897 if (eb) {
2898 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2899 &eb->bflags);
2900 atomic_set(&eb->refs, 1);
2901 }
2902 if (PageWriteback(page))
2903 end_page_writeback(page);
2904
2905 lock_page(page);
2906 if (PageDirty(page)) {
2907 clear_page_dirty_for_io(page);
2908 spin_lock_irq(&page->mapping->tree_lock);
2909 radix_tree_tag_clear(&page->mapping->page_tree,
2910 page_index(page),
2911 PAGECACHE_TAG_DIRTY);
2912 spin_unlock_irq(&page->mapping->tree_lock);
2913 }
2914
2915 page->mapping->a_ops->invalidatepage(page, 0);
2916 unlock_page(page);
2917 }
2918 }
2919
2920 return ret;
2921}
2922
2923static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2924 struct extent_io_tree *pinned_extents)
2925{
2926 struct extent_io_tree *unpin;
2927 u64 start;
2928 u64 end;
2929 int ret;
2930
2931 unpin = pinned_extents;
2932 while (1) {
2933 ret = find_first_extent_bit(unpin, 0, &start, &end,
2934 EXTENT_DIRTY);
2935 if (ret)
2936 break;
2937
2938 /* opt_discard */
2939 ret = btrfs_error_discard_extent(root, start, end + 1 - start);
2940
2941 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2942 btrfs_error_unpin_extent_range(root, start, end);
2943 cond_resched();
2944 }
2945
2946 return 0;
2947}
2948
2949static int btrfs_cleanup_transaction(struct btrfs_root *root)
2950{
2951 struct btrfs_transaction *t;
2952 LIST_HEAD(list);
2953
2954 WARN_ON(1);
2955
2956 mutex_lock(&root->fs_info->trans_mutex);
2957 mutex_lock(&root->fs_info->transaction_kthread_mutex);
2958
2959 list_splice_init(&root->fs_info->trans_list, &list);
2960 while (!list_empty(&list)) {
2961 t = list_entry(list.next, struct btrfs_transaction, list);
2962 if (!t)
2963 break;
2964
2965 btrfs_destroy_ordered_operations(root);
2966
2967 btrfs_destroy_ordered_extents(root);
2968
2969 btrfs_destroy_delayed_refs(t, root);
2970
2971 btrfs_block_rsv_release(root,
2972 &root->fs_info->trans_block_rsv,
2973 t->dirty_pages.dirty_bytes);
2974
2975 /* FIXME: cleanup wait for commit */
2976 t->in_commit = 1;
2977 t->blocked = 1;
2978 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
2979 wake_up(&root->fs_info->transaction_blocked_wait);
2980
2981 t->blocked = 0;
2982 if (waitqueue_active(&root->fs_info->transaction_wait))
2983 wake_up(&root->fs_info->transaction_wait);
2984 mutex_unlock(&root->fs_info->trans_mutex);
2985
2986 mutex_lock(&root->fs_info->trans_mutex);
2987 t->commit_done = 1;
2988 if (waitqueue_active(&t->commit_wait))
2989 wake_up(&t->commit_wait);
2990 mutex_unlock(&root->fs_info->trans_mutex);
2991
2992 mutex_lock(&root->fs_info->trans_mutex);
2993
2994 btrfs_destroy_pending_snapshots(t);
2995
2996 btrfs_destroy_delalloc_inodes(root);
2997
2998 spin_lock(&root->fs_info->new_trans_lock);
2999 root->fs_info->running_transaction = NULL;
3000 spin_unlock(&root->fs_info->new_trans_lock);
3001
3002 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3003 EXTENT_DIRTY);
3004
3005 btrfs_destroy_pinned_extent(root,
3006 root->fs_info->pinned_extents);
3007
3008 t->use_count = 0;
3009 list_del_init(&t->list);
3010 memset(t, 0, sizeof(*t));
3011 kmem_cache_free(btrfs_transaction_cachep, t);
3012 }
3013
3014 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3015 mutex_unlock(&root->fs_info->trans_mutex);
3016
3017 return 0;
3018}
3019
2622static struct extent_io_ops btree_extent_io_ops = { 3020static struct extent_io_ops btree_extent_io_ops = {
2623 .write_cache_pages_lock_hook = btree_lock_page_hook, 3021 .write_cache_pages_lock_hook = btree_lock_page_hook,
2624 .readpage_end_io_hook = btree_readpage_end_io_hook, 3022 .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..07b20dc2fd95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors); 52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root); 54int btrfs_commit_super(struct btrfs_root *root);
55int btrfs_error_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 56struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize); 57 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 58struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 0ccf9a8afcdf..9786963b07e5 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
65{ 65{
66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
67 struct btrfs_root *root; 67 struct btrfs_root *root;
68 struct dentry *dentry;
69 struct inode *inode; 68 struct inode *inode;
70 struct btrfs_key key; 69 struct btrfs_key key;
71 int index; 70 int index;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
108 return ERR_PTR(-ESTALE); 107 return ERR_PTR(-ESTALE);
109 } 108 }
110 109
111 dentry = d_obtain_alias(inode); 110 return d_obtain_alias(inode);
112 if (!IS_ERR(dentry))
113 d_set_d_op(dentry, &btrfs_dentry_operations);
114 return dentry;
115fail: 111fail:
116 srcu_read_unlock(&fs_info->subvol_srcu, index); 112 srcu_read_unlock(&fs_info->subvol_srcu, index);
117 return ERR_PTR(err); 113 return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
166static struct dentry *btrfs_get_parent(struct dentry *child) 162static struct dentry *btrfs_get_parent(struct dentry *child)
167{ 163{
168 struct inode *dir = child->d_inode; 164 struct inode *dir = child->d_inode;
169 struct dentry *dentry;
170 struct btrfs_root *root = BTRFS_I(dir)->root; 165 struct btrfs_root *root = BTRFS_I(dir)->root;
171 struct btrfs_path *path; 166 struct btrfs_path *path;
172 struct extent_buffer *leaf; 167 struct extent_buffer *leaf;
@@ -223,10 +218,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 218
224 key.type = BTRFS_INODE_ITEM_KEY; 219 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 220 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); 221 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry))
228 d_set_d_op(dentry, &btrfs_dentry_operations);
229 return dentry;
230fail: 222fail:
231 btrfs_free_path(path); 223 btrfs_free_path(path);
232 return ERR_PTR(ret); 224 return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 227e5815d838..b55269340cec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3089,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3089 return btrfs_reduce_alloc_profile(root, flags); 3089 return btrfs_reduce_alloc_profile(root, flags);
3090} 3090}
3091 3091
3092static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3092u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3093{ 3093{
3094 u64 flags; 3094 u64 flags;
3095 3095
@@ -3161,8 +3161,12 @@ alloc:
3161 bytes + 2 * 1024 * 1024, 3161 bytes + 2 * 1024 * 1024,
3162 alloc_target, 0); 3162 alloc_target, 0);
3163 btrfs_end_transaction(trans, root); 3163 btrfs_end_transaction(trans, root);
3164 if (ret < 0) 3164 if (ret < 0) {
3165 return ret; 3165 if (ret != -ENOSPC)
3166 return ret;
3167 else
3168 goto commit_trans;
3169 }
3166 3170
3167 if (!data_sinfo) { 3171 if (!data_sinfo) {
3168 btrfs_set_inode_space_info(root, inode); 3172 btrfs_set_inode_space_info(root, inode);
@@ -3173,6 +3177,7 @@ alloc:
3173 spin_unlock(&data_sinfo->lock); 3177 spin_unlock(&data_sinfo->lock);
3174 3178
3175 /* commit the current transaction and try again */ 3179 /* commit the current transaction and try again */
3180commit_trans:
3176 if (!committed && !root->fs_info->open_ioctl_trans) { 3181 if (!committed && !root->fs_info->open_ioctl_trans) {
3177 committed = 1; 3182 committed = 1;
3178 trans = btrfs_join_transaction(root, 1); 3183 trans = btrfs_join_transaction(root, 1);
@@ -3721,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3721 return 0; 3726 return 0;
3722 } 3727 }
3723 3728
3724 WARN_ON(1);
3725 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3726 block_rsv->size, block_rsv->reserved,
3727 block_rsv->freed[0], block_rsv->freed[1]);
3728
3729 return -ENOSPC; 3729 return -ENOSPC;
3730} 3730}
3731 3731
@@ -7970,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7970 7970
7971 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 7971 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7972 sinfo->bytes_may_use + sinfo->bytes_readonly + 7972 sinfo->bytes_may_use + sinfo->bytes_readonly +
7973 cache->reserved_pinned + num_bytes < sinfo->total_bytes) { 7973 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
7974 sinfo->bytes_readonly += num_bytes; 7974 sinfo->bytes_readonly += num_bytes;
7975 sinfo->bytes_reserved += cache->reserved_pinned; 7975 sinfo->bytes_reserved += cache->reserved_pinned;
7976 cache->reserved_pinned = 0; 7976 cache->reserved_pinned = 0;
7977 cache->ro = 1; 7977 cache->ro = 1;
7978 ret = 0; 7978 ret = 0;
7979 } 7979 }
7980
7980 spin_unlock(&cache->lock); 7981 spin_unlock(&cache->lock);
7981 spin_unlock(&sinfo->lock); 7982 spin_unlock(&sinfo->lock);
7982 return ret; 7983 return ret;
@@ -8012,6 +8013,62 @@ out:
8012 return ret; 8013 return ret;
8013} 8014}
8014 8015
8016/*
8017 * helper to account the unused space of all the readonly block group in the
8018 * list. takes mirrors into account.
8019 */
8020static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8021{
8022 struct btrfs_block_group_cache *block_group;
8023 u64 free_bytes = 0;
8024 int factor;
8025
8026 list_for_each_entry(block_group, groups_list, list) {
8027 spin_lock(&block_group->lock);
8028
8029 if (!block_group->ro) {
8030 spin_unlock(&block_group->lock);
8031 continue;
8032 }
8033
8034 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8035 BTRFS_BLOCK_GROUP_RAID10 |
8036 BTRFS_BLOCK_GROUP_DUP))
8037 factor = 2;
8038 else
8039 factor = 1;
8040
8041 free_bytes += (block_group->key.offset -
8042 btrfs_block_group_used(&block_group->item)) *
8043 factor;
8044
8045 spin_unlock(&block_group->lock);
8046 }
8047
8048 return free_bytes;
8049}
8050
8051/*
8052 * helper to account the unused space of all the readonly block group in the
8053 * space_info. takes mirrors into account.
8054 */
8055u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8056{
8057 int i;
8058 u64 free_bytes = 0;
8059
8060 spin_lock(&sinfo->lock);
8061
8062 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8063 if (!list_empty(&sinfo->block_groups[i]))
8064 free_bytes += __btrfs_get_ro_block_group_free_space(
8065 &sinfo->block_groups[i]);
8066
8067 spin_unlock(&sinfo->lock);
8068
8069 return free_bytes;
8070}
8071
8015int btrfs_set_block_group_rw(struct btrfs_root *root, 8072int btrfs_set_block_group_rw(struct btrfs_root *root,
8016 struct btrfs_block_group_cache *cache) 8073 struct btrfs_block_group_cache *cache)
8017{ 8074{
@@ -8092,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8092 mutex_lock(&root->fs_info->chunk_mutex); 8149 mutex_lock(&root->fs_info->chunk_mutex);
8093 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8150 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8094 u64 min_free = btrfs_block_group_used(&block_group->item); 8151 u64 min_free = btrfs_block_group_used(&block_group->item);
8095 u64 dev_offset, max_avail; 8152 u64 dev_offset;
8096 8153
8097 /* 8154 /*
8098 * check to make sure we can actually find a chunk with enough 8155 * check to make sure we can actually find a chunk with enough
@@ -8100,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8100 */ 8157 */
8101 if (device->total_bytes > device->bytes_used + min_free) { 8158 if (device->total_bytes > device->bytes_used + min_free) {
8102 ret = find_free_dev_extent(NULL, device, min_free, 8159 ret = find_free_dev_extent(NULL, device, min_free,
8103 &dev_offset, &max_avail); 8160 &dev_offset, NULL);
8104 if (!ret) 8161 if (!ret)
8105 break; 8162 break;
8106 ret = -1; 8163 ret = -1;
@@ -8584,3 +8641,14 @@ out:
8584 btrfs_free_path(path); 8641 btrfs_free_path(path);
8585 return ret; 8642 return ret;
8586} 8643}
8644
8645int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8646{
8647 return unpin_extent_range(root, start, end);
8648}
8649
8650int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8651 u64 num_bytes)
8652{
8653 return btrfs_discard_extent(root, bytenr, num_bytes);
8654}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e86b9f36507..2e993cf1766e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2028 BUG_ON(extent_map_end(em) <= cur); 2028 BUG_ON(extent_map_end(em) <= cur);
2029 BUG_ON(end < cur); 2029 BUG_ON(end < cur);
2030 2030
2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2032 this_bio_flag = EXTENT_BIO_COMPRESSED; 2032 this_bio_flag = EXTENT_BIO_COMPRESSED;
2033 extent_set_compress_type(&this_bio_flag,
2034 em->compress_type);
2035 }
2033 2036
2034 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2037 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2035 cur_end = min(extent_map_end(em) - 1, end); 2038 cur_end = min(extent_map_end(em) - 1, end);
@@ -3072,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3072#endif 3075#endif
3073 3076
3074 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3077 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3078 if (eb == NULL)
3079 return NULL;
3075 eb->start = start; 3080 eb->start = start;
3076 eb->len = len; 3081 eb->len = len;
3077 spin_lock_init(&eb->lock); 3082 spin_lock_init(&eb->lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4183c8178f01..7083cfafd061 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 22
23/* flags for bio submission */ 23/*
24 * flags for bio submission. The high bits indicate the compression
25 * type for this bio
26 */
24#define EXTENT_BIO_COMPRESSED 1 27#define EXTENT_BIO_COMPRESSED 1
28#define EXTENT_BIO_FLAG_SHIFT 16
25 29
26/* these are bit numbers for test/set bit */ 30/* these are bit numbers for test/set bit */
27#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
135 wait_queue_head_t lock_wq; 139 wait_queue_head_t lock_wq;
136}; 140};
137 141
142static inline void extent_set_compress_type(unsigned long *bio_flags,
143 int compress_type)
144{
145 *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
146}
147
148static inline int extent_compress_type(unsigned long bio_flags)
149{
150 return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
151}
152
138struct extent_map_tree; 153struct extent_map_tree;
139 154
140static inline struct extent_state *extent_state_next(struct extent_state *state) 155static inline struct extent_state *extent_state_next(struct extent_state *state)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff66..b0e1fce12530 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/hardirq.h> 5#include <linux/hardirq.h>
6#include "ctree.h"
6#include "extent_map.h" 7#include "extent_map.h"
7 8
8 9
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
54 return em; 55 return em;
55 em->in_tree = 0; 56 em->in_tree = 0;
56 em->flags = 0; 57 em->flags = 0;
58 em->compress_type = BTRFS_COMPRESS_NONE;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
58 return em; 60 return em;
59} 61}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..28b44dbd1e35 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
26 unsigned long flags; 26 unsigned long flags;
27 struct block_device *bdev; 27 struct block_device *bdev;
28 atomic_t refs; 28 atomic_t refs;
29 int in_tree; 29 unsigned int in_tree:1;
30 unsigned int compress_type:4;
30}; 31};
31 32
32struct extent_map_tree { 33struct extent_map_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763b..c800d58f3013 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/falloc.h>
27#include <linux/swap.h> 28#include <linux/swap.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
29#include <linux/statfs.h> 30#include <linux/statfs.h>
@@ -224,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
224 225
225 split->bdev = em->bdev; 226 split->bdev = em->bdev;
226 split->flags = flags; 227 split->flags = flags;
228 split->compress_type = em->compress_type;
227 ret = add_extent_mapping(em_tree, split); 229 ret = add_extent_mapping(em_tree, split);
228 BUG_ON(ret); 230 BUG_ON(ret);
229 free_extent_map(split); 231 free_extent_map(split);
@@ -238,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
238 split->len = em->start + em->len - (start + len); 240 split->len = em->start + em->len - (start + len);
239 split->bdev = em->bdev; 241 split->bdev = em->bdev;
240 split->flags = flags; 242 split->flags = flags;
243 split->compress_type = em->compress_type;
241 244
242 if (compressed) { 245 if (compressed) {
243 split->block_len = em->block_len; 246 split->block_len = em->block_len;
@@ -890,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
890 if (err) 893 if (err)
891 goto out; 894 goto out;
892 895
896 /*
897 * If BTRFS flips readonly due to some impossible error
898 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
899 * although we have opened a file as writable, we have
900 * to stop this write operation to ensure FS consistency.
901 */
902 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
903 err = -EROFS;
904 goto out;
905 }
906
893 file_update_time(file); 907 file_update_time(file);
894 BTRFS_I(inode)->sequence++; 908 BTRFS_I(inode)->sequence++;
895 909
@@ -1237,6 +1251,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1237 return 0; 1251 return 0;
1238} 1252}
1239 1253
1254static long btrfs_fallocate(struct file *file, int mode,
1255 loff_t offset, loff_t len)
1256{
1257 struct inode *inode = file->f_path.dentry->d_inode;
1258 struct extent_state *cached_state = NULL;
1259 u64 cur_offset;
1260 u64 last_byte;
1261 u64 alloc_start;
1262 u64 alloc_end;
1263 u64 alloc_hint = 0;
1264 u64 locked_end;
1265 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1266 struct extent_map *em;
1267 int ret;
1268
1269 alloc_start = offset & ~mask;
1270 alloc_end = (offset + len + mask) & ~mask;
1271
1272 /* We only support the FALLOC_FL_KEEP_SIZE mode */
1273 if (mode & ~FALLOC_FL_KEEP_SIZE)
1274 return -EOPNOTSUPP;
1275
1276 /*
1277 * wait for ordered IO before we have any locks. We'll loop again
1278 * below with the locks held.
1279 */
1280 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1281
1282 mutex_lock(&inode->i_mutex);
1283 ret = inode_newsize_ok(inode, alloc_end);
1284 if (ret)
1285 goto out;
1286
1287 if (alloc_start > inode->i_size) {
1288 ret = btrfs_cont_expand(inode, alloc_start);
1289 if (ret)
1290 goto out;
1291 }
1292
1293 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1294 if (ret)
1295 goto out;
1296
1297 locked_end = alloc_end - 1;
1298 while (1) {
1299 struct btrfs_ordered_extent *ordered;
1300
1301 /* the extent lock is ordered inside the running
1302 * transaction
1303 */
1304 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1305 locked_end, 0, &cached_state, GFP_NOFS);
1306 ordered = btrfs_lookup_first_ordered_extent(inode,
1307 alloc_end - 1);
1308 if (ordered &&
1309 ordered->file_offset + ordered->len > alloc_start &&
1310 ordered->file_offset < alloc_end) {
1311 btrfs_put_ordered_extent(ordered);
1312 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1313 alloc_start, locked_end,
1314 &cached_state, GFP_NOFS);
1315 /*
1316 * we can't wait on the range with the transaction
1317 * running or with the extent lock held
1318 */
1319 btrfs_wait_ordered_range(inode, alloc_start,
1320 alloc_end - alloc_start);
1321 } else {
1322 if (ordered)
1323 btrfs_put_ordered_extent(ordered);
1324 break;
1325 }
1326 }
1327
1328 cur_offset = alloc_start;
1329 while (1) {
1330 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1331 alloc_end - cur_offset, 0);
1332 BUG_ON(IS_ERR(em) || !em);
1333 last_byte = min(extent_map_end(em), alloc_end);
1334 last_byte = (last_byte + mask) & ~mask;
1335 if (em->block_start == EXTENT_MAP_HOLE ||
1336 (cur_offset >= inode->i_size &&
1337 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1338 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1339 last_byte - cur_offset,
1340 1 << inode->i_blkbits,
1341 offset + len,
1342 &alloc_hint);
1343 if (ret < 0) {
1344 free_extent_map(em);
1345 break;
1346 }
1347 }
1348 free_extent_map(em);
1349
1350 cur_offset = last_byte;
1351 if (cur_offset >= alloc_end) {
1352 ret = 0;
1353 break;
1354 }
1355 }
1356 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1357 &cached_state, GFP_NOFS);
1358
1359 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1360out:
1361 mutex_unlock(&inode->i_mutex);
1362 return ret;
1363}
1364
1240const struct file_operations btrfs_file_operations = { 1365const struct file_operations btrfs_file_operations = {
1241 .llseek = generic_file_llseek, 1366 .llseek = generic_file_llseek,
1242 .read = do_sync_read, 1367 .read = do_sync_read,
@@ -1248,6 +1373,7 @@ const struct file_operations btrfs_file_operations = {
1248 .open = generic_file_open, 1373 .open = generic_file_open,
1249 .release = btrfs_release_file, 1374 .release = btrfs_release_file,
1250 .fsync = btrfs_sync_file, 1375 .fsync = btrfs_sync_file,
1376 .fallocate = btrfs_fallocate,
1251 .unlocked_ioctl = btrfs_ioctl, 1377 .unlocked_ioctl = btrfs_ioctl,
1252#ifdef CONFIG_COMPAT 1378#ifdef CONFIG_COMPAT
1253 .compat_ioctl = btrfs_ioctl, 1379 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0ff46a47895..160b55b3e132 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
122 size_t cur_size = size; 122 size_t cur_size = size;
123 size_t datasize; 123 size_t datasize;
124 unsigned long offset; 124 unsigned long offset;
125 int use_compress = 0; 125 int compress_type = BTRFS_COMPRESS_NONE;
126 126
127 if (compressed_size && compressed_pages) { 127 if (compressed_size && compressed_pages) {
128 use_compress = 1; 128 compress_type = root->fs_info->compress_type;
129 cur_size = compressed_size; 129 cur_size = compressed_size;
130 } 130 }
131 131
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
159 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 159 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
160 ptr = btrfs_file_extent_inline_start(ei); 160 ptr = btrfs_file_extent_inline_start(ei);
161 161
162 if (use_compress) { 162 if (compress_type != BTRFS_COMPRESS_NONE) {
163 struct page *cpage; 163 struct page *cpage;
164 int i = 0; 164 int i = 0;
165 while (compressed_size > 0) { 165 while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
176 compressed_size -= cur_size; 176 compressed_size -= cur_size;
177 } 177 }
178 btrfs_set_file_extent_compression(leaf, ei, 178 btrfs_set_file_extent_compression(leaf, ei,
179 BTRFS_COMPRESS_ZLIB); 179 compress_type);
180 } else { 180 } else {
181 page = find_get_page(inode->i_mapping, 181 page = find_get_page(inode->i_mapping,
182 start >> PAGE_CACHE_SHIFT); 182 start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
263 u64 compressed_size; 263 u64 compressed_size;
264 struct page **pages; 264 struct page **pages;
265 unsigned long nr_pages; 265 unsigned long nr_pages;
266 int compress_type;
266 struct list_head list; 267 struct list_head list;
267}; 268};
268 269
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
280 u64 start, u64 ram_size, 281 u64 start, u64 ram_size,
281 u64 compressed_size, 282 u64 compressed_size,
282 struct page **pages, 283 struct page **pages,
283 unsigned long nr_pages) 284 unsigned long nr_pages,
285 int compress_type)
284{ 286{
285 struct async_extent *async_extent; 287 struct async_extent *async_extent;
286 288
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
290 async_extent->compressed_size = compressed_size; 292 async_extent->compressed_size = compressed_size;
291 async_extent->pages = pages; 293 async_extent->pages = pages;
292 async_extent->nr_pages = nr_pages; 294 async_extent->nr_pages = nr_pages;
295 async_extent->compress_type = compress_type;
293 list_add_tail(&async_extent->list, &cow->extents); 296 list_add_tail(&async_extent->list, &cow->extents);
294 return 0; 297 return 0;
295} 298}
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
332 unsigned long max_uncompressed = 128 * 1024; 335 unsigned long max_uncompressed = 128 * 1024;
333 int i; 336 int i;
334 int will_compress; 337 int will_compress;
338 int compress_type = root->fs_info->compress_type;
335 339
336 actual_end = min_t(u64, isize, end + 1); 340 actual_end = min_t(u64, isize, end + 1);
337again: 341again:
@@ -381,12 +385,16 @@ again:
381 WARN_ON(pages); 385 WARN_ON(pages);
382 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
383 387
384 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 388 if (BTRFS_I(inode)->force_compress)
385 total_compressed, pages, 389 compress_type = BTRFS_I(inode)->force_compress;
386 nr_pages, &nr_pages_ret, 390
387 &total_in, 391 ret = btrfs_compress_pages(compress_type,
388 &total_compressed, 392 inode->i_mapping, start,
389 max_compressed); 393 total_compressed, pages,
394 nr_pages, &nr_pages_ret,
395 &total_in,
396 &total_compressed,
397 max_compressed);
390 398
391 if (!ret) { 399 if (!ret) {
392 unsigned long offset = total_compressed & 400 unsigned long offset = total_compressed &
@@ -493,7 +501,8 @@ again:
493 * and will submit them to the elevator. 501 * and will submit them to the elevator.
494 */ 502 */
495 add_async_extent(async_cow, start, num_bytes, 503 add_async_extent(async_cow, start, num_bytes,
496 total_compressed, pages, nr_pages_ret); 504 total_compressed, pages, nr_pages_ret,
505 compress_type);
497 506
498 if (start + num_bytes < end) { 507 if (start + num_bytes < end) {
499 start += num_bytes; 508 start += num_bytes;
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
515 __set_page_dirty_nobuffers(locked_page); 524 __set_page_dirty_nobuffers(locked_page);
516 /* unlocked later on in the async handlers */ 525 /* unlocked later on in the async handlers */
517 } 526 }
518 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 527 add_async_extent(async_cow, start, end - start + 1,
528 0, NULL, 0, BTRFS_COMPRESS_NONE);
519 *num_added += 1; 529 *num_added += 1;
520 } 530 }
521 531
@@ -640,6 +650,7 @@ retry:
640 em->block_start = ins.objectid; 650 em->block_start = ins.objectid;
641 em->block_len = ins.offset; 651 em->block_len = ins.offset;
642 em->bdev = root->fs_info->fs_devices->latest_bdev; 652 em->bdev = root->fs_info->fs_devices->latest_bdev;
653 em->compress_type = async_extent->compress_type;
643 set_bit(EXTENT_FLAG_PINNED, &em->flags); 654 set_bit(EXTENT_FLAG_PINNED, &em->flags);
644 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 655 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
645 656
@@ -656,11 +667,13 @@ retry:
656 async_extent->ram_size - 1, 0); 667 async_extent->ram_size - 1, 0);
657 } 668 }
658 669
659 ret = btrfs_add_ordered_extent(inode, async_extent->start, 670 ret = btrfs_add_ordered_extent_compress(inode,
660 ins.objectid, 671 async_extent->start,
661 async_extent->ram_size, 672 ins.objectid,
662 ins.offset, 673 async_extent->ram_size,
663 BTRFS_ORDERED_COMPRESSED); 674 ins.offset,
675 BTRFS_ORDERED_COMPRESSED,
676 async_extent->compress_type);
664 BUG_ON(ret); 677 BUG_ON(ret);
665 678
666 /* 679 /*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1670 struct btrfs_ordered_extent *ordered_extent = NULL; 1683 struct btrfs_ordered_extent *ordered_extent = NULL;
1671 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1684 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1672 struct extent_state *cached_state = NULL; 1685 struct extent_state *cached_state = NULL;
1673 int compressed = 0; 1686 int compress_type = 0;
1674 int ret; 1687 int ret;
1675 bool nolock = false; 1688 bool nolock = false;
1676 1689
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1711 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1724 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1712 1725
1713 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1726 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1714 compressed = 1; 1727 compress_type = ordered_extent->compress_type;
1715 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1728 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1716 BUG_ON(compressed); 1729 BUG_ON(compress_type);
1717 ret = btrfs_mark_extent_written(trans, inode, 1730 ret = btrfs_mark_extent_written(trans, inode,
1718 ordered_extent->file_offset, 1731 ordered_extent->file_offset,
1719 ordered_extent->file_offset + 1732 ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1727 ordered_extent->disk_len, 1740 ordered_extent->disk_len,
1728 ordered_extent->len, 1741 ordered_extent->len,
1729 ordered_extent->len, 1742 ordered_extent->len,
1730 compressed, 0, 0, 1743 compress_type, 0, 0,
1731 BTRFS_FILE_EXTENT_REG); 1744 BTRFS_FILE_EXTENT_REG);
1732 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1745 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1733 ordered_extent->file_offset, 1746 ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1829 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1842 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1830 logical = em->block_start; 1843 logical = em->block_start;
1831 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1844 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1845 extent_set_compress_type(&failrec->bio_flags,
1846 em->compress_type);
1832 } 1847 }
1833 failrec->logical = logical; 1848 failrec->logical = logical;
1834 free_extent_map(em); 1849 free_extent_map(em);
@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3671static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3686static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3672{ 3687{
3673 struct inode *inode = dentry->d_inode; 3688 struct inode *inode = dentry->d_inode;
3689 struct btrfs_root *root = BTRFS_I(inode)->root;
3674 int err; 3690 int err;
3675 3691
3692 if (btrfs_root_readonly(root))
3693 return -EROFS;
3694
3676 err = inode_change_ok(inode, attr); 3695 err = inode_change_ok(inode, attr);
3677 if (err) 3696 if (err)
3678 return err; 3697 return err;
@@ -4084,8 +4103,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4084 int index; 4103 int index;
4085 int ret; 4104 int ret;
4086 4105
4087 d_set_d_op(dentry, &btrfs_dentry_operations);
4088
4089 if (dentry->d_name.len > BTRFS_NAME_LEN) 4106 if (dentry->d_name.len > BTRFS_NAME_LEN)
4090 return ERR_PTR(-ENAMETOOLONG); 4107 return ERR_PTR(-ENAMETOOLONG);
4091 4108
@@ -4930,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4930 size_t max_size; 4947 size_t max_size;
4931 unsigned long inline_size; 4948 unsigned long inline_size;
4932 unsigned long ptr; 4949 unsigned long ptr;
4950 int compress_type;
4933 4951
4934 WARN_ON(pg_offset != 0); 4952 WARN_ON(pg_offset != 0);
4953 compress_type = btrfs_file_extent_compression(leaf, item);
4935 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4954 max_size = btrfs_file_extent_ram_bytes(leaf, item);
4936 inline_size = btrfs_file_extent_inline_item_len(leaf, 4955 inline_size = btrfs_file_extent_inline_item_len(leaf,
4937 btrfs_item_nr(leaf, path->slots[0])); 4956 btrfs_item_nr(leaf, path->slots[0]));
@@ -4941,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4941 read_extent_buffer(leaf, tmp, ptr, inline_size); 4960 read_extent_buffer(leaf, tmp, ptr, inline_size);
4942 4961
4943 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4962 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4944 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 4963 ret = btrfs_decompress(compress_type, tmp, page,
4945 inline_size, max_size); 4964 extent_offset, inline_size, max_size);
4946 if (ret) { 4965 if (ret) {
4947 char *kaddr = kmap_atomic(page, KM_USER0); 4966 char *kaddr = kmap_atomic(page, KM_USER0);
4948 unsigned long copy_size = min_t(u64, 4967 unsigned long copy_size = min_t(u64,
@@ -4984,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4984 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5003 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4985 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5004 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4986 struct btrfs_trans_handle *trans = NULL; 5005 struct btrfs_trans_handle *trans = NULL;
4987 int compressed; 5006 int compress_type;
4988 5007
4989again: 5008again:
4990 read_lock(&em_tree->lock); 5009 read_lock(&em_tree->lock);
@@ -5043,7 +5062,7 @@ again:
5043 5062
5044 found_type = btrfs_file_extent_type(leaf, item); 5063 found_type = btrfs_file_extent_type(leaf, item);
5045 extent_start = found_key.offset; 5064 extent_start = found_key.offset;
5046 compressed = btrfs_file_extent_compression(leaf, item); 5065 compress_type = btrfs_file_extent_compression(leaf, item);
5047 if (found_type == BTRFS_FILE_EXTENT_REG || 5066 if (found_type == BTRFS_FILE_EXTENT_REG ||
5048 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5067 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5049 extent_end = extent_start + 5068 extent_end = extent_start +
@@ -5089,8 +5108,9 @@ again:
5089 em->block_start = EXTENT_MAP_HOLE; 5108 em->block_start = EXTENT_MAP_HOLE;
5090 goto insert; 5109 goto insert;
5091 } 5110 }
5092 if (compressed) { 5111 if (compress_type != BTRFS_COMPRESS_NONE) {
5093 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5112 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5113 em->compress_type = compress_type;
5094 em->block_start = bytenr; 5114 em->block_start = bytenr;
5095 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5115 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
5096 item); 5116 item);
@@ -5124,12 +5144,14 @@ again:
5124 em->len = (copy_size + root->sectorsize - 1) & 5144 em->len = (copy_size + root->sectorsize - 1) &
5125 ~((u64)root->sectorsize - 1); 5145 ~((u64)root->sectorsize - 1);
5126 em->orig_start = EXTENT_MAP_INLINE; 5146 em->orig_start = EXTENT_MAP_INLINE;
5127 if (compressed) 5147 if (compress_type) {
5128 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5148 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5149 em->compress_type = compress_type;
5150 }
5129 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5151 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
5130 if (create == 0 && !PageUptodate(page)) { 5152 if (create == 0 && !PageUptodate(page)) {
5131 if (btrfs_file_extent_compression(leaf, item) == 5153 if (btrfs_file_extent_compression(leaf, item) !=
5132 BTRFS_COMPRESS_ZLIB) { 5154 BTRFS_COMPRESS_NONE) {
5133 ret = uncompress_inline(path, inode, page, 5155 ret = uncompress_inline(path, inode, page,
5134 pg_offset, 5156 pg_offset,
5135 extent_offset, item); 5157 extent_offset, item);
@@ -6479,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6479 ei->ordered_data_close = 0; 6501 ei->ordered_data_close = 0;
6480 ei->orphan_meta_reserved = 0; 6502 ei->orphan_meta_reserved = 0;
6481 ei->dummy_inode = 0; 6503 ei->dummy_inode = 0;
6482 ei->force_compress = 0; 6504 ei->force_compress = BTRFS_COMPRESS_NONE;
6483 6505
6484 inode = &ei->vfs_inode; 6506 inode = &ei->vfs_inode;
6485 extent_map_tree_init(&ei->extent_tree, GFP_NOFS); 6507 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -7100,112 +7122,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
7100 min_size, actual_len, alloc_hint, trans); 7122 min_size, actual_len, alloc_hint, trans);
7101} 7123}
7102 7124
7103static long btrfs_fallocate(struct inode *inode, int mode,
7104 loff_t offset, loff_t len)
7105{
7106 struct extent_state *cached_state = NULL;
7107 u64 cur_offset;
7108 u64 last_byte;
7109 u64 alloc_start;
7110 u64 alloc_end;
7111 u64 alloc_hint = 0;
7112 u64 locked_end;
7113 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
7114 struct extent_map *em;
7115 int ret;
7116
7117 alloc_start = offset & ~mask;
7118 alloc_end = (offset + len + mask) & ~mask;
7119
7120 /*
7121 * wait for ordered IO before we have any locks. We'll loop again
7122 * below with the locks held.
7123 */
7124 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
7125
7126 mutex_lock(&inode->i_mutex);
7127 ret = inode_newsize_ok(inode, alloc_end);
7128 if (ret)
7129 goto out;
7130
7131 if (alloc_start > inode->i_size) {
7132 ret = btrfs_cont_expand(inode, alloc_start);
7133 if (ret)
7134 goto out;
7135 }
7136
7137 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
7138 if (ret)
7139 goto out;
7140
7141 locked_end = alloc_end - 1;
7142 while (1) {
7143 struct btrfs_ordered_extent *ordered;
7144
7145 /* the extent lock is ordered inside the running
7146 * transaction
7147 */
7148 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
7149 locked_end, 0, &cached_state, GFP_NOFS);
7150 ordered = btrfs_lookup_first_ordered_extent(inode,
7151 alloc_end - 1);
7152 if (ordered &&
7153 ordered->file_offset + ordered->len > alloc_start &&
7154 ordered->file_offset < alloc_end) {
7155 btrfs_put_ordered_extent(ordered);
7156 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
7157 alloc_start, locked_end,
7158 &cached_state, GFP_NOFS);
7159 /*
7160 * we can't wait on the range with the transaction
7161 * running or with the extent lock held
7162 */
7163 btrfs_wait_ordered_range(inode, alloc_start,
7164 alloc_end - alloc_start);
7165 } else {
7166 if (ordered)
7167 btrfs_put_ordered_extent(ordered);
7168 break;
7169 }
7170 }
7171
7172 cur_offset = alloc_start;
7173 while (1) {
7174 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
7175 alloc_end - cur_offset, 0);
7176 BUG_ON(IS_ERR(em) || !em);
7177 last_byte = min(extent_map_end(em), alloc_end);
7178 last_byte = (last_byte + mask) & ~mask;
7179 if (em->block_start == EXTENT_MAP_HOLE ||
7180 (cur_offset >= inode->i_size &&
7181 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7182 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
7183 last_byte - cur_offset,
7184 1 << inode->i_blkbits,
7185 offset + len,
7186 &alloc_hint);
7187 if (ret < 0) {
7188 free_extent_map(em);
7189 break;
7190 }
7191 }
7192 free_extent_map(em);
7193
7194 cur_offset = last_byte;
7195 if (cur_offset >= alloc_end) {
7196 ret = 0;
7197 break;
7198 }
7199 }
7200 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
7201 &cached_state, GFP_NOFS);
7202
7203 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
7204out:
7205 mutex_unlock(&inode->i_mutex);
7206 return ret;
7207}
7208
7209static int btrfs_set_page_dirty(struct page *page) 7125static int btrfs_set_page_dirty(struct page *page)
7210{ 7126{
7211 return __set_page_dirty_nobuffers(page); 7127 return __set_page_dirty_nobuffers(page);
@@ -7213,6 +7129,10 @@ static int btrfs_set_page_dirty(struct page *page)
7213 7129
7214static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) 7130static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
7215{ 7131{
7132 struct btrfs_root *root = BTRFS_I(inode)->root;
7133
7134 if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
7135 return -EROFS;
7216 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7136 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
7217 return -EACCES; 7137 return -EACCES;
7218 return generic_permission(inode, mask, flags, btrfs_check_acl); 7138 return generic_permission(inode, mask, flags, btrfs_check_acl);
@@ -7308,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
7308 .listxattr = btrfs_listxattr, 7228 .listxattr = btrfs_listxattr,
7309 .removexattr = btrfs_removexattr, 7229 .removexattr = btrfs_removexattr,
7310 .permission = btrfs_permission, 7230 .permission = btrfs_permission,
7311 .fallocate = btrfs_fallocate,
7312 .fiemap = btrfs_fiemap, 7231 .fiemap = btrfs_fiemap,
7313}; 7232};
7314static const struct inode_operations btrfs_special_inode_operations = { 7233static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7ea..a506a22b522a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
147 unsigned int flags, oldflags; 147 unsigned int flags, oldflags;
148 int ret; 148 int ret;
149 149
150 if (btrfs_root_readonly(root))
151 return -EROFS;
152
150 if (copy_from_user(&flags, arg, sizeof(flags))) 153 if (copy_from_user(&flags, arg, sizeof(flags)))
151 return -EFAULT; 154 return -EFAULT;
152 155
@@ -360,7 +363,8 @@ fail:
360} 363}
361 364
362static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 365static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
363 char *name, int namelen, u64 *async_transid) 366 char *name, int namelen, u64 *async_transid,
367 bool readonly)
364{ 368{
365 struct inode *inode; 369 struct inode *inode;
366 struct dentry *parent; 370 struct dentry *parent;
@@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
378 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 382 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 pending_snapshot->dentry = dentry; 383 pending_snapshot->dentry = dentry;
380 pending_snapshot->root = root; 384 pending_snapshot->root = root;
385 pending_snapshot->readonly = readonly;
381 386
382 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 387 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
383 if (IS_ERR(trans)) { 388 if (IS_ERR(trans)) {
@@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
509static noinline int btrfs_mksubvol(struct path *parent, 514static noinline int btrfs_mksubvol(struct path *parent,
510 char *name, int namelen, 515 char *name, int namelen,
511 struct btrfs_root *snap_src, 516 struct btrfs_root *snap_src,
512 u64 *async_transid) 517 u64 *async_transid, bool readonly)
513{ 518{
514 struct inode *dir = parent->dentry->d_inode; 519 struct inode *dir = parent->dentry->d_inode;
515 struct dentry *dentry; 520 struct dentry *dentry;
@@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
541 546
542 if (snap_src) { 547 if (snap_src) {
543 error = create_snapshot(snap_src, dentry, 548 error = create_snapshot(snap_src, dentry,
544 name, namelen, async_transid); 549 name, namelen, async_transid, readonly);
545 } else { 550 } else {
546 error = create_subvol(BTRFS_I(dir)->root, dentry, 551 error = create_subvol(BTRFS_I(dir)->root, dentry,
547 name, namelen, async_transid); 552 name, namelen, async_transid);
@@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
638 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 643 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
639 struct btrfs_ordered_extent *ordered; 644 struct btrfs_ordered_extent *ordered;
640 struct page *page; 645 struct page *page;
646 struct btrfs_super_block *disk_super;
641 unsigned long last_index; 647 unsigned long last_index;
642 unsigned long ra_pages = root->fs_info->bdi.ra_pages; 648 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
643 unsigned long total_read = 0; 649 unsigned long total_read = 0;
650 u64 features;
644 u64 page_start; 651 u64 page_start;
645 u64 page_end; 652 u64 page_end;
646 u64 last_len = 0; 653 u64 last_len = 0;
@@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
648 u64 defrag_end = 0; 655 u64 defrag_end = 0;
649 unsigned long i; 656 unsigned long i;
650 int ret; 657 int ret;
658 int compress_type = BTRFS_COMPRESS_ZLIB;
659
660 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
661 if (range->compress_type > BTRFS_COMPRESS_TYPES)
662 return -EINVAL;
663 if (range->compress_type)
664 compress_type = range->compress_type;
665 }
651 666
652 if (inode->i_size == 0) 667 if (inode->i_size == 0)
653 return 0; 668 return 0;
@@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
683 total_read++; 698 total_read++;
684 mutex_lock(&inode->i_mutex); 699 mutex_lock(&inode->i_mutex);
685 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 700 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
686 BTRFS_I(inode)->force_compress = 1; 701 BTRFS_I(inode)->force_compress = compress_type;
687 702
688 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 703 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
689 if (ret) 704 if (ret)
@@ -781,10 +796,17 @@ loop_unlock:
781 atomic_dec(&root->fs_info->async_submit_draining); 796 atomic_dec(&root->fs_info->async_submit_draining);
782 797
783 mutex_lock(&inode->i_mutex); 798 mutex_lock(&inode->i_mutex);
784 BTRFS_I(inode)->force_compress = 0; 799 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
785 mutex_unlock(&inode->i_mutex); 800 mutex_unlock(&inode->i_mutex);
786 } 801 }
787 802
803 disk_super = &root->fs_info->super_copy;
804 features = btrfs_super_incompat_flags(disk_super);
805 if (range->compress_type == BTRFS_COMPRESS_LZO) {
806 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
807 btrfs_set_super_incompat_flags(disk_super, features);
808 }
809
788 return 0; 810 return 0;
789 811
790err_reservations: 812err_reservations:
@@ -901,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
901 char *name, 923 char *name,
902 unsigned long fd, 924 unsigned long fd,
903 int subvol, 925 int subvol,
904 u64 *transid) 926 u64 *transid,
927 bool readonly)
905{ 928{
906 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 929 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
907 struct file *src_file; 930 struct file *src_file;
@@ -919,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
919 942
920 if (subvol) { 943 if (subvol) {
921 ret = btrfs_mksubvol(&file->f_path, name, namelen, 944 ret = btrfs_mksubvol(&file->f_path, name, namelen,
922 NULL, transid); 945 NULL, transid, readonly);
923 } else { 946 } else {
924 struct inode *src_inode; 947 struct inode *src_inode;
925 src_file = fget(fd); 948 src_file = fget(fd);
@@ -938,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
938 } 961 }
939 ret = btrfs_mksubvol(&file->f_path, name, namelen, 962 ret = btrfs_mksubvol(&file->f_path, name, namelen,
940 BTRFS_I(src_inode)->root, 963 BTRFS_I(src_inode)->root,
941 transid); 964 transid, readonly);
942 fput(src_file); 965 fput(src_file);
943 } 966 }
944out: 967out:
@@ -946,58 +969,139 @@ out:
946} 969}
947 970
948static noinline int btrfs_ioctl_snap_create(struct file *file, 971static noinline int btrfs_ioctl_snap_create(struct file *file,
949 void __user *arg, int subvol, 972 void __user *arg, int subvol)
950 int v2)
951{ 973{
952 struct btrfs_ioctl_vol_args *vol_args = NULL; 974 struct btrfs_ioctl_vol_args *vol_args;
953 struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
954 char *name;
955 u64 fd;
956 int ret; 975 int ret;
957 976
958 if (v2) { 977 vol_args = memdup_user(arg, sizeof(*vol_args));
959 u64 transid = 0; 978 if (IS_ERR(vol_args))
960 u64 *ptr = NULL; 979 return PTR_ERR(vol_args);
980 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
961 981
962 vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); 982 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
963 if (IS_ERR(vol_args_v2)) 983 vol_args->fd, subvol,
964 return PTR_ERR(vol_args_v2); 984 NULL, false);
965 985
966 if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { 986 kfree(vol_args);
967 ret = -EINVAL; 987 return ret;
968 goto out; 988}
969 }
970
971 name = vol_args_v2->name;
972 fd = vol_args_v2->fd;
973 vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
974 989
975 if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) 990static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
976 ptr = &transid; 991 void __user *arg, int subvol)
992{
993 struct btrfs_ioctl_vol_args_v2 *vol_args;
994 int ret;
995 u64 transid = 0;
996 u64 *ptr = NULL;
997 bool readonly = false;
977 998
978 ret = btrfs_ioctl_snap_create_transid(file, name, fd, 999 vol_args = memdup_user(arg, sizeof(*vol_args));
979 subvol, ptr); 1000 if (IS_ERR(vol_args))
1001 return PTR_ERR(vol_args);
1002 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
980 1003
981 if (ret == 0 && ptr && 1004 if (vol_args->flags &
982 copy_to_user(arg + 1005 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
983 offsetof(struct btrfs_ioctl_vol_args_v2, 1006 ret = -EOPNOTSUPP;
984 transid), ptr, sizeof(*ptr))) 1007 goto out;
985 ret = -EFAULT;
986 } else {
987 vol_args = memdup_user(arg, sizeof(*vol_args));
988 if (IS_ERR(vol_args))
989 return PTR_ERR(vol_args);
990 name = vol_args->name;
991 fd = vol_args->fd;
992 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
993
994 ret = btrfs_ioctl_snap_create_transid(file, name, fd,
995 subvol, NULL);
996 } 1008 }
1009
1010 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1011 ptr = &transid;
1012 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1013 readonly = true;
1014
1015 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1016 vol_args->fd, subvol,
1017 ptr, readonly);
1018
1019 if (ret == 0 && ptr &&
1020 copy_to_user(arg +
1021 offsetof(struct btrfs_ioctl_vol_args_v2,
1022 transid), ptr, sizeof(*ptr)))
1023 ret = -EFAULT;
997out: 1024out:
998 kfree(vol_args); 1025 kfree(vol_args);
999 kfree(vol_args_v2); 1026 return ret;
1027}
1000 1028
1029static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1030 void __user *arg)
1031{
1032 struct inode *inode = fdentry(file)->d_inode;
1033 struct btrfs_root *root = BTRFS_I(inode)->root;
1034 int ret = 0;
1035 u64 flags = 0;
1036
1037 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1038 return -EINVAL;
1039
1040 down_read(&root->fs_info->subvol_sem);
1041 if (btrfs_root_readonly(root))
1042 flags |= BTRFS_SUBVOL_RDONLY;
1043 up_read(&root->fs_info->subvol_sem);
1044
1045 if (copy_to_user(arg, &flags, sizeof(flags)))
1046 ret = -EFAULT;
1047
1048 return ret;
1049}
1050
1051static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1052 void __user *arg)
1053{
1054 struct inode *inode = fdentry(file)->d_inode;
1055 struct btrfs_root *root = BTRFS_I(inode)->root;
1056 struct btrfs_trans_handle *trans;
1057 u64 root_flags;
1058 u64 flags;
1059 int ret = 0;
1060
1061 if (root->fs_info->sb->s_flags & MS_RDONLY)
1062 return -EROFS;
1063
1064 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1065 return -EINVAL;
1066
1067 if (copy_from_user(&flags, arg, sizeof(flags)))
1068 return -EFAULT;
1069
1070 if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
1071 return -EINVAL;
1072
1073 if (flags & ~BTRFS_SUBVOL_RDONLY)
1074 return -EOPNOTSUPP;
1075
1076 down_write(&root->fs_info->subvol_sem);
1077
1078 /* nothing to do */
1079 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1080 goto out;
1081
1082 root_flags = btrfs_root_flags(&root->root_item);
1083 if (flags & BTRFS_SUBVOL_RDONLY)
1084 btrfs_set_root_flags(&root->root_item,
1085 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1086 else
1087 btrfs_set_root_flags(&root->root_item,
1088 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1089
1090 trans = btrfs_start_transaction(root, 1);
1091 if (IS_ERR(trans)) {
1092 ret = PTR_ERR(trans);
1093 goto out_reset;
1094 }
1095
1096 ret = btrfs_update_root(trans, root,
1097 &root->root_key, &root->root_item);
1098
1099 btrfs_commit_transaction(trans, root);
1100out_reset:
1101 if (ret)
1102 btrfs_set_root_flags(&root->root_item, root_flags);
1103out:
1104 up_write(&root->fs_info->subvol_sem);
1001 return ret; 1105 return ret;
1002} 1106}
1003 1107
@@ -1509,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1509 struct btrfs_ioctl_defrag_range_args *range; 1613 struct btrfs_ioctl_defrag_range_args *range;
1510 int ret; 1614 int ret;
1511 1615
1616 if (btrfs_root_readonly(root))
1617 return -EROFS;
1618
1512 ret = mnt_want_write(file->f_path.mnt); 1619 ret = mnt_want_write(file->f_path.mnt);
1513 if (ret) 1620 if (ret)
1514 return ret; 1621 return ret;
@@ -1637,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1637 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) 1744 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1638 return -EINVAL; 1745 return -EINVAL;
1639 1746
1747 if (btrfs_root_readonly(root))
1748 return -EROFS;
1749
1640 ret = mnt_want_write(file->f_path.mnt); 1750 ret = mnt_want_write(file->f_path.mnt);
1641 if (ret) 1751 if (ret)
1642 return ret; 1752 return ret;
@@ -1958,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
1958 if (file->private_data) 2068 if (file->private_data)
1959 goto out; 2069 goto out;
1960 2070
2071 ret = -EROFS;
2072 if (btrfs_root_readonly(root))
2073 goto out;
2074
1961 ret = mnt_want_write(file->f_path.mnt); 2075 ret = mnt_want_write(file->f_path.mnt);
1962 if (ret) 2076 if (ret)
1963 goto out; 2077 goto out;
@@ -2257,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
2257 case FS_IOC_GETVERSION: 2371 case FS_IOC_GETVERSION:
2258 return btrfs_ioctl_getversion(file, argp); 2372 return btrfs_ioctl_getversion(file, argp);
2259 case BTRFS_IOC_SNAP_CREATE: 2373 case BTRFS_IOC_SNAP_CREATE:
2260 return btrfs_ioctl_snap_create(file, argp, 0, 0); 2374 return btrfs_ioctl_snap_create(file, argp, 0);
2261 case BTRFS_IOC_SNAP_CREATE_V2: 2375 case BTRFS_IOC_SNAP_CREATE_V2:
2262 return btrfs_ioctl_snap_create(file, argp, 0, 1); 2376 return btrfs_ioctl_snap_create_v2(file, argp, 0);
2263 case BTRFS_IOC_SUBVOL_CREATE: 2377 case BTRFS_IOC_SUBVOL_CREATE:
2264 return btrfs_ioctl_snap_create(file, argp, 1, 0); 2378 return btrfs_ioctl_snap_create(file, argp, 1);
2265 case BTRFS_IOC_SNAP_DESTROY: 2379 case BTRFS_IOC_SNAP_DESTROY:
2266 return btrfs_ioctl_snap_destroy(file, argp); 2380 return btrfs_ioctl_snap_destroy(file, argp);
2381 case BTRFS_IOC_SUBVOL_GETFLAGS:
2382 return btrfs_ioctl_subvol_getflags(file, argp);
2383 case BTRFS_IOC_SUBVOL_SETFLAGS:
2384 return btrfs_ioctl_subvol_setflags(file, argp);
2267 case BTRFS_IOC_DEFAULT_SUBVOL: 2385 case BTRFS_IOC_DEFAULT_SUBVOL:
2268 return btrfs_ioctl_default_subvol(file, argp); 2386 return btrfs_ioctl_default_subvol(file, argp);
2269 case BTRFS_IOC_DEFRAG: 2387 case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c344d12c646b..8fb382167b13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
31}; 31};
32 32
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
34 35
35#define BTRFS_SUBVOL_NAME_MAX 4039 36#define BTRFS_SUBVOL_NAME_MAX 4039
36struct btrfs_ioctl_vol_args_v2 { 37struct btrfs_ioctl_vol_args_v2 {
@@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
133 */ 134 */
134 __u32 extent_thresh; 135 __u32 extent_thresh;
135 136
137 /*
138 * which compression method to use if turning on compression
139 * for this defrag operation. If unspecified, zlib will
140 * be used
141 */
142 __u32 compress_type;
143
136 /* spare for later */ 144 /* spare for later */
137 __u32 unused[5]; 145 __u32 unused[4];
138}; 146};
139 147
140struct btrfs_ioctl_space_info { 148struct btrfs_ioctl_space_info {
@@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args {
193#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 201#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
194#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ 202#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
195 struct btrfs_ioctl_vol_args_v2) 203 struct btrfs_ioctl_vol_args_v2)
204#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
205#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
196#endif 206#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..cc9b450399df
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/slab.h>
21#include <linux/vmalloc.h>
22#include <linux/init.h>
23#include <linux/err.h>
24#include <linux/sched.h>
25#include <linux/pagemap.h>
26#include <linux/bio.h>
27#include <linux/lzo.h>
28#include "compression.h"
29
30#define LZO_LEN 4
31
32struct workspace {
33 void *mem;
34 void *buf; /* where compressed data goes */
35 void *cbuf; /* where decompressed data goes */
36 struct list_head list;
37};
38
39static void lzo_free_workspace(struct list_head *ws)
40{
41 struct workspace *workspace = list_entry(ws, struct workspace, list);
42
43 vfree(workspace->buf);
44 vfree(workspace->cbuf);
45 vfree(workspace->mem);
46 kfree(workspace);
47}
48
49static struct list_head *lzo_alloc_workspace(void)
50{
51 struct workspace *workspace;
52
53 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
54 if (!workspace)
55 return ERR_PTR(-ENOMEM);
56
57 workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
58 workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
59 workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
60 if (!workspace->mem || !workspace->buf || !workspace->cbuf)
61 goto fail;
62
63 INIT_LIST_HEAD(&workspace->list);
64
65 return &workspace->list;
66fail:
67 lzo_free_workspace(&workspace->list);
68 return ERR_PTR(-ENOMEM);
69}
70
71static inline void write_compress_length(char *buf, size_t len)
72{
73 __le32 dlen;
74
75 dlen = cpu_to_le32(len);
76 memcpy(buf, &dlen, LZO_LEN);
77}
78
79static inline size_t read_compress_length(char *buf)
80{
81 __le32 dlen;
82
83 memcpy(&dlen, buf, LZO_LEN);
84 return le32_to_cpu(dlen);
85}
86
87static int lzo_compress_pages(struct list_head *ws,
88 struct address_space *mapping,
89 u64 start, unsigned long len,
90 struct page **pages,
91 unsigned long nr_dest_pages,
92 unsigned long *out_pages,
93 unsigned long *total_in,
94 unsigned long *total_out,
95 unsigned long max_out)
96{
97 struct workspace *workspace = list_entry(ws, struct workspace, list);
98 int ret = 0;
99 char *data_in;
100 char *cpage_out;
101 int nr_pages = 0;
102 struct page *in_page = NULL;
103 struct page *out_page = NULL;
104 unsigned long bytes_left;
105
106 size_t in_len;
107 size_t out_len;
108 char *buf;
109 unsigned long tot_in = 0;
110 unsigned long tot_out = 0;
111 unsigned long pg_bytes_left;
112 unsigned long out_offset;
113 unsigned long bytes;
114
115 *out_pages = 0;
116 *total_out = 0;
117 *total_in = 0;
118
119 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
120 data_in = kmap(in_page);
121
122 /*
123 * store the size of all chunks of compressed data in
124 * the first 4 bytes
125 */
126 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
127 if (out_page == NULL) {
128 ret = -ENOMEM;
129 goto out;
130 }
131 cpage_out = kmap(out_page);
132 out_offset = LZO_LEN;
133 tot_out = LZO_LEN;
134 pages[0] = out_page;
135 nr_pages = 1;
136 pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
137
138 /* compress at most one page of data each time */
139 in_len = min(len, PAGE_CACHE_SIZE);
140 while (tot_in < len) {
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
145 ret);
146 ret = -1;
147 goto out;
148 }
149
150 /* store the size of this chunk of compressed data */
151 write_compress_length(cpage_out + out_offset, out_len);
152 tot_out += LZO_LEN;
153 out_offset += LZO_LEN;
154 pg_bytes_left -= LZO_LEN;
155
156 tot_in += in_len;
157 tot_out += out_len;
158
159 /* copy bytes from the working buffer into the pages */
160 buf = workspace->cbuf;
161 while (out_len) {
162 bytes = min_t(unsigned long, pg_bytes_left, out_len);
163
164 memcpy(cpage_out + out_offset, buf, bytes);
165
166 out_len -= bytes;
167 pg_bytes_left -= bytes;
168 buf += bytes;
169 out_offset += bytes;
170
171 /*
172 * we need another page for writing out.
173 *
174 * Note if there's less than 4 bytes left, we just
175 * skip to a new page.
176 */
177 if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
178 pg_bytes_left == 0) {
179 if (pg_bytes_left) {
180 memset(cpage_out + out_offset, 0,
181 pg_bytes_left);
182 tot_out += pg_bytes_left;
183 }
184
185 /* we're done, don't allocate new page */
186 if (out_len == 0 && tot_in >= len)
187 break;
188
189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL;
192 ret = -1;
193 goto out;
194 }
195
196 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
197 if (out_page == NULL) {
198 ret = -ENOMEM;
199 goto out;
200 }
201 cpage_out = kmap(out_page);
202 pages[nr_pages++] = out_page;
203
204 pg_bytes_left = PAGE_CACHE_SIZE;
205 out_offset = 0;
206 }
207 }
208
209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out)
211 goto out;
212
213 /* we're all done */
214 if (tot_in >= len)
215 break;
216
217 if (tot_out > max_out)
218 break;
219
220 bytes_left = len - tot_in;
221 kunmap(in_page);
222 page_cache_release(in_page);
223
224 start += PAGE_CACHE_SIZE;
225 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
226 data_in = kmap(in_page);
227 in_len = min(bytes_left, PAGE_CACHE_SIZE);
228 }
229
230 if (tot_out > tot_in)
231 goto out;
232
233 /* store the size of all chunks of compressed data */
234 cpage_out = kmap(pages[0]);
235 write_compress_length(cpage_out, tot_out);
236
237 kunmap(pages[0]);
238
239 ret = 0;
240 *total_out = tot_out;
241 *total_in = tot_in;
242out:
243 *out_pages = nr_pages;
244 if (out_page)
245 kunmap(out_page);
246
247 if (in_page) {
248 kunmap(in_page);
249 page_cache_release(in_page);
250 }
251
252 return ret;
253}
254
255static int lzo_decompress_biovec(struct list_head *ws,
256 struct page **pages_in,
257 u64 disk_start,
258 struct bio_vec *bvec,
259 int vcnt,
260 size_t srclen)
261{
262 struct workspace *workspace = list_entry(ws, struct workspace, list);
263 int ret = 0, ret2;
264 char *data_in;
265 unsigned long page_in_index = 0;
266 unsigned long page_out_index = 0;
267 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
268 PAGE_CACHE_SIZE;
269 unsigned long buf_start;
270 unsigned long buf_offset = 0;
271 unsigned long bytes;
272 unsigned long working_bytes;
273 unsigned long pg_offset;
274
275 size_t in_len;
276 size_t out_len;
277 unsigned long in_offset;
278 unsigned long in_page_bytes_left;
279 unsigned long tot_in;
280 unsigned long tot_out;
281 unsigned long tot_len;
282 char *buf;
283
284 data_in = kmap(pages_in[0]);
285 tot_len = read_compress_length(data_in);
286
287 tot_in = LZO_LEN;
288 in_offset = LZO_LEN;
289 tot_len = min_t(size_t, srclen, tot_len);
290 in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
291
292 tot_out = 0;
293 pg_offset = 0;
294
295 while (tot_in < tot_len) {
296 in_len = read_compress_length(data_in + in_offset);
297 in_page_bytes_left -= LZO_LEN;
298 in_offset += LZO_LEN;
299 tot_in += LZO_LEN;
300
301 tot_in += in_len;
302 working_bytes = in_len;
303
304 /* fast path: avoid using the working buffer */
305 if (in_page_bytes_left >= in_len) {
306 buf = data_in + in_offset;
307 bytes = in_len;
308 goto cont;
309 }
310
311 /* copy bytes from the pages into the working buffer */
312 buf = workspace->cbuf;
313 buf_offset = 0;
314 while (working_bytes) {
315 bytes = min(working_bytes, in_page_bytes_left);
316
317 memcpy(buf + buf_offset, data_in + in_offset, bytes);
318 buf_offset += bytes;
319cont:
320 working_bytes -= bytes;
321 in_page_bytes_left -= bytes;
322 in_offset += bytes;
323
324 /* check if we need to pick another page */
325 if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
326 || in_page_bytes_left == 0) {
327 tot_in += in_page_bytes_left;
328
329 if (working_bytes == 0 && tot_in >= tot_len)
330 break;
331
332 kunmap(pages_in[page_in_index]);
333 page_in_index++;
334 if (page_in_index >= total_pages_in) {
335 ret = -1;
336 data_in = NULL;
337 goto done;
338 }
339 data_in = kmap(pages_in[page_in_index]);
340
341 in_page_bytes_left = PAGE_CACHE_SIZE;
342 in_offset = 0;
343 }
344 }
345
346 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
347 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
348 &out_len);
349 if (ret != LZO_E_OK) {
350 printk(KERN_WARNING "btrfs decompress failed\n");
351 ret = -1;
352 break;
353 }
354
355 buf_start = tot_out;
356 tot_out += out_len;
357
358 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
359 tot_out, disk_start,
360 bvec, vcnt,
361 &page_out_index, &pg_offset);
362 if (ret2 == 0)
363 break;
364 }
365done:
366 if (data_in)
367 kunmap(pages_in[page_in_index]);
368 return ret;
369}
370
371static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
372 struct page *dest_page,
373 unsigned long start_byte,
374 size_t srclen, size_t destlen)
375{
376 struct workspace *workspace = list_entry(ws, struct workspace, list);
377 size_t in_len;
378 size_t out_len;
379 size_t tot_len;
380 int ret = 0;
381 char *kaddr;
382 unsigned long bytes;
383
384 BUG_ON(srclen < LZO_LEN);
385
386 tot_len = read_compress_length(data_in);
387 data_in += LZO_LEN;
388
389 in_len = read_compress_length(data_in);
390 data_in += LZO_LEN;
391
392 out_len = PAGE_CACHE_SIZE;
393 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
394 if (ret != LZO_E_OK) {
395 printk(KERN_WARNING "btrfs decompress failed!\n");
396 ret = -1;
397 goto out;
398 }
399
400 if (out_len < start_byte) {
401 ret = -1;
402 goto out;
403 }
404
405 bytes = min_t(unsigned long, destlen, out_len - start_byte);
406
407 kaddr = kmap_atomic(dest_page, KM_USER0);
408 memcpy(kaddr, workspace->buf + start_byte, bytes);
409 kunmap_atomic(kaddr, KM_USER0);
410out:
411 return ret;
412}
413
414struct btrfs_compress_op btrfs_lzo_compress = {
415 .alloc_workspace = lzo_alloc_workspace,
416 .free_workspace = lzo_free_workspace,
417 .compress_pages = lzo_compress_pages,
418 .decompress_biovec = lzo_decompress_biovec,
419 .decompress = lzo_decompress,
420};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ae7737e352c9..2b61e1ddcd99 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
172 */ 172 */
173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 u64 start, u64 len, u64 disk_len, 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio) 175 int type, int dio, int compress_type)
176{ 176{
177 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
178 struct rb_node *node; 178 struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
189 entry->disk_len = disk_len; 189 entry->disk_len = disk_len;
190 entry->bytes_left = len; 190 entry->bytes_left = len;
191 entry->inode = inode; 191 entry->inode = inode;
192 entry->compress_type = compress_type;
192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 193 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
193 set_bit(type, &entry->flags); 194 set_bit(type, &entry->flags);
194 195
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type) 221 u64 start, u64 len, u64 disk_len, int type)
221{ 222{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 223 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0); 224 disk_len, type, 0,
225 BTRFS_COMPRESS_NONE);
224} 226}
225 227
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 228int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type) 229 u64 start, u64 len, u64 disk_len, int type)
228{ 230{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 231 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1); 232 disk_len, type, 1,
233 BTRFS_COMPRESS_NONE);
234}
235
236int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
237 u64 start, u64 len, u64 disk_len,
238 int type, int compress_type)
239{
240 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
241 disk_len, type, 0,
242 compress_type);
231} 243}
232 244
233/* 245/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 61dca83119dd..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
68 68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70 70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ 71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
93 /* flags (described above) */ 93 /* flags (described above) */
94 unsigned long flags; 94 unsigned long flags;
95 95
96 /* compression algorithm */
97 int compress_type;
98
96 /* reference count */ 99 /* reference count */
97 atomic_t refs; 100 atomic_t refs;
98 101
@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
148 u64 start, u64 len, u64 disk_len, int type); 151 u64 start, u64 len, u64 disk_len, int type);
149int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
150 u64 start, u64 len, u64 disk_len, int type); 153 u64 start, u64 len, u64 disk_len, int type);
154int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
155 u64 start, u64 len, u64 disk_len,
156 int type, int compress_type);
151int btrfs_add_ordered_sum(struct inode *inode, 157int btrfs_add_ordered_sum(struct inode *inode,
152 struct btrfs_ordered_extent *entry, 158 struct btrfs_ordered_extent *entry,
153 struct btrfs_ordered_sum *sum); 159 struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 883c6fa1367e..b2130c46fdb5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
54 54
55static const struct super_operations btrfs_super_ops; 55static const struct super_operations btrfs_super_ops;
56 56
57static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
58 char nbuf[16])
59{
60 char *errstr = NULL;
61
62 switch (errno) {
63 case -EIO:
64 errstr = "IO failure";
65 break;
66 case -ENOMEM:
67 errstr = "Out of memory";
68 break;
69 case -EROFS:
70 errstr = "Readonly filesystem";
71 break;
72 default:
73 if (nbuf) {
74 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
75 errstr = nbuf;
76 }
77 break;
78 }
79
80 return errstr;
81}
82
83static void __save_error_info(struct btrfs_fs_info *fs_info)
84{
85 /*
86 * today we only save the error info into ram. Long term we'll
87 * also send it down to the disk
88 */
89 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
90}
91
92/* NOTE:
93 * We move write_super stuff at umount in order to avoid deadlock
94 * for umount hold all lock.
95 */
96static void save_error_info(struct btrfs_fs_info *fs_info)
97{
98 __save_error_info(fs_info);
99}
100
101/* btrfs handle error by forcing the filesystem readonly */
102static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
103{
104 struct super_block *sb = fs_info->sb;
105
106 if (sb->s_flags & MS_RDONLY)
107 return;
108
109 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
110 sb->s_flags |= MS_RDONLY;
111 printk(KERN_INFO "btrfs is forced readonly\n");
112 }
113}
114
115/*
116 * __btrfs_std_error decodes expected errors from the caller and
117 * invokes the approciate error response.
118 */
119void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
120 unsigned int line, int errno)
121{
122 struct super_block *sb = fs_info->sb;
123 char nbuf[16];
124 const char *errstr;
125
126 /*
127 * Special case: if the error is EROFS, and we're already
128 * under MS_RDONLY, then it is safe here.
129 */
130 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
131 return;
132
133 errstr = btrfs_decode_error(fs_info, errno, nbuf);
134 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
135 sb->s_id, function, line, errstr);
136 save_error_info(fs_info);
137
138 btrfs_handle_error(fs_info);
139}
140
57static void btrfs_put_super(struct super_block *sb) 141static void btrfs_put_super(struct super_block *sb)
58{ 142{
59 struct btrfs_root *root = btrfs_sb(sb); 143 struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
69 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 153 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
70 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 154 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
71 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 155 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
72 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
73 Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, 157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
74 Opt_user_subvol_rm_allowed, 158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
75}; 159};
76 160
77static match_table_t tokens = { 161static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
86 {Opt_alloc_start, "alloc_start=%s"}, 170 {Opt_alloc_start, "alloc_start=%s"},
87 {Opt_thread_pool, "thread_pool=%d"}, 171 {Opt_thread_pool, "thread_pool=%d"},
88 {Opt_compress, "compress"}, 172 {Opt_compress, "compress"},
173 {Opt_compress_type, "compress=%s"},
89 {Opt_compress_force, "compress-force"}, 174 {Opt_compress_force, "compress-force"},
175 {Opt_compress_force_type, "compress-force=%s"},
90 {Opt_ssd, "ssd"}, 176 {Opt_ssd, "ssd"},
91 {Opt_ssd_spread, "ssd_spread"}, 177 {Opt_ssd_spread, "ssd_spread"},
92 {Opt_nossd, "nossd"}, 178 {Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
112 char *p, *num, *orig; 198 char *p, *num, *orig;
113 int intarg; 199 int intarg;
114 int ret = 0; 200 int ret = 0;
201 char *compress_type;
202 bool compress_force = false;
115 203
116 if (!options) 204 if (!options)
117 return 0; 205 return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
154 btrfs_set_opt(info->mount_opt, NODATACOW); 242 btrfs_set_opt(info->mount_opt, NODATACOW);
155 btrfs_set_opt(info->mount_opt, NODATASUM); 243 btrfs_set_opt(info->mount_opt, NODATASUM);
156 break; 244 break;
157 case Opt_compress:
158 printk(KERN_INFO "btrfs: use compression\n");
159 btrfs_set_opt(info->mount_opt, COMPRESS);
160 break;
161 case Opt_compress_force: 245 case Opt_compress_force:
162 printk(KERN_INFO "btrfs: forcing compression\n"); 246 case Opt_compress_force_type:
163 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 247 compress_force = true;
248 case Opt_compress:
249 case Opt_compress_type:
250 if (token == Opt_compress ||
251 token == Opt_compress_force ||
252 strcmp(args[0].from, "zlib") == 0) {
253 compress_type = "zlib";
254 info->compress_type = BTRFS_COMPRESS_ZLIB;
255 } else if (strcmp(args[0].from, "lzo") == 0) {
256 compress_type = "lzo";
257 info->compress_type = BTRFS_COMPRESS_LZO;
258 } else {
259 ret = -EINVAL;
260 goto out;
261 }
262
164 btrfs_set_opt(info->mount_opt, COMPRESS); 263 btrfs_set_opt(info->mount_opt, COMPRESS);
264 if (compress_force) {
265 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
266 pr_info("btrfs: force %s compression\n",
267 compress_type);
268 } else
269 pr_info("btrfs: use %s compression\n",
270 compress_type);
165 break; 271 break;
166 case Opt_ssd: 272 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 273 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -460,6 +566,7 @@ static int btrfs_fill_super(struct super_block *sb,
460 sb->s_maxbytes = MAX_LFS_FILESIZE; 566 sb->s_maxbytes = MAX_LFS_FILESIZE;
461 sb->s_magic = BTRFS_SUPER_MAGIC; 567 sb->s_magic = BTRFS_SUPER_MAGIC;
462 sb->s_op = &btrfs_super_ops; 568 sb->s_op = &btrfs_super_ops;
569 sb->s_d_op = &btrfs_dentry_operations;
463 sb->s_export_op = &btrfs_export_ops; 570 sb->s_export_op = &btrfs_export_ops;
464 sb->s_xattr = btrfs_xattr_handlers; 571 sb->s_xattr = btrfs_xattr_handlers;
465 sb->s_time_gran = 1; 572 sb->s_time_gran = 1;
@@ -752,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
752 return 0; 859 return 0;
753} 860}
754 861
862/*
863 * The helper to calc the free space on the devices that can be used to store
864 * file data.
865 */
866static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
867{
868 struct btrfs_fs_info *fs_info = root->fs_info;
869 struct btrfs_device_info *devices_info;
870 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
871 struct btrfs_device *device;
872 u64 skip_space;
873 u64 type;
874 u64 avail_space;
875 u64 used_space;
876 u64 min_stripe_size;
877 int min_stripes = 1;
878 int i = 0, nr_devices;
879 int ret;
880
881 nr_devices = fs_info->fs_devices->rw_devices;
882 BUG_ON(!nr_devices);
883
884 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
885 GFP_NOFS);
886 if (!devices_info)
887 return -ENOMEM;
888
889 /* calc min stripe number for data space alloction */
890 type = btrfs_get_alloc_profile(root, 1);
891 if (type & BTRFS_BLOCK_GROUP_RAID0)
892 min_stripes = 2;
893 else if (type & BTRFS_BLOCK_GROUP_RAID1)
894 min_stripes = 2;
895 else if (type & BTRFS_BLOCK_GROUP_RAID10)
896 min_stripes = 4;
897
898 if (type & BTRFS_BLOCK_GROUP_DUP)
899 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
900 else
901 min_stripe_size = BTRFS_STRIPE_LEN;
902
903 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
904 if (!device->in_fs_metadata)
905 continue;
906
907 avail_space = device->total_bytes - device->bytes_used;
908
909 /* align with stripe_len */
910 do_div(avail_space, BTRFS_STRIPE_LEN);
911 avail_space *= BTRFS_STRIPE_LEN;
912
913 /*
914 * In order to avoid overwritting the superblock on the drive,
915 * btrfs starts at an offset of at least 1MB when doing chunk
916 * allocation.
917 */
918 skip_space = 1024 * 1024;
919
920 /* user can set the offset in fs_info->alloc_start. */
921 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
922 device->total_bytes)
923 skip_space = max(fs_info->alloc_start, skip_space);
924
925 /*
926 * btrfs can not use the free space in [0, skip_space - 1],
927 * we must subtract it from the total. In order to implement
928 * it, we account the used space in this range first.
929 */
930 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
931 &used_space);
932 if (ret) {
933 kfree(devices_info);
934 return ret;
935 }
936
937 /* calc the free space in [0, skip_space - 1] */
938 skip_space -= used_space;
939
940 /*
941 * we can use the free space in [0, skip_space - 1], subtract
942 * it from the total.
943 */
944 if (avail_space && avail_space >= skip_space)
945 avail_space -= skip_space;
946 else
947 avail_space = 0;
948
949 if (avail_space < min_stripe_size)
950 continue;
951
952 devices_info[i].dev = device;
953 devices_info[i].max_avail = avail_space;
954
955 i++;
956 }
957
958 nr_devices = i;
959
960 btrfs_descending_sort_devices(devices_info, nr_devices);
961
962 i = nr_devices - 1;
963 avail_space = 0;
964 while (nr_devices >= min_stripes) {
965 if (devices_info[i].max_avail >= min_stripe_size) {
966 int j;
967 u64 alloc_size;
968
969 avail_space += devices_info[i].max_avail * min_stripes;
970 alloc_size = devices_info[i].max_avail;
971 for (j = i + 1 - min_stripes; j <= i; j++)
972 devices_info[j].max_avail -= alloc_size;
973 }
974 i--;
975 nr_devices--;
976 }
977
978 kfree(devices_info);
979 *free_bytes = avail_space;
980 return 0;
981}
982
755static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 983static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
756{ 984{
757 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 985 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -759,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
759 struct list_head *head = &root->fs_info->space_info; 987 struct list_head *head = &root->fs_info->space_info;
760 struct btrfs_space_info *found; 988 struct btrfs_space_info *found;
761 u64 total_used = 0; 989 u64 total_used = 0;
762 u64 total_used_data = 0; 990 u64 total_free_data = 0;
763 int bits = dentry->d_sb->s_blocksize_bits; 991 int bits = dentry->d_sb->s_blocksize_bits;
764 __be32 *fsid = (__be32 *)root->fs_info->fsid; 992 __be32 *fsid = (__be32 *)root->fs_info->fsid;
993 int ret;
765 994
995 /* holding chunk_muext to avoid allocating new chunks */
996 mutex_lock(&root->fs_info->chunk_mutex);
766 rcu_read_lock(); 997 rcu_read_lock();
767 list_for_each_entry_rcu(found, head, list) { 998 list_for_each_entry_rcu(found, head, list) {
768 if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | 999 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
769 BTRFS_BLOCK_GROUP_SYSTEM)) 1000 total_free_data += found->disk_total - found->disk_used;
770 total_used_data += found->disk_total; 1001 total_free_data -=
771 else 1002 btrfs_account_ro_block_groups_free_space(found);
772 total_used_data += found->disk_used; 1003 }
1004
773 total_used += found->disk_used; 1005 total_used += found->disk_used;
774 } 1006 }
775 rcu_read_unlock(); 1007 rcu_read_unlock();
@@ -777,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
777 buf->f_namelen = BTRFS_NAME_LEN; 1009 buf->f_namelen = BTRFS_NAME_LEN;
778 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1010 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
779 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1011 buf->f_bfree = buf->f_blocks - (total_used >> bits);
780 buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
781 buf->f_bsize = dentry->d_sb->s_blocksize; 1012 buf->f_bsize = dentry->d_sb->s_blocksize;
782 buf->f_type = BTRFS_SUPER_MAGIC; 1013 buf->f_type = BTRFS_SUPER_MAGIC;
1014 buf->f_bavail = total_free_data;
1015 ret = btrfs_calc_avail_data_space(root, &total_free_data);
1016 if (ret) {
1017 mutex_unlock(&root->fs_info->chunk_mutex);
1018 return ret;
1019 }
1020 buf->f_bavail += total_free_data;
1021 buf->f_bavail = buf->f_bavail >> bits;
1022 mutex_unlock(&root->fs_info->chunk_mutex);
783 1023
784 /* We treat it as constant endianness (it doesn't matter _which_) 1024 /* We treat it as constant endianness (it doesn't matter _which_)
785 because we want the fsid to come out the same whether mounted 1025 because we want the fsid to come out the same whether mounted
@@ -896,10 +1136,14 @@ static int __init init_btrfs_fs(void)
896 if (err) 1136 if (err)
897 return err; 1137 return err;
898 1138
899 err = btrfs_init_cachep(); 1139 err = btrfs_init_compress();
900 if (err) 1140 if (err)
901 goto free_sysfs; 1141 goto free_sysfs;
902 1142
1143 err = btrfs_init_cachep();
1144 if (err)
1145 goto free_compress;
1146
903 err = extent_io_init(); 1147 err = extent_io_init();
904 if (err) 1148 if (err)
905 goto free_cachep; 1149 goto free_cachep;
@@ -927,6 +1171,8 @@ free_extent_io:
927 extent_io_exit(); 1171 extent_io_exit();
928free_cachep: 1172free_cachep:
929 btrfs_destroy_cachep(); 1173 btrfs_destroy_cachep();
1174free_compress:
1175 btrfs_exit_compress();
930free_sysfs: 1176free_sysfs:
931 btrfs_exit_sysfs(); 1177 btrfs_exit_sysfs();
932 return err; 1178 return err;
@@ -941,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
941 unregister_filesystem(&btrfs_fs_type); 1187 unregister_filesystem(&btrfs_fs_type);
942 btrfs_exit_sysfs(); 1188 btrfs_exit_sysfs();
943 btrfs_cleanup_fs_uuids(); 1189 btrfs_cleanup_fs_uuids();
944 btrfs_zlib_exit(); 1190 btrfs_exit_compress();
945} 1191}
946 1192
947module_init(init_btrfs_fs) 1193module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f50e931fc217..bae5c7b8bbe2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
181 struct btrfs_trans_handle *h; 181 struct btrfs_trans_handle *h;
182 struct btrfs_transaction *cur_trans; 182 struct btrfs_transaction *cur_trans;
183 int ret; 183 int ret;
184
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
186 return ERR_PTR(-EROFS);
184again: 187again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 188 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h) 189 if (!h)
@@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
910 u64 to_reserve = 0; 913 u64 to_reserve = 0;
911 u64 index = 0; 914 u64 index = 0;
912 u64 objectid; 915 u64 objectid;
916 u64 root_flags;
913 917
914 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 918 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
915 if (!new_root_item) { 919 if (!new_root_item) {
@@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
967 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 971 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
968 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 972 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
969 973
974 root_flags = btrfs_root_flags(new_root_item);
975 if (pending->readonly)
976 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
977 else
978 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
979 btrfs_set_root_flags(new_root_item, root_flags);
980
970 old = btrfs_lock_root_node(root); 981 old = btrfs_lock_root_node(root);
971 btrfs_cow_block(trans, root, old, NULL, 0, &old); 982 btrfs_cow_block(trans, root, old, NULL, 0, &old);
972 btrfs_set_lock_blocking(old); 983 btrfs_set_lock_blocking(old);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4ef..229a594cacd5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
62 struct btrfs_block_rsv block_rsv; 62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */ 63 /* extra metadata reseration for relocation */
64 int error; 64 int error;
65 bool readonly;
65 struct list_head list; 66 struct list_head list;
66}; 67};
67 68
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6b9884507837..d158530233b7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include "compat.h" 27#include "compat.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -493,7 +494,7 @@ again:
493 continue; 494 continue;
494 495
495 if (device->bdev) { 496 if (device->bdev) {
496 close_bdev_exclusive(device->bdev, device->mode); 497 blkdev_put(device->bdev, device->mode);
497 device->bdev = NULL; 498 device->bdev = NULL;
498 fs_devices->open_devices--; 499 fs_devices->open_devices--;
499 } 500 }
@@ -527,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
527 528
528 list_for_each_entry(device, &fs_devices->devices, dev_list) { 529 list_for_each_entry(device, &fs_devices->devices, dev_list) {
529 if (device->bdev) { 530 if (device->bdev) {
530 close_bdev_exclusive(device->bdev, device->mode); 531 blkdev_put(device->bdev, device->mode);
531 fs_devices->open_devices--; 532 fs_devices->open_devices--;
532 } 533 }
533 if (device->writeable) { 534 if (device->writeable) {
@@ -584,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
584 int seeding = 1; 585 int seeding = 1;
585 int ret = 0; 586 int ret = 0;
586 587
588 flags |= FMODE_EXCL;
589
587 list_for_each_entry(device, head, dev_list) { 590 list_for_each_entry(device, head, dev_list) {
588 if (device->bdev) 591 if (device->bdev)
589 continue; 592 continue;
590 if (!device->name) 593 if (!device->name)
591 continue; 594 continue;
592 595
593 bdev = open_bdev_exclusive(device->name, flags, holder); 596 bdev = blkdev_get_by_path(device->name, flags, holder);
594 if (IS_ERR(bdev)) { 597 if (IS_ERR(bdev)) {
595 printk(KERN_INFO "open %s failed\n", device->name); 598 printk(KERN_INFO "open %s failed\n", device->name);
596 goto error; 599 goto error;
@@ -598,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
598 set_blocksize(bdev, 4096); 601 set_blocksize(bdev, 4096);
599 602
600 bh = btrfs_read_dev_super(bdev); 603 bh = btrfs_read_dev_super(bdev);
601 if (!bh) 604 if (!bh) {
605 ret = -EINVAL;
602 goto error_close; 606 goto error_close;
607 }
603 608
604 disk_super = (struct btrfs_super_block *)bh->b_data; 609 disk_super = (struct btrfs_super_block *)bh->b_data;
605 devid = btrfs_stack_device_id(&disk_super->dev_item); 610 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -642,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
642error_brelse: 647error_brelse:
643 brelse(bh); 648 brelse(bh);
644error_close: 649error_close:
645 close_bdev_exclusive(bdev, FMODE_READ); 650 blkdev_put(bdev, flags);
646error: 651error:
647 continue; 652 continue;
648 } 653 }
@@ -688,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
688 693
689 mutex_lock(&uuid_mutex); 694 mutex_lock(&uuid_mutex);
690 695
691 bdev = open_bdev_exclusive(path, flags, holder); 696 flags |= FMODE_EXCL;
697 bdev = blkdev_get_by_path(path, flags, holder);
692 698
693 if (IS_ERR(bdev)) { 699 if (IS_ERR(bdev)) {
694 ret = PTR_ERR(bdev); 700 ret = PTR_ERR(bdev);
@@ -700,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
700 goto error_close; 706 goto error_close;
701 bh = btrfs_read_dev_super(bdev); 707 bh = btrfs_read_dev_super(bdev);
702 if (!bh) { 708 if (!bh) {
703 ret = -EIO; 709 ret = -EINVAL;
704 goto error_close; 710 goto error_close;
705 } 711 }
706 disk_super = (struct btrfs_super_block *)bh->b_data; 712 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -720,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
720 726
721 brelse(bh); 727 brelse(bh);
722error_close: 728error_close:
723 close_bdev_exclusive(bdev, flags); 729 blkdev_put(bdev, flags);
724error: 730error:
725 mutex_unlock(&uuid_mutex); 731 mutex_unlock(&uuid_mutex);
726 return ret; 732 return ret;
727} 733}
728 734
735/* helper to account the used device space in the range */
736int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
737 u64 end, u64 *length)
738{
739 struct btrfs_key key;
740 struct btrfs_root *root = device->dev_root;
741 struct btrfs_dev_extent *dev_extent;
742 struct btrfs_path *path;
743 u64 extent_end;
744 int ret;
745 int slot;
746 struct extent_buffer *l;
747
748 *length = 0;
749
750 if (start >= device->total_bytes)
751 return 0;
752
753 path = btrfs_alloc_path();
754 if (!path)
755 return -ENOMEM;
756 path->reada = 2;
757
758 key.objectid = device->devid;
759 key.offset = start;
760 key.type = BTRFS_DEV_EXTENT_KEY;
761
762 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
763 if (ret < 0)
764 goto out;
765 if (ret > 0) {
766 ret = btrfs_previous_item(root, path, key.objectid, key.type);
767 if (ret < 0)
768 goto out;
769 }
770
771 while (1) {
772 l = path->nodes[0];
773 slot = path->slots[0];
774 if (slot >= btrfs_header_nritems(l)) {
775 ret = btrfs_next_leaf(root, path);
776 if (ret == 0)
777 continue;
778 if (ret < 0)
779 goto out;
780
781 break;
782 }
783 btrfs_item_key_to_cpu(l, &key, slot);
784
785 if (key.objectid < device->devid)
786 goto next;
787
788 if (key.objectid > device->devid)
789 break;
790
791 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
792 goto next;
793
794 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
795 extent_end = key.offset + btrfs_dev_extent_length(l,
796 dev_extent);
797 if (key.offset <= start && extent_end > end) {
798 *length = end - start + 1;
799 break;
800 } else if (key.offset <= start && extent_end > start)
801 *length += extent_end - start;
802 else if (key.offset > start && extent_end <= end)
803 *length += extent_end - key.offset;
804 else if (key.offset > start && key.offset <= end) {
805 *length += end - key.offset + 1;
806 break;
807 } else if (key.offset > end)
808 break;
809
810next:
811 path->slots[0]++;
812 }
813 ret = 0;
814out:
815 btrfs_free_path(path);
816 return ret;
817}
818
729/* 819/*
820 * find_free_dev_extent - find free space in the specified device
821 * @trans: transaction handler
822 * @device: the device which we search the free space in
823 * @num_bytes: the size of the free space that we need
824 * @start: store the start of the free space.
825 * @len: the size of the free space. that we find, or the size of the max
826 * free space if we don't find suitable free space
827 *
730 * this uses a pretty simple search, the expectation is that it is 828 * this uses a pretty simple search, the expectation is that it is
731 * called very infrequently and that a given device has a small number 829 * called very infrequently and that a given device has a small number
732 * of extents 830 * of extents
831 *
832 * @start is used to store the start of the free space if we find. But if we
833 * don't find suitable free space, it will be used to store the start position
834 * of the max free space.
835 *
836 * @len is used to store the size of the free space that we find.
837 * But if we don't find suitable free space, it is used to store the size of
838 * the max free space.
733 */ 839 */
734int find_free_dev_extent(struct btrfs_trans_handle *trans, 840int find_free_dev_extent(struct btrfs_trans_handle *trans,
735 struct btrfs_device *device, u64 num_bytes, 841 struct btrfs_device *device, u64 num_bytes,
736 u64 *start, u64 *max_avail) 842 u64 *start, u64 *len)
737{ 843{
738 struct btrfs_key key; 844 struct btrfs_key key;
739 struct btrfs_root *root = device->dev_root; 845 struct btrfs_root *root = device->dev_root;
740 struct btrfs_dev_extent *dev_extent = NULL; 846 struct btrfs_dev_extent *dev_extent;
741 struct btrfs_path *path; 847 struct btrfs_path *path;
742 u64 hole_size = 0; 848 u64 hole_size;
743 u64 last_byte = 0; 849 u64 max_hole_start;
744 u64 search_start = 0; 850 u64 max_hole_size;
851 u64 extent_end;
852 u64 search_start;
745 u64 search_end = device->total_bytes; 853 u64 search_end = device->total_bytes;
746 int ret; 854 int ret;
747 int slot = 0; 855 int slot;
748 int start_found;
749 struct extent_buffer *l; 856 struct extent_buffer *l;
750 857
751 path = btrfs_alloc_path();
752 if (!path)
753 return -ENOMEM;
754 path->reada = 2;
755 start_found = 0;
756
757 /* FIXME use last free of some kind */ 858 /* FIXME use last free of some kind */
758 859
759 /* we don't want to overwrite the superblock on the drive, 860 /* we don't want to overwrite the superblock on the drive,
760 * so we make sure to start at an offset of at least 1MB 861 * so we make sure to start at an offset of at least 1MB
761 */ 862 */
762 search_start = max((u64)1024 * 1024, search_start); 863 search_start = 1024 * 1024;
763 864
764 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 865 if (root->fs_info->alloc_start + num_bytes <= search_end)
765 search_start = max(root->fs_info->alloc_start, search_start); 866 search_start = max(root->fs_info->alloc_start, search_start);
766 867
868 max_hole_start = search_start;
869 max_hole_size = 0;
870
871 if (search_start >= search_end) {
872 ret = -ENOSPC;
873 goto error;
874 }
875
876 path = btrfs_alloc_path();
877 if (!path) {
878 ret = -ENOMEM;
879 goto error;
880 }
881 path->reada = 2;
882
767 key.objectid = device->devid; 883 key.objectid = device->devid;
768 key.offset = search_start; 884 key.offset = search_start;
769 key.type = BTRFS_DEV_EXTENT_KEY; 885 key.type = BTRFS_DEV_EXTENT_KEY;
886
770 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 887 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
771 if (ret < 0) 888 if (ret < 0)
772 goto error; 889 goto out;
773 if (ret > 0) { 890 if (ret > 0) {
774 ret = btrfs_previous_item(root, path, key.objectid, key.type); 891 ret = btrfs_previous_item(root, path, key.objectid, key.type);
775 if (ret < 0) 892 if (ret < 0)
776 goto error; 893 goto out;
777 if (ret > 0)
778 start_found = 1;
779 } 894 }
780 l = path->nodes[0]; 895
781 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
782 while (1) { 896 while (1) {
783 l = path->nodes[0]; 897 l = path->nodes[0];
784 slot = path->slots[0]; 898 slot = path->slots[0];
@@ -787,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
787 if (ret == 0) 901 if (ret == 0)
788 continue; 902 continue;
789 if (ret < 0) 903 if (ret < 0)
790 goto error; 904 goto out;
791no_more_items: 905
792 if (!start_found) { 906 break;
793 if (search_start >= search_end) {
794 ret = -ENOSPC;
795 goto error;
796 }
797 *start = search_start;
798 start_found = 1;
799 goto check_pending;
800 }
801 *start = last_byte > search_start ?
802 last_byte : search_start;
803 if (search_end <= *start) {
804 ret = -ENOSPC;
805 goto error;
806 }
807 goto check_pending;
808 } 907 }
809 btrfs_item_key_to_cpu(l, &key, slot); 908 btrfs_item_key_to_cpu(l, &key, slot);
810 909
@@ -812,48 +911,62 @@ no_more_items:
812 goto next; 911 goto next;
813 912
814 if (key.objectid > device->devid) 913 if (key.objectid > device->devid)
815 goto no_more_items; 914 break;
816 915
817 if (key.offset >= search_start && key.offset > last_byte && 916 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
818 start_found) { 917 goto next;
819 if (last_byte < search_start)
820 last_byte = search_start;
821 hole_size = key.offset - last_byte;
822 918
823 if (hole_size > *max_avail) 919 if (key.offset > search_start) {
824 *max_avail = hole_size; 920 hole_size = key.offset - search_start;
825 921
826 if (key.offset > last_byte && 922 if (hole_size > max_hole_size) {
827 hole_size >= num_bytes) { 923 max_hole_start = search_start;
828 *start = last_byte; 924 max_hole_size = hole_size;
829 goto check_pending; 925 }
926
927 /*
928 * If this free space is greater than which we need,
929 * it must be the max free space that we have found
930 * until now, so max_hole_start must point to the start
931 * of this free space and the length of this free space
932 * is stored in max_hole_size. Thus, we return
933 * max_hole_start and max_hole_size and go back to the
934 * caller.
935 */
936 if (hole_size >= num_bytes) {
937 ret = 0;
938 goto out;
830 } 939 }
831 } 940 }
832 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
833 goto next;
834 941
835 start_found = 1;
836 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 942 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
837 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 943 extent_end = key.offset + btrfs_dev_extent_length(l,
944 dev_extent);
945 if (extent_end > search_start)
946 search_start = extent_end;
838next: 947next:
839 path->slots[0]++; 948 path->slots[0]++;
840 cond_resched(); 949 cond_resched();
841 } 950 }
842check_pending:
843 /* we have to make sure we didn't find an extent that has already
844 * been allocated by the map tree or the original allocation
845 */
846 BUG_ON(*start < search_start);
847 951
848 if (*start + num_bytes > search_end) { 952 hole_size = search_end- search_start;
849 ret = -ENOSPC; 953 if (hole_size > max_hole_size) {
850 goto error; 954 max_hole_start = search_start;
955 max_hole_size = hole_size;
851 } 956 }
852 /* check for pending inserts here */
853 ret = 0;
854 957
855error: 958 /* See above. */
959 if (hole_size < num_bytes)
960 ret = -ENOSPC;
961 else
962 ret = 0;
963
964out:
856 btrfs_free_path(path); 965 btrfs_free_path(path);
966error:
967 *start = max_hole_start;
968 if (len)
969 *len = max_hole_size;
857 return ret; 970 return ret;
858} 971}
859 972
@@ -1183,8 +1296,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1183 goto out; 1296 goto out;
1184 } 1297 }
1185 } else { 1298 } else {
1186 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1299 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1187 root->fs_info->bdev_holder); 1300 root->fs_info->bdev_holder);
1188 if (IS_ERR(bdev)) { 1301 if (IS_ERR(bdev)) {
1189 ret = PTR_ERR(bdev); 1302 ret = PTR_ERR(bdev);
1190 goto out; 1303 goto out;
@@ -1193,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1193 set_blocksize(bdev, 4096); 1306 set_blocksize(bdev, 4096);
1194 bh = btrfs_read_dev_super(bdev); 1307 bh = btrfs_read_dev_super(bdev);
1195 if (!bh) { 1308 if (!bh) {
1196 ret = -EIO; 1309 ret = -EINVAL;
1197 goto error_close; 1310 goto error_close;
1198 } 1311 }
1199 disk_super = (struct btrfs_super_block *)bh->b_data; 1312 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1251,7 +1364,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1251 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1364 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1252 1365
1253 if (device->bdev) { 1366 if (device->bdev) {
1254 close_bdev_exclusive(device->bdev, device->mode); 1367 blkdev_put(device->bdev, device->mode);
1255 device->bdev = NULL; 1368 device->bdev = NULL;
1256 device->fs_devices->open_devices--; 1369 device->fs_devices->open_devices--;
1257 } 1370 }
@@ -1294,7 +1407,7 @@ error_brelse:
1294 brelse(bh); 1407 brelse(bh);
1295error_close: 1408error_close:
1296 if (bdev) 1409 if (bdev)
1297 close_bdev_exclusive(bdev, FMODE_READ); 1410 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1298out: 1411out:
1299 mutex_unlock(&root->fs_info->volume_mutex); 1412 mutex_unlock(&root->fs_info->volume_mutex);
1300 mutex_unlock(&uuid_mutex); 1413 mutex_unlock(&uuid_mutex);
@@ -1446,7 +1559,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1446 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1559 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1447 return -EINVAL; 1560 return -EINVAL;
1448 1561
1449 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1562 bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1563 root->fs_info->bdev_holder);
1450 if (IS_ERR(bdev)) 1564 if (IS_ERR(bdev))
1451 return PTR_ERR(bdev); 1565 return PTR_ERR(bdev);
1452 1566
@@ -1572,7 +1686,7 @@ out:
1572 mutex_unlock(&root->fs_info->volume_mutex); 1686 mutex_unlock(&root->fs_info->volume_mutex);
1573 return ret; 1687 return ret;
1574error: 1688error:
1575 close_bdev_exclusive(bdev, 0); 1689 blkdev_put(bdev, FMODE_EXCL);
1576 if (seeding_dev) { 1690 if (seeding_dev) {
1577 mutex_unlock(&uuid_mutex); 1691 mutex_unlock(&uuid_mutex);
1578 up_write(&sb->s_umount); 1692 up_write(&sb->s_umount);
@@ -1912,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1912 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2026 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1913 return -EROFS; 2027 return -EROFS;
1914 2028
2029 if (!capable(CAP_SYS_ADMIN))
2030 return -EPERM;
2031
1915 mutex_lock(&dev_root->fs_info->volume_mutex); 2032 mutex_lock(&dev_root->fs_info->volume_mutex);
1916 dev_root = dev_root->fs_info->dev_root; 2033 dev_root = dev_root->fs_info->dev_root;
1917 2034
@@ -2150,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2150 return calc_size * num_stripes; 2267 return calc_size * num_stripes;
2151} 2268}
2152 2269
2153static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2270/* Used to sort the devices by max_avail(descending sort) */
2154 struct btrfs_root *extent_root, 2271int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2155 struct map_lookup **map_ret,
2156 u64 *num_bytes, u64 *stripe_size,
2157 u64 start, u64 type)
2158{ 2272{
2159 struct btrfs_fs_info *info = extent_root->fs_info; 2273 if (((struct btrfs_device_info *)dev_info1)->max_avail >
2160 struct btrfs_device *device = NULL; 2274 ((struct btrfs_device_info *)dev_info2)->max_avail)
2161 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2275 return -1;
2162 struct list_head *cur; 2276 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2163 struct map_lookup *map = NULL; 2277 ((struct btrfs_device_info *)dev_info2)->max_avail)
2164 struct extent_map_tree *em_tree; 2278 return 1;
2165 struct extent_map *em; 2279 else
2166 struct list_head private_devs; 2280 return 0;
2167 int min_stripe_size = 1 * 1024 * 1024; 2281}
2168 u64 calc_size = 1024 * 1024 * 1024;
2169 u64 max_chunk_size = calc_size;
2170 u64 min_free;
2171 u64 avail;
2172 u64 max_avail = 0;
2173 u64 dev_offset;
2174 int num_stripes = 1;
2175 int min_stripes = 1;
2176 int sub_stripes = 0;
2177 int looped = 0;
2178 int ret;
2179 int index;
2180 int stripe_len = 64 * 1024;
2181 2282
2182 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2283static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2183 (type & BTRFS_BLOCK_GROUP_DUP)) { 2284 int *num_stripes, int *min_stripes,
2184 WARN_ON(1); 2285 int *sub_stripes)
2185 type &= ~BTRFS_BLOCK_GROUP_DUP; 2286{
2186 } 2287 *num_stripes = 1;
2187 if (list_empty(&fs_devices->alloc_list)) 2288 *min_stripes = 1;
2188 return -ENOSPC; 2289 *sub_stripes = 0;
2189 2290
2190 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2291 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2191 num_stripes = fs_devices->rw_devices; 2292 *num_stripes = fs_devices->rw_devices;
2192 min_stripes = 2; 2293 *min_stripes = 2;
2193 } 2294 }
2194 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2295 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2195 num_stripes = 2; 2296 *num_stripes = 2;
2196 min_stripes = 2; 2297 *min_stripes = 2;
2197 } 2298 }
2198 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2299 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2199 if (fs_devices->rw_devices < 2) 2300 if (fs_devices->rw_devices < 2)
2200 return -ENOSPC; 2301 return -ENOSPC;
2201 num_stripes = 2; 2302 *num_stripes = 2;
2202 min_stripes = 2; 2303 *min_stripes = 2;
2203 } 2304 }
2204 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2305 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2205 num_stripes = fs_devices->rw_devices; 2306 *num_stripes = fs_devices->rw_devices;
2206 if (num_stripes < 4) 2307 if (*num_stripes < 4)
2207 return -ENOSPC; 2308 return -ENOSPC;
2208 num_stripes &= ~(u32)1; 2309 *num_stripes &= ~(u32)1;
2209 sub_stripes = 2; 2310 *sub_stripes = 2;
2210 min_stripes = 4; 2311 *min_stripes = 4;
2211 } 2312 }
2212 2313
2314 return 0;
2315}
2316
2317static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2318 u64 proposed_size, u64 type,
2319 int num_stripes, int small_stripe)
2320{
2321 int min_stripe_size = 1 * 1024 * 1024;
2322 u64 calc_size = proposed_size;
2323 u64 max_chunk_size = calc_size;
2324 int ncopies = 1;
2325
2326 if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2327 BTRFS_BLOCK_GROUP_DUP |
2328 BTRFS_BLOCK_GROUP_RAID10))
2329 ncopies = 2;
2330
2213 if (type & BTRFS_BLOCK_GROUP_DATA) { 2331 if (type & BTRFS_BLOCK_GROUP_DATA) {
2214 max_chunk_size = 10 * calc_size; 2332 max_chunk_size = 10 * calc_size;
2215 min_stripe_size = 64 * 1024 * 1024; 2333 min_stripe_size = 64 * 1024 * 1024;
@@ -2226,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2226 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2344 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2227 max_chunk_size); 2345 max_chunk_size);
2228 2346
2229again: 2347 if (calc_size * num_stripes > max_chunk_size * ncopies) {
2230 max_avail = 0; 2348 calc_size = max_chunk_size * ncopies;
2231 if (!map || map->num_stripes != num_stripes) {
2232 kfree(map);
2233 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2234 if (!map)
2235 return -ENOMEM;
2236 map->num_stripes = num_stripes;
2237 }
2238
2239 if (calc_size * num_stripes > max_chunk_size) {
2240 calc_size = max_chunk_size;
2241 do_div(calc_size, num_stripes); 2349 do_div(calc_size, num_stripes);
2242 do_div(calc_size, stripe_len); 2350 do_div(calc_size, BTRFS_STRIPE_LEN);
2243 calc_size *= stripe_len; 2351 calc_size *= BTRFS_STRIPE_LEN;
2244 } 2352 }
2245 2353
2246 /* we don't want tiny stripes */ 2354 /* we don't want tiny stripes */
2247 if (!looped) 2355 if (!small_stripe)
2248 calc_size = max_t(u64, min_stripe_size, calc_size); 2356 calc_size = max_t(u64, min_stripe_size, calc_size);
2249 2357
2250 /* 2358 /*
2251 * we're about to do_div by the stripe_len so lets make sure 2359 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2252 * we end up with something bigger than a stripe 2360 * we end up with something bigger than a stripe
2253 */ 2361 */
2254 calc_size = max_t(u64, calc_size, stripe_len * 4); 2362 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2363
2364 do_div(calc_size, BTRFS_STRIPE_LEN);
2365 calc_size *= BTRFS_STRIPE_LEN;
2366
2367 return calc_size;
2368}
2369
2370static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2371 int num_stripes)
2372{
2373 struct map_lookup *new;
2374 size_t len = map_lookup_size(num_stripes);
2375
2376 BUG_ON(map->num_stripes < num_stripes);
2377
2378 if (map->num_stripes == num_stripes)
2379 return map;
2380
2381 new = kmalloc(len, GFP_NOFS);
2382 if (!new) {
2383 /* just change map->num_stripes */
2384 map->num_stripes = num_stripes;
2385 return map;
2386 }
2387
2388 memcpy(new, map, len);
2389 new->num_stripes = num_stripes;
2390 kfree(map);
2391 return new;
2392}
2393
2394/*
2395 * helper to allocate device space from btrfs_device_info, in which we stored
2396 * max free space information of every device. It is used when we can not
2397 * allocate chunks by default size.
2398 *
2399 * By this helper, we can allocate a new chunk as larger as possible.
2400 */
2401static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2402 struct btrfs_fs_devices *fs_devices,
2403 struct btrfs_device_info *devices,
2404 int nr_device, u64 type,
2405 struct map_lookup **map_lookup,
2406 int min_stripes, u64 *stripe_size)
2407{
2408 int i, index, sort_again = 0;
2409 int min_devices = min_stripes;
2410 u64 max_avail, min_free;
2411 struct map_lookup *map = *map_lookup;
2412 int ret;
2413
2414 if (nr_device < min_stripes)
2415 return -ENOSPC;
2416
2417 btrfs_descending_sort_devices(devices, nr_device);
2255 2418
2256 do_div(calc_size, stripe_len); 2419 max_avail = devices[0].max_avail;
2257 calc_size *= stripe_len; 2420 if (!max_avail)
2421 return -ENOSPC;
2422
2423 for (i = 0; i < nr_device; i++) {
2424 /*
2425 * if dev_offset = 0, it means the free space of this device
2426 * is less than what we need, and we didn't search max avail
2427 * extent on this device, so do it now.
2428 */
2429 if (!devices[i].dev_offset) {
2430 ret = find_free_dev_extent(trans, devices[i].dev,
2431 max_avail,
2432 &devices[i].dev_offset,
2433 &devices[i].max_avail);
2434 if (ret != 0 && ret != -ENOSPC)
2435 return ret;
2436 sort_again = 1;
2437 }
2438 }
2439
2440 /* we update the max avail free extent of each devices, sort again */
2441 if (sort_again)
2442 btrfs_descending_sort_devices(devices, nr_device);
2443
2444 if (type & BTRFS_BLOCK_GROUP_DUP)
2445 min_devices = 1;
2446
2447 if (!devices[min_devices - 1].max_avail)
2448 return -ENOSPC;
2449
2450 max_avail = devices[min_devices - 1].max_avail;
2451 if (type & BTRFS_BLOCK_GROUP_DUP)
2452 do_div(max_avail, 2);
2453
2454 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2455 min_stripes, 1);
2456 if (type & BTRFS_BLOCK_GROUP_DUP)
2457 min_free = max_avail * 2;
2458 else
2459 min_free = max_avail;
2460
2461 if (min_free > devices[min_devices - 1].max_avail)
2462 return -ENOSPC;
2463
2464 map = __shrink_map_lookup_stripes(map, min_stripes);
2465 *stripe_size = max_avail;
2466
2467 index = 0;
2468 for (i = 0; i < min_stripes; i++) {
2469 map->stripes[i].dev = devices[index].dev;
2470 map->stripes[i].physical = devices[index].dev_offset;
2471 if (type & BTRFS_BLOCK_GROUP_DUP) {
2472 i++;
2473 map->stripes[i].dev = devices[index].dev;
2474 map->stripes[i].physical = devices[index].dev_offset +
2475 max_avail;
2476 }
2477 index++;
2478 }
2479 *map_lookup = map;
2480
2481 return 0;
2482}
2483
2484static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2485 struct btrfs_root *extent_root,
2486 struct map_lookup **map_ret,
2487 u64 *num_bytes, u64 *stripe_size,
2488 u64 start, u64 type)
2489{
2490 struct btrfs_fs_info *info = extent_root->fs_info;
2491 struct btrfs_device *device = NULL;
2492 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2493 struct list_head *cur;
2494 struct map_lookup *map;
2495 struct extent_map_tree *em_tree;
2496 struct extent_map *em;
2497 struct btrfs_device_info *devices_info;
2498 struct list_head private_devs;
2499 u64 calc_size = 1024 * 1024 * 1024;
2500 u64 min_free;
2501 u64 avail;
2502 u64 dev_offset;
2503 int num_stripes;
2504 int min_stripes;
2505 int sub_stripes;
2506 int min_devices; /* the min number of devices we need */
2507 int i;
2508 int ret;
2509 int index;
2510
2511 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2512 (type & BTRFS_BLOCK_GROUP_DUP)) {
2513 WARN_ON(1);
2514 type &= ~BTRFS_BLOCK_GROUP_DUP;
2515 }
2516 if (list_empty(&fs_devices->alloc_list))
2517 return -ENOSPC;
2518
2519 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2520 &min_stripes, &sub_stripes);
2521 if (ret)
2522 return ret;
2523
2524 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2525 GFP_NOFS);
2526 if (!devices_info)
2527 return -ENOMEM;
2528
2529 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2530 if (!map) {
2531 ret = -ENOMEM;
2532 goto error;
2533 }
2534 map->num_stripes = num_stripes;
2258 2535
2259 cur = fs_devices->alloc_list.next; 2536 cur = fs_devices->alloc_list.next;
2260 index = 0; 2537 index = 0;
2538 i = 0;
2261 2539
2262 if (type & BTRFS_BLOCK_GROUP_DUP) 2540 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2541 num_stripes, 0);
2542
2543 if (type & BTRFS_BLOCK_GROUP_DUP) {
2263 min_free = calc_size * 2; 2544 min_free = calc_size * 2;
2264 else 2545 min_devices = 1;
2546 } else {
2265 min_free = calc_size; 2547 min_free = calc_size;
2266 2548 min_devices = min_stripes;
2267 /* 2549 }
2268 * we add 1MB because we never use the first 1MB of the device, unless
2269 * we've looped, then we are likely allocating the maximum amount of
2270 * space left already
2271 */
2272 if (!looped)
2273 min_free += 1024 * 1024;
2274 2550
2275 INIT_LIST_HEAD(&private_devs); 2551 INIT_LIST_HEAD(&private_devs);
2276 while (index < num_stripes) { 2552 while (index < num_stripes) {
@@ -2283,27 +2559,39 @@ again:
2283 cur = cur->next; 2559 cur = cur->next;
2284 2560
2285 if (device->in_fs_metadata && avail >= min_free) { 2561 if (device->in_fs_metadata && avail >= min_free) {
2286 ret = find_free_dev_extent(trans, device, 2562 ret = find_free_dev_extent(trans, device, min_free,
2287 min_free, &dev_offset, 2563 &devices_info[i].dev_offset,
2288 &max_avail); 2564 &devices_info[i].max_avail);
2289 if (ret == 0) { 2565 if (ret == 0) {
2290 list_move_tail(&device->dev_alloc_list, 2566 list_move_tail(&device->dev_alloc_list,
2291 &private_devs); 2567 &private_devs);
2292 map->stripes[index].dev = device; 2568 map->stripes[index].dev = device;
2293 map->stripes[index].physical = dev_offset; 2569 map->stripes[index].physical =
2570 devices_info[i].dev_offset;
2294 index++; 2571 index++;
2295 if (type & BTRFS_BLOCK_GROUP_DUP) { 2572 if (type & BTRFS_BLOCK_GROUP_DUP) {
2296 map->stripes[index].dev = device; 2573 map->stripes[index].dev = device;
2297 map->stripes[index].physical = 2574 map->stripes[index].physical =
2298 dev_offset + calc_size; 2575 devices_info[i].dev_offset +
2576 calc_size;
2299 index++; 2577 index++;
2300 } 2578 }
2301 } 2579 } else if (ret != -ENOSPC)
2302 } else if (device->in_fs_metadata && avail > max_avail) 2580 goto error;
2303 max_avail = avail; 2581
2582 devices_info[i].dev = device;
2583 i++;
2584 } else if (device->in_fs_metadata &&
2585 avail >= BTRFS_STRIPE_LEN) {
2586 devices_info[i].dev = device;
2587 devices_info[i].max_avail = avail;
2588 i++;
2589 }
2590
2304 if (cur == &fs_devices->alloc_list) 2591 if (cur == &fs_devices->alloc_list)
2305 break; 2592 break;
2306 } 2593 }
2594
2307 list_splice(&private_devs, &fs_devices->alloc_list); 2595 list_splice(&private_devs, &fs_devices->alloc_list);
2308 if (index < num_stripes) { 2596 if (index < num_stripes) {
2309 if (index >= min_stripes) { 2597 if (index >= min_stripes) {
@@ -2312,34 +2600,36 @@ again:
2312 num_stripes /= sub_stripes; 2600 num_stripes /= sub_stripes;
2313 num_stripes *= sub_stripes; 2601 num_stripes *= sub_stripes;
2314 } 2602 }
2315 looped = 1; 2603
2316 goto again; 2604 map = __shrink_map_lookup_stripes(map, num_stripes);
2317 } 2605 } else if (i >= min_devices) {
2318 if (!looped && max_avail > 0) { 2606 ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2319 looped = 1; 2607 devices_info, i, type,
2320 calc_size = max_avail; 2608 &map, min_stripes,
2321 goto again; 2609 &calc_size);
2610 if (ret)
2611 goto error;
2612 } else {
2613 ret = -ENOSPC;
2614 goto error;
2322 } 2615 }
2323 kfree(map);
2324 return -ENOSPC;
2325 } 2616 }
2326 map->sector_size = extent_root->sectorsize; 2617 map->sector_size = extent_root->sectorsize;
2327 map->stripe_len = stripe_len; 2618 map->stripe_len = BTRFS_STRIPE_LEN;
2328 map->io_align = stripe_len; 2619 map->io_align = BTRFS_STRIPE_LEN;
2329 map->io_width = stripe_len; 2620 map->io_width = BTRFS_STRIPE_LEN;
2330 map->type = type; 2621 map->type = type;
2331 map->num_stripes = num_stripes;
2332 map->sub_stripes = sub_stripes; 2622 map->sub_stripes = sub_stripes;
2333 2623
2334 *map_ret = map; 2624 *map_ret = map;
2335 *stripe_size = calc_size; 2625 *stripe_size = calc_size;
2336 *num_bytes = chunk_bytes_by_type(type, calc_size, 2626 *num_bytes = chunk_bytes_by_type(type, calc_size,
2337 num_stripes, sub_stripes); 2627 map->num_stripes, sub_stripes);
2338 2628
2339 em = alloc_extent_map(GFP_NOFS); 2629 em = alloc_extent_map(GFP_NOFS);
2340 if (!em) { 2630 if (!em) {
2341 kfree(map); 2631 ret = -ENOMEM;
2342 return -ENOMEM; 2632 goto error;
2343 } 2633 }
2344 em->bdev = (struct block_device *)map; 2634 em->bdev = (struct block_device *)map;
2345 em->start = start; 2635 em->start = start;
@@ -2372,7 +2662,13 @@ again:
2372 index++; 2662 index++;
2373 } 2663 }
2374 2664
2665 kfree(devices_info);
2375 return 0; 2666 return 0;
2667
2668error:
2669 kfree(map);
2670 kfree(devices_info);
2671 return ret;
2376} 2672}
2377 2673
2378static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2674static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2740db49eb04..7fb59d45fe8c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
20#define __BTRFS_VOLUMES_ 20#define __BTRFS_VOLUMES_
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h>
23#include "async-thread.h" 24#include "async-thread.h"
24 25
26#define BTRFS_STRIPE_LEN (64 * 1024)
27
25struct buffer_head; 28struct buffer_head;
26struct btrfs_pending_bios { 29struct btrfs_pending_bios {
27 struct bio *head; 30 struct bio *head;
@@ -50,7 +53,7 @@ struct btrfs_device {
50 53
51 struct block_device *bdev; 54 struct block_device *bdev;
52 55
53 /* the mode sent to open_bdev_exclusive */ 56 /* the mode sent to blkdev_get */
54 fmode_t mode; 57 fmode_t mode;
55 58
56 char *name; 59 char *name;
@@ -136,6 +139,30 @@ struct btrfs_multi_bio {
136 struct btrfs_bio_stripe stripes[]; 139 struct btrfs_bio_stripe stripes[];
137}; 140};
138 141
142struct btrfs_device_info {
143 struct btrfs_device *dev;
144 u64 dev_offset;
145 u64 max_avail;
146};
147
148/* Used to sort the devices by max_avail(descending sort) */
149int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
150
151/*
152 * sort the devices by max_avail, in which max free extent size of each device
153 * is stored.(Descending Sort)
154 */
155static inline void btrfs_descending_sort_devices(
156 struct btrfs_device_info *devices,
157 size_t nr_devices)
158{
159 sort(devices, nr_devices, sizeof(struct btrfs_device_info),
160 btrfs_cmp_device_free_bytes, NULL);
161}
162
163int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
164 u64 end, u64 *length);
165
139#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 166#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
140 (sizeof(struct btrfs_bio_stripe) * (n))) 167 (sizeof(struct btrfs_bio_stripe) * (n)))
141 168
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739c..a5776531dc2b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, 316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
317 size_t size, int flags) 317 size_t size, int flags)
318{ 318{
319 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
320
321 /*
322 * The permission on security.* and system.* is not checked
323 * in permission().
324 */
325 if (btrfs_root_readonly(root))
326 return -EROFS;
327
319 /* 328 /*
320 * If this is a request for a synthetic attribute in the system.* 329 * If this is a request for a synthetic attribute in the system.*
321 * namespace use the generic infrastructure to resolve a handler 330 * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
336 345
337int btrfs_removexattr(struct dentry *dentry, const char *name) 346int btrfs_removexattr(struct dentry *dentry, const char *name)
338{ 347{
348 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
349
350 /*
351 * The permission on security.* and system.* is not checked
352 * in permission().
353 */
354 if (btrfs_root_readonly(root))
355 return -EROFS;
356
339 /* 357 /*
340 * If this is a request for a synthetic attribute in the system.* 358 * If this is a request for a synthetic attribute in the system.*
341 * namespace use the generic infrastructure to resolve a handler 359 * namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71c..f5ec2d44150d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
32#include <linux/bio.h> 32#include <linux/bio.h>
33#include "compression.h" 33#include "compression.h"
34 34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace { 35struct workspace {
45 z_stream inf_strm; 36 z_stream inf_strm;
46 z_stream def_strm; 37 z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
48 struct list_head list; 39 struct list_head list;
49}; 40};
50 41
51static LIST_HEAD(idle_workspace); 42static void zlib_free_workspace(struct list_head *ws)
52static DEFINE_SPINLOCK(workspace_lock); 43{
53static unsigned long num_workspace; 44 struct workspace *workspace = list_entry(ws, struct workspace, list);
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56 45
57/* 46 vfree(workspace->def_strm.workspace);
58 * this finds an available zlib workspace or allocates a new one 47 vfree(workspace->inf_strm.workspace);
59 * NULL or an ERR_PTR is returned if things go bad. 48 kfree(workspace->buf);
60 */ 49 kfree(workspace);
61static struct workspace *find_zlib_workspace(void) 50}
51
52static struct list_head *zlib_alloc_workspace(void)
62{ 53{
63 struct workspace *workspace; 54 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76 55
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) { 57 if (!workspace)
90 ret = -ENOMEM; 58 return ERR_PTR(-ENOMEM);
91 goto fail;
92 }
93 59
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 61 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) { 63 if (!workspace->def_strm.workspace ||
106 ret = -ENOMEM; 64 !workspace->inf_strm.workspace || !workspace->buf)
107 goto fail_kmalloc; 65 goto fail;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142 66
143 atomic_dec(&alloc_workspace); 67 INIT_LIST_HEAD(&workspace->list);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148 68
149/* 69 return &workspace->list;
150 * cleanup function for module exit 70fail:
151 */ 71 zlib_free_workspace(&workspace->list);
152static void free_workspaces(void) 72 return ERR_PTR(-ENOMEM);
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165} 73}
166 74
167/* 75static int zlib_compress_pages(struct list_head *ws,
168 * given an address space and start/len, compress the bytes. 76 struct address_space *mapping,
169 * 77 u64 start, unsigned long len,
170 * pages are allocated to hold the compressed result and stored 78 struct page **pages,
171 * in 'pages' 79 unsigned long nr_dest_pages,
172 * 80 unsigned long *out_pages,
173 * out_pages is used to return the number of pages allocated. There 81 unsigned long *total_in,
174 * may be pages allocated even if we return an error 82 unsigned long *total_out,
175 * 83 unsigned long max_out)
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{ 84{
85 struct workspace *workspace = list_entry(ws, struct workspace, list);
195 int ret; 86 int ret;
196 struct workspace *workspace;
197 char *data_in; 87 char *data_in;
198 char *cpage_out; 88 char *cpage_out;
199 int nr_pages = 0; 89 int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
205 *total_out = 0; 95 *total_out = 0;
206 *total_in = 0; 96 *total_in = 0;
207 97
208 workspace = find_zlib_workspace();
209 if (IS_ERR(workspace))
210 return -1;
211
212 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 98 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
213 printk(KERN_WARNING "deflateInit failed\n"); 99 printk(KERN_WARNING "deflateInit failed\n");
214 ret = -1; 100 ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
222 data_in = kmap(in_page); 108 data_in = kmap(in_page);
223 109
224 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 110 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
111 if (out_page == NULL) {
112 ret = -1;
113 goto out;
114 }
225 cpage_out = kmap(out_page); 115 cpage_out = kmap(out_page);
226 pages[0] = out_page; 116 pages[0] = out_page;
227 nr_pages = 1; 117 nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
260 goto out; 150 goto out;
261 } 151 }
262 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 152 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
153 if (out_page == NULL) {
154 ret = -1;
155 goto out;
156 }
263 cpage_out = kmap(out_page); 157 cpage_out = kmap(out_page);
264 pages[nr_pages] = out_page; 158 pages[nr_pages] = out_page;
265 nr_pages++; 159 nr_pages++;
@@ -314,55 +208,26 @@ out:
314 kunmap(in_page); 208 kunmap(in_page);
315 page_cache_release(in_page); 209 page_cache_release(in_page);
316 } 210 }
317 free_workspace(workspace);
318 return ret; 211 return ret;
319} 212}
320 213
321/* 214static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
322 * pages_in is an array of pages with compressed data. 215 u64 disk_start,
323 * 216 struct bio_vec *bvec,
324 * disk_start is the starting logical offset of this array in the file 217 int vcnt,
325 * 218 size_t srclen)
326 * bvec is a bio_vec of pages from the file that we want to decompress into
327 *
328 * vcnt is the count of pages in the biovec
329 *
330 * srclen is the number of bytes in pages_in
331 *
332 * The basic idea is that we have a bio that was created by readpages.
333 * The pages in the bio are for the uncompressed data, and they may not
334 * be contiguous. They all correspond to the range of bytes covered by
335 * the compressed extent.
336 */
337int btrfs_zlib_decompress_biovec(struct page **pages_in,
338 u64 disk_start,
339 struct bio_vec *bvec,
340 int vcnt,
341 size_t srclen)
342{ 219{
343 int ret = 0; 220 struct workspace *workspace = list_entry(ws, struct workspace, list);
221 int ret = 0, ret2;
344 int wbits = MAX_WBITS; 222 int wbits = MAX_WBITS;
345 struct workspace *workspace;
346 char *data_in; 223 char *data_in;
347 size_t total_out = 0; 224 size_t total_out = 0;
348 unsigned long page_bytes_left;
349 unsigned long page_in_index = 0; 225 unsigned long page_in_index = 0;
350 unsigned long page_out_index = 0; 226 unsigned long page_out_index = 0;
351 struct page *page_out;
352 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 227 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
353 PAGE_CACHE_SIZE; 228 PAGE_CACHE_SIZE;
354 unsigned long buf_start; 229 unsigned long buf_start;
355 unsigned long buf_offset;
356 unsigned long bytes;
357 unsigned long working_bytes;
358 unsigned long pg_offset; 230 unsigned long pg_offset;
359 unsigned long start_byte;
360 unsigned long current_buf_start;
361 char *kaddr;
362
363 workspace = find_zlib_workspace();
364 if (IS_ERR(workspace))
365 return -ENOMEM;
366 231
367 data_in = kmap(pages_in[page_in_index]); 232 data_in = kmap(pages_in[page_in_index]);
368 workspace->inf_strm.next_in = data_in; 233 workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
372 workspace->inf_strm.total_out = 0; 237 workspace->inf_strm.total_out = 0;
373 workspace->inf_strm.next_out = workspace->buf; 238 workspace->inf_strm.next_out = workspace->buf;
374 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 239 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
375 page_out = bvec[page_out_index].bv_page;
376 page_bytes_left = PAGE_CACHE_SIZE;
377 pg_offset = 0; 240 pg_offset = 0;
378 241
379 /* If it's deflate, and it's got no preset dictionary, then 242 /* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
389 252
390 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 253 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
391 printk(KERN_WARNING "inflateInit failed\n"); 254 printk(KERN_WARNING "inflateInit failed\n");
392 ret = -1; 255 return -1;
393 goto out;
394 } 256 }
395 while (workspace->inf_strm.total_in < srclen) { 257 while (workspace->inf_strm.total_in < srclen) {
396 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 258 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
397 if (ret != Z_OK && ret != Z_STREAM_END) 259 if (ret != Z_OK && ret != Z_STREAM_END)
398 break; 260 break;
399 /*
400 * buf start is the byte offset we're of the start of
401 * our workspace buffer
402 */
403 buf_start = total_out;
404 261
405 /* total_out is the last byte of the workspace buffer */ 262 buf_start = total_out;
406 total_out = workspace->inf_strm.total_out; 263 total_out = workspace->inf_strm.total_out;
407 264
408 working_bytes = total_out - buf_start; 265 /* we didn't make progress in this inflate call, we're done */
409 266 if (buf_start == total_out)
410 /*
411 * start byte is the first byte of the page we're currently
412 * copying into relative to the start of the compressed data.
413 */
414 start_byte = page_offset(page_out) - disk_start;
415
416 if (working_bytes == 0) {
417 /* we didn't make progress in this inflate
418 * call, we're done
419 */
420 if (ret != Z_STREAM_END)
421 ret = -1;
422 break; 267 break;
423 }
424 268
425 /* we haven't yet hit data corresponding to this page */ 269 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
426 if (total_out <= start_byte) 270 total_out, disk_start,
427 goto next; 271 bvec, vcnt,
428 272 &page_out_index, &pg_offset);
429 /* 273 if (ret2 == 0) {
430 * the start of the data we care about is offset into 274 ret = 0;
431 * the middle of our working buffer 275 goto done;
432 */
433 if (total_out > start_byte && buf_start < start_byte) {
434 buf_offset = start_byte - buf_start;
435 working_bytes -= buf_offset;
436 } else {
437 buf_offset = 0;
438 }
439 current_buf_start = buf_start;
440
441 /* copy bytes from the working buffer into the pages */
442 while (working_bytes > 0) {
443 bytes = min(PAGE_CACHE_SIZE - pg_offset,
444 PAGE_CACHE_SIZE - buf_offset);
445 bytes = min(bytes, working_bytes);
446 kaddr = kmap_atomic(page_out, KM_USER0);
447 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
448 bytes);
449 kunmap_atomic(kaddr, KM_USER0);
450 flush_dcache_page(page_out);
451
452 pg_offset += bytes;
453 page_bytes_left -= bytes;
454 buf_offset += bytes;
455 working_bytes -= bytes;
456 current_buf_start += bytes;
457
458 /* check if we need to pick another page */
459 if (page_bytes_left == 0) {
460 page_out_index++;
461 if (page_out_index >= vcnt) {
462 ret = 0;
463 goto done;
464 }
465
466 page_out = bvec[page_out_index].bv_page;
467 pg_offset = 0;
468 page_bytes_left = PAGE_CACHE_SIZE;
469 start_byte = page_offset(page_out) - disk_start;
470
471 /*
472 * make sure our new page is covered by this
473 * working buffer
474 */
475 if (total_out <= start_byte)
476 goto next;
477
478 /* the next page in the biovec might not
479 * be adjacent to the last page, but it
480 * might still be found inside this working
481 * buffer. bump our offset pointer
482 */
483 if (total_out > start_byte &&
484 current_buf_start < start_byte) {
485 buf_offset = start_byte - buf_start;
486 working_bytes = total_out - start_byte;
487 current_buf_start = buf_start +
488 buf_offset;
489 }
490 }
491 } 276 }
492next: 277
493 workspace->inf_strm.next_out = workspace->buf; 278 workspace->inf_strm.next_out = workspace->buf;
494 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 279 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
495 280
@@ -516,35 +301,21 @@ done:
516 zlib_inflateEnd(&workspace->inf_strm); 301 zlib_inflateEnd(&workspace->inf_strm);
517 if (data_in) 302 if (data_in)
518 kunmap(pages_in[page_in_index]); 303 kunmap(pages_in[page_in_index]);
519out:
520 free_workspace(workspace);
521 return ret; 304 return ret;
522} 305}
523 306
524/* 307static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
525 * a less complex decompression routine. Our compressed data fits in a 308 struct page *dest_page,
526 * single page, and we want to read a single page out of it. 309 unsigned long start_byte,
527 * start_byte tells us the offset into the compressed data we're interested in 310 size_t srclen, size_t destlen)
528 */
529int btrfs_zlib_decompress(unsigned char *data_in,
530 struct page *dest_page,
531 unsigned long start_byte,
532 size_t srclen, size_t destlen)
533{ 311{
312 struct workspace *workspace = list_entry(ws, struct workspace, list);
534 int ret = 0; 313 int ret = 0;
535 int wbits = MAX_WBITS; 314 int wbits = MAX_WBITS;
536 struct workspace *workspace;
537 unsigned long bytes_left = destlen; 315 unsigned long bytes_left = destlen;
538 unsigned long total_out = 0; 316 unsigned long total_out = 0;
539 char *kaddr; 317 char *kaddr;
540 318
541 if (destlen > PAGE_CACHE_SIZE)
542 return -ENOMEM;
543
544 workspace = find_zlib_workspace();
545 if (IS_ERR(workspace))
546 return -ENOMEM;
547
548 workspace->inf_strm.next_in = data_in; 319 workspace->inf_strm.next_in = data_in;
549 workspace->inf_strm.avail_in = srclen; 320 workspace->inf_strm.avail_in = srclen;
550 workspace->inf_strm.total_in = 0; 321 workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
565 336
566 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 337 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
567 printk(KERN_WARNING "inflateInit failed\n"); 338 printk(KERN_WARNING "inflateInit failed\n");
568 ret = -1; 339 return -1;
569 goto out;
570 } 340 }
571 341
572 while (bytes_left > 0) { 342 while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
616 ret = 0; 386 ret = 0;
617 387
618 zlib_inflateEnd(&workspace->inf_strm); 388 zlib_inflateEnd(&workspace->inf_strm);
619out:
620 free_workspace(workspace);
621 return ret; 389 return ret;
622} 390}
623 391
624void btrfs_zlib_exit(void) 392struct btrfs_compress_op btrfs_zlib_compress = {
625{ 393 .alloc_workspace = zlib_alloc_workspace,
626 free_workspaces(); 394 .free_workspace = zlib_free_workspace,
627} 395 .compress_pages = zlib_compress_pages,
396 .decompress_biovec = zlib_decompress_biovec,
397 .decompress = zlib_decompress,
398};
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 9e6c4f2e8ff1..bd352125e829 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,31 +2,10 @@
2# Makefile for CEPH filesystem. 2# Makefile for CEPH filesystem.
3# 3#
4 4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o 5obj-$(CONFIG_CEPH_FS) += ceph.o
8 6
9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 7ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 8 export.o caps.o snap.o xattr.o \
11 mds_client.o mdsmap.o strings.o ceph_frag.o \ 9 mds_client.o mdsmap.o strings.o ceph_frag.o \
12 debugfs.o 10 debugfs.o
13 11
14else
15#Otherwise we were called directly from the command
16# line; invoke the kernel build system.
17
18KERNELDIR ?= /lib/modules/$(shell uname -r)/build
19PWD := $(shell pwd)
20
21default: all
22
23all:
24 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
25
26modules_install:
27 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
28
29clean:
30 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
31
32endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7ae1b3d55b58..08f65faac112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
60 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { 60 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
61 req = rb_entry(rp, struct ceph_mds_request, r_node); 61 req = rb_entry(rp, struct ceph_mds_request, r_node);
62 62
63 if (req->r_request) 63 if (req->r_request && req->r_session)
64 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); 64 seq_printf(s, "%lld\tmds%d\t", req->r_tid,
65 else 65 req->r_session->s_mds);
66 else if (!req->r_request)
66 seq_printf(s, "%lld\t(no request)\t", req->r_tid); 67 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
68 else
69 seq_printf(s, "%lld\t(no session)\t", req->r_tid);
67 70
68 seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); 71 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
69 72
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fa7ca04ee816..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1224,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
1224 } 1224 }
1225} 1225}
1226 1226
1227/*
1228 * Return name hash for a given dentry. This is dependent on
1229 * the parent directory's hash function.
1230 */
1231unsigned ceph_dentry_hash(struct dentry *dn)
1232{
1233 struct inode *dir = dn->d_parent->d_inode;
1234 struct ceph_inode_info *dci = ceph_inode(dir);
1235
1236 switch (dci->i_dir_layout.dl_dir_hash) {
1237 case 0: /* for backward compat */
1238 case CEPH_STR_HASH_LINUX:
1239 return dn->d_name.hash;
1240
1241 default:
1242 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
1243 dn->d_name.name, dn->d_name.len);
1244 }
1245}
1246
1227const struct file_operations ceph_dir_fops = { 1247const struct file_operations ceph_dir_fops = {
1228 .read = ceph_read_dir, 1248 .read = ceph_read_dir,
1229 .readdir = ceph_readdir, 1249 .readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 2297d9426992..e41056174bf8 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
59 dout("encode_fh %p connectable\n", dentry); 59 dout("encode_fh %p connectable\n", dentry);
60 cfh->ino = ceph_ino(dentry->d_inode); 60 cfh->ino = ceph_ino(dentry->d_inode);
61 cfh->parent_ino = ceph_ino(parent->d_inode); 61 cfh->parent_ino = ceph_ino(parent->d_inode);
62 cfh->parent_name_hash = parent->d_name.hash; 62 cfh->parent_name_hash = ceph_dentry_hash(parent);
63 *max_len = connected_handle_length; 63 *max_len = connected_handle_length;
64 type = 2; 64 type = 2;
65 } else if (*max_len >= handle_length) { 65 } else if (*max_len >= handle_length) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e61de4f7b99d..e835eff551e3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
297 ci->i_release_count = 0; 297 ci->i_release_count = 0;
298 ci->i_symlink = NULL; 298 ci->i_symlink = NULL;
299 299
300 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
301
300 ci->i_fragtree = RB_ROOT; 302 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex); 303 mutex_init(&ci->i_fragtree_mutex);
302 304
@@ -689,6 +691,8 @@ static int fill_inode(struct inode *inode,
689 inode->i_op = &ceph_dir_iops; 691 inode->i_op = &ceph_dir_iops;
690 inode->i_fop = &ceph_dir_fops; 692 inode->i_fop = &ceph_dir_fops;
691 693
694 ci->i_dir_layout = iinfo->dir_layout;
695
692 ci->i_files = le64_to_cpu(info->files); 696 ci->i_files = le64_to_cpu(info->files);
693 ci->i_subdirs = le64_to_cpu(info->subdirs); 697 ci->i_subdirs = le64_to_cpu(info->subdirs);
694 ci->i_rbytes = le64_to_cpu(info->rbytes); 698 ci->i_rbytes = le64_to_cpu(info->rbytes);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a50fca1e03be..1e30d194a8e3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -60,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
60 * parse individual inode info 60 * parse individual inode info
61 */ 61 */
62static int parse_reply_info_in(void **p, void *end, 62static int parse_reply_info_in(void **p, void *end,
63 struct ceph_mds_reply_info_in *info) 63 struct ceph_mds_reply_info_in *info,
64 int features)
64{ 65{
65 int err = -EIO; 66 int err = -EIO;
66 67
@@ -74,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
74 info->symlink = *p; 75 info->symlink = *p;
75 *p += info->symlink_len; 76 *p += info->symlink_len;
76 77
78 if (features & CEPH_FEATURE_DIRLAYOUTHASH)
79 ceph_decode_copy_safe(p, end, &info->dir_layout,
80 sizeof(info->dir_layout), bad);
81 else
82 memset(&info->dir_layout, 0, sizeof(info->dir_layout));
83
77 ceph_decode_32_safe(p, end, info->xattr_len, bad); 84 ceph_decode_32_safe(p, end, info->xattr_len, bad);
78 ceph_decode_need(p, end, info->xattr_len, bad); 85 ceph_decode_need(p, end, info->xattr_len, bad);
79 info->xattr_data = *p; 86 info->xattr_data = *p;
@@ -88,12 +95,13 @@ bad:
88 * target inode. 95 * target inode.
89 */ 96 */
90static int parse_reply_info_trace(void **p, void *end, 97static int parse_reply_info_trace(void **p, void *end,
91 struct ceph_mds_reply_info_parsed *info) 98 struct ceph_mds_reply_info_parsed *info,
99 int features)
92{ 100{
93 int err; 101 int err;
94 102
95 if (info->head->is_dentry) { 103 if (info->head->is_dentry) {
96 err = parse_reply_info_in(p, end, &info->diri); 104 err = parse_reply_info_in(p, end, &info->diri, features);
97 if (err < 0) 105 if (err < 0)
98 goto out_bad; 106 goto out_bad;
99 107
@@ -114,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
114 } 122 }
115 123
116 if (info->head->is_target) { 124 if (info->head->is_target) {
117 err = parse_reply_info_in(p, end, &info->targeti); 125 err = parse_reply_info_in(p, end, &info->targeti, features);
118 if (err < 0) 126 if (err < 0)
119 goto out_bad; 127 goto out_bad;
120 } 128 }
@@ -134,7 +142,8 @@ out_bad:
134 * parse readdir results 142 * parse readdir results
135 */ 143 */
136static int parse_reply_info_dir(void **p, void *end, 144static int parse_reply_info_dir(void **p, void *end,
137 struct ceph_mds_reply_info_parsed *info) 145 struct ceph_mds_reply_info_parsed *info,
146 int features)
138{ 147{
139 u32 num, i = 0; 148 u32 num, i = 0;
140 int err; 149 int err;
@@ -182,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
182 *p += sizeof(struct ceph_mds_reply_lease); 191 *p += sizeof(struct ceph_mds_reply_lease);
183 192
184 /* inode */ 193 /* inode */
185 err = parse_reply_info_in(p, end, &info->dir_in[i]); 194 err = parse_reply_info_in(p, end, &info->dir_in[i], features);
186 if (err < 0) 195 if (err < 0)
187 goto out_bad; 196 goto out_bad;
188 i++; 197 i++;
@@ -205,7 +214,8 @@ out_bad:
205 * parse fcntl F_GETLK results 214 * parse fcntl F_GETLK results
206 */ 215 */
207static int parse_reply_info_filelock(void **p, void *end, 216static int parse_reply_info_filelock(void **p, void *end,
208 struct ceph_mds_reply_info_parsed *info) 217 struct ceph_mds_reply_info_parsed *info,
218 int features)
209{ 219{
210 if (*p + sizeof(*info->filelock_reply) > end) 220 if (*p + sizeof(*info->filelock_reply) > end)
211 goto bad; 221 goto bad;
@@ -225,19 +235,21 @@ bad:
225 * parse extra results 235 * parse extra results
226 */ 236 */
227static int parse_reply_info_extra(void **p, void *end, 237static int parse_reply_info_extra(void **p, void *end,
228 struct ceph_mds_reply_info_parsed *info) 238 struct ceph_mds_reply_info_parsed *info,
239 int features)
229{ 240{
230 if (info->head->op == CEPH_MDS_OP_GETFILELOCK) 241 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
231 return parse_reply_info_filelock(p, end, info); 242 return parse_reply_info_filelock(p, end, info, features);
232 else 243 else
233 return parse_reply_info_dir(p, end, info); 244 return parse_reply_info_dir(p, end, info, features);
234} 245}
235 246
236/* 247/*
237 * parse entire mds reply 248 * parse entire mds reply
238 */ 249 */
239static int parse_reply_info(struct ceph_msg *msg, 250static int parse_reply_info(struct ceph_msg *msg,
240 struct ceph_mds_reply_info_parsed *info) 251 struct ceph_mds_reply_info_parsed *info,
252 int features)
241{ 253{
242 void *p, *end; 254 void *p, *end;
243 u32 len; 255 u32 len;
@@ -250,7 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
250 /* trace */ 262 /* trace */
251 ceph_decode_32_safe(&p, end, len, bad); 263 ceph_decode_32_safe(&p, end, len, bad);
252 if (len > 0) { 264 if (len > 0) {
253 err = parse_reply_info_trace(&p, p+len, info); 265 err = parse_reply_info_trace(&p, p+len, info, features);
254 if (err < 0) 266 if (err < 0)
255 goto out_bad; 267 goto out_bad;
256 } 268 }
@@ -258,7 +270,7 @@ static int parse_reply_info(struct ceph_msg *msg,
258 /* extra */ 270 /* extra */
259 ceph_decode_32_safe(&p, end, len, bad); 271 ceph_decode_32_safe(&p, end, len, bad);
260 if (len > 0) { 272 if (len > 0) {
261 err = parse_reply_info_extra(&p, p+len, info); 273 err = parse_reply_info_extra(&p, p+len, info, features);
262 if (err < 0) 274 if (err < 0)
263 goto out_bad; 275 goto out_bad;
264 } 276 }
@@ -654,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
654 } else { 666 } else {
655 /* dir + name */ 667 /* dir + name */
656 inode = dir; 668 inode = dir;
657 hash = req->r_dentry->d_name.hash; 669 hash = ceph_dentry_hash(req->r_dentry);
658 is_hash = true; 670 is_hash = true;
659 } 671 }
660 } 672 }
@@ -1693,7 +1705,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1693 struct ceph_msg *msg; 1705 struct ceph_msg *msg;
1694 int flags = 0; 1706 int flags = 0;
1695 1707
1696 req->r_mds = mds;
1697 req->r_attempts++; 1708 req->r_attempts++;
1698 if (req->r_inode) { 1709 if (req->r_inode) {
1699 struct ceph_cap *cap = 1710 struct ceph_cap *cap =
@@ -1780,6 +1791,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
1780 goto finish; 1791 goto finish;
1781 } 1792 }
1782 1793
1794 put_request_session(req);
1795
1783 mds = __choose_mds(mdsc, req); 1796 mds = __choose_mds(mdsc, req);
1784 if (mds < 0 || 1797 if (mds < 0 ||
1785 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { 1798 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1797,6 +1810,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
1797 goto finish; 1810 goto finish;
1798 } 1811 }
1799 } 1812 }
1813 req->r_session = get_session(session);
1814
1800 dout("do_request mds%d session %p state %s\n", mds, session, 1815 dout("do_request mds%d session %p state %s\n", mds, session,
1801 session_state_name(session->s_state)); 1816 session_state_name(session->s_state));
1802 if (session->s_state != CEPH_MDS_SESSION_OPEN && 1817 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1809,7 +1824,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
1809 } 1824 }
1810 1825
1811 /* send request */ 1826 /* send request */
1812 req->r_session = get_session(session);
1813 req->r_resend_mds = -1; /* forget any previous mds hint */ 1827 req->r_resend_mds = -1; /* forget any previous mds hint */
1814 1828
1815 if (req->r_request_started == 0) /* note request start time */ 1829 if (req->r_request_started == 0) /* note request start time */
@@ -1863,7 +1877,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1863 if (req->r_session && 1877 if (req->r_session &&
1864 req->r_session->s_mds == mds) { 1878 req->r_session->s_mds == mds) {
1865 dout(" kicking tid %llu\n", req->r_tid); 1879 dout(" kicking tid %llu\n", req->r_tid);
1866 put_request_session(req);
1867 __do_request(mdsc, req); 1880 __do_request(mdsc, req);
1868 } 1881 }
1869 } 1882 }
@@ -2056,8 +2069,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2056 goto out; 2069 goto out;
2057 } else { 2070 } else {
2058 struct ceph_inode_info *ci = ceph_inode(req->r_inode); 2071 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
2059 struct ceph_cap *cap = 2072 struct ceph_cap *cap = NULL;
2060 ceph_get_cap_for_mds(ci, req->r_mds);; 2073
2074 if (req->r_session)
2075 cap = ceph_get_cap_for_mds(ci,
2076 req->r_session->s_mds);
2061 2077
2062 dout("already using auth"); 2078 dout("already using auth");
2063 if ((!cap || cap != ci->i_auth_cap) || 2079 if ((!cap || cap != ci->i_auth_cap) ||
@@ -2101,7 +2117,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2101 2117
2102 dout("handle_reply tid %lld result %d\n", tid, result); 2118 dout("handle_reply tid %lld result %d\n", tid, result);
2103 rinfo = &req->r_reply_info; 2119 rinfo = &req->r_reply_info;
2104 err = parse_reply_info(msg, rinfo); 2120 err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
2105 mutex_unlock(&mdsc->mutex); 2121 mutex_unlock(&mdsc->mutex);
2106 2122
2107 mutex_lock(&session->s_mutex); 2123 mutex_lock(&session->s_mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index aabe563b54db..4e3a9cc0bba6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -35,6 +35,7 @@ struct ceph_cap;
35 */ 35 */
36struct ceph_mds_reply_info_in { 36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in; 37 struct ceph_mds_reply_inode *in;
38 struct ceph_dir_layout dir_layout;
38 u32 symlink_len; 39 u32 symlink_len;
39 char *symlink; 40 char *symlink;
40 u32 xattr_len; 41 u32 xattr_len;
@@ -165,7 +166,6 @@ struct ceph_mds_request {
165 struct ceph_mds_client *r_mdsc; 166 struct ceph_mds_client *r_mdsc;
166 167
167 int r_op; /* mds op code */ 168 int r_op; /* mds op code */
168 int r_mds;
169 169
170 /* operation on what? */ 170 /* operation on what? */
171 struct inode *r_inode; /* arg1 */ 171 struct inode *r_inode; /* arg1 */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 08b460ae0539..bf6f0f34082a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -428,7 +428,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
428 goto fail; 428 goto fail;
429 } 429 }
430 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 430 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK; 431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
432 CEPH_FEATURE_DIRLAYOUTHASH;
432 fsc->client->monc.want_mdsmap = 1; 433 fsc->client->monc.want_mdsmap = 1;
433 434
434 fsc->mount_options = fsopt; 435 fsc->mount_options = fsopt;
@@ -443,13 +444,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
443 goto fail_client; 444 goto fail_client;
444 445
445 err = -ENOMEM; 446 err = -ENOMEM;
446 fsc->wb_wq = create_workqueue("ceph-writeback"); 447 /*
448 * The number of concurrent works can be high but they don't need
449 * to be processed in parallel, limit concurrency.
450 */
451 fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
447 if (fsc->wb_wq == NULL) 452 if (fsc->wb_wq == NULL)
448 goto fail_bdi; 453 goto fail_bdi;
449 fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); 454 fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
450 if (fsc->pg_inv_wq == NULL) 455 if (fsc->pg_inv_wq == NULL)
451 goto fail_wb_wq; 456 goto fail_wb_wq;
452 fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); 457 fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
453 if (fsc->trunc_wq == NULL) 458 if (fsc->trunc_wq == NULL)
454 goto fail_pg_inv_wq; 459 goto fail_pg_inv_wq;
455 460
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4553d8829edb..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -239,6 +239,7 @@ struct ceph_inode_info {
239 unsigned i_ceph_flags; 239 unsigned i_ceph_flags;
240 unsigned long i_release_count; 240 unsigned long i_release_count;
241 241
242 struct ceph_dir_layout i_dir_layout;
242 struct ceph_file_layout i_layout; 243 struct ceph_file_layout i_layout;
243 char *i_symlink; 244 char *i_symlink;
244 245
@@ -768,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
768extern void ceph_dentry_lru_touch(struct dentry *dn); 769extern void ceph_dentry_lru_touch(struct dentry *dn);
769extern void ceph_dentry_lru_del(struct dentry *dn); 770extern void ceph_dentry_lru_del(struct dentry *dn);
770extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 771extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
772extern unsigned ceph_dentry_hash(struct dentry *dn);
771 773
772/* 774/*
773 * our d_ops vary depending on whether the inode is live, 775 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/char_dev.c b/fs/char_dev.c
index e5b9df993b93..dca9e5e0f73b 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,7 +59,7 @@ static struct char_device_struct {
59} *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; 59} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
60 60
61/* index in the above */ 61/* index in the above */
62static inline int major_to_index(int major) 62static inline int major_to_index(unsigned major)
63{ 63{
64 return major % CHRDEV_MAJOR_HASH_SIZE; 64 return major % CHRDEV_MAJOR_HASH_SIZE;
65} 65}
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
417 return ret; 417 return ret;
418} 418}
419 419
420int cdev_index(struct inode *inode)
421{
422 int idx;
423 struct kobject *kobj;
424
425 kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
426 if (!kobj)
427 return -1;
428 kobject_put(kobj);
429 return idx;
430}
431
432void cd_forget(struct inode *inode) 420void cd_forget(struct inode *inode)
433{ 421{
434 spin_lock(&cdev_lock); 422 spin_lock(&cdev_lock);
@@ -582,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
582EXPORT_SYMBOL(cdev_alloc); 570EXPORT_SYMBOL(cdev_alloc);
583EXPORT_SYMBOL(cdev_del); 571EXPORT_SYMBOL(cdev_del);
584EXPORT_SYMBOL(cdev_add); 572EXPORT_SYMBOL(cdev_add);
585EXPORT_SYMBOL(cdev_index);
586EXPORT_SYMBOL(__register_chrdev); 573EXPORT_SYMBOL(__register_chrdev);
587EXPORT_SYMBOL(__unregister_chrdev); 574EXPORT_SYMBOL(__unregister_chrdev);
588EXPORT_SYMBOL(directly_mappable_cdev_bdi); 575EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 224d7bbd1fcc..e654dfd092c3 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
64 void *buffer, uint16_t maxbuf) 64 void *buffer, uint16_t maxbuf)
65{ 65{
66 const struct TCP_Server_Info *server = cookie_netfs_data; 66 const struct TCP_Server_Info *server = cookie_netfs_data;
67 const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr; 67 const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
68 const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
69 const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
68 struct cifs_server_key *key = buffer; 70 struct cifs_server_key *key = buffer;
69 uint16_t key_len = sizeof(struct cifs_server_key); 71 uint16_t key_len = sizeof(struct cifs_server_key);
70 72
@@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
76 */ 78 */
77 switch (sa->sa_family) { 79 switch (sa->sa_family) {
78 case AF_INET: 80 case AF_INET:
79 key->family = server->addr.sockAddr.sin_family; 81 key->family = sa->sa_family;
80 key->port = server->addr.sockAddr.sin_port; 82 key->port = addr->sin_port;
81 key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr; 83 key->addr[0].ipv4_addr = addr->sin_addr;
82 key_len += sizeof(key->addr[0].ipv4_addr); 84 key_len += sizeof(key->addr[0].ipv4_addr);
83 break; 85 break;
84 86
85 case AF_INET6: 87 case AF_INET6:
86 key->family = server->addr.sockAddr6.sin6_family; 88 key->family = sa->sa_family;
87 key->port = server->addr.sockAddr6.sin6_port; 89 key->port = addr6->sin6_port;
88 key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr; 90 key->addr[0].ipv6_addr = addr6->sin6_addr;
89 key_len += sizeof(key->addr[0].ipv6_addr); 91 key_len += sizeof(key->addr[0].ipv6_addr);
90 break; 92 break;
91 93
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 103ab8b605b0..65829d32128c 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
79 spin_lock(&GlobalMid_Lock); 79 spin_lock(&GlobalMid_Lock);
80 list_for_each(tmp, &server->pending_mid_q) { 80 list_for_each(tmp, &server->pending_mid_q) {
81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
82 cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", 82 cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
83 mid_entry->midState, 83 mid_entry->midState,
84 (int)mid_entry->command, 84 (int)mid_entry->command,
85 mid_entry->pid, 85 mid_entry->pid,
86 mid_entry->tsk, 86 mid_entry->callback_data,
87 mid_entry->mid); 87 mid_entry->mid);
88#ifdef CONFIG_CIFS_STATS2 88#ifdef CONFIG_CIFS_STATS2
89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld", 89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -119,29 +119,27 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
119 "Display Internal CIFS Data Structures for Debugging\n" 119 "Display Internal CIFS Data Structures for Debugging\n"
120 "---------------------------------------------------\n"); 120 "---------------------------------------------------\n");
121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); 121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
122 seq_printf(m, "Features: "); 122 seq_printf(m, "Features:");
123#ifdef CONFIG_CIFS_DFS_UPCALL 123#ifdef CONFIG_CIFS_DFS_UPCALL
124 seq_printf(m, "dfs"); 124 seq_printf(m, " dfs");
125 seq_putc(m, ' ');
126#endif 125#endif
127#ifdef CONFIG_CIFS_FSCACHE 126#ifdef CONFIG_CIFS_FSCACHE
128 seq_printf(m, "fscache"); 127 seq_printf(m, " fscache");
129 seq_putc(m, ' ');
130#endif 128#endif
131#ifdef CONFIG_CIFS_WEAK_PW_HASH 129#ifdef CONFIG_CIFS_WEAK_PW_HASH
132 seq_printf(m, "lanman"); 130 seq_printf(m, " lanman");
133 seq_putc(m, ' ');
134#endif 131#endif
135#ifdef CONFIG_CIFS_POSIX 132#ifdef CONFIG_CIFS_POSIX
136 seq_printf(m, "posix"); 133 seq_printf(m, " posix");
137 seq_putc(m, ' ');
138#endif 134#endif
139#ifdef CONFIG_CIFS_UPCALL 135#ifdef CONFIG_CIFS_UPCALL
140 seq_printf(m, "spnego"); 136 seq_printf(m, " spnego");
141 seq_putc(m, ' ');
142#endif 137#endif
143#ifdef CONFIG_CIFS_XATTR 138#ifdef CONFIG_CIFS_XATTR
144 seq_printf(m, "xattr"); 139 seq_printf(m, " xattr");
140#endif
141#ifdef CONFIG_CIFS_ACL
142 seq_printf(m, " acl");
145#endif 143#endif
146 seq_putc(m, '\n'); 144 seq_putc(m, '\n');
147 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); 145 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
@@ -220,11 +218,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
220 mid_entry = list_entry(tmp3, struct mid_q_entry, 218 mid_entry = list_entry(tmp3, struct mid_q_entry,
221 qhead); 219 qhead);
222 seq_printf(m, "\tState: %d com: %d pid:" 220 seq_printf(m, "\tState: %d com: %d pid:"
223 " %d tsk: %p mid %d\n", 221 " %d cbdata: %p mid %d\n",
224 mid_entry->midState, 222 mid_entry->midState,
225 (int)mid_entry->command, 223 (int)mid_entry->command,
226 mid_entry->pid, 224 mid_entry->pid,
227 mid_entry->tsk, 225 mid_entry->callback_data,
228 mid_entry->mid); 226 mid_entry->mid);
229 } 227 }
230 spin_unlock(&GlobalMid_Lock); 228 spin_unlock(&GlobalMid_Lock);
@@ -333,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
333 atomic_read(&totSmBufAllocCount)); 331 atomic_read(&totSmBufAllocCount));
334#endif /* CONFIG_CIFS_STATS2 */ 332#endif /* CONFIG_CIFS_STATS2 */
335 333
336 seq_printf(m, "Operations (MIDs): %d\n", midCount.counter); 334 seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
337 seq_printf(m, 335 seq_printf(m,
338 "\n%d session %d share reconnects\n", 336 "\n%d session %d share reconnects\n",
339 tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); 337 tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27fd..7ed36536e754 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -255,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
255 255
256} 256}
257 257
258static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
259 struct list_head *mntlist)
260{
261 /* stolen from afs code */
262 int err;
263
264 mntget(newmnt);
265 err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
266 switch (err) {
267 case 0:
268 path_put(&nd->path);
269 nd->path.mnt = newmnt;
270 nd->path.dentry = dget(newmnt->mnt_root);
271 schedule_delayed_work(&cifs_dfs_automount_task,
272 cifs_dfs_mountpoint_expiry_timeout);
273 break;
274 case -EBUSY:
275 /* someone else made a mount here whilst we were busy */
276 while (d_mountpoint(nd->path.dentry) &&
277 follow_down(&nd->path))
278 ;
279 err = 0;
280 default:
281 mntput(newmnt);
282 break;
283 }
284 return err;
285}
286
287static void dump_referral(const struct dfs_info3_param *ref) 258static void dump_referral(const struct dfs_info3_param *ref)
288{ 259{
289 cFYI(1, "DFS: ref path: %s", ref->path_name); 260 cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -293,45 +264,43 @@ static void dump_referral(const struct dfs_info3_param *ref)
293 ref->path_consumed); 264 ref->path_consumed);
294} 265}
295 266
296 267/*
297static void* 268 * Create a vfsmount that we can automount
298cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) 269 */
270static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
299{ 271{
300 struct dfs_info3_param *referrals = NULL; 272 struct dfs_info3_param *referrals = NULL;
301 unsigned int num_referrals = 0; 273 unsigned int num_referrals = 0;
302 struct cifs_sb_info *cifs_sb; 274 struct cifs_sb_info *cifs_sb;
303 struct cifsSesInfo *ses; 275 struct cifsSesInfo *ses;
304 char *full_path = NULL; 276 char *full_path;
305 int xid, i; 277 int xid, i;
306 int rc = 0; 278 int rc;
307 struct vfsmount *mnt = ERR_PTR(-ENOENT); 279 struct vfsmount *mnt;
308 struct tcon_link *tlink; 280 struct tcon_link *tlink;
309 281
310 cFYI(1, "in %s", __func__); 282 cFYI(1, "in %s", __func__);
311 BUG_ON(IS_ROOT(dentry)); 283 BUG_ON(IS_ROOT(mntpt));
312 284
313 xid = GetXid(); 285 xid = GetXid();
314 286
315 dput(nd->path.dentry);
316 nd->path.dentry = dget(dentry);
317
318 /* 287 /*
319 * The MSDFS spec states that paths in DFS referral requests and 288 * The MSDFS spec states that paths in DFS referral requests and
320 * responses must be prefixed by a single '\' character instead of 289 * responses must be prefixed by a single '\' character instead of
321 * the double backslashes usually used in the UNC. This function 290 * the double backslashes usually used in the UNC. This function
322 * gives us the latter, so we must adjust the result. 291 * gives us the latter, so we must adjust the result.
323 */ 292 */
324 full_path = build_path_from_dentry(dentry); 293 mnt = ERR_PTR(-ENOMEM);
325 if (full_path == NULL) { 294 full_path = build_path_from_dentry(mntpt);
326 rc = -ENOMEM; 295 if (full_path == NULL)
327 goto out_err; 296 goto free_xid;
328 }
329 297
330 cifs_sb = CIFS_SB(dentry->d_inode->i_sb); 298 cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
331 tlink = cifs_sb_tlink(cifs_sb); 299 tlink = cifs_sb_tlink(cifs_sb);
300 mnt = ERR_PTR(-EINVAL);
332 if (IS_ERR(tlink)) { 301 if (IS_ERR(tlink)) {
333 rc = PTR_ERR(tlink); 302 mnt = ERR_CAST(tlink);
334 goto out_err; 303 goto free_full_path;
335 } 304 }
336 ses = tlink_tcon(tlink)->ses; 305 ses = tlink_tcon(tlink)->ses;
337 306
@@ -341,46 +310,63 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
341 310
342 cifs_put_tlink(tlink); 311 cifs_put_tlink(tlink);
343 312
313 mnt = ERR_PTR(-ENOENT);
344 for (i = 0; i < num_referrals; i++) { 314 for (i = 0; i < num_referrals; i++) {
345 int len; 315 int len;
346 dump_referral(referrals+i); 316 dump_referral(referrals + i);
347 /* connect to a node */ 317 /* connect to a node */
348 len = strlen(referrals[i].node_name); 318 len = strlen(referrals[i].node_name);
349 if (len < 2) { 319 if (len < 2) {
350 cERROR(1, "%s: Net Address path too short: %s", 320 cERROR(1, "%s: Net Address path too short: %s",
351 __func__, referrals[i].node_name); 321 __func__, referrals[i].node_name);
352 rc = -EINVAL; 322 mnt = ERR_PTR(-EINVAL);
353 goto out_err; 323 break;
354 } 324 }
355 mnt = cifs_dfs_do_refmount(cifs_sb, 325 mnt = cifs_dfs_do_refmount(cifs_sb,
356 full_path, referrals + i); 326 full_path, referrals + i);
357 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, 327 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
358 referrals[i].node_name, mnt); 328 referrals[i].node_name, mnt);
359
360 /* complete mount procedure if we accured submount */
361 if (!IS_ERR(mnt)) 329 if (!IS_ERR(mnt))
362 break; 330 goto success;
363 } 331 }
364 332
365 /* we need it cause for() above could exit without valid submount */ 333 /* no valid submounts were found; return error from get_dfs_path() by
366 rc = PTR_ERR(mnt); 334 * preference */
367 if (IS_ERR(mnt)) 335 if (rc != 0)
368 goto out_err; 336 mnt = ERR_PTR(rc);
369
370 rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
371 337
372out: 338success:
373 FreeXid(xid);
374 free_dfs_info_array(referrals, num_referrals); 339 free_dfs_info_array(referrals, num_referrals);
340free_full_path:
375 kfree(full_path); 341 kfree(full_path);
342free_xid:
343 FreeXid(xid);
376 cFYI(1, "leaving %s" , __func__); 344 cFYI(1, "leaving %s" , __func__);
377 return ERR_PTR(rc); 345 return mnt;
378out_err: 346}
379 path_put(&nd->path); 347
380 goto out; 348/*
349 * Attempt to automount the referral
350 */
351struct vfsmount *cifs_dfs_d_automount(struct path *path)
352{
353 struct vfsmount *newmnt;
354
355 cFYI(1, "in %s", __func__);
356
357 newmnt = cifs_dfs_do_automount(path->dentry);
358 if (IS_ERR(newmnt)) {
359 cFYI(1, "leaving %s [automount failed]" , __func__);
360 return newmnt;
361 }
362
363 mntget(newmnt); /* prevent immediate expiration */
364 mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
365 schedule_delayed_work(&cifs_dfs_automount_task,
366 cifs_dfs_mountpoint_expiry_timeout);
367 cFYI(1, "leaving %s [ok]" , __func__);
368 return newmnt;
381} 369}
382 370
383const struct inode_operations cifs_dfs_referral_inode_operations = { 371const struct inode_operations cifs_dfs_referral_inode_operations = {
384 .follow_link = cifs_dfs_follow_mountpoint,
385}; 372};
386
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7852cd677051..ac51cd2d33ae 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -40,6 +40,7 @@
40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ 40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */
41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */ 41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */
42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ 42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
43#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */
43 44
44struct cifs_sb_info { 45struct cifs_sb_info {
45 struct rb_root tlink_tree; 46 struct rb_root tlink_tree;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 87044906cd1f..4dfba8283165 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,6 +98,8 @@ struct key *
98cifs_get_spnego_key(struct cifsSesInfo *sesInfo) 98cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
99{ 99{
100 struct TCP_Server_Info *server = sesInfo->server; 100 struct TCP_Server_Info *server = sesInfo->server;
101 struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
102 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
101 char *description, *dp; 103 char *description, *dp;
102 size_t desc_len; 104 size_t desc_len;
103 struct key *spnego_key; 105 struct key *spnego_key;
@@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
127 dp = description + strlen(description); 129 dp = description + strlen(description);
128 130
129 /* add the server address */ 131 /* add the server address */
130 if (server->addr.sockAddr.sin_family == AF_INET) 132 if (server->dstaddr.ss_family == AF_INET)
131 sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); 133 sprintf(dp, "ip4=%pI4", &sa->sin_addr);
132 else if (server->addr.sockAddr.sin_family == AF_INET6) 134 else if (server->dstaddr.ss_family == AF_INET6)
133 sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr); 135 sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
134 else 136 else
135 goto out; 137 goto out;
136 138
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 430f510a1720..fc0fd4fde306 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
44 int charlen, outlen = 0; 44 int charlen, outlen = 0;
45 int maxwords = maxbytes / 2; 45 int maxwords = maxbytes / 2;
46 char tmp[NLS_MAX_CHARSET_SIZE]; 46 char tmp[NLS_MAX_CHARSET_SIZE];
47 __u16 ftmp;
47 48
48 for (i = 0; i < maxwords && from[i]; i++) { 49 for (i = 0; i < maxwords; i++) {
49 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, 50 ftmp = get_unaligned_le16(&from[i]);
50 NLS_MAX_CHARSET_SIZE); 51 if (ftmp == 0)
52 break;
53
54 charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
51 if (charlen > 0) 55 if (charlen > 0)
52 outlen += charlen; 56 outlen += charlen;
53 else 57 else
@@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
58} 62}
59 63
60/* 64/*
61 * cifs_mapchar - convert a little-endian char to proper char in codepage 65 * cifs_mapchar - convert a host-endian char to proper char in codepage
62 * @target - where converted character should be copied 66 * @target - where converted character should be copied
63 * @src_char - 2 byte little-endian source character 67 * @src_char - 2 byte host-endian source character
64 * @cp - codepage to which character should be converted 68 * @cp - codepage to which character should be converted
65 * @mapchar - should character be mapped according to mapchars mount option? 69 * @mapchar - should character be mapped according to mapchars mount option?
66 * 70 *
@@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
69 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). 73 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
70 */ 74 */
71static int 75static int
72cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp, 76cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
73 bool mapchar) 77 bool mapchar)
74{ 78{
75 int len = 1; 79 int len = 1;
@@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
82 * build_path_from_dentry are modified, as they use slash as 86 * build_path_from_dentry are modified, as they use slash as
83 * separator. 87 * separator.
84 */ 88 */
85 switch (le16_to_cpu(src_char)) { 89 switch (src_char) {
86 case UNI_COLON: 90 case UNI_COLON:
87 *target = ':'; 91 *target = ':';
88 break; 92 break;
@@ -109,8 +113,7 @@ out:
109 return len; 113 return len;
110 114
111cp_convert: 115cp_convert:
112 len = cp->uni2char(le16_to_cpu(src_char), target, 116 len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
113 NLS_MAX_CHARSET_SIZE);
114 if (len <= 0) { 117 if (len <= 0) {
115 *target = '?'; 118 *target = '?';
116 len = 1; 119 len = 1;
@@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
149 int nullsize = nls_nullsize(codepage); 152 int nullsize = nls_nullsize(codepage);
150 int fromwords = fromlen / 2; 153 int fromwords = fromlen / 2;
151 char tmp[NLS_MAX_CHARSET_SIZE]; 154 char tmp[NLS_MAX_CHARSET_SIZE];
155 __u16 ftmp;
152 156
153 /* 157 /*
154 * because the chars can be of varying widths, we need to take care 158 * because the chars can be of varying widths, we need to take care
@@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
158 */ 162 */
159 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); 163 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
160 164
161 for (i = 0; i < fromwords && from[i]; i++) { 165 for (i = 0; i < fromwords; i++) {
166 ftmp = get_unaligned_le16(&from[i]);
167 if (ftmp == 0)
168 break;
169
162 /* 170 /*
163 * check to see if converting this character might make the 171 * check to see if converting this character might make the
164 * conversion bleed into the null terminator 172 * conversion bleed into the null terminator
165 */ 173 */
166 if (outlen >= safelen) { 174 if (outlen >= safelen) {
167 charlen = cifs_mapchar(tmp, from[i], codepage, mapchar); 175 charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
168 if ((outlen + charlen) > (tolen - nullsize)) 176 if ((outlen + charlen) > (tolen - nullsize))
169 break; 177 break;
170 } 178 }
171 179
172 /* put converted char into 'to' buffer */ 180 /* put converted char into 'to' buffer */
173 charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar); 181 charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
174 outlen += charlen; 182 outlen += charlen;
175 } 183 }
176 184
@@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
193{ 201{
194 int charlen; 202 int charlen;
195 int i; 203 int i;
196 wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */ 204 wchar_t wchar_to; /* needed to quiet sparse */
197 205
198 for (i = 0; len && *from; i++, from += charlen, len -= charlen) { 206 for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
199 207 charlen = codepage->char2uni(from, len, &wchar_to);
200 /* works for 2.4.0 kernel or later */
201 charlen = codepage->char2uni(from, len, &wchar_to[i]);
202 if (charlen < 1) { 208 if (charlen < 1) {
203 cERROR(1, "strtoUCS: char2uni of %d returned %d", 209 cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
204 (int)*from, charlen); 210 *from, charlen);
205 /* A question mark */ 211 /* A question mark */
206 to[i] = cpu_to_le16(0x003f); 212 wchar_to = 0x003f;
207 charlen = 1; 213 charlen = 1;
208 } else 214 }
209 to[i] = cpu_to_le16(wchar_to[i]); 215 put_unaligned_le16(wchar_to, &to[i]);
210
211 } 216 }
212 217
213 to[i] = 0; 218 put_unaligned_le16(0, &to[i]);
214 return i; 219 return i;
215} 220}
216 221
@@ -252,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
252 return dst; 257 return dst;
253} 258}
254 259
260/*
261 * Convert 16 bit Unicode pathname to wire format from string in current code
262 * page. Conversion may involve remapping up the six characters that are
263 * only legal in POSIX-like OS (if they are present in the string). Path
264 * names are little endian 16 bit Unicode on the wire
265 */
266int
267cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
268 const struct nls_table *cp, int mapChars)
269{
270 int i, j, charlen;
271 int len_remaining = maxlen;
272 char src_char;
273 __u16 temp;
274
275 if (!mapChars)
276 return cifs_strtoUCS(target, source, PATH_MAX, cp);
277
278 for (i = 0, j = 0; i < maxlen; j++) {
279 src_char = source[i];
280 switch (src_char) {
281 case 0:
282 put_unaligned_le16(0, &target[j]);
283 goto ctoUCS_out;
284 case ':':
285 temp = UNI_COLON;
286 break;
287 case '*':
288 temp = UNI_ASTERIK;
289 break;
290 case '?':
291 temp = UNI_QUESTION;
292 break;
293 case '<':
294 temp = UNI_LESSTHAN;
295 break;
296 case '>':
297 temp = UNI_GRTRTHAN;
298 break;
299 case '|':
300 temp = UNI_PIPE;
301 break;
302 /*
303 * FIXME: We can not handle remapping backslash (UNI_SLASH)
304 * until all the calls to build_path_from_dentry are modified,
305 * as they use backslash as separator.
306 */
307 default:
308 charlen = cp->char2uni(source+i, len_remaining,
309 &temp);
310 /*
311 * if no match, use question mark, which at least in
312 * some cases serves as wild card
313 */
314 if (charlen < 1) {
315 temp = 0x003f;
316 charlen = 1;
317 }
318 len_remaining -= charlen;
319 /*
320 * character may take more than one byte in the source
321 * string, but will take exactly two bytes in the
322 * target string
323 */
324 i += charlen;
325 continue;
326 }
327 put_unaligned_le16(temp, &target[j]);
328 i++; /* move to next char in source string */
329 len_remaining--;
330 }
331
332ctoUCS_out:
333 return i;
334}
335
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index a437ec391a01..1e7636b145a8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -41,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
41; 41;
42 42
43 43
44/* security id for everyone */ 44/* security id for everyone/world system group */
45static const struct cifs_sid sid_everyone = { 45static const struct cifs_sid sid_everyone = {
46 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; 46 1, 1, {0, 0, 0, 0, 0, 1}, {0} };
47/* security id for Authenticated Users system group */
48static const struct cifs_sid sid_authusers = {
49 1, 1, {0, 0, 0, 0, 0, 5}, {11} };
47/* group users */ 50/* group users */
48static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; 51static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
49 52
@@ -365,7 +368,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
365 if (num_aces > 0) { 368 if (num_aces > 0) {
366 umode_t user_mask = S_IRWXU; 369 umode_t user_mask = S_IRWXU;
367 umode_t group_mask = S_IRWXG; 370 umode_t group_mask = S_IRWXG;
368 umode_t other_mask = S_IRWXO; 371 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
369 372
370 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), 373 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
371 GFP_KERNEL); 374 GFP_KERNEL);
@@ -390,6 +393,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
390 ppace[i]->type, 393 ppace[i]->type,
391 &fattr->cf_mode, 394 &fattr->cf_mode,
392 &other_mask); 395 &other_mask);
396 if (compare_sids(&(ppace[i]->sid), &sid_authusers))
397 access_flags_to_mode(ppace[i]->access_req,
398 ppace[i]->type,
399 &fattr->cf_mode,
400 &other_mask);
401
393 402
394/* memcpy((void *)(&(cifscred->aces[i])), 403/* memcpy((void *)(&(cifscred->aces[i])),
395 (void *)ppace[i], 404 (void *)ppace[i],
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index f856732161ab..66f3d50d0676 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -72,6 +72,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
72 return 0; 72 return 0;
73} 73}
74 74
75/* must be called with server->srv_mutex held */
75int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, 76int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
76 __u32 *pexpected_response_sequence_number) 77 __u32 *pexpected_response_sequence_number)
77{ 78{
@@ -84,14 +85,12 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
84 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 85 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
85 return rc; 86 return rc;
86 87
87 spin_lock(&GlobalMid_Lock);
88 cifs_pdu->Signature.Sequence.SequenceNumber = 88 cifs_pdu->Signature.Sequence.SequenceNumber =
89 cpu_to_le32(server->sequence_number); 89 cpu_to_le32(server->sequence_number);
90 cifs_pdu->Signature.Sequence.Reserved = 0; 90 cifs_pdu->Signature.Sequence.Reserved = 0;
91 91
92 *pexpected_response_sequence_number = server->sequence_number++; 92 *pexpected_response_sequence_number = server->sequence_number++;
93 server->sequence_number++; 93 server->sequence_number++;
94 spin_unlock(&GlobalMid_Lock);
95 94
96 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); 95 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
97 if (rc) 96 if (rc)
@@ -149,6 +148,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
149 return rc; 148 return rc;
150} 149}
151 150
151/* must be called with server->srv_mutex held */
152int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, 152int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
153 __u32 *pexpected_response_sequence_number) 153 __u32 *pexpected_response_sequence_number)
154{ 154{
@@ -162,14 +162,12 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
162 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 162 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
163 return rc; 163 return rc;
164 164
165 spin_lock(&GlobalMid_Lock);
166 cifs_pdu->Signature.Sequence.SequenceNumber = 165 cifs_pdu->Signature.Sequence.SequenceNumber =
167 cpu_to_le32(server->sequence_number); 166 cpu_to_le32(server->sequence_number);
168 cifs_pdu->Signature.Sequence.Reserved = 0; 167 cifs_pdu->Signature.Sequence.Reserved = 0;
169 168
170 *pexpected_response_sequence_number = server->sequence_number++; 169 *pexpected_response_sequence_number = server->sequence_number++;
171 server->sequence_number++; 170 server->sequence_number++;
172 spin_unlock(&GlobalMid_Lock);
173 171
174 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); 172 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
175 if (rc) 173 if (rc)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8e21e0fe65d5..a8323f1dc1c4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -77,7 +77,11 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
77module_param(cifs_max_pending, int, 0); 77module_param(cifs_max_pending, int, 0);
78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
79 "Default: 50 Range: 2 to 256"); 79 "Default: 50 Range: 2 to 256");
80 80unsigned short echo_retries = 5;
81module_param(echo_retries, ushort, 0644);
82MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
83 "reconnecting server. Default: 5. 0 means "
84 "never reconnect.");
81extern mempool_t *cifs_sm_req_poolp; 85extern mempool_t *cifs_sm_req_poolp;
82extern mempool_t *cifs_req_poolp; 86extern mempool_t *cifs_req_poolp;
83extern mempool_t *cifs_mid_poolp; 87extern mempool_t *cifs_mid_poolp;
@@ -174,6 +178,12 @@ cifs_read_super(struct super_block *sb, void *data,
174 goto out_no_root; 178 goto out_no_root;
175 } 179 }
176 180
181 /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
182 if (cifs_sb_master_tcon(cifs_sb)->nocase)
183 sb->s_d_op = &cifs_ci_dentry_ops;
184 else
185 sb->s_d_op = &cifs_dentry_ops;
186
177#ifdef CONFIG_CIFS_EXPERIMENTAL 187#ifdef CONFIG_CIFS_EXPERIMENTAL
178 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 188 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
179 cFYI(1, "export ops supported"); 189 cFYI(1, "export ops supported");
@@ -329,6 +339,8 @@ cifs_alloc_inode(struct super_block *sb)
329 cifs_inode->invalid_mapping = false; 339 cifs_inode->invalid_mapping = false;
330 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 340 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
331 cifs_inode->server_eof = 0; 341 cifs_inode->server_eof = 0;
342 cifs_inode->uniqueid = 0;
343 cifs_inode->createtime = 0;
332 344
333 /* Can not set i_flags here - they get immediately overwritten 345 /* Can not set i_flags here - they get immediately overwritten
334 to zero by the VFS */ 346 to zero by the VFS */
@@ -361,18 +373,19 @@ cifs_evict_inode(struct inode *inode)
361static void 373static void
362cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) 374cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
363{ 375{
376 struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
377 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
378
364 seq_printf(s, ",addr="); 379 seq_printf(s, ",addr=");
365 380
366 switch (server->addr.sockAddr.sin_family) { 381 switch (server->dstaddr.ss_family) {
367 case AF_INET: 382 case AF_INET:
368 seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr); 383 seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
369 break; 384 break;
370 case AF_INET6: 385 case AF_INET6:
371 seq_printf(s, "%pI6", 386 seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
372 &server->addr.sockAddr6.sin6_addr.s6_addr); 387 if (sa6->sin6_scope_id)
373 if (server->addr.sockAddr6.sin6_scope_id) 388 seq_printf(s, "%%%u", sa6->sin6_scope_id);
374 seq_printf(s, "%%%u",
375 server->addr.sockAddr6.sin6_scope_id);
376 break; 389 break;
377 default: 390 default:
378 seq_printf(s, "(unknown)"); 391 seq_printf(s, "(unknown)");
@@ -720,6 +733,25 @@ const struct file_operations cifs_file_ops = {
720 .setlease = cifs_setlease, 733 .setlease = cifs_setlease,
721}; 734};
722 735
736const struct file_operations cifs_file_strict_ops = {
737 .read = do_sync_read,
738 .write = do_sync_write,
739 .aio_read = cifs_strict_readv,
740 .aio_write = cifs_file_aio_write,
741 .open = cifs_open,
742 .release = cifs_close,
743 .lock = cifs_lock,
744 .fsync = cifs_strict_fsync,
745 .flush = cifs_flush,
746 .mmap = cifs_file_strict_mmap,
747 .splice_read = generic_file_splice_read,
748 .llseek = cifs_llseek,
749#ifdef CONFIG_CIFS_POSIX
750 .unlocked_ioctl = cifs_ioctl,
751#endif /* CONFIG_CIFS_POSIX */
752 .setlease = cifs_setlease,
753};
754
723const struct file_operations cifs_file_direct_ops = { 755const struct file_operations cifs_file_direct_ops = {
724 /* no aio, no readv - 756 /* no aio, no readv -
725 BB reevaluate whether they can be done with directio, no cache */ 757 BB reevaluate whether they can be done with directio, no cache */
@@ -738,6 +770,7 @@ const struct file_operations cifs_file_direct_ops = {
738 .llseek = cifs_llseek, 770 .llseek = cifs_llseek,
739 .setlease = cifs_setlease, 771 .setlease = cifs_setlease,
740}; 772};
773
741const struct file_operations cifs_file_nobrl_ops = { 774const struct file_operations cifs_file_nobrl_ops = {
742 .read = do_sync_read, 775 .read = do_sync_read,
743 .write = do_sync_write, 776 .write = do_sync_write,
@@ -756,6 +789,24 @@ const struct file_operations cifs_file_nobrl_ops = {
756 .setlease = cifs_setlease, 789 .setlease = cifs_setlease,
757}; 790};
758 791
792const struct file_operations cifs_file_strict_nobrl_ops = {
793 .read = do_sync_read,
794 .write = do_sync_write,
795 .aio_read = cifs_strict_readv,
796 .aio_write = cifs_file_aio_write,
797 .open = cifs_open,
798 .release = cifs_close,
799 .fsync = cifs_strict_fsync,
800 .flush = cifs_flush,
801 .mmap = cifs_file_strict_mmap,
802 .splice_read = generic_file_splice_read,
803 .llseek = cifs_llseek,
804#ifdef CONFIG_CIFS_POSIX
805 .unlocked_ioctl = cifs_ioctl,
806#endif /* CONFIG_CIFS_POSIX */
807 .setlease = cifs_setlease,
808};
809
759const struct file_operations cifs_file_direct_nobrl_ops = { 810const struct file_operations cifs_file_direct_nobrl_ops = {
760 /* no mmap, no aio, no readv - 811 /* no mmap, no aio, no readv -
761 BB reevaluate whether they can be done with directio, no cache */ 812 BB reevaluate whether they can be done with directio, no cache */
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 897b2b2b28b5..f23206d46531 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
61 struct dentry *); 61 struct dentry *);
62extern int cifs_revalidate_file(struct file *filp); 62extern int cifs_revalidate_file(struct file *filp);
63extern int cifs_revalidate_dentry(struct dentry *); 63extern int cifs_revalidate_dentry(struct dentry *);
64extern void cifs_invalidate_mapping(struct inode *inode);
64extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
65extern int cifs_setattr(struct dentry *, struct iattr *); 66extern int cifs_setattr(struct dentry *, struct iattr *);
66 67
@@ -72,19 +73,25 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
72/* Functions related to files and directories */ 73/* Functions related to files and directories */
73extern const struct file_operations cifs_file_ops; 74extern const struct file_operations cifs_file_ops;
74extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ 75extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
75extern const struct file_operations cifs_file_nobrl_ops; 76extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
76extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */ 77extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
78extern const struct file_operations cifs_file_direct_nobrl_ops;
79extern const struct file_operations cifs_file_strict_nobrl_ops;
77extern int cifs_open(struct inode *inode, struct file *file); 80extern int cifs_open(struct inode *inode, struct file *file);
78extern int cifs_close(struct inode *inode, struct file *file); 81extern int cifs_close(struct inode *inode, struct file *file);
79extern int cifs_closedir(struct inode *inode, struct file *file); 82extern int cifs_closedir(struct inode *inode, struct file *file);
80extern ssize_t cifs_user_read(struct file *file, char __user *read_data, 83extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
81 size_t read_size, loff_t *poffset); 84 size_t read_size, loff_t *poffset);
85extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
86 unsigned long nr_segs, loff_t pos);
82extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 87extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
83 size_t write_size, loff_t *poffset); 88 size_t write_size, loff_t *poffset);
84extern int cifs_lock(struct file *, int, struct file_lock *); 89extern int cifs_lock(struct file *, int, struct file_lock *);
85extern int cifs_fsync(struct file *, int); 90extern int cifs_fsync(struct file *, int);
91extern int cifs_strict_fsync(struct file *, int);
86extern int cifs_flush(struct file *, fl_owner_t id); 92extern int cifs_flush(struct file *, fl_owner_t id);
87extern int cifs_file_mmap(struct file * , struct vm_area_struct *); 93extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
94extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
88extern const struct file_operations cifs_dir_ops; 95extern const struct file_operations cifs_dir_ops;
89extern int cifs_dir_open(struct inode *inode, struct file *file); 96extern int cifs_dir_open(struct inode *inode, struct file *file);
90extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); 97extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
@@ -93,6 +100,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
93extern const struct dentry_operations cifs_dentry_ops; 100extern const struct dentry_operations cifs_dentry_ops;
94extern const struct dentry_operations cifs_ci_dentry_ops; 101extern const struct dentry_operations cifs_ci_dentry_ops;
95 102
103#ifdef CONFIG_CIFS_DFS_UPCALL
104extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
105#else
106#define cifs_dfs_d_automount NULL
107#endif
108
96/* Functions related to symlinks */ 109/* Functions related to symlinks */
97extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); 110extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
98extern void cifs_put_link(struct dentry *direntry, 111extern void cifs_put_link(struct dentry *direntry,
@@ -112,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
112extern const struct export_operations cifs_export_ops; 125extern const struct export_operations cifs_export_ops;
113#endif /* EXPERIMENTAL */ 126#endif /* EXPERIMENTAL */
114 127
115#define CIFS_VERSION "1.68" 128#define CIFS_VERSION "1.69"
116#endif /* _CIFSFS_H */ 129#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7136c0c3e2f9..5bfb75346cb0 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -161,35 +161,24 @@ struct TCP_Server_Info {
161 int srv_count; /* reference counter */ 161 int srv_count; /* reference counter */
162 /* 15 character server name + 0x20 16th byte indicating type = srv */ 162 /* 15 character server name + 0x20 16th byte indicating type = srv */
163 char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 163 char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
164 enum statusEnum tcpStatus; /* what we think the status is */
164 char *hostname; /* hostname portion of UNC string */ 165 char *hostname; /* hostname portion of UNC string */
165 struct socket *ssocket; 166 struct socket *ssocket;
166 union { 167 struct sockaddr_storage dstaddr;
167 struct sockaddr_in sockAddr;
168 struct sockaddr_in6 sockAddr6;
169 } addr;
170 struct sockaddr_storage srcaddr; /* locally bind to this IP */ 168 struct sockaddr_storage srcaddr; /* locally bind to this IP */
171 wait_queue_head_t response_q; 169 wait_queue_head_t response_q;
172 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ 170 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
173 struct list_head pending_mid_q; 171 struct list_head pending_mid_q;
174 void *Server_NlsInfo; /* BB - placeholder for future NLS info */
175 unsigned short server_codepage; /* codepage for the server */
176 enum protocolEnum protocolType;
177 char versionMajor;
178 char versionMinor;
179 bool svlocal:1; /* local server or remote */
180 bool noblocksnd; /* use blocking sendmsg */ 172 bool noblocksnd; /* use blocking sendmsg */
181 bool noautotune; /* do not autotune send buf sizes */ 173 bool noautotune; /* do not autotune send buf sizes */
182 bool tcp_nodelay; 174 bool tcp_nodelay;
183 atomic_t inFlight; /* number of requests on the wire to server */ 175 atomic_t inFlight; /* number of requests on the wire to server */
184#ifdef CONFIG_CIFS_STATS2
185 atomic_t inSend; /* requests trying to send */
186 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
187#endif
188 enum statusEnum tcpStatus; /* what we think the status is */
189 struct mutex srv_mutex; 176 struct mutex srv_mutex;
190 struct task_struct *tsk; 177 struct task_struct *tsk;
191 char server_GUID[16]; 178 char server_GUID[16];
192 char secMode; 179 char secMode;
180 bool session_estab; /* mark when very first sess is established */
181 u16 dialect; /* dialect index that server chose */
193 enum securityEnum secType; 182 enum securityEnum secType;
194 unsigned int maxReq; /* Clients should submit no more */ 183 unsigned int maxReq; /* Clients should submit no more */
195 /* than maxReq distinct unanswered SMBs to the server when using */ 184 /* than maxReq distinct unanswered SMBs to the server when using */
@@ -202,28 +191,29 @@ struct TCP_Server_Info {
202 unsigned int max_vcs; /* maximum number of smb sessions, at least 191 unsigned int max_vcs; /* maximum number of smb sessions, at least
203 those that can be specified uniquely with 192 those that can be specified uniquely with
204 vcnumbers */ 193 vcnumbers */
205 char sessid[4]; /* unique token id for this session */
206 /* (returned on Negotiate */
207 int capabilities; /* allow selective disabling of caps by smb sess */ 194 int capabilities; /* allow selective disabling of caps by smb sess */
208 int timeAdj; /* Adjust for difference in server time zone in sec */ 195 int timeAdj; /* Adjust for difference in server time zone in sec */
209 __u16 CurrentMid; /* multiplex id - rotating counter */ 196 __u16 CurrentMid; /* multiplex id - rotating counter */
210 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ 197 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
211 /* 16th byte of RFC1001 workstation name is always null */ 198 /* 16th byte of RFC1001 workstation name is always null */
212 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 199 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
213 __u32 sequence_number; /* needed for CIFS PDU signature */ 200 __u32 sequence_number; /* for signing, protected by srv_mutex */
214 struct session_key session_key; 201 struct session_key session_key;
215 unsigned long lstrp; /* when we got last response from this server */ 202 unsigned long lstrp; /* when we got last response from this server */
216 u16 dialect; /* dialect index that server chose */
217 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ 203 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
218 /* extended security flavors that server supports */ 204 /* extended security flavors that server supports */
205 bool sec_ntlmssp; /* supports NTLMSSP */
206 bool sec_kerberosu2u; /* supports U2U Kerberos */
219 bool sec_kerberos; /* supports plain Kerberos */ 207 bool sec_kerberos; /* supports plain Kerberos */
220 bool sec_mskerberos; /* supports legacy MS Kerberos */ 208 bool sec_mskerberos; /* supports legacy MS Kerberos */
221 bool sec_kerberosu2u; /* supports U2U Kerberos */ 209 struct delayed_work echo; /* echo ping workqueue job */
222 bool sec_ntlmssp; /* supports NTLMSSP */
223 bool session_estab; /* mark when very first sess is established */
224#ifdef CONFIG_CIFS_FSCACHE 210#ifdef CONFIG_CIFS_FSCACHE
225 struct fscache_cookie *fscache; /* client index cache cookie */ 211 struct fscache_cookie *fscache; /* client index cache cookie */
226#endif 212#endif
213#ifdef CONFIG_CIFS_STATS2
214 atomic_t inSend; /* requests trying to send */
215 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
216#endif
227}; 217};
228 218
229/* 219/*
@@ -449,13 +439,14 @@ struct cifsInodeInfo {
449 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 439 /* BB add in lists for dirty pages i.e. write caching info for oplock */
450 struct list_head openFileList; 440 struct list_head openFileList;
451 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 441 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
452 unsigned long time; /* jiffies of last update/check of inode */ 442 bool clientCanCacheRead; /* read oplock */
453 bool clientCanCacheRead:1; /* read oplock */ 443 bool clientCanCacheAll; /* read and writebehind oplock */
454 bool clientCanCacheAll:1; /* read and writebehind oplock */ 444 bool delete_pending; /* DELETE_ON_CLOSE is set */
455 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 445 bool invalid_mapping; /* pagecache is invalid */
456 bool invalid_mapping:1; /* pagecache is invalid */ 446 unsigned long time; /* jiffies of last update of inode */
457 u64 server_eof; /* current file size on server */ 447 u64 server_eof; /* current file size on server */
458 u64 uniqueid; /* server inode number */ 448 u64 uniqueid; /* server inode number */
449 u64 createtime; /* creation time on server */
459#ifdef CONFIG_CIFS_FSCACHE 450#ifdef CONFIG_CIFS_FSCACHE
460 struct fscache_cookie *fscache; 451 struct fscache_cookie *fscache;
461#endif 452#endif
@@ -510,6 +501,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
510 501
511#endif 502#endif
512 503
504struct mid_q_entry;
505
506/*
507 * This is the prototype for the mid callback function. When creating one,
508 * take special care to avoid deadlocks. Things to bear in mind:
509 *
510 * - it will be called by cifsd
511 * - the GlobalMid_Lock will be held
512 * - the mid will be removed from the pending_mid_q list
513 */
514typedef void (mid_callback_t)(struct mid_q_entry *mid);
515
513/* one of these for every pending CIFS request to the server */ 516/* one of these for every pending CIFS request to the server */
514struct mid_q_entry { 517struct mid_q_entry {
515 struct list_head qhead; /* mids waiting on reply from this server */ 518 struct list_head qhead; /* mids waiting on reply from this server */
@@ -521,7 +524,8 @@ struct mid_q_entry {
521 unsigned long when_sent; /* time when smb send finished */ 524 unsigned long when_sent; /* time when smb send finished */
522 unsigned long when_received; /* when demux complete (taken off wire) */ 525 unsigned long when_received; /* when demux complete (taken off wire) */
523#endif 526#endif
524 struct task_struct *tsk; /* task waiting for response */ 527 mid_callback_t *callback; /* call completion callback */
528 void *callback_data; /* general purpose pointer for callback */
525 struct smb_hdr *resp_buf; /* response buffer */ 529 struct smb_hdr *resp_buf; /* response buffer */
526 int midState; /* wish this were enum but can not pass to wait_event */ 530 int midState; /* wish this were enum but can not pass to wait_event */
527 __u8 command; /* smb command code */ 531 __u8 command; /* smb command code */
@@ -576,6 +580,7 @@ struct cifs_fattr {
576 u64 cf_uniqueid; 580 u64 cf_uniqueid;
577 u64 cf_eof; 581 u64 cf_eof;
578 u64 cf_bytes; 582 u64 cf_bytes;
583 u64 cf_createtime;
579 uid_t cf_uid; 584 uid_t cf_uid;
580 gid_t cf_gid; 585 gid_t cf_gid;
581 umode_t cf_mode; 586 umode_t cf_mode;
@@ -623,12 +628,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
623#define CIFS_IOVEC 4 /* array of response buffers */ 628#define CIFS_IOVEC 4 /* array of response buffers */
624 629
625/* Type of Request to SendReceive2 */ 630/* Type of Request to SendReceive2 */
626#define CIFS_STD_OP 0 /* normal request timeout */ 631#define CIFS_BLOCKING_OP 1 /* operation can block */
627#define CIFS_LONG_OP 1 /* long op (up to 45 sec, oplock time) */ 632#define CIFS_ASYNC_OP 2 /* do not wait for response */
628#define CIFS_VLONG_OP 2 /* sloow op - can take up to 180 seconds */ 633#define CIFS_TIMEOUT_MASK 0x003 /* only one of above set in req */
629#define CIFS_BLOCKING_OP 4 /* operation can block */
630#define CIFS_ASYNC_OP 8 /* do not wait for response */
631#define CIFS_TIMEOUT_MASK 0x00F /* only one of 5 above set in req */
632#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */ 634#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */
633#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */ 635#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */
634#define CIFS_NO_RESP 0x040 /* no response buffer required */ 636#define CIFS_NO_RESP 0x040 /* no response buffer required */
@@ -791,6 +793,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
791GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 793GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
792GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 794GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
793 795
796/* reconnect after this many failed echo attempts */
797GLOBAL_EXTERN unsigned short echo_retries;
798
794void cifs_oplock_break(struct work_struct *work); 799void cifs_oplock_break(struct work_struct *work);
795void cifs_oplock_break_get(struct cifsFileInfo *cfile); 800void cifs_oplock_break_get(struct cifsFileInfo *cfile);
796void cifs_oplock_break_put(struct cifsFileInfo *cfile); 801void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de36b09763a8..b5c8cc5d7a7f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -23,6 +23,7 @@
23#define _CIFSPDU_H 23#define _CIFSPDU_H
24 24
25#include <net/sock.h> 25#include <net/sock.h>
26#include <asm/unaligned.h>
26#include "smbfsctl.h" 27#include "smbfsctl.h"
27 28
28#ifdef CONFIG_CIFS_WEAK_PW_HASH 29#ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -50,6 +51,7 @@
50#define SMB_COM_SETATTR 0x09 /* trivial response */ 51#define SMB_COM_SETATTR 0x09 /* trivial response */
51#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */ 52#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */
52#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/ 53#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/
54#define SMB_COM_ECHO 0x2B /* echo request */
53#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */ 55#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */
54#define SMB_COM_READ_ANDX 0x2E 56#define SMB_COM_READ_ANDX 0x2E
55#define SMB_COM_WRITE_ANDX 0x2F 57#define SMB_COM_WRITE_ANDX 0x2F
@@ -425,11 +427,49 @@ struct smb_hdr {
425 __u16 Mid; 427 __u16 Mid;
426 __u8 WordCount; 428 __u8 WordCount;
427} __attribute__((packed)); 429} __attribute__((packed));
428/* given a pointer to an smb_hdr retrieve the value of byte count */ 430
429#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) 431/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
430#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) 432#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
433 (2 * (smb_var)->WordCount))
434
431/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ 435/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
432#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2) 436#define pByteArea(smb_var) (BCC(smb_var) + 2)
437
438/* get the converted ByteCount for a SMB packet and return it */
439static inline __u16
440get_bcc(struct smb_hdr *hdr)
441{
442 __u16 *bc_ptr = (__u16 *)BCC(hdr);
443
444 return get_unaligned(bc_ptr);
445}
446
447/* get the unconverted ByteCount for a SMB packet and return it */
448static inline __u16
449get_bcc_le(struct smb_hdr *hdr)
450{
451 __le16 *bc_ptr = (__le16 *)BCC(hdr);
452
453 return get_unaligned_le16(bc_ptr);
454}
455
456/* set the ByteCount for a SMB packet in host-byte order */
457static inline void
458put_bcc(__u16 count, struct smb_hdr *hdr)
459{
460 __u16 *bc_ptr = (__u16 *)BCC(hdr);
461
462 put_unaligned(count, bc_ptr);
463}
464
465/* set the ByteCount for a SMB packet in little-endian */
466static inline void
467put_bcc_le(__u16 count, struct smb_hdr *hdr)
468{
469 __le16 *bc_ptr = (__le16 *)BCC(hdr);
470
471 put_unaligned_le16(count, bc_ptr);
472}
433 473
434/* 474/*
435 * Computer Name Length (since Netbios name was length 16 with last byte 0x20) 475 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -760,6 +800,20 @@ typedef struct smb_com_tconx_rsp_ext {
760 * 800 *
761 */ 801 */
762 802
803typedef struct smb_com_echo_req {
804 struct smb_hdr hdr;
805 __le16 EchoCount;
806 __le16 ByteCount;
807 char Data[1];
808} __attribute__((packed)) ECHO_REQ;
809
810typedef struct smb_com_echo_rsp {
811 struct smb_hdr hdr;
812 __le16 SequenceNumber;
813 __le16 ByteCount;
814 char Data[1];
815} __attribute__((packed)) ECHO_RSP;
816
763typedef struct smb_com_logoff_andx_req { 817typedef struct smb_com_logoff_andx_req {
764 struct smb_hdr hdr; /* wct = 2 */ 818 struct smb_hdr hdr; /* wct = 2 */
765 __u8 AndXCommand; 819 __u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e6d1481b16c1..982895fa7615 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -61,6 +61,12 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
61 const char *fullpath, const struct dfs_info3_param *ref, 61 const char *fullpath, const struct dfs_info3_param *ref,
62 char **devname); 62 char **devname);
63/* extern void renew_parental_timestamps(struct dentry *direntry);*/ 63/* extern void renew_parental_timestamps(struct dentry *direntry);*/
64extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
65 struct TCP_Server_Info *server);
66extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
67extern int cifs_call_async(struct TCP_Server_Info *server,
68 struct smb_hdr *in_buf, mid_callback_t *callback,
69 void *cbdata);
64extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, 70extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
65 struct smb_hdr * /* input */ , 71 struct smb_hdr * /* input */ ,
66 struct smb_hdr * /* out */ , 72 struct smb_hdr * /* out */ ,
@@ -347,12 +353,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
347 const __u16 netfid, const __u64 len, 353 const __u16 netfid, const __u64 len,
348 const __u64 offset, const __u32 numUnlock, 354 const __u64 offset, const __u32 numUnlock,
349 const __u32 numLock, const __u8 lockType, 355 const __u32 numLock, const __u8 lockType,
350 const bool waitFlag); 356 const bool waitFlag, const __u8 oplock_level);
351extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, 357extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
352 const __u16 smb_file_id, const int get_flag, 358 const __u16 smb_file_id, const int get_flag,
353 const __u64 len, struct file_lock *, 359 const __u64 len, struct file_lock *,
354 const __u16 lock_type, const bool waitFlag); 360 const __u16 lock_type, const bool waitFlag);
355extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon); 361extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
362extern int CIFSSMBEcho(struct TCP_Server_Info *server);
356extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses); 363extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
357 364
358extern struct cifsSesInfo *sesInfoAlloc(void); 365extern struct cifsSesInfo *sesInfoAlloc(void);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 67acfb3acad2..3106f5e5c633 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -331,37 +331,35 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
331 331
332static int validate_t2(struct smb_t2_rsp *pSMB) 332static int validate_t2(struct smb_t2_rsp *pSMB)
333{ 333{
334 int rc = -EINVAL; 334 unsigned int total_size;
335 int total_size; 335
336 char *pBCC; 336 /* check for plausible wct */
337 if (pSMB->hdr.WordCount < 10)
338 goto vt2_err;
337 339
338 /* check for plausible wct, bcc and t2 data and parm sizes */
339 /* check for parm and data offset going beyond end of smb */ 340 /* check for parm and data offset going beyond end of smb */
340 if (pSMB->hdr.WordCount >= 10) { 341 if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
341 if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) && 342 get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
342 (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) { 343 goto vt2_err;
343 /* check that bcc is at least as big as parms + data */ 344
344 /* check that bcc is less than negotiated smb buffer */ 345 /* check that bcc is at least as big as parms + data */
345 total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount); 346 /* check that bcc is less than negotiated smb buffer */
346 if (total_size < 512) { 347 total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
347 total_size += 348 if (total_size >= 512)
348 le16_to_cpu(pSMB->t2_rsp.DataCount); 349 goto vt2_err;
349 /* BCC le converted in SendReceive */ 350
350 pBCC = (pSMB->hdr.WordCount * 2) + 351 total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
351 sizeof(struct smb_hdr) + 352 if (total_size > get_bcc(&pSMB->hdr) ||
352 (char *)pSMB; 353 total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
353 if ((total_size <= (*(u16 *)pBCC)) && 354 goto vt2_err;
354 (total_size < 355
355 CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) { 356 return 0;
356 return 0; 357vt2_err:
357 }
358 }
359 }
360 }
361 cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB, 358 cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
362 sizeof(struct smb_t2_rsp) + 16); 359 sizeof(struct smb_t2_rsp) + 16);
363 return rc; 360 return -EINVAL;
364} 361}
362
365int 363int
366CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) 364CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
367{ 365{
@@ -401,15 +399,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
401 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { 399 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
402 cFYI(1, "Kerberos only mechanism, enable extended security"); 400 cFYI(1, "Kerberos only mechanism, enable extended security");
403 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 401 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
404 } 402 } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
405#ifdef CONFIG_CIFS_EXPERIMENTAL
406 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
407 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 403 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
408 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { 404 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
409 cFYI(1, "NTLMSSP only mechanism, enable extended security"); 405 cFYI(1, "NTLMSSP only mechanism, enable extended security");
410 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 406 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
411 } 407 }
412#endif
413 408
414 count = 0; 409 count = 0;
415 for (i = 0; i < CIFS_NUM_PROT; i++) { 410 for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -455,7 +450,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
455 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), 450 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
456 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 451 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
457 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); 452 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
458 GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
459 /* even though we do not use raw we might as well set this 453 /* even though we do not use raw we might as well set this
460 accurately, in case we ever find a need for it */ 454 accurately, in case we ever find a need for it */
461 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { 455 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -569,7 +563,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
569 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 563 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
570 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 564 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
571 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); 565 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
572 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
573 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 566 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
574 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 567 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
575 server->timeAdj *= 60; 568 server->timeAdj *= 60;
@@ -709,6 +702,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
709 return rc; 702 return rc;
710} 703}
711 704
705/*
706 * This is a no-op for now. We're not really interested in the reply, but
707 * rather in the fact that the server sent one and that server->lstrp
708 * gets updated.
709 *
710 * FIXME: maybe we should consider checking that the reply matches request?
711 */
712static void
713cifs_echo_callback(struct mid_q_entry *mid)
714{
715 struct TCP_Server_Info *server = mid->callback_data;
716
717 DeleteMidQEntry(mid);
718 atomic_dec(&server->inFlight);
719 wake_up(&server->request_q);
720}
721
722int
723CIFSSMBEcho(struct TCP_Server_Info *server)
724{
725 ECHO_REQ *smb;
726 int rc = 0;
727
728 cFYI(1, "In echo request");
729
730 rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
731 if (rc)
732 return rc;
733
734 /* set up echo request */
735 smb->hdr.Tid = cpu_to_le16(0xffff);
736 smb->hdr.WordCount = 1;
737 put_unaligned_le16(1, &smb->EchoCount);
738 put_bcc_le(1, &smb->hdr);
739 smb->Data[0] = 'a';
740 smb->hdr.smb_buf_length += 3;
741
742 rc = cifs_call_async(server, (struct smb_hdr *)smb,
743 cifs_echo_callback, server);
744 if (rc)
745 cFYI(1, "Echo request failed: %d", rc);
746
747 cifs_small_buf_release(smb);
748
749 return rc;
750}
751
712int 752int
713CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) 753CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
714{ 754{
@@ -1196,7 +1236,7 @@ OldOpenRetry:
1196 pSMB->ByteCount = cpu_to_le16(count); 1236 pSMB->ByteCount = cpu_to_le16(count);
1197 /* long_op set to 1 to allow for oplock break timeouts */ 1237 /* long_op set to 1 to allow for oplock break timeouts */
1198 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1238 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1199 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1239 (struct smb_hdr *)pSMBr, &bytes_returned, 0);
1200 cifs_stats_inc(&tcon->num_opens); 1240 cifs_stats_inc(&tcon->num_opens);
1201 if (rc) { 1241 if (rc) {
1202 cFYI(1, "Error in Open = %d", rc); 1242 cFYI(1, "Error in Open = %d", rc);
@@ -1309,7 +1349,7 @@ openRetry:
1309 pSMB->ByteCount = cpu_to_le16(count); 1349 pSMB->ByteCount = cpu_to_le16(count);
1310 /* long_op set to 1 to allow for oplock break timeouts */ 1350 /* long_op set to 1 to allow for oplock break timeouts */
1311 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1351 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1312 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1352 (struct smb_hdr *)pSMBr, &bytes_returned, 0);
1313 cifs_stats_inc(&tcon->num_opens); 1353 cifs_stats_inc(&tcon->num_opens);
1314 if (rc) { 1354 if (rc) {
1315 cFYI(1, "Error in Open = %d", rc); 1355 cFYI(1, "Error in Open = %d", rc);
@@ -1391,7 +1431,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1391 iov[0].iov_base = (char *)pSMB; 1431 iov[0].iov_base = (char *)pSMB;
1392 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; 1432 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
1393 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, 1433 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
1394 &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR); 1434 &resp_buf_type, CIFS_LOG_ERROR);
1395 cifs_stats_inc(&tcon->num_reads); 1435 cifs_stats_inc(&tcon->num_reads);
1396 pSMBr = (READ_RSP *)iov[0].iov_base; 1436 pSMBr = (READ_RSP *)iov[0].iov_base;
1397 if (rc) { 1437 if (rc) {
@@ -1666,7 +1706,8 @@ int
1666CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, 1706CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1667 const __u16 smb_file_id, const __u64 len, 1707 const __u16 smb_file_id, const __u64 len,
1668 const __u64 offset, const __u32 numUnlock, 1708 const __u64 offset, const __u32 numUnlock,
1669 const __u32 numLock, const __u8 lockType, const bool waitFlag) 1709 const __u32 numLock, const __u8 lockType,
1710 const bool waitFlag, const __u8 oplock_level)
1670{ 1711{
1671 int rc = 0; 1712 int rc = 0;
1672 LOCK_REQ *pSMB = NULL; 1713 LOCK_REQ *pSMB = NULL;
@@ -1694,6 +1735,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1694 pSMB->NumberOfLocks = cpu_to_le16(numLock); 1735 pSMB->NumberOfLocks = cpu_to_le16(numLock);
1695 pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock); 1736 pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
1696 pSMB->LockType = lockType; 1737 pSMB->LockType = lockType;
1738 pSMB->OplockLevel = oplock_level;
1697 pSMB->AndXCommand = 0xFF; /* none */ 1739 pSMB->AndXCommand = 0xFF; /* none */
1698 pSMB->Fid = smb_file_id; /* netfid stays le */ 1740 pSMB->Fid = smb_file_id; /* netfid stays le */
1699 1741
@@ -3090,7 +3132,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3090 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; 3132 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
3091 3133
3092 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, 3134 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
3093 CIFS_STD_OP); 3135 0);
3094 cifs_stats_inc(&tcon->num_acl_get); 3136 cifs_stats_inc(&tcon->num_acl_get);
3095 if (rc) { 3137 if (rc) {
3096 cFYI(1, "Send error in QuerySecDesc = %d", rc); 3138 cFYI(1, "Send error in QuerySecDesc = %d", rc);
@@ -5565,7 +5607,7 @@ QAllEAsRetry:
5565 } 5607 }
5566 5608
5567 /* make sure list_len doesn't go past end of SMB */ 5609 /* make sure list_len doesn't go past end of SMB */
5568 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr); 5610 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
5569 if ((char *)ea_response_data + list_len > end_of_smb) { 5611 if ((char *)ea_response_data + list_len > end_of_smb) {
5570 cFYI(1, "EA list appears to go beyond SMB"); 5612 cFYI(1, "EA list appears to go beyond SMB");
5571 rc = -EIO; 5613 rc = -EIO;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index cc1a8604a790..18d3c7724d6e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -52,6 +52,9 @@
52#define CIFS_PORT 445 52#define CIFS_PORT 445
53#define RFC1001_PORT 139 53#define RFC1001_PORT 139
54 54
55/* SMB echo "timeout" -- FIXME: tunable? */
56#define SMB_ECHO_INTERVAL (60 * HZ)
57
55extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 58extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
56 unsigned char *p24); 59 unsigned char *p24);
57 60
@@ -64,8 +67,8 @@ struct smb_vol {
64 char *UNC; 67 char *UNC;
65 char *UNCip; 68 char *UNCip;
66 char *iocharset; /* local code page for mapping to and from Unicode */ 69 char *iocharset; /* local code page for mapping to and from Unicode */
67 char source_rfc1001_name[16]; /* netbios name of client */ 70 char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
68 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ 71 char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
69 uid_t cred_uid; 72 uid_t cred_uid;
70 uid_t linux_uid; 73 uid_t linux_uid;
71 gid_t linux_gid; 74 gid_t linux_gid;
@@ -115,8 +118,8 @@ struct smb_vol {
115#define TLINK_ERROR_EXPIRE (1 * HZ) 118#define TLINK_ERROR_EXPIRE (1 * HZ)
116#define TLINK_IDLE_EXPIRE (600 * HZ) 119#define TLINK_IDLE_EXPIRE (600 * HZ)
117 120
118static int ipv4_connect(struct TCP_Server_Info *server); 121static int ip_connect(struct TCP_Server_Info *server);
119static int ipv6_connect(struct TCP_Server_Info *server); 122static int generic_ip_connect(struct TCP_Server_Info *server);
120static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); 123static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
121static void cifs_prune_tlinks(struct work_struct *work); 124static void cifs_prune_tlinks(struct work_struct *work);
122 125
@@ -152,6 +155,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
152 155
153 /* before reconnecting the tcp session, mark the smb session (uid) 156 /* before reconnecting the tcp session, mark the smb session (uid)
154 and the tid bad so they are not used until reconnected */ 157 and the tid bad so they are not used until reconnected */
158 cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
155 spin_lock(&cifs_tcp_ses_lock); 159 spin_lock(&cifs_tcp_ses_lock);
156 list_for_each(tmp, &server->smb_ses_list) { 160 list_for_each(tmp, &server->smb_ses_list) {
157 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 161 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
@@ -163,7 +167,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
163 } 167 }
164 } 168 }
165 spin_unlock(&cifs_tcp_ses_lock); 169 spin_unlock(&cifs_tcp_ses_lock);
170
166 /* do not want to be sending data on a socket we are freeing */ 171 /* do not want to be sending data on a socket we are freeing */
172 cFYI(1, "%s: tearing down socket", __func__);
167 mutex_lock(&server->srv_mutex); 173 mutex_lock(&server->srv_mutex);
168 if (server->ssocket) { 174 if (server->ssocket) {
169 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state, 175 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -180,30 +186,27 @@ cifs_reconnect(struct TCP_Server_Info *server)
180 kfree(server->session_key.response); 186 kfree(server->session_key.response);
181 server->session_key.response = NULL; 187 server->session_key.response = NULL;
182 server->session_key.len = 0; 188 server->session_key.len = 0;
189 server->lstrp = jiffies;
190 mutex_unlock(&server->srv_mutex);
183 191
192 /* mark submitted MIDs for retry and issue callback */
193 cFYI(1, "%s: issuing mid callbacks", __func__);
184 spin_lock(&GlobalMid_Lock); 194 spin_lock(&GlobalMid_Lock);
185 list_for_each(tmp, &server->pending_mid_q) { 195 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
186 mid_entry = list_entry(tmp, struct 196 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
187 mid_q_entry, 197 if (mid_entry->midState == MID_REQUEST_SUBMITTED)
188 qhead);
189 if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
190 /* Mark other intransit requests as needing
191 retry so we do not immediately mark the
192 session bad again (ie after we reconnect
193 below) as they timeout too */
194 mid_entry->midState = MID_RETRY_NEEDED; 198 mid_entry->midState = MID_RETRY_NEEDED;
195 } 199 list_del_init(&mid_entry->qhead);
200 mid_entry->callback(mid_entry);
196 } 201 }
197 spin_unlock(&GlobalMid_Lock); 202 spin_unlock(&GlobalMid_Lock);
198 mutex_unlock(&server->srv_mutex);
199 203
200 while ((server->tcpStatus != CifsExiting) && 204 while ((server->tcpStatus != CifsExiting) &&
201 (server->tcpStatus != CifsGood)) { 205 (server->tcpStatus != CifsGood)) {
202 try_to_freeze(); 206 try_to_freeze();
203 if (server->addr.sockAddr6.sin6_family == AF_INET6) 207
204 rc = ipv6_connect(server); 208 /* we should try only the port we connected to before */
205 else 209 rc = generic_ip_connect(server);
206 rc = ipv4_connect(server);
207 if (rc) { 210 if (rc) {
208 cFYI(1, "reconnect error %d", rc); 211 cFYI(1, "reconnect error %d", rc);
209 msleep(3000); 212 msleep(3000);
@@ -213,10 +216,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
213 if (server->tcpStatus != CifsExiting) 216 if (server->tcpStatus != CifsExiting)
214 server->tcpStatus = CifsGood; 217 server->tcpStatus = CifsGood;
215 spin_unlock(&GlobalMid_Lock); 218 spin_unlock(&GlobalMid_Lock);
216 /* atomic_set(&server->inFlight,0);*/
217 wake_up(&server->response_q);
218 } 219 }
219 } 220 }
221
220 return rc; 222 return rc;
221} 223}
222 224
@@ -230,9 +232,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
230static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) 232static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
231{ 233{
232 struct smb_t2_rsp *pSMBt; 234 struct smb_t2_rsp *pSMBt;
233 int total_data_size;
234 int data_in_this_rsp;
235 int remaining; 235 int remaining;
236 __u16 total_data_size, data_in_this_rsp;
236 237
237 if (pSMB->Command != SMB_COM_TRANSACTION2) 238 if (pSMB->Command != SMB_COM_TRANSACTION2)
238 return 0; 239 return 0;
@@ -246,8 +247,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
246 247
247 pSMBt = (struct smb_t2_rsp *)pSMB; 248 pSMBt = (struct smb_t2_rsp *)pSMB;
248 249
249 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 250 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
250 data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount); 251 data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
251 252
252 remaining = total_data_size - data_in_this_rsp; 253 remaining = total_data_size - data_in_this_rsp;
253 254
@@ -273,21 +274,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
273{ 274{
274 struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond; 275 struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
275 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB; 276 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB;
276 int total_data_size;
277 int total_in_buf;
278 int remaining;
279 int total_in_buf2;
280 char *data_area_of_target; 277 char *data_area_of_target;
281 char *data_area_of_buf2; 278 char *data_area_of_buf2;
282 __u16 byte_count; 279 int remaining;
280 __u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
283 281
284 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 282 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
285 283
286 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) { 284 if (total_data_size !=
285 get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
287 cFYI(1, "total data size of primary and secondary t2 differ"); 286 cFYI(1, "total data size of primary and secondary t2 differ");
288 }
289 287
290 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount); 288 total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
291 289
292 remaining = total_data_size - total_in_buf; 290 remaining = total_data_size - total_in_buf;
293 291
@@ -297,28 +295,28 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
297 if (remaining == 0) /* nothing to do, ignore */ 295 if (remaining == 0) /* nothing to do, ignore */
298 return 0; 296 return 0;
299 297
300 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount); 298 total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
301 if (remaining < total_in_buf2) { 299 if (remaining < total_in_buf2) {
302 cFYI(1, "transact2 2nd response contains too much data"); 300 cFYI(1, "transact2 2nd response contains too much data");
303 } 301 }
304 302
305 /* find end of first SMB data area */ 303 /* find end of first SMB data area */
306 data_area_of_target = (char *)&pSMBt->hdr.Protocol + 304 data_area_of_target = (char *)&pSMBt->hdr.Protocol +
307 le16_to_cpu(pSMBt->t2_rsp.DataOffset); 305 get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
308 /* validate target area */ 306 /* validate target area */
309 307
310 data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol + 308 data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
311 le16_to_cpu(pSMB2->t2_rsp.DataOffset); 309 get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
312 310
313 data_area_of_target += total_in_buf; 311 data_area_of_target += total_in_buf;
314 312
315 /* copy second buffer into end of first buffer */ 313 /* copy second buffer into end of first buffer */
316 memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); 314 memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
317 total_in_buf += total_in_buf2; 315 total_in_buf += total_in_buf2;
318 pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf); 316 put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
319 byte_count = le16_to_cpu(BCC_LE(pTargetSMB)); 317 byte_count = get_bcc_le(pTargetSMB);
320 byte_count += total_in_buf2; 318 byte_count += total_in_buf2;
321 BCC_LE(pTargetSMB) = cpu_to_le16(byte_count); 319 put_bcc_le(byte_count, pTargetSMB);
322 320
323 byte_count = pTargetSMB->smb_buf_length; 321 byte_count = pTargetSMB->smb_buf_length;
324 byte_count += total_in_buf2; 322 byte_count += total_in_buf2;
@@ -332,7 +330,26 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
332 return 0; /* we are done */ 330 return 0; /* we are done */
333 } else /* more responses to go */ 331 } else /* more responses to go */
334 return 1; 332 return 1;
333}
335 334
335static void
336cifs_echo_request(struct work_struct *work)
337{
338 int rc;
339 struct TCP_Server_Info *server = container_of(work,
340 struct TCP_Server_Info, echo.work);
341
342 /* no need to ping if we got a response recently */
343 if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
344 goto requeue_echo;
345
346 rc = CIFSSMBEcho(server);
347 if (rc)
348 cFYI(1, "Unable to send echo request to server: %s",
349 server->hostname);
350
351requeue_echo:
352 queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
336} 353}
337 354
338static int 355static int
@@ -346,8 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
346 struct msghdr smb_msg; 363 struct msghdr smb_msg;
347 struct kvec iov; 364 struct kvec iov;
348 struct socket *csocket = server->ssocket; 365 struct socket *csocket = server->ssocket;
349 struct list_head *tmp; 366 struct list_head *tmp, *tmp2;
350 struct cifsSesInfo *ses;
351 struct task_struct *task_to_wake = NULL; 367 struct task_struct *task_to_wake = NULL;
352 struct mid_q_entry *mid_entry; 368 struct mid_q_entry *mid_entry;
353 char temp; 369 char temp;
@@ -400,7 +416,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
400 smb_msg.msg_control = NULL; 416 smb_msg.msg_control = NULL;
401 smb_msg.msg_controllen = 0; 417 smb_msg.msg_controllen = 0;
402 pdu_length = 4; /* enough to get RFC1001 header */ 418 pdu_length = 4; /* enough to get RFC1001 header */
419
403incomplete_rcv: 420incomplete_rcv:
421 if (echo_retries > 0 &&
422 time_after(jiffies, server->lstrp +
423 (echo_retries * SMB_ECHO_INTERVAL))) {
424 cERROR(1, "Server %s has not responded in %d seconds. "
425 "Reconnecting...", server->hostname,
426 (echo_retries * SMB_ECHO_INTERVAL / HZ));
427 cifs_reconnect(server);
428 csocket = server->ssocket;
429 wake_up(&server->response_q);
430 continue;
431 }
432
404 length = 433 length =
405 kernel_recvmsg(csocket, &smb_msg, 434 kernel_recvmsg(csocket, &smb_msg,
406 &iov, 1, pdu_length, 0 /* BB other flags? */); 435 &iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -477,7 +506,7 @@ incomplete_rcv:
477 * initialize frame) 506 * initialize frame)
478 */ 507 */
479 cifs_set_port((struct sockaddr *) 508 cifs_set_port((struct sockaddr *)
480 &server->addr.sockAddr, CIFS_PORT); 509 &server->dstaddr, CIFS_PORT);
481 cifs_reconnect(server); 510 cifs_reconnect(server);
482 csocket = server->ssocket; 511 csocket = server->ssocket;
483 wake_up(&server->response_q); 512 wake_up(&server->response_q);
@@ -560,10 +589,11 @@ incomplete_rcv:
560 continue; 589 continue;
561 } 590 }
562 591
592 mid_entry = NULL;
593 server->lstrp = jiffies;
563 594
564 task_to_wake = NULL;
565 spin_lock(&GlobalMid_Lock); 595 spin_lock(&GlobalMid_Lock);
566 list_for_each(tmp, &server->pending_mid_q) { 596 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
567 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 597 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
568 598
569 if ((mid_entry->mid == smb_buffer->Mid) && 599 if ((mid_entry->mid == smb_buffer->Mid) &&
@@ -604,20 +634,19 @@ incomplete_rcv:
604 mid_entry->resp_buf = smb_buffer; 634 mid_entry->resp_buf = smb_buffer;
605 mid_entry->largeBuf = isLargeBuf; 635 mid_entry->largeBuf = isLargeBuf;
606multi_t2_fnd: 636multi_t2_fnd:
607 task_to_wake = mid_entry->tsk;
608 mid_entry->midState = MID_RESPONSE_RECEIVED; 637 mid_entry->midState = MID_RESPONSE_RECEIVED;
638 list_del_init(&mid_entry->qhead);
639 mid_entry->callback(mid_entry);
609#ifdef CONFIG_CIFS_STATS2 640#ifdef CONFIG_CIFS_STATS2
610 mid_entry->when_received = jiffies; 641 mid_entry->when_received = jiffies;
611#endif 642#endif
612 /* so we do not time out requests to server
613 which is still responding (since server could
614 be busy but not dead) */
615 server->lstrp = jiffies;
616 break; 643 break;
617 } 644 }
645 mid_entry = NULL;
618 } 646 }
619 spin_unlock(&GlobalMid_Lock); 647 spin_unlock(&GlobalMid_Lock);
620 if (task_to_wake) { 648
649 if (mid_entry != NULL) {
621 /* Was previous buf put in mpx struct for multi-rsp? */ 650 /* Was previous buf put in mpx struct for multi-rsp? */
622 if (!isMultiRsp) { 651 if (!isMultiRsp) {
623 /* smb buffer will be freed by user thread */ 652 /* smb buffer will be freed by user thread */
@@ -626,11 +655,10 @@ multi_t2_fnd:
626 else 655 else
627 smallbuf = NULL; 656 smallbuf = NULL;
628 } 657 }
629 wake_up_process(task_to_wake);
630 } else if (!is_valid_oplock_break(smb_buffer, server) && 658 } else if (!is_valid_oplock_break(smb_buffer, server) &&
631 !isMultiRsp) { 659 !isMultiRsp) {
632 cERROR(1, "No task to wake, unknown frame received! " 660 cERROR(1, "No task to wake, unknown frame received! "
633 "NumMids %d", midCount.counter); 661 "NumMids %d", atomic_read(&midCount));
634 cifs_dump_mem("Received Data is: ", (char *)smb_buffer, 662 cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
635 sizeof(struct smb_hdr)); 663 sizeof(struct smb_hdr));
636#ifdef CONFIG_CIFS_DEBUG2 664#ifdef CONFIG_CIFS_DEBUG2
@@ -678,44 +706,16 @@ multi_t2_fnd:
678 if (smallbuf) /* no sense logging a debug message if NULL */ 706 if (smallbuf) /* no sense logging a debug message if NULL */
679 cifs_small_buf_release(smallbuf); 707 cifs_small_buf_release(smallbuf);
680 708
681 /* 709 if (!list_empty(&server->pending_mid_q)) {
682 * BB: we shouldn't have to do any of this. It shouldn't be
683 * possible to exit from the thread with active SMB sessions
684 */
685 spin_lock(&cifs_tcp_ses_lock);
686 if (list_empty(&server->pending_mid_q)) {
687 /* loop through server session structures attached to this and
688 mark them dead */
689 list_for_each(tmp, &server->smb_ses_list) {
690 ses = list_entry(tmp, struct cifsSesInfo,
691 smb_ses_list);
692 ses->status = CifsExiting;
693 ses->server = NULL;
694 }
695 spin_unlock(&cifs_tcp_ses_lock);
696 } else {
697 /* although we can not zero the server struct pointer yet,
698 since there are active requests which may depnd on them,
699 mark the corresponding SMB sessions as exiting too */
700 list_for_each(tmp, &server->smb_ses_list) {
701 ses = list_entry(tmp, struct cifsSesInfo,
702 smb_ses_list);
703 ses->status = CifsExiting;
704 }
705
706 spin_lock(&GlobalMid_Lock); 710 spin_lock(&GlobalMid_Lock);
707 list_for_each(tmp, &server->pending_mid_q) { 711 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
708 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 712 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
709 if (mid_entry->midState == MID_REQUEST_SUBMITTED) { 713 cFYI(1, "Clearing Mid 0x%x - issuing callback",
710 cFYI(1, "Clearing Mid 0x%x - waking up ",
711 mid_entry->mid); 714 mid_entry->mid);
712 task_to_wake = mid_entry->tsk; 715 list_del_init(&mid_entry->qhead);
713 if (task_to_wake) 716 mid_entry->callback(mid_entry);
714 wake_up_process(task_to_wake);
715 }
716 } 717 }
717 spin_unlock(&GlobalMid_Lock); 718 spin_unlock(&GlobalMid_Lock);
718 spin_unlock(&cifs_tcp_ses_lock);
719 /* 1/8th of sec is more than enough time for them to exit */ 719 /* 1/8th of sec is more than enough time for them to exit */
720 msleep(125); 720 msleep(125);
721 } 721 }
@@ -733,18 +733,6 @@ multi_t2_fnd:
733 coming home not much else we can do but free the memory */ 733 coming home not much else we can do but free the memory */
734 } 734 }
735 735
736 /* last chance to mark ses pointers invalid
737 if there are any pointing to this (e.g
738 if a crazy root user tried to kill cifsd
739 kernel thread explicitly this might happen) */
740 /* BB: This shouldn't be necessary, see above */
741 spin_lock(&cifs_tcp_ses_lock);
742 list_for_each(tmp, &server->smb_ses_list) {
743 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
744 ses->server = NULL;
745 }
746 spin_unlock(&cifs_tcp_ses_lock);
747
748 kfree(server->hostname); 736 kfree(server->hostname);
749 task_to_wake = xchg(&server->tsk, NULL); 737 task_to_wake = xchg(&server->tsk, NULL);
750 kfree(server); 738 kfree(server);
@@ -817,11 +805,11 @@ cifs_parse_mount_options(char *options, const char *devname,
817 * informational, only used for servers that do not support 805 * informational, only used for servers that do not support
818 * port 445 and it can be overridden at mount time 806 * port 445 and it can be overridden at mount time
819 */ 807 */
820 memset(vol->source_rfc1001_name, 0x20, 15); 808 memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
821 for (i = 0; i < strnlen(nodename, 15); i++) 809 for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
822 vol->source_rfc1001_name[i] = toupper(nodename[i]); 810 vol->source_rfc1001_name[i] = toupper(nodename[i]);
823 811
824 vol->source_rfc1001_name[15] = 0; 812 vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
825 /* null target name indicates to use *SMBSERVR default called name 813 /* null target name indicates to use *SMBSERVR default called name
826 if we end up sending RFC1001 session initialize */ 814 if we end up sending RFC1001 session initialize */
827 vol->target_rfc1001_name[0] = 0; 815 vol->target_rfc1001_name[0] = 0;
@@ -985,13 +973,11 @@ cifs_parse_mount_options(char *options, const char *devname,
985 return 1; 973 return 1;
986 } else if (strnicmp(value, "krb5", 4) == 0) { 974 } else if (strnicmp(value, "krb5", 4) == 0) {
987 vol->secFlg |= CIFSSEC_MAY_KRB5; 975 vol->secFlg |= CIFSSEC_MAY_KRB5;
988#ifdef CONFIG_CIFS_EXPERIMENTAL
989 } else if (strnicmp(value, "ntlmsspi", 8) == 0) { 976 } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
990 vol->secFlg |= CIFSSEC_MAY_NTLMSSP | 977 vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
991 CIFSSEC_MUST_SIGN; 978 CIFSSEC_MUST_SIGN;
992 } else if (strnicmp(value, "ntlmssp", 7) == 0) { 979 } else if (strnicmp(value, "ntlmssp", 7) == 0) {
993 vol->secFlg |= CIFSSEC_MAY_NTLMSSP; 980 vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
994#endif
995 } else if (strnicmp(value, "ntlmv2i", 7) == 0) { 981 } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
996 vol->secFlg |= CIFSSEC_MAY_NTLMV2 | 982 vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
997 CIFSSEC_MUST_SIGN; 983 CIFSSEC_MUST_SIGN;
@@ -1116,6 +1102,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1116 } else if (!strnicmp(data, "uid", 3) && value && *value) { 1102 } else if (!strnicmp(data, "uid", 3) && value && *value) {
1117 vol->linux_uid = simple_strtoul(value, &value, 0); 1103 vol->linux_uid = simple_strtoul(value, &value, 0);
1118 uid_specified = true; 1104 uid_specified = true;
1105 } else if (!strnicmp(data, "cruid", 5) && value && *value) {
1106 vol->cred_uid = simple_strtoul(value, &value, 0);
1119 } else if (!strnicmp(data, "forceuid", 8)) { 1107 } else if (!strnicmp(data, "forceuid", 8)) {
1120 override_uid = 1; 1108 override_uid = 1;
1121 } else if (!strnicmp(data, "noforceuid", 10)) { 1109 } else if (!strnicmp(data, "noforceuid", 10)) {
@@ -1168,22 +1156,22 @@ cifs_parse_mount_options(char *options, const char *devname,
1168 if (!value || !*value || (*value == ' ')) { 1156 if (!value || !*value || (*value == ' ')) {
1169 cFYI(1, "invalid (empty) netbiosname"); 1157 cFYI(1, "invalid (empty) netbiosname");
1170 } else { 1158 } else {
1171 memset(vol->source_rfc1001_name, 0x20, 15); 1159 memset(vol->source_rfc1001_name, 0x20,
1172 for (i = 0; i < 15; i++) { 1160 RFC1001_NAME_LEN);
1173 /* BB are there cases in which a comma can be 1161 /*
1174 valid in this workstation netbios name (and need 1162 * FIXME: are there cases in which a comma can
1175 special handling)? */ 1163 * be valid in workstation netbios name (and
1176 1164 * need special handling)?
1177 /* We do not uppercase netbiosname for user */ 1165 */
1166 for (i = 0; i < RFC1001_NAME_LEN; i++) {
1167 /* don't ucase netbiosname for user */
1178 if (value[i] == 0) 1168 if (value[i] == 0)
1179 break; 1169 break;
1180 else 1170 vol->source_rfc1001_name[i] = value[i];
1181 vol->source_rfc1001_name[i] =
1182 value[i];
1183 } 1171 }
1184 /* The string has 16th byte zero still from 1172 /* The string has 16th byte zero still from
1185 set at top of the function */ 1173 set at top of the function */
1186 if ((i == 15) && (value[i] != 0)) 1174 if (i == RFC1001_NAME_LEN && value[i] != 0)
1187 printk(KERN_WARNING "CIFS: netbiosname" 1175 printk(KERN_WARNING "CIFS: netbiosname"
1188 " longer than 15 truncated.\n"); 1176 " longer than 15 truncated.\n");
1189 } 1177 }
@@ -1193,7 +1181,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1193 cFYI(1, "empty server netbiosname specified"); 1181 cFYI(1, "empty server netbiosname specified");
1194 } else { 1182 } else {
1195 /* last byte, type, is 0x20 for servr type */ 1183 /* last byte, type, is 0x20 for servr type */
1196 memset(vol->target_rfc1001_name, 0x20, 16); 1184 memset(vol->target_rfc1001_name, 0x20,
1185 RFC1001_NAME_LEN_WITH_NULL);
1197 1186
1198 for (i = 0; i < 15; i++) { 1187 for (i = 0; i < 15; i++) {
1199 /* BB are there cases in which a comma can be 1188 /* BB are there cases in which a comma can be
@@ -1210,7 +1199,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1210 } 1199 }
1211 /* The string has 16th byte zero still from 1200 /* The string has 16th byte zero still from
1212 set at top of the function */ 1201 set at top of the function */
1213 if ((i == 15) && (value[i] != 0)) 1202 if (i == RFC1001_NAME_LEN && value[i] != 0)
1214 printk(KERN_WARNING "CIFS: server net" 1203 printk(KERN_WARNING "CIFS: server net"
1215 "biosname longer than 15 truncated.\n"); 1204 "biosname longer than 15 truncated.\n");
1216 } 1205 }
@@ -1341,10 +1330,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1341 vol->no_psx_acl = 0; 1330 vol->no_psx_acl = 0;
1342 } else if (strnicmp(data, "noacl", 5) == 0) { 1331 } else if (strnicmp(data, "noacl", 5) == 0) {
1343 vol->no_psx_acl = 1; 1332 vol->no_psx_acl = 1;
1344#ifdef CONFIG_CIFS_EXPERIMENTAL
1345 } else if (strnicmp(data, "locallease", 6) == 0) { 1333 } else if (strnicmp(data, "locallease", 6) == 0) {
1346 vol->local_lease = 1; 1334 vol->local_lease = 1;
1347#endif
1348 } else if (strnicmp(data, "sign", 4) == 0) { 1335 } else if (strnicmp(data, "sign", 4) == 0) {
1349 vol->secFlg |= CIFSSEC_MUST_SIGN; 1336 vol->secFlg |= CIFSSEC_MUST_SIGN;
1350 } else if (strnicmp(data, "seal", 4) == 0) { 1337 } else if (strnicmp(data, "seal", 4) == 0) {
@@ -1454,35 +1441,71 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
1454 } 1441 }
1455} 1442}
1456 1443
1444/*
1445 * If no port is specified in addr structure, we try to match with 445 port
1446 * and if it fails - with 139 ports. It should be called only if address
1447 * families of server and addr are equal.
1448 */
1449static bool
1450match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
1451{
1452 unsigned short int port, *sport;
1453
1454 switch (addr->sa_family) {
1455 case AF_INET:
1456 sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
1457 port = ((struct sockaddr_in *) addr)->sin_port;
1458 break;
1459 case AF_INET6:
1460 sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
1461 port = ((struct sockaddr_in6 *) addr)->sin6_port;
1462 break;
1463 default:
1464 WARN_ON(1);
1465 return false;
1466 }
1467
1468 if (!port) {
1469 port = htons(CIFS_PORT);
1470 if (port == *sport)
1471 return true;
1472
1473 port = htons(RFC1001_PORT);
1474 }
1475
1476 return port == *sport;
1477}
1457 1478
1458static bool 1479static bool
1459match_address(struct TCP_Server_Info *server, struct sockaddr *addr, 1480match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
1460 struct sockaddr *srcaddr) 1481 struct sockaddr *srcaddr)
1461{ 1482{
1462 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
1463 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
1464
1465 switch (addr->sa_family) { 1483 switch (addr->sa_family) {
1466 case AF_INET: 1484 case AF_INET: {
1467 if (addr4->sin_addr.s_addr != 1485 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
1468 server->addr.sockAddr.sin_addr.s_addr) 1486 struct sockaddr_in *srv_addr4 =
1469 return false; 1487 (struct sockaddr_in *)&server->dstaddr;
1470 if (addr4->sin_port && 1488
1471 addr4->sin_port != server->addr.sockAddr.sin_port) 1489 if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
1472 return false; 1490 return false;
1473 break; 1491 break;
1474 case AF_INET6: 1492 }
1493 case AF_INET6: {
1494 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
1495 struct sockaddr_in6 *srv_addr6 =
1496 (struct sockaddr_in6 *)&server->dstaddr;
1497
1475 if (!ipv6_addr_equal(&addr6->sin6_addr, 1498 if (!ipv6_addr_equal(&addr6->sin6_addr,
1476 &server->addr.sockAddr6.sin6_addr)) 1499 &srv_addr6->sin6_addr))
1477 return false;
1478 if (addr6->sin6_scope_id !=
1479 server->addr.sockAddr6.sin6_scope_id)
1480 return false; 1500 return false;
1481 if (addr6->sin6_port && 1501 if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
1482 addr6->sin6_port != server->addr.sockAddr6.sin6_port)
1483 return false; 1502 return false;
1484 break; 1503 break;
1485 } 1504 }
1505 default:
1506 WARN_ON(1);
1507 return false; /* don't expect to be here */
1508 }
1486 1509
1487 if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr)) 1510 if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
1488 return false; 1511 return false;
@@ -1549,6 +1572,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
1549 (struct sockaddr *)&vol->srcaddr)) 1572 (struct sockaddr *)&vol->srcaddr))
1550 continue; 1573 continue;
1551 1574
1575 if (!match_port(server, addr))
1576 continue;
1577
1552 if (!match_security(server, vol)) 1578 if (!match_security(server, vol))
1553 continue; 1579 continue;
1554 1580
@@ -1575,6 +1601,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
1575 list_del_init(&server->tcp_ses_list); 1601 list_del_init(&server->tcp_ses_list);
1576 spin_unlock(&cifs_tcp_ses_lock); 1602 spin_unlock(&cifs_tcp_ses_lock);
1577 1603
1604 cancel_delayed_work_sync(&server->echo);
1605
1578 spin_lock(&GlobalMid_Lock); 1606 spin_lock(&GlobalMid_Lock);
1579 server->tcpStatus = CifsExiting; 1607 server->tcpStatus = CifsExiting;
1580 spin_unlock(&GlobalMid_Lock); 1608 spin_unlock(&GlobalMid_Lock);
@@ -1664,8 +1692,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1664 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1692 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1665 tcp_ses->session_estab = false; 1693 tcp_ses->session_estab = false;
1666 tcp_ses->sequence_number = 0; 1694 tcp_ses->sequence_number = 0;
1695 tcp_ses->lstrp = jiffies;
1667 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 1696 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
1668 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 1697 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
1698 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
1669 1699
1670 /* 1700 /*
1671 * at this point we are the only ones with the pointer 1701 * at this point we are the only ones with the pointer
@@ -1681,14 +1711,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1681 cFYI(1, "attempting ipv6 connect"); 1711 cFYI(1, "attempting ipv6 connect");
1682 /* BB should we allow ipv6 on port 139? */ 1712 /* BB should we allow ipv6 on port 139? */
1683 /* other OS never observed in Wild doing 139 with v6 */ 1713 /* other OS never observed in Wild doing 139 with v6 */
1684 memcpy(&tcp_ses->addr.sockAddr6, sin_server6, 1714 memcpy(&tcp_ses->dstaddr, sin_server6,
1685 sizeof(struct sockaddr_in6)); 1715 sizeof(struct sockaddr_in6));
1686 rc = ipv6_connect(tcp_ses); 1716 } else
1687 } else { 1717 memcpy(&tcp_ses->dstaddr, sin_server,
1688 memcpy(&tcp_ses->addr.sockAddr, sin_server, 1718 sizeof(struct sockaddr_in));
1689 sizeof(struct sockaddr_in)); 1719
1690 rc = ipv4_connect(tcp_ses); 1720 rc = ip_connect(tcp_ses);
1691 }
1692 if (rc < 0) { 1721 if (rc < 0) {
1693 cERROR(1, "Error connecting to socket. Aborting operation"); 1722 cERROR(1, "Error connecting to socket. Aborting operation");
1694 goto out_err_crypto_release; 1723 goto out_err_crypto_release;
@@ -1715,6 +1744,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1715 1744
1716 cifs_fscache_get_client_cookie(tcp_ses); 1745 cifs_fscache_get_client_cookie(tcp_ses);
1717 1746
1747 /* queue echo request delayed work */
1748 queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
1749
1718 return tcp_ses; 1750 return tcp_ses;
1719 1751
1720out_err_crypto_release: 1752out_err_crypto_release:
@@ -1793,6 +1825,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1793{ 1825{
1794 int rc = -ENOMEM, xid; 1826 int rc = -ENOMEM, xid;
1795 struct cifsSesInfo *ses; 1827 struct cifsSesInfo *ses;
1828 struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
1829 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
1796 1830
1797 xid = GetXid(); 1831 xid = GetXid();
1798 1832
@@ -1836,12 +1870,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1836 1870
1837 /* new SMB session uses our server ref */ 1871 /* new SMB session uses our server ref */
1838 ses->server = server; 1872 ses->server = server;
1839 if (server->addr.sockAddr6.sin6_family == AF_INET6) 1873 if (server->dstaddr.ss_family == AF_INET6)
1840 sprintf(ses->serverName, "%pI6", 1874 sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
1841 &server->addr.sockAddr6.sin6_addr);
1842 else 1875 else
1843 sprintf(ses->serverName, "%pI4", 1876 sprintf(ses->serverName, "%pI4", &addr->sin_addr);
1844 &server->addr.sockAddr.sin_addr.s_addr);
1845 1877
1846 if (volume_info->username) 1878 if (volume_info->username)
1847 strncpy(ses->userName, volume_info->username, 1879 strncpy(ses->userName, volume_info->username,
@@ -2136,19 +2168,106 @@ bind_socket(struct TCP_Server_Info *server)
2136} 2168}
2137 2169
2138static int 2170static int
2139ipv4_connect(struct TCP_Server_Info *server) 2171ip_rfc1001_connect(struct TCP_Server_Info *server)
2172{
2173 int rc = 0;
2174 /*
2175 * some servers require RFC1001 sessinit before sending
2176 * negprot - BB check reconnection in case where second
2177 * sessinit is sent but no second negprot
2178 */
2179 struct rfc1002_session_packet *ses_init_buf;
2180 struct smb_hdr *smb_buf;
2181 ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
2182 GFP_KERNEL);
2183 if (ses_init_buf) {
2184 ses_init_buf->trailer.session_req.called_len = 32;
2185
2186 if (server->server_RFC1001_name &&
2187 server->server_RFC1001_name[0] != 0)
2188 rfc1002mangle(ses_init_buf->trailer.
2189 session_req.called_name,
2190 server->server_RFC1001_name,
2191 RFC1001_NAME_LEN_WITH_NULL);
2192 else
2193 rfc1002mangle(ses_init_buf->trailer.
2194 session_req.called_name,
2195 DEFAULT_CIFS_CALLED_NAME,
2196 RFC1001_NAME_LEN_WITH_NULL);
2197
2198 ses_init_buf->trailer.session_req.calling_len = 32;
2199
2200 /*
2201 * calling name ends in null (byte 16) from old smb
2202 * convention.
2203 */
2204 if (server->workstation_RFC1001_name &&
2205 server->workstation_RFC1001_name[0] != 0)
2206 rfc1002mangle(ses_init_buf->trailer.
2207 session_req.calling_name,
2208 server->workstation_RFC1001_name,
2209 RFC1001_NAME_LEN_WITH_NULL);
2210 else
2211 rfc1002mangle(ses_init_buf->trailer.
2212 session_req.calling_name,
2213 "LINUX_CIFS_CLNT",
2214 RFC1001_NAME_LEN_WITH_NULL);
2215
2216 ses_init_buf->trailer.session_req.scope1 = 0;
2217 ses_init_buf->trailer.session_req.scope2 = 0;
2218 smb_buf = (struct smb_hdr *)ses_init_buf;
2219
2220 /* sizeof RFC1002_SESSION_REQUEST with no scope */
2221 smb_buf->smb_buf_length = 0x81000044;
2222 rc = smb_send(server, smb_buf, 0x44);
2223 kfree(ses_init_buf);
2224 /*
2225 * RFC1001 layer in at least one server
2226 * requires very short break before negprot
2227 * presumably because not expecting negprot
2228 * to follow so fast. This is a simple
2229 * solution that works without
2230 * complicating the code and causes no
2231 * significant slowing down on mount
2232 * for everyone else
2233 */
2234 usleep_range(1000, 2000);
2235 }
2236 /*
2237 * else the negprot may still work without this
2238 * even though malloc failed
2239 */
2240
2241 return rc;
2242}
2243
2244static int
2245generic_ip_connect(struct TCP_Server_Info *server)
2140{ 2246{
2141 int rc = 0; 2247 int rc = 0;
2142 int val; 2248 unsigned short int sport;
2143 bool connected = false; 2249 int slen, sfamily;
2144 __be16 orig_port = 0;
2145 struct socket *socket = server->ssocket; 2250 struct socket *socket = server->ssocket;
2251 struct sockaddr *saddr;
2252
2253 saddr = (struct sockaddr *) &server->dstaddr;
2254
2255 if (server->dstaddr.ss_family == AF_INET6) {
2256 sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
2257 slen = sizeof(struct sockaddr_in6);
2258 sfamily = AF_INET6;
2259 } else {
2260 sport = ((struct sockaddr_in *) saddr)->sin_port;
2261 slen = sizeof(struct sockaddr_in);
2262 sfamily = AF_INET;
2263 }
2146 2264
2147 if (socket == NULL) { 2265 if (socket == NULL) {
2148 rc = sock_create_kern(PF_INET, SOCK_STREAM, 2266 rc = sock_create_kern(sfamily, SOCK_STREAM,
2149 IPPROTO_TCP, &socket); 2267 IPPROTO_TCP, &socket);
2150 if (rc < 0) { 2268 if (rc < 0) {
2151 cERROR(1, "Error %d creating socket", rc); 2269 cERROR(1, "Error %d creating socket", rc);
2270 server->ssocket = NULL;
2152 return rc; 2271 return rc;
2153 } 2272 }
2154 2273
@@ -2156,63 +2275,28 @@ ipv4_connect(struct TCP_Server_Info *server)
2156 cFYI(1, "Socket created"); 2275 cFYI(1, "Socket created");
2157 server->ssocket = socket; 2276 server->ssocket = socket;
2158 socket->sk->sk_allocation = GFP_NOFS; 2277 socket->sk->sk_allocation = GFP_NOFS;
2159 cifs_reclassify_socket4(socket); 2278 if (sfamily == AF_INET6)
2279 cifs_reclassify_socket6(socket);
2280 else
2281 cifs_reclassify_socket4(socket);
2160 } 2282 }
2161 2283
2162 rc = bind_socket(server); 2284 rc = bind_socket(server);
2163 if (rc < 0) 2285 if (rc < 0)
2164 return rc; 2286 return rc;
2165 2287
2166 /* user overrode default port */ 2288 rc = socket->ops->connect(socket, saddr, slen, 0);
2167 if (server->addr.sockAddr.sin_port) { 2289 if (rc < 0) {
2168 rc = socket->ops->connect(socket, (struct sockaddr *) 2290 cFYI(1, "Error %d connecting to server", rc);
2169 &server->addr.sockAddr,
2170 sizeof(struct sockaddr_in), 0);
2171 if (rc >= 0)
2172 connected = true;
2173 }
2174
2175 if (!connected) {
2176 /* save original port so we can retry user specified port
2177 later if fall back ports fail this time */
2178 orig_port = server->addr.sockAddr.sin_port;
2179
2180 /* do not retry on the same port we just failed on */
2181 if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
2182 server->addr.sockAddr.sin_port = htons(CIFS_PORT);
2183 rc = socket->ops->connect(socket,
2184 (struct sockaddr *)
2185 &server->addr.sockAddr,
2186 sizeof(struct sockaddr_in), 0);
2187 if (rc >= 0)
2188 connected = true;
2189 }
2190 }
2191 if (!connected) {
2192 server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
2193 rc = socket->ops->connect(socket, (struct sockaddr *)
2194 &server->addr.sockAddr,
2195 sizeof(struct sockaddr_in), 0);
2196 if (rc >= 0)
2197 connected = true;
2198 }
2199
2200 /* give up here - unless we want to retry on different
2201 protocol families some day */
2202 if (!connected) {
2203 if (orig_port)
2204 server->addr.sockAddr.sin_port = orig_port;
2205 cFYI(1, "Error %d connecting to server via ipv4", rc);
2206 sock_release(socket); 2291 sock_release(socket);
2207 server->ssocket = NULL; 2292 server->ssocket = NULL;
2208 return rc; 2293 return rc;
2209 } 2294 }
2210 2295
2211
2212 /* 2296 /*
2213 * Eventually check for other socket options to change from 2297 * Eventually check for other socket options to change from
2214 * the default. sock_setsockopt not used because it expects 2298 * the default. sock_setsockopt not used because it expects
2215 * user space buffer 2299 * user space buffer
2216 */ 2300 */
2217 socket->sk->sk_rcvtimeo = 7 * HZ; 2301 socket->sk->sk_rcvtimeo = 7 * HZ;
2218 socket->sk->sk_sndtimeo = 5 * HZ; 2302 socket->sk->sk_sndtimeo = 5 * HZ;
@@ -2226,7 +2310,7 @@ ipv4_connect(struct TCP_Server_Info *server)
2226 } 2310 }
2227 2311
2228 if (server->tcp_nodelay) { 2312 if (server->tcp_nodelay) {
2229 val = 1; 2313 int val = 1;
2230 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, 2314 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2231 (char *)&val, sizeof(val)); 2315 (char *)&val, sizeof(val));
2232 if (rc) 2316 if (rc)
@@ -2237,161 +2321,39 @@ ipv4_connect(struct TCP_Server_Info *server)
2237 socket->sk->sk_sndbuf, 2321 socket->sk->sk_sndbuf,
2238 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); 2322 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
2239 2323
2240 /* send RFC1001 sessinit */ 2324 if (sport == htons(RFC1001_PORT))
2241 if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) { 2325 rc = ip_rfc1001_connect(server);
2242 /* some servers require RFC1001 sessinit before sending
2243 negprot - BB check reconnection in case where second
2244 sessinit is sent but no second negprot */
2245 struct rfc1002_session_packet *ses_init_buf;
2246 struct smb_hdr *smb_buf;
2247 ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
2248 GFP_KERNEL);
2249 if (ses_init_buf) {
2250 ses_init_buf->trailer.session_req.called_len = 32;
2251 if (server->server_RFC1001_name &&
2252 server->server_RFC1001_name[0] != 0)
2253 rfc1002mangle(ses_init_buf->trailer.
2254 session_req.called_name,
2255 server->server_RFC1001_name,
2256 RFC1001_NAME_LEN_WITH_NULL);
2257 else
2258 rfc1002mangle(ses_init_buf->trailer.
2259 session_req.called_name,
2260 DEFAULT_CIFS_CALLED_NAME,
2261 RFC1001_NAME_LEN_WITH_NULL);
2262
2263 ses_init_buf->trailer.session_req.calling_len = 32;
2264
2265 /* calling name ends in null (byte 16) from old smb
2266 convention. */
2267 if (server->workstation_RFC1001_name &&
2268 server->workstation_RFC1001_name[0] != 0)
2269 rfc1002mangle(ses_init_buf->trailer.
2270 session_req.calling_name,
2271 server->workstation_RFC1001_name,
2272 RFC1001_NAME_LEN_WITH_NULL);
2273 else
2274 rfc1002mangle(ses_init_buf->trailer.
2275 session_req.calling_name,
2276 "LINUX_CIFS_CLNT",
2277 RFC1001_NAME_LEN_WITH_NULL);
2278
2279 ses_init_buf->trailer.session_req.scope1 = 0;
2280 ses_init_buf->trailer.session_req.scope2 = 0;
2281 smb_buf = (struct smb_hdr *)ses_init_buf;
2282 /* sizeof RFC1002_SESSION_REQUEST with no scope */
2283 smb_buf->smb_buf_length = 0x81000044;
2284 rc = smb_send(server, smb_buf, 0x44);
2285 kfree(ses_init_buf);
2286 msleep(1); /* RFC1001 layer in at least one server
2287 requires very short break before negprot
2288 presumably because not expecting negprot
2289 to follow so fast. This is a simple
2290 solution that works without
2291 complicating the code and causes no
2292 significant slowing down on mount
2293 for everyone else */
2294 }
2295 /* else the negprot may still work without this
2296 even though malloc failed */
2297
2298 }
2299 2326
2300 return rc; 2327 return rc;
2301} 2328}
2302 2329
2303static int 2330static int
2304ipv6_connect(struct TCP_Server_Info *server) 2331ip_connect(struct TCP_Server_Info *server)
2305{ 2332{
2306 int rc = 0; 2333 unsigned short int *sport;
2307 int val; 2334 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
2308 bool connected = false; 2335 struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
2309 __be16 orig_port = 0;
2310 struct socket *socket = server->ssocket;
2311 2336
2312 if (socket == NULL) { 2337 if (server->dstaddr.ss_family == AF_INET6)
2313 rc = sock_create_kern(PF_INET6, SOCK_STREAM, 2338 sport = &addr6->sin6_port;
2314 IPPROTO_TCP, &socket); 2339 else
2315 if (rc < 0) { 2340 sport = &addr->sin_port;
2316 cERROR(1, "Error %d creating ipv6 socket", rc);
2317 socket = NULL;
2318 return rc;
2319 }
2320 2341
2321 /* BB other socket options to set KEEPALIVE, NODELAY? */ 2342 if (*sport == 0) {
2322 cFYI(1, "ipv6 Socket created"); 2343 int rc;
2323 server->ssocket = socket;
2324 socket->sk->sk_allocation = GFP_NOFS;
2325 cifs_reclassify_socket6(socket);
2326 }
2327 2344
2328 rc = bind_socket(server); 2345 /* try with 445 port at first */
2329 if (rc < 0) 2346 *sport = htons(CIFS_PORT);
2330 return rc;
2331 2347
2332 /* user overrode default port */ 2348 rc = generic_ip_connect(server);
2333 if (server->addr.sockAddr6.sin6_port) {
2334 rc = socket->ops->connect(socket,
2335 (struct sockaddr *) &server->addr.sockAddr6,
2336 sizeof(struct sockaddr_in6), 0);
2337 if (rc >= 0) 2349 if (rc >= 0)
2338 connected = true; 2350 return rc;
2339 }
2340
2341 if (!connected) {
2342 /* save original port so we can retry user specified port
2343 later if fall back ports fail this time */
2344
2345 orig_port = server->addr.sockAddr6.sin6_port;
2346 /* do not retry on the same port we just failed on */
2347 if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
2348 server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
2349 rc = socket->ops->connect(socket, (struct sockaddr *)
2350 &server->addr.sockAddr6,
2351 sizeof(struct sockaddr_in6), 0);
2352 if (rc >= 0)
2353 connected = true;
2354 }
2355 }
2356 if (!connected) {
2357 server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
2358 rc = socket->ops->connect(socket, (struct sockaddr *)
2359 &server->addr.sockAddr6,
2360 sizeof(struct sockaddr_in6), 0);
2361 if (rc >= 0)
2362 connected = true;
2363 }
2364
2365 /* give up here - unless we want to retry on different
2366 protocol families some day */
2367 if (!connected) {
2368 if (orig_port)
2369 server->addr.sockAddr6.sin6_port = orig_port;
2370 cFYI(1, "Error %d connecting to server via ipv6", rc);
2371 sock_release(socket);
2372 server->ssocket = NULL;
2373 return rc;
2374 }
2375
2376 /*
2377 * Eventually check for other socket options to change from
2378 * the default. sock_setsockopt not used because it expects
2379 * user space buffer
2380 */
2381 socket->sk->sk_rcvtimeo = 7 * HZ;
2382 socket->sk->sk_sndtimeo = 5 * HZ;
2383 2351
2384 if (server->tcp_nodelay) { 2352 /* if it failed, try with 139 port */
2385 val = 1; 2353 *sport = htons(RFC1001_PORT);
2386 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2387 (char *)&val, sizeof(val));
2388 if (rc)
2389 cFYI(1, "set TCP_NODELAY socket option error %d", rc);
2390 } 2354 }
2391 2355
2392 server->ssocket = socket; 2356 return generic_ip_connect(server);
2393
2394 return rc;
2395} 2357}
2396 2358
2397void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon, 2359void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
@@ -2970,8 +2932,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2970 TCONX_RSP *pSMBr; 2932 TCONX_RSP *pSMBr;
2971 unsigned char *bcc_ptr; 2933 unsigned char *bcc_ptr;
2972 int rc = 0; 2934 int rc = 0;
2973 int length, bytes_left; 2935 int length;
2974 __u16 count; 2936 __u16 bytes_left, count;
2975 2937
2976 if (ses == NULL) 2938 if (ses == NULL)
2977 return -EIO; 2939 return -EIO;
@@ -2999,7 +2961,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2999 bcc_ptr++; /* skip password */ 2961 bcc_ptr++; /* skip password */
3000 /* already aligned so no need to do it below */ 2962 /* already aligned so no need to do it below */
3001 } else { 2963 } else {
3002 pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 2964 pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
3003 /* BB FIXME add code to fail this if NTLMv2 or Kerberos 2965 /* BB FIXME add code to fail this if NTLMv2 or Kerberos
3004 specified as required (when that support is added to 2966 specified as required (when that support is added to
3005 the vfs in the future) as only NTLM or the much 2967 the vfs in the future) as only NTLM or the much
@@ -3017,7 +2979,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3017#endif /* CIFS_WEAK_PW_HASH */ 2979#endif /* CIFS_WEAK_PW_HASH */
3018 SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr); 2980 SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
3019 2981
3020 bcc_ptr += CIFS_SESS_KEY_SIZE; 2982 bcc_ptr += CIFS_AUTH_RESP_SIZE;
3021 if (ses->capabilities & CAP_UNICODE) { 2983 if (ses->capabilities & CAP_UNICODE) {
3022 /* must align unicode strings */ 2984 /* must align unicode strings */
3023 *bcc_ptr = 0; /* null byte password */ 2985 *bcc_ptr = 0; /* null byte password */
@@ -3055,7 +3017,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3055 pSMB->ByteCount = cpu_to_le16(count); 3017 pSMB->ByteCount = cpu_to_le16(count);
3056 3018
3057 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, 3019 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
3058 CIFS_STD_OP); 3020 0);
3059 3021
3060 /* above now done in SendReceive */ 3022 /* above now done in SendReceive */
3061 if ((rc == 0) && (tcon != NULL)) { 3023 if ((rc == 0) && (tcon != NULL)) {
@@ -3065,7 +3027,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3065 tcon->need_reconnect = false; 3027 tcon->need_reconnect = false;
3066 tcon->tid = smb_buffer_response->Tid; 3028 tcon->tid = smb_buffer_response->Tid;
3067 bcc_ptr = pByteArea(smb_buffer_response); 3029 bcc_ptr = pByteArea(smb_buffer_response);
3068 bytes_left = BCC(smb_buffer_response); 3030 bytes_left = get_bcc(smb_buffer_response);
3069 length = strnlen(bcc_ptr, bytes_left - 2); 3031 length = strnlen(bcc_ptr, bytes_left - 2);
3070 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) 3032 if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
3071 is_unicode = true; 3033 is_unicode = true;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index db2a58c00f7b..dd5f22918c33 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -130,17 +130,6 @@ cifs_bp_rename_retry:
130 return full_path; 130 return full_path;
131} 131}
132 132
133static void setup_cifs_dentry(struct cifsTconInfo *tcon,
134 struct dentry *direntry,
135 struct inode *newinode)
136{
137 if (tcon->nocase)
138 d_set_d_op(direntry, &cifs_ci_dentry_ops);
139 else
140 d_set_d_op(direntry, &cifs_dentry_ops);
141 d_instantiate(direntry, newinode);
142}
143
144/* Inode operations in similar order to how they appear in Linux file fs.h */ 133/* Inode operations in similar order to how they appear in Linux file fs.h */
145 134
146int 135int
@@ -293,10 +282,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
293 args.uid = NO_CHANGE_64; 282 args.uid = NO_CHANGE_64;
294 args.gid = NO_CHANGE_64; 283 args.gid = NO_CHANGE_64;
295 } 284 }
296 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, 285 CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
297 cifs_sb->local_nls, 286 current->tgid);
298 cifs_sb->mnt_cifs_flags &
299 CIFS_MOUNT_MAP_SPECIAL_CHR);
300 } else { 287 } else {
301 /* BB implement mode setting via Windows security 288 /* BB implement mode setting via Windows security
302 descriptors e.g. */ 289 descriptors e.g. */
@@ -329,7 +316,7 @@ cifs_create_get_file_info:
329 316
330cifs_create_set_dentry: 317cifs_create_set_dentry:
331 if (rc == 0) 318 if (rc == 0)
332 setup_cifs_dentry(tcon, direntry, newinode); 319 d_instantiate(direntry, newinode);
333 else 320 else
334 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); 321 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
335 322
@@ -420,10 +407,6 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
420 407
421 rc = cifs_get_inode_info_unix(&newinode, full_path, 408 rc = cifs_get_inode_info_unix(&newinode, full_path,
422 inode->i_sb, xid); 409 inode->i_sb, xid);
423 if (pTcon->nocase)
424 d_set_d_op(direntry, &cifs_ci_dentry_ops);
425 else
426 d_set_d_op(direntry, &cifs_dentry_ops);
427 410
428 if (rc == 0) 411 if (rc == 0)
429 d_instantiate(direntry, newinode); 412 d_instantiate(direntry, newinode);
@@ -603,10 +586,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
603 parent_dir_inode->i_sb, xid, NULL); 586 parent_dir_inode->i_sb, xid, NULL);
604 587
605 if ((rc == 0) && (newInode != NULL)) { 588 if ((rc == 0) && (newInode != NULL)) {
606 if (pTcon->nocase)
607 d_set_d_op(direntry, &cifs_ci_dentry_ops);
608 else
609 d_set_d_op(direntry, &cifs_dentry_ops);
610 d_add(direntry, newInode); 589 d_add(direntry, newInode);
611 if (posix_open) { 590 if (posix_open) {
612 filp = lookup_instantiate_filp(nd, direntry, 591 filp = lookup_instantiate_filp(nd, direntry,
@@ -633,10 +612,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
633 } else if (rc == -ENOENT) { 612 } else if (rc == -ENOENT) {
634 rc = 0; 613 rc = 0;
635 direntry->d_time = jiffies; 614 direntry->d_time = jiffies;
636 if (pTcon->nocase)
637 d_set_d_op(direntry, &cifs_ci_dentry_ops);
638 else
639 d_set_d_op(direntry, &cifs_dentry_ops);
640 d_add(direntry, NULL); 615 d_add(direntry, NULL);
641 /* if it was once a directory (but how can we tell?) we could do 616 /* if it was once a directory (but how can we tell?) we could do
642 shrink_dcache_parent(direntry); */ 617 shrink_dcache_parent(direntry); */
@@ -700,6 +675,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
700 675
701const struct dentry_operations cifs_dentry_ops = { 676const struct dentry_operations cifs_dentry_ops = {
702 .d_revalidate = cifs_d_revalidate, 677 .d_revalidate = cifs_d_revalidate,
678 .d_automount = cifs_dfs_d_automount,
703/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ 679/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
704}; 680};
705 681
@@ -736,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
736 .d_revalidate = cifs_d_revalidate, 712 .d_revalidate = cifs_d_revalidate,
737 .d_hash = cifs_ci_hash, 713 .d_hash = cifs_ci_hash,
738 .d_compare = cifs_ci_compare, 714 .d_compare = cifs_ci_compare,
715 .d_automount = cifs_dfs_d_automount,
739}; 716};
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5a28660ca2b5..d7d65a70678e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -104,53 +104,6 @@ static inline int cifs_get_disposition(unsigned int flags)
104 return FILE_OPEN; 104 return FILE_OPEN;
105} 105}
106 106
107static inline int cifs_open_inode_helper(struct inode *inode,
108 struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
109 char *full_path, int xid)
110{
111 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
112 struct timespec temp;
113 int rc;
114
115 if (pCifsInode->clientCanCacheRead) {
116 /* we have the inode open somewhere else
117 no need to discard cache data */
118 goto client_can_cache;
119 }
120
121 /* BB need same check in cifs_create too? */
122 /* if not oplocked, invalidate inode pages if mtime or file
123 size changed */
124 temp = cifs_NTtimeToUnix(buf->LastWriteTime);
125 if (timespec_equal(&inode->i_mtime, &temp) &&
126 (inode->i_size ==
127 (loff_t)le64_to_cpu(buf->EndOfFile))) {
128 cFYI(1, "inode unchanged on server");
129 } else {
130 if (inode->i_mapping) {
131 /* BB no need to lock inode until after invalidate
132 since namei code should already have it locked? */
133 rc = filemap_write_and_wait(inode->i_mapping);
134 mapping_set_error(inode->i_mapping, rc);
135 }
136 cFYI(1, "invalidating remote inode since open detected it "
137 "changed");
138 invalidate_remote_inode(inode);
139 }
140
141client_can_cache:
142 if (pTcon->unix_ext)
143 rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
144 xid);
145 else
146 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
147 xid, NULL);
148
149 cifs_set_oplock_level(pCifsInode, oplock);
150
151 return rc;
152}
153
154int cifs_posix_open(char *full_path, struct inode **pinode, 107int cifs_posix_open(char *full_path, struct inode **pinode,
155 struct super_block *sb, int mode, unsigned int f_flags, 108 struct super_block *sb, int mode, unsigned int f_flags,
156 __u32 *poplock, __u16 *pnetfid, int xid) 109 __u32 *poplock, __u16 *pnetfid, int xid)
@@ -213,6 +166,76 @@ posix_open_ret:
213 return rc; 166 return rc;
214} 167}
215 168
169static int
170cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
171 struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
172 __u16 *pnetfid, int xid)
173{
174 int rc;
175 int desiredAccess;
176 int disposition;
177 FILE_ALL_INFO *buf;
178
179 desiredAccess = cifs_convert_flags(f_flags);
180
181/*********************************************************************
182 * open flag mapping table:
183 *
184 * POSIX Flag CIFS Disposition
185 * ---------- ----------------
186 * O_CREAT FILE_OPEN_IF
187 * O_CREAT | O_EXCL FILE_CREATE
188 * O_CREAT | O_TRUNC FILE_OVERWRITE_IF
189 * O_TRUNC FILE_OVERWRITE
190 * none of the above FILE_OPEN
191 *
192 * Note that there is not a direct match between disposition
193 * FILE_SUPERSEDE (ie create whether or not file exists although
194 * O_CREAT | O_TRUNC is similar but truncates the existing
195 * file rather than creating a new file as FILE_SUPERSEDE does
196 * (which uses the attributes / metadata passed in on open call)
197 *?
198 *? O_SYNC is a reasonable match to CIFS writethrough flag
199 *? and the read write flags match reasonably. O_LARGEFILE
200 *? is irrelevant because largefile support is always used
201 *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
202 * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
203 *********************************************************************/
204
205 disposition = cifs_get_disposition(f_flags);
206
207 /* BB pass O_SYNC flag through on file attributes .. BB */
208
209 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
210 if (!buf)
211 return -ENOMEM;
212
213 if (tcon->ses->capabilities & CAP_NT_SMBS)
214 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
215 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
216 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
217 & CIFS_MOUNT_MAP_SPECIAL_CHR);
218 else
219 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
220 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
221 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
222 & CIFS_MOUNT_MAP_SPECIAL_CHR);
223
224 if (rc)
225 goto out;
226
227 if (tcon->unix_ext)
228 rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
229 xid);
230 else
231 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
232 xid, pnetfid);
233
234out:
235 kfree(buf);
236 return rc;
237}
238
216struct cifsFileInfo * 239struct cifsFileInfo *
217cifs_new_fileinfo(__u16 fileHandle, struct file *file, 240cifs_new_fileinfo(__u16 fileHandle, struct file *file,
218 struct tcon_link *tlink, __u32 oplock) 241 struct tcon_link *tlink, __u32 oplock)
@@ -264,6 +287,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
264 struct inode *inode = cifs_file->dentry->d_inode; 287 struct inode *inode = cifs_file->dentry->d_inode;
265 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); 288 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
266 struct cifsInodeInfo *cifsi = CIFS_I(inode); 289 struct cifsInodeInfo *cifsi = CIFS_I(inode);
290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
267 struct cifsLockInfo *li, *tmp; 291 struct cifsLockInfo *li, *tmp;
268 292
269 spin_lock(&cifs_file_list_lock); 293 spin_lock(&cifs_file_list_lock);
@@ -279,6 +303,13 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
279 if (list_empty(&cifsi->openFileList)) { 303 if (list_empty(&cifsi->openFileList)) {
280 cFYI(1, "closing last open instance for inode %p", 304 cFYI(1, "closing last open instance for inode %p",
281 cifs_file->dentry->d_inode); 305 cifs_file->dentry->d_inode);
306
307 /* in strict cache mode we need invalidate mapping on the last
308 close because it may cause a error when we open this file
309 again and get at least level II oplock */
310 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
311 CIFS_I(inode)->invalid_mapping = true;
312
282 cifs_set_oplock_level(cifsi, 0); 313 cifs_set_oplock_level(cifsi, 0);
283 } 314 }
284 spin_unlock(&cifs_file_list_lock); 315 spin_unlock(&cifs_file_list_lock);
@@ -317,10 +348,8 @@ int cifs_open(struct inode *inode, struct file *file)
317 struct cifsFileInfo *pCifsFile = NULL; 348 struct cifsFileInfo *pCifsFile = NULL;
318 struct cifsInodeInfo *pCifsInode; 349 struct cifsInodeInfo *pCifsInode;
319 char *full_path = NULL; 350 char *full_path = NULL;
320 int desiredAccess; 351 bool posix_open_ok = false;
321 int disposition;
322 __u16 netfid; 352 __u16 netfid;
323 FILE_ALL_INFO *buf = NULL;
324 353
325 xid = GetXid(); 354 xid = GetXid();
326 355
@@ -358,17 +387,7 @@ int cifs_open(struct inode *inode, struct file *file)
358 file->f_flags, &oplock, &netfid, xid); 387 file->f_flags, &oplock, &netfid, xid);
359 if (rc == 0) { 388 if (rc == 0) {
360 cFYI(1, "posix open succeeded"); 389 cFYI(1, "posix open succeeded");
361 390 posix_open_ok = true;
362 pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
363 oplock);
364 if (pCifsFile == NULL) {
365 CIFSSMBClose(xid, tcon, netfid);
366 rc = -ENOMEM;
367 }
368
369 cifs_fscache_set_inode_cookie(inode, file);
370
371 goto out;
372 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 391 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
373 if (tcon->ses->serverNOS) 392 if (tcon->ses->serverNOS)
374 cERROR(1, "server %s of type %s returned" 393 cERROR(1, "server %s of type %s returned"
@@ -385,103 +404,39 @@ int cifs_open(struct inode *inode, struct file *file)
385 or DFS errors */ 404 or DFS errors */
386 } 405 }
387 406
388 desiredAccess = cifs_convert_flags(file->f_flags); 407 if (!posix_open_ok) {
389 408 rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
390/********************************************************************* 409 file->f_flags, &oplock, &netfid, xid);
391 * open flag mapping table: 410 if (rc)
392 * 411 goto out;
393 * POSIX Flag CIFS Disposition
394 * ---------- ----------------
395 * O_CREAT FILE_OPEN_IF
396 * O_CREAT | O_EXCL FILE_CREATE
397 * O_CREAT | O_TRUNC FILE_OVERWRITE_IF
398 * O_TRUNC FILE_OVERWRITE
399 * none of the above FILE_OPEN
400 *
401 * Note that there is not a direct match between disposition
402 * FILE_SUPERSEDE (ie create whether or not file exists although
403 * O_CREAT | O_TRUNC is similar but truncates the existing
404 * file rather than creating a new file as FILE_SUPERSEDE does
405 * (which uses the attributes / metadata passed in on open call)
406 *?
407 *? O_SYNC is a reasonable match to CIFS writethrough flag
408 *? and the read write flags match reasonably. O_LARGEFILE
409 *? is irrelevant because largefile support is always used
410 *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
411 * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
412 *********************************************************************/
413
414 disposition = cifs_get_disposition(file->f_flags);
415
416 /* BB pass O_SYNC flag through on file attributes .. BB */
417
418 /* Also refresh inode by passing in file_info buf returned by SMBOpen
419 and calling get_inode_info with returned buf (at least helps
420 non-Unix server case) */
421
422 /* BB we can not do this if this is the second open of a file
423 and the first handle has writebehind data, we might be
424 able to simply do a filemap_fdatawrite/filemap_fdatawait first */
425 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
426 if (!buf) {
427 rc = -ENOMEM;
428 goto out;
429 }
430
431 if (tcon->ses->capabilities & CAP_NT_SMBS)
432 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
433 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
434 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
435 & CIFS_MOUNT_MAP_SPECIAL_CHR);
436 else
437 rc = -EIO; /* no NT SMB support fall into legacy open below */
438
439 if (rc == -EIO) {
440 /* Old server, try legacy style OpenX */
441 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
442 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
443 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
444 & CIFS_MOUNT_MAP_SPECIAL_CHR);
445 }
446 if (rc) {
447 cFYI(1, "cifs_open returned 0x%x", rc);
448 goto out;
449 } 412 }
450 413
451 rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
452 if (rc != 0)
453 goto out;
454
455 pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock); 414 pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
456 if (pCifsFile == NULL) { 415 if (pCifsFile == NULL) {
416 CIFSSMBClose(xid, tcon, netfid);
457 rc = -ENOMEM; 417 rc = -ENOMEM;
458 goto out; 418 goto out;
459 } 419 }
460 420
461 cifs_fscache_set_inode_cookie(inode, file); 421 cifs_fscache_set_inode_cookie(inode, file);
462 422
463 if (oplock & CIFS_CREATE_ACTION) { 423 if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
464 /* time to set mode which we can not set earlier due to 424 /* time to set mode which we can not set earlier due to
465 problems creating new read-only files */ 425 problems creating new read-only files */
466 if (tcon->unix_ext) { 426 struct cifs_unix_set_info_args args = {
467 struct cifs_unix_set_info_args args = { 427 .mode = inode->i_mode,
468 .mode = inode->i_mode, 428 .uid = NO_CHANGE_64,
469 .uid = NO_CHANGE_64, 429 .gid = NO_CHANGE_64,
470 .gid = NO_CHANGE_64, 430 .ctime = NO_CHANGE_64,
471 .ctime = NO_CHANGE_64, 431 .atime = NO_CHANGE_64,
472 .atime = NO_CHANGE_64, 432 .mtime = NO_CHANGE_64,
473 .mtime = NO_CHANGE_64, 433 .device = 0,
474 .device = 0, 434 };
475 }; 435 CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
476 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, 436 pCifsFile->pid);
477 cifs_sb->local_nls,
478 cifs_sb->mnt_cifs_flags &
479 CIFS_MOUNT_MAP_SPECIAL_CHR);
480 }
481 } 437 }
482 438
483out: 439out:
484 kfree(buf);
485 kfree(full_path); 440 kfree(full_path);
486 FreeXid(xid); 441 FreeXid(xid);
487 cifs_put_tlink(tlink); 442 cifs_put_tlink(tlink);
@@ -779,12 +734,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
779 734
780 /* BB we could chain these into one lock request BB */ 735 /* BB we could chain these into one lock request BB */
781 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 736 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
782 0, 1, lockType, 0 /* wait flag */ ); 737 0, 1, lockType, 0 /* wait flag */, 0);
783 if (rc == 0) { 738 if (rc == 0) {
784 rc = CIFSSMBLock(xid, tcon, netfid, length, 739 rc = CIFSSMBLock(xid, tcon, netfid, length,
785 pfLock->fl_start, 1 /* numUnlock */ , 740 pfLock->fl_start, 1 /* numUnlock */ ,
786 0 /* numLock */ , lockType, 741 0 /* numLock */ , lockType,
787 0 /* wait flag */ ); 742 0 /* wait flag */, 0);
788 pfLock->fl_type = F_UNLCK; 743 pfLock->fl_type = F_UNLCK;
789 if (rc != 0) 744 if (rc != 0)
790 cERROR(1, "Error unlocking previously locked " 745 cERROR(1, "Error unlocking previously locked "
@@ -801,13 +756,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
801 rc = CIFSSMBLock(xid, tcon, netfid, length, 756 rc = CIFSSMBLock(xid, tcon, netfid, length,
802 pfLock->fl_start, 0, 1, 757 pfLock->fl_start, 0, 1,
803 lockType | LOCKING_ANDX_SHARED_LOCK, 758 lockType | LOCKING_ANDX_SHARED_LOCK,
804 0 /* wait flag */); 759 0 /* wait flag */, 0);
805 if (rc == 0) { 760 if (rc == 0) {
806 rc = CIFSSMBLock(xid, tcon, netfid, 761 rc = CIFSSMBLock(xid, tcon, netfid,
807 length, pfLock->fl_start, 1, 0, 762 length, pfLock->fl_start, 1, 0,
808 lockType | 763 lockType |
809 LOCKING_ANDX_SHARED_LOCK, 764 LOCKING_ANDX_SHARED_LOCK,
810 0 /* wait flag */); 765 0 /* wait flag */, 0);
811 pfLock->fl_type = F_RDLCK; 766 pfLock->fl_type = F_RDLCK;
812 if (rc != 0) 767 if (rc != 0)
813 cERROR(1, "Error unlocking " 768 cERROR(1, "Error unlocking "
@@ -850,8 +805,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
850 805
851 if (numLock) { 806 if (numLock) {
852 rc = CIFSSMBLock(xid, tcon, netfid, length, 807 rc = CIFSSMBLock(xid, tcon, netfid, length,
853 pfLock->fl_start, 808 pfLock->fl_start, 0, numLock, lockType,
854 0, numLock, lockType, wait_flag); 809 wait_flag, 0);
855 810
856 if (rc == 0) { 811 if (rc == 0) {
857 /* For Windows locks we must store them. */ 812 /* For Windows locks we must store them. */
@@ -871,9 +826,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
871 (pfLock->fl_start + length) >= 826 (pfLock->fl_start + length) >=
872 (li->offset + li->length)) { 827 (li->offset + li->length)) {
873 stored_rc = CIFSSMBLock(xid, tcon, 828 stored_rc = CIFSSMBLock(xid, tcon,
874 netfid, 829 netfid, li->length,
875 li->length, li->offset, 830 li->offset, 1, 0,
876 1, 0, li->type, false); 831 li->type, false, 0);
877 if (stored_rc) 832 if (stored_rc)
878 rc = stored_rc; 833 rc = stored_rc;
879 else { 834 else {
@@ -892,29 +847,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
892 return rc; 847 return rc;
893} 848}
894 849
895/*
896 * Set the timeout on write requests past EOF. For some servers (Windows)
897 * these calls can be very long.
898 *
899 * If we're writing >10M past the EOF we give a 180s timeout. Anything less
900 * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
901 * The 10M cutoff is totally arbitrary. A better scheme for this would be
902 * welcome if someone wants to suggest one.
903 *
904 * We may be able to do a better job with this if there were some way to
905 * declare that a file should be sparse.
906 */
907static int
908cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
909{
910 if (offset <= cifsi->server_eof)
911 return CIFS_STD_OP;
912 else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
913 return CIFS_VLONG_OP;
914 else
915 return CIFS_LONG_OP;
916}
917
918/* update the file size (if needed) after a write */ 850/* update the file size (if needed) after a write */
919static void 851static void
920cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, 852cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
@@ -935,7 +867,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
935 unsigned int total_written; 867 unsigned int total_written;
936 struct cifs_sb_info *cifs_sb; 868 struct cifs_sb_info *cifs_sb;
937 struct cifsTconInfo *pTcon; 869 struct cifsTconInfo *pTcon;
938 int xid, long_op; 870 int xid;
939 struct cifsFileInfo *open_file; 871 struct cifsFileInfo *open_file;
940 struct cifsInodeInfo *cifsi = CIFS_I(inode); 872 struct cifsInodeInfo *cifsi = CIFS_I(inode);
941 873
@@ -956,7 +888,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
956 888
957 xid = GetXid(); 889 xid = GetXid();
958 890
959 long_op = cifs_write_timeout(cifsi, *poffset);
960 for (total_written = 0; write_size > total_written; 891 for (total_written = 0; write_size > total_written;
961 total_written += bytes_written) { 892 total_written += bytes_written) {
962 rc = -EAGAIN; 893 rc = -EAGAIN;
@@ -984,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
984 min_t(const int, cifs_sb->wsize, 915 min_t(const int, cifs_sb->wsize,
985 write_size - total_written), 916 write_size - total_written),
986 *poffset, &bytes_written, 917 *poffset, &bytes_written,
987 NULL, write_data + total_written, long_op); 918 NULL, write_data + total_written, 0);
988 } 919 }
989 if (rc || (bytes_written == 0)) { 920 if (rc || (bytes_written == 0)) {
990 if (total_written) 921 if (total_written)
@@ -997,8 +928,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
997 cifs_update_eof(cifsi, *poffset, bytes_written); 928 cifs_update_eof(cifsi, *poffset, bytes_written);
998 *poffset += bytes_written; 929 *poffset += bytes_written;
999 } 930 }
1000 long_op = CIFS_STD_OP; /* subsequent writes fast -
1001 15 seconds is plenty */
1002 } 931 }
1003 932
1004 cifs_stats_bytes_written(pTcon, total_written); 933 cifs_stats_bytes_written(pTcon, total_written);
@@ -1027,7 +956,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1027 unsigned int total_written; 956 unsigned int total_written;
1028 struct cifs_sb_info *cifs_sb; 957 struct cifs_sb_info *cifs_sb;
1029 struct cifsTconInfo *pTcon; 958 struct cifsTconInfo *pTcon;
1030 int xid, long_op; 959 int xid;
1031 struct dentry *dentry = open_file->dentry; 960 struct dentry *dentry = open_file->dentry;
1032 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); 961 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
1033 962
@@ -1040,7 +969,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1040 969
1041 xid = GetXid(); 970 xid = GetXid();
1042 971
1043 long_op = cifs_write_timeout(cifsi, *poffset);
1044 for (total_written = 0; write_size > total_written; 972 for (total_written = 0; write_size > total_written;
1045 total_written += bytes_written) { 973 total_written += bytes_written) {
1046 rc = -EAGAIN; 974 rc = -EAGAIN;
@@ -1070,7 +998,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1070 rc = CIFSSMBWrite2(xid, pTcon, 998 rc = CIFSSMBWrite2(xid, pTcon,
1071 open_file->netfid, len, 999 open_file->netfid, len,
1072 *poffset, &bytes_written, 1000 *poffset, &bytes_written,
1073 iov, 1, long_op); 1001 iov, 1, 0);
1074 } else 1002 } else
1075 rc = CIFSSMBWrite(xid, pTcon, 1003 rc = CIFSSMBWrite(xid, pTcon,
1076 open_file->netfid, 1004 open_file->netfid,
@@ -1078,7 +1006,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1078 write_size - total_written), 1006 write_size - total_written),
1079 *poffset, &bytes_written, 1007 *poffset, &bytes_written,
1080 write_data + total_written, 1008 write_data + total_written,
1081 NULL, long_op); 1009 NULL, 0);
1082 } 1010 }
1083 if (rc || (bytes_written == 0)) { 1011 if (rc || (bytes_written == 0)) {
1084 if (total_written) 1012 if (total_written)
@@ -1091,8 +1019,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1091 cifs_update_eof(cifsi, *poffset, bytes_written); 1019 cifs_update_eof(cifsi, *poffset, bytes_written);
1092 *poffset += bytes_written; 1020 *poffset += bytes_written;
1093 } 1021 }
1094 long_op = CIFS_STD_OP; /* subsequent writes fast -
1095 15 seconds is plenty */
1096 } 1022 }
1097 1023
1098 cifs_stats_bytes_written(pTcon, total_written); 1024 cifs_stats_bytes_written(pTcon, total_written);
@@ -1292,7 +1218,7 @@ static int cifs_writepages(struct address_space *mapping,
1292 struct pagevec pvec; 1218 struct pagevec pvec;
1293 int rc = 0; 1219 int rc = 0;
1294 int scanned = 0; 1220 int scanned = 0;
1295 int xid, long_op; 1221 int xid;
1296 1222
1297 cifs_sb = CIFS_SB(mapping->host->i_sb); 1223 cifs_sb = CIFS_SB(mapping->host->i_sb);
1298 1224
@@ -1430,43 +1356,67 @@ retry:
1430 break; 1356 break;
1431 } 1357 }
1432 if (n_iov) { 1358 if (n_iov) {
1359retry_write:
1433 open_file = find_writable_file(CIFS_I(mapping->host), 1360 open_file = find_writable_file(CIFS_I(mapping->host),
1434 false); 1361 false);
1435 if (!open_file) { 1362 if (!open_file) {
1436 cERROR(1, "No writable handles for inode"); 1363 cERROR(1, "No writable handles for inode");
1437 rc = -EBADF; 1364 rc = -EBADF;
1438 } else { 1365 } else {
1439 long_op = cifs_write_timeout(cifsi, offset);
1440 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid, 1366 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
1441 bytes_to_write, offset, 1367 bytes_to_write, offset,
1442 &bytes_written, iov, n_iov, 1368 &bytes_written, iov, n_iov,
1443 long_op); 1369 0);
1444 cifsFileInfo_put(open_file); 1370 cifsFileInfo_put(open_file);
1445 cifs_update_eof(cifsi, offset, bytes_written);
1446 } 1371 }
1447 1372
1448 if (rc || bytes_written < bytes_to_write) { 1373 cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
1449 cERROR(1, "Write2 ret %d, wrote %d", 1374
1450 rc, bytes_written); 1375 /*
1451 mapping_set_error(mapping, rc); 1376 * For now, treat a short write as if nothing got
1452 } else { 1377 * written. A zero length write however indicates
1378 * ENOSPC or EFBIG. We have no way to know which
1379 * though, so call it ENOSPC for now. EFBIG would
1380 * get translated to AS_EIO anyway.
1381 *
1382 * FIXME: make it take into account the data that did
1383 * get written
1384 */
1385 if (rc == 0) {
1386 if (bytes_written == 0)
1387 rc = -ENOSPC;
1388 else if (bytes_written < bytes_to_write)
1389 rc = -EAGAIN;
1390 }
1391
1392 /* retry on data-integrity flush */
1393 if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
1394 goto retry_write;
1395
1396 /* fix the stats and EOF */
1397 if (bytes_written > 0) {
1453 cifs_stats_bytes_written(tcon, bytes_written); 1398 cifs_stats_bytes_written(tcon, bytes_written);
1399 cifs_update_eof(cifsi, offset, bytes_written);
1454 } 1400 }
1455 1401
1456 for (i = 0; i < n_iov; i++) { 1402 for (i = 0; i < n_iov; i++) {
1457 page = pvec.pages[first + i]; 1403 page = pvec.pages[first + i];
1458 /* Should we also set page error on 1404 /* on retryable write error, redirty page */
1459 success rc but too little data written? */ 1405 if (rc == -EAGAIN)
1460 /* BB investigate retry logic on temporary 1406 redirty_page_for_writepage(wbc, page);
1461 server crash cases and how recovery works 1407 else if (rc != 0)
1462 when page marked as error */
1463 if (rc)
1464 SetPageError(page); 1408 SetPageError(page);
1465 kunmap(page); 1409 kunmap(page);
1466 unlock_page(page); 1410 unlock_page(page);
1467 end_page_writeback(page); 1411 end_page_writeback(page);
1468 page_cache_release(page); 1412 page_cache_release(page);
1469 } 1413 }
1414
1415 if (rc != -EAGAIN)
1416 mapping_set_error(mapping, rc);
1417 else
1418 rc = 0;
1419
1470 if ((wbc->nr_to_write -= n_iov) <= 0) 1420 if ((wbc->nr_to_write -= n_iov) <= 0)
1471 done = 1; 1421 done = 1;
1472 index = next; 1422 index = next;
@@ -1578,27 +1528,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1578 return rc; 1528 return rc;
1579} 1529}
1580 1530
1581int cifs_fsync(struct file *file, int datasync) 1531int cifs_strict_fsync(struct file *file, int datasync)
1582{ 1532{
1583 int xid; 1533 int xid;
1584 int rc = 0; 1534 int rc = 0;
1585 struct cifsTconInfo *tcon; 1535 struct cifsTconInfo *tcon;
1586 struct cifsFileInfo *smbfile = file->private_data; 1536 struct cifsFileInfo *smbfile = file->private_data;
1587 struct inode *inode = file->f_path.dentry->d_inode; 1537 struct inode *inode = file->f_path.dentry->d_inode;
1538 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1588 1539
1589 xid = GetXid(); 1540 xid = GetXid();
1590 1541
1591 cFYI(1, "Sync file - name: %s datasync: 0x%x", 1542 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1592 file->f_path.dentry->d_name.name, datasync); 1543 file->f_path.dentry->d_name.name, datasync);
1593 1544
1594 rc = filemap_write_and_wait(inode->i_mapping); 1545 if (!CIFS_I(inode)->clientCanCacheRead)
1595 if (rc == 0) { 1546 cifs_invalidate_mapping(inode);
1596 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1597 1547
1598 tcon = tlink_tcon(smbfile->tlink); 1548 tcon = tlink_tcon(smbfile->tlink);
1599 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) 1549 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1600 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); 1550 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1601 } 1551
1552 FreeXid(xid);
1553 return rc;
1554}
1555
1556int cifs_fsync(struct file *file, int datasync)
1557{
1558 int xid;
1559 int rc = 0;
1560 struct cifsTconInfo *tcon;
1561 struct cifsFileInfo *smbfile = file->private_data;
1562 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1563
1564 xid = GetXid();
1565
1566 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1567 file->f_path.dentry->d_name.name, datasync);
1568
1569 tcon = tlink_tcon(smbfile->tlink);
1570 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1571 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1602 1572
1603 FreeXid(xid); 1573 FreeXid(xid);
1604 return rc; 1574 return rc;
@@ -1649,42 +1619,42 @@ int cifs_flush(struct file *file, fl_owner_t id)
1649 return rc; 1619 return rc;
1650} 1620}
1651 1621
1652ssize_t cifs_user_read(struct file *file, char __user *read_data, 1622static ssize_t
1653 size_t read_size, loff_t *poffset) 1623cifs_iovec_read(struct file *file, const struct iovec *iov,
1624 unsigned long nr_segs, loff_t *poffset)
1654{ 1625{
1655 int rc = -EACCES; 1626 int rc;
1656 unsigned int bytes_read = 0; 1627 int xid;
1657 unsigned int total_read = 0; 1628 unsigned int total_read, bytes_read = 0;
1658 unsigned int current_read_size; 1629 size_t len, cur_len;
1630 int iov_offset = 0;
1659 struct cifs_sb_info *cifs_sb; 1631 struct cifs_sb_info *cifs_sb;
1660 struct cifsTconInfo *pTcon; 1632 struct cifsTconInfo *pTcon;
1661 int xid;
1662 struct cifsFileInfo *open_file; 1633 struct cifsFileInfo *open_file;
1663 char *smb_read_data;
1664 char __user *current_offset;
1665 struct smb_com_read_rsp *pSMBr; 1634 struct smb_com_read_rsp *pSMBr;
1635 char *read_data;
1636
1637 if (!nr_segs)
1638 return 0;
1639
1640 len = iov_length(iov, nr_segs);
1641 if (!len)
1642 return 0;
1666 1643
1667 xid = GetXid(); 1644 xid = GetXid();
1668 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1645 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1669 1646
1670 if (file->private_data == NULL) {
1671 rc = -EBADF;
1672 FreeXid(xid);
1673 return rc;
1674 }
1675 open_file = file->private_data; 1647 open_file = file->private_data;
1676 pTcon = tlink_tcon(open_file->tlink); 1648 pTcon = tlink_tcon(open_file->tlink);
1677 1649
1678 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1650 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1679 cFYI(1, "attempting read on write only file instance"); 1651 cFYI(1, "attempting read on write only file instance");
1680 1652
1681 for (total_read = 0, current_offset = read_data; 1653 for (total_read = 0; total_read < len; total_read += bytes_read) {
1682 read_size > total_read; 1654 cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
1683 total_read += bytes_read, current_offset += bytes_read) {
1684 current_read_size = min_t(const int, read_size - total_read,
1685 cifs_sb->rsize);
1686 rc = -EAGAIN; 1655 rc = -EAGAIN;
1687 smb_read_data = NULL; 1656 read_data = NULL;
1657
1688 while (rc == -EAGAIN) { 1658 while (rc == -EAGAIN) {
1689 int buf_type = CIFS_NO_BUFFER; 1659 int buf_type = CIFS_NO_BUFFER;
1690 if (open_file->invalidHandle) { 1660 if (open_file->invalidHandle) {
@@ -1692,27 +1662,25 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1692 if (rc != 0) 1662 if (rc != 0)
1693 break; 1663 break;
1694 } 1664 }
1695 rc = CIFSSMBRead(xid, pTcon, 1665 rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
1696 open_file->netfid, 1666 cur_len, *poffset, &bytes_read,
1697 current_read_size, *poffset, 1667 &read_data, &buf_type);
1698 &bytes_read, &smb_read_data, 1668 pSMBr = (struct smb_com_read_rsp *)read_data;
1699 &buf_type); 1669 if (read_data) {
1700 pSMBr = (struct smb_com_read_rsp *)smb_read_data; 1670 char *data_offset = read_data + 4 +
1701 if (smb_read_data) { 1671 le16_to_cpu(pSMBr->DataOffset);
1702 if (copy_to_user(current_offset, 1672 if (memcpy_toiovecend(iov, data_offset,
1703 smb_read_data + 1673 iov_offset, bytes_read))
1704 4 /* RFC1001 length field */ +
1705 le16_to_cpu(pSMBr->DataOffset),
1706 bytes_read))
1707 rc = -EFAULT; 1674 rc = -EFAULT;
1708
1709 if (buf_type == CIFS_SMALL_BUFFER) 1675 if (buf_type == CIFS_SMALL_BUFFER)
1710 cifs_small_buf_release(smb_read_data); 1676 cifs_small_buf_release(read_data);
1711 else if (buf_type == CIFS_LARGE_BUFFER) 1677 else if (buf_type == CIFS_LARGE_BUFFER)
1712 cifs_buf_release(smb_read_data); 1678 cifs_buf_release(read_data);
1713 smb_read_data = NULL; 1679 read_data = NULL;
1680 iov_offset += bytes_read;
1714 } 1681 }
1715 } 1682 }
1683
1716 if (rc || (bytes_read == 0)) { 1684 if (rc || (bytes_read == 0)) {
1717 if (total_read) { 1685 if (total_read) {
1718 break; 1686 break;
@@ -1725,13 +1693,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1725 *poffset += bytes_read; 1693 *poffset += bytes_read;
1726 } 1694 }
1727 } 1695 }
1696
1728 FreeXid(xid); 1697 FreeXid(xid);
1729 return total_read; 1698 return total_read;
1730} 1699}
1731 1700
1701ssize_t cifs_user_read(struct file *file, char __user *read_data,
1702 size_t read_size, loff_t *poffset)
1703{
1704 struct iovec iov;
1705 iov.iov_base = read_data;
1706 iov.iov_len = read_size;
1707
1708 return cifs_iovec_read(file, &iov, 1, poffset);
1709}
1710
1711static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
1712 unsigned long nr_segs, loff_t pos)
1713{
1714 ssize_t read;
1715
1716 read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
1717 if (read > 0)
1718 iocb->ki_pos = pos;
1719
1720 return read;
1721}
1722
1723ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
1724 unsigned long nr_segs, loff_t pos)
1725{
1726 struct inode *inode;
1727
1728 inode = iocb->ki_filp->f_path.dentry->d_inode;
1729
1730 if (CIFS_I(inode)->clientCanCacheRead)
1731 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1732
1733 /*
1734 * In strict cache mode we need to read from the server all the time
1735 * if we don't have level II oplock because the server can delay mtime
1736 * change - so we can't make a decision about inode invalidating.
1737 * And we can also fail with pagereading if there are mandatory locks
1738 * on pages affected by this read but not on the region from pos to
1739 * pos+len-1.
1740 */
1741
1742 return cifs_user_readv(iocb, iov, nr_segs, pos);
1743}
1732 1744
1733static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, 1745static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1734 loff_t *poffset) 1746 loff_t *poffset)
1735{ 1747{
1736 int rc = -EACCES; 1748 int rc = -EACCES;
1737 unsigned int bytes_read = 0; 1749 unsigned int bytes_read = 0;
@@ -1799,6 +1811,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1799 return total_read; 1811 return total_read;
1800} 1812}
1801 1813
1814int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
1815{
1816 int rc, xid;
1817 struct inode *inode = file->f_path.dentry->d_inode;
1818
1819 xid = GetXid();
1820
1821 if (!CIFS_I(inode)->clientCanCacheRead)
1822 cifs_invalidate_mapping(inode);
1823
1824 rc = generic_file_mmap(file, vma);
1825 FreeXid(xid);
1826 return rc;
1827}
1828
1802int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1829int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1803{ 1830{
1804 int rc, xid; 1831 int rc, xid;
@@ -2245,7 +2272,8 @@ void cifs_oplock_break(struct work_struct *work)
2245 */ 2272 */
2246 if (!cfile->oplock_break_cancelled) { 2273 if (!cfile->oplock_break_cancelled) {
2247 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, 2274 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
2248 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false); 2275 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
2276 cinode->clientCanCacheRead ? 1 : 0);
2249 cFYI(1, "Oplock release rc = %d", rc); 2277 cFYI(1, "Oplock release rc = %d", rc);
2250 } 2278 }
2251 2279
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a853a89857a5..8852470b4fbb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
32#include "fscache.h" 32#include "fscache.h"
33 33
34 34
35static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) 35static void cifs_set_ops(struct inode *inode)
36{ 36{
37 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 37 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
38 38
@@ -44,13 +44,17 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
44 inode->i_fop = &cifs_file_direct_nobrl_ops; 44 inode->i_fop = &cifs_file_direct_nobrl_ops;
45 else 45 else
46 inode->i_fop = &cifs_file_direct_ops; 46 inode->i_fop = &cifs_file_direct_ops;
47 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
48 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
49 inode->i_fop = &cifs_file_strict_nobrl_ops;
50 else
51 inode->i_fop = &cifs_file_strict_ops;
47 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 52 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
48 inode->i_fop = &cifs_file_nobrl_ops; 53 inode->i_fop = &cifs_file_nobrl_ops;
49 else { /* not direct, send byte range locks */ 54 else { /* not direct, send byte range locks */
50 inode->i_fop = &cifs_file_ops; 55 inode->i_fop = &cifs_file_ops;
51 } 56 }
52 57
53
54 /* check if server can support readpages */ 58 /* check if server can support readpages */
55 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf < 59 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
56 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) 60 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
@@ -60,7 +64,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
60 break; 64 break;
61 case S_IFDIR: 65 case S_IFDIR:
62#ifdef CONFIG_CIFS_DFS_UPCALL 66#ifdef CONFIG_CIFS_DFS_UPCALL
63 if (is_dfs_referral) { 67 if (IS_AUTOMOUNT(inode)) {
64 inode->i_op = &cifs_dfs_referral_inode_operations; 68 inode->i_op = &cifs_dfs_referral_inode_operations;
65 } else { 69 } else {
66#else /* NO DFS support, treat as a directory */ 70#else /* NO DFS support, treat as a directory */
@@ -167,7 +171,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
167 } 171 }
168 spin_unlock(&inode->i_lock); 172 spin_unlock(&inode->i_lock);
169 173
170 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); 174 if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
175 inode->i_flags |= S_AUTOMOUNT;
176 cifs_set_ops(inode);
171} 177}
172 178
173void 179void
@@ -518,6 +524,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
518 524
519 fattr->cf_eof = le64_to_cpu(info->EndOfFile); 525 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
520 fattr->cf_bytes = le64_to_cpu(info->AllocationSize); 526 fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
527 fattr->cf_createtime = le64_to_cpu(info->CreationTime);
521 528
522 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { 529 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
523 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; 530 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
@@ -779,6 +786,10 @@ cifs_find_inode(struct inode *inode, void *opaque)
779 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) 786 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
780 return 0; 787 return 0;
781 788
789 /* use createtime like an i_generation field */
790 if (CIFS_I(inode)->createtime != fattr->cf_createtime)
791 return 0;
792
782 /* don't match inode of different type */ 793 /* don't match inode of different type */
783 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) 794 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
784 return 0; 795 return 0;
@@ -796,6 +807,7 @@ cifs_init_inode(struct inode *inode, void *opaque)
796 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; 807 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
797 808
798 CIFS_I(inode)->uniqueid = fattr->cf_uniqueid; 809 CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
810 CIFS_I(inode)->createtime = fattr->cf_createtime;
799 return 0; 811 return 0;
800} 812}
801 813
@@ -1318,10 +1330,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1318/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need 1330/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
1319 to set uid/gid */ 1331 to set uid/gid */
1320 inc_nlink(inode); 1332 inc_nlink(inode);
1321 if (pTcon->nocase)
1322 d_set_d_op(direntry, &cifs_ci_dentry_ops);
1323 else
1324 d_set_d_op(direntry, &cifs_dentry_ops);
1325 1333
1326 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); 1334 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
1327 cifs_fill_uniqueid(inode->i_sb, &fattr); 1335 cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1362,10 +1370,6 @@ mkdir_get_info:
1362 rc = cifs_get_inode_info(&newinode, full_path, NULL, 1370 rc = cifs_get_inode_info(&newinode, full_path, NULL,
1363 inode->i_sb, xid, NULL); 1371 inode->i_sb, xid, NULL);
1364 1372
1365 if (pTcon->nocase)
1366 d_set_d_op(direntry, &cifs_ci_dentry_ops);
1367 else
1368 d_set_d_op(direntry, &cifs_dentry_ops);
1369 d_instantiate(direntry, newinode); 1373 d_instantiate(direntry, newinode);
1370 /* setting nlink not necessary except in cases where we 1374 /* setting nlink not necessary except in cases where we
1371 * failed to get it from the server or was set bogus */ 1375 * failed to get it from the server or was set bogus */
@@ -1679,7 +1683,7 @@ cifs_inode_needs_reval(struct inode *inode)
1679/* 1683/*
1680 * Zap the cache. Called when invalid_mapping flag is set. 1684 * Zap the cache. Called when invalid_mapping flag is set.
1681 */ 1685 */
1682static void 1686void
1683cifs_invalidate_mapping(struct inode *inode) 1687cifs_invalidate_mapping(struct inode *inode)
1684{ 1688{
1685 int rc; 1689 int rc;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fe2f6a93c49e..306769de2fb5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -524,10 +524,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
524 cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d", 524 cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
525 rc); 525 rc);
526 } else { 526 } else {
527 if (pTcon->nocase)
528 d_set_d_op(direntry, &cifs_ci_dentry_ops);
529 else
530 d_set_d_op(direntry, &cifs_dentry_ops);
531 d_instantiate(direntry, newinode); 527 d_instantiate(direntry, newinode);
532 } 528 }
533 } 529 }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 43f10281bc19..a09e077ba925 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -571,7 +571,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
571 pCifsInode = CIFS_I(netfile->dentry->d_inode); 571 pCifsInode = CIFS_I(netfile->dentry->d_inode);
572 572
573 cifs_set_oplock_level(pCifsInode, 573 cifs_set_oplock_level(pCifsInode,
574 pSMB->OplockLevel); 574 pSMB->OplockLevel ? OPLOCK_READ : 0);
575 /* 575 /*
576 * cifs_oplock_break_put() can't be called 576 * cifs_oplock_break_put() can't be called
577 * from here. Get reference after queueing 577 * from here. Get reference after queueing
@@ -637,77 +637,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
637 return; 637 return;
638} 638}
639 639
640/* Convert 16 bit Unicode pathname to wire format from string in current code
641 page. Conversion may involve remapping up the seven characters that are
642 only legal in POSIX-like OS (if they are present in the string). Path
643 names are little endian 16 bit Unicode on the wire */
644int
645cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
646 const struct nls_table *cp, int mapChars)
647{
648 int i, j, charlen;
649 int len_remaining = maxlen;
650 char src_char;
651 __u16 temp;
652
653 if (!mapChars)
654 return cifs_strtoUCS(target, source, PATH_MAX, cp);
655
656 for (i = 0, j = 0; i < maxlen; j++) {
657 src_char = source[i];
658 switch (src_char) {
659 case 0:
660 target[j] = 0;
661 goto ctoUCS_out;
662 case ':':
663 target[j] = cpu_to_le16(UNI_COLON);
664 break;
665 case '*':
666 target[j] = cpu_to_le16(UNI_ASTERIK);
667 break;
668 case '?':
669 target[j] = cpu_to_le16(UNI_QUESTION);
670 break;
671 case '<':
672 target[j] = cpu_to_le16(UNI_LESSTHAN);
673 break;
674 case '>':
675 target[j] = cpu_to_le16(UNI_GRTRTHAN);
676 break;
677 case '|':
678 target[j] = cpu_to_le16(UNI_PIPE);
679 break;
680 /* BB We can not handle remapping slash until
681 all the calls to build_path_from_dentry
682 are modified, as they use slash as separator BB */
683 /* case '\\':
684 target[j] = cpu_to_le16(UNI_SLASH);
685 break;*/
686 default:
687 charlen = cp->char2uni(source+i,
688 len_remaining, &temp);
689 /* if no match, use question mark, which
690 at least in some cases servers as wild card */
691 if (charlen < 1) {
692 target[j] = cpu_to_le16(0x003f);
693 charlen = 1;
694 } else
695 target[j] = cpu_to_le16(temp);
696 len_remaining -= charlen;
697 /* character may take more than one byte in the
698 the source string, but will take exactly two
699 bytes in the target string */
700 i += charlen;
701 continue;
702 }
703 i++; /* move to next char in source string */
704 len_remaining--;
705 }
706
707ctoUCS_out:
708 return i;
709}
710
711void 640void
712cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) 641cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
713{ 642{
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62f..8d9189f64477 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
899 } 899 }
900 /* else ERRHRD class errors or junk - return EIO */ 900 /* else ERRHRD class errors or junk - return EIO */
901 901
902 cFYI(1, "Mapping smb error code %d to POSIX err %d", 902 cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
903 smberrcode, rc); 903 le32_to_cpu(smb->Status.CifsError), rc);
904 904
905 /* generic corrective action e.g. reconnect SMB session on 905 /* generic corrective action e.g. reconnect SMB session on
906 * ERRbaduid could be added */ 906 * ERRbaduid could be added */
@@ -916,14 +916,14 @@ unsigned int
916smbCalcSize(struct smb_hdr *ptr) 916smbCalcSize(struct smb_hdr *ptr)
917{ 917{
918 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 918 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
919 2 /* size of the bcc field */ + BCC(ptr)); 919 2 /* size of the bcc field */ + get_bcc(ptr));
920} 920}
921 921
922unsigned int 922unsigned int
923smbCalcSize_LE(struct smb_hdr *ptr) 923smbCalcSize_LE(struct smb_hdr *ptr)
924{ 924{
925 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 925 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
926 2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr))); 926 2 /* size of the bcc field */ + get_bcc_le(ptr));
927} 927}
928 928
929/* The following are taken from fs/ntfs/util.c */ 929/* The following are taken from fs/ntfs/util.c */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ec5b68e3b928..7f25cc3d2256 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,11 +102,6 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
102 return NULL; 102 return NULL;
103 } 103 }
104 104
105 if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
106 d_set_d_op(dentry, &cifs_ci_dentry_ops);
107 else
108 d_set_d_op(dentry, &cifs_dentry_ops);
109
110 alias = d_materialise_unique(dentry, inode); 105 alias = d_materialise_unique(dentry, inode);
111 if (alias != NULL) { 106 if (alias != NULL) {
112 dput(dentry); 107 dput(dentry);
@@ -160,6 +155,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
160 fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); 155 fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
161 fattr->cf_eof = le64_to_cpu(info->EndOfFile); 156 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
162 fattr->cf_bytes = le64_to_cpu(info->AllocationSize); 157 fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
158 fattr->cf_createtime = le64_to_cpu(info->CreationTime);
163 fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); 159 fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
164 fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); 160 fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
165 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); 161 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7b01d3f6eed6..1adc9625a344 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -277,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
277} 277}
278 278
279static void 279static void
280decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses, 280decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
281 const struct nls_table *nls_cp) 281 const struct nls_table *nls_cp)
282{ 282{
283 int len; 283 int len;
@@ -323,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
323 return; 323 return;
324} 324}
325 325
326static int decode_ascii_ssetup(char **pbcc_area, int bleft, 326static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
327 struct cifsSesInfo *ses, 327 struct cifsSesInfo *ses,
328 const struct nls_table *nls_cp) 328 const struct nls_table *nls_cp)
329{ 329{
@@ -420,7 +420,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
420 return 0; 420 return 0;
421} 421}
422 422
423#ifdef CONFIG_CIFS_EXPERIMENTAL
424/* BB Move to ntlmssp.c eventually */ 423/* BB Move to ntlmssp.c eventually */
425 424
426/* We do not malloc the blob, it is passed in pbuffer, because 425/* We do not malloc the blob, it is passed in pbuffer, because
@@ -431,13 +430,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
431 NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer; 430 NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
432 __u32 flags; 431 __u32 flags;
433 432
433 memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
434 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); 434 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
435 sec_blob->MessageType = NtLmNegotiate; 435 sec_blob->MessageType = NtLmNegotiate;
436 436
437 /* BB is NTLMV2 session security format easier to use here? */ 437 /* BB is NTLMV2 session security format easier to use here? */
438 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | 438 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
439 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 439 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
440 NTLMSSP_NEGOTIATE_NTLM; 440 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
441 if (ses->server->secMode & 441 if (ses->server->secMode &
442 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 442 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
443 flags |= NTLMSSP_NEGOTIATE_SIGN; 443 flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -446,7 +446,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
446 NTLMSSP_NEGOTIATE_EXTENDED_SEC; 446 NTLMSSP_NEGOTIATE_EXTENDED_SEC;
447 } 447 }
448 448
449 sec_blob->NegotiateFlags |= cpu_to_le32(flags); 449 sec_blob->NegotiateFlags = cpu_to_le32(flags);
450 450
451 sec_blob->WorkstationName.BufferOffset = 0; 451 sec_blob->WorkstationName.BufferOffset = 0;
452 sec_blob->WorkstationName.Length = 0; 452 sec_blob->WorkstationName.Length = 0;
@@ -477,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
477 flags = NTLMSSP_NEGOTIATE_56 | 477 flags = NTLMSSP_NEGOTIATE_56 |
478 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | 478 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
479 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 479 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
480 NTLMSSP_NEGOTIATE_NTLM; 480 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
481 if (ses->server->secMode & 481 if (ses->server->secMode &
482 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 482 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
483 flags |= NTLMSSP_NEGOTIATE_SIGN; 483 flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -485,7 +485,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
485 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; 485 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
486 486
487 tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); 487 tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
488 sec_blob->NegotiateFlags |= cpu_to_le32(flags); 488 sec_blob->NegotiateFlags = cpu_to_le32(flags);
489 489
490 sec_blob->LmChallengeResponse.BufferOffset = 490 sec_blob->LmChallengeResponse.BufferOffset =
491 cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE)); 491 cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
@@ -544,8 +544,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
544 sec_blob->WorkstationName.MaximumLength = 0; 544 sec_blob->WorkstationName.MaximumLength = 0;
545 tmp += 2; 545 tmp += 2;
546 546
547 if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) && 547 if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
548 !calc_seckey(ses)) { 548 (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
549 && !calc_seckey(ses)) {
549 memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); 550 memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
550 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); 551 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
551 sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); 552 sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
@@ -563,17 +564,6 @@ setup_ntlmv2_ret:
563 return rc; 564 return rc;
564} 565}
565 566
566
567static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
568 struct cifsSesInfo *ses)
569{
570 build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
571 pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
572
573 return;
574}
575#endif
576
577int 567int
578CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, 568CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
579 const struct nls_table *nls_cp) 569 const struct nls_table *nls_cp)
@@ -585,12 +575,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
585 char *str_area; 575 char *str_area;
586 SESSION_SETUP_ANDX *pSMB; 576 SESSION_SETUP_ANDX *pSMB;
587 __u32 capabilities; 577 __u32 capabilities;
588 int count; 578 __u16 count;
589 int resp_buf_type; 579 int resp_buf_type;
590 struct kvec iov[3]; 580 struct kvec iov[3];
591 enum securityEnum type; 581 enum securityEnum type;
592 __u16 action; 582 __u16 action, bytes_remaining;
593 int bytes_remaining;
594 struct key *spnego_key = NULL; 583 struct key *spnego_key = NULL;
595 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 584 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
596 u16 blob_len; 585 u16 blob_len;
@@ -814,71 +803,70 @@ ssetup_ntlmssp_authenticate:
814 rc = -ENOSYS; 803 rc = -ENOSYS;
815 goto ssetup_exit; 804 goto ssetup_exit;
816#endif /* CONFIG_CIFS_UPCALL */ 805#endif /* CONFIG_CIFS_UPCALL */
817 } else { 806 } else if (type == RawNTLMSSP) {
818#ifdef CONFIG_CIFS_EXPERIMENTAL 807 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
819 if (type == RawNTLMSSP) { 808 cERROR(1, "NTLMSSP requires Unicode support");
820 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { 809 rc = -ENOSYS;
821 cERROR(1, "NTLMSSP requires Unicode support"); 810 goto ssetup_exit;
822 rc = -ENOSYS; 811 }
812
813 cFYI(1, "ntlmssp session setup phase %d", phase);
814 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
815 capabilities |= CAP_EXTENDED_SECURITY;
816 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
817 switch(phase) {
818 case NtLmNegotiate:
819 build_ntlmssp_negotiate_blob(
820 pSMB->req.SecurityBlob, ses);
821 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
822 iov[1].iov_base = pSMB->req.SecurityBlob;
823 pSMB->req.SecurityBlobLength =
824 cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
825 break;
826 case NtLmAuthenticate:
827 /*
828 * 5 is an empirical value, large enough to hold
829 * authenticate message plus max 10 of av paris,
830 * domain, user, workstation names, flags, etc.
831 */
832 ntlmsspblob = kzalloc(
833 5*sizeof(struct _AUTHENTICATE_MESSAGE),
834 GFP_KERNEL);
835 if (!ntlmsspblob) {
836 cERROR(1, "Can't allocate NTLMSSP blob");
837 rc = -ENOMEM;
823 goto ssetup_exit; 838 goto ssetup_exit;
824 } 839 }
825 840
826 cFYI(1, "ntlmssp session setup phase %d", phase); 841 rc = build_ntlmssp_auth_blob(ntlmsspblob,
827 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 842 &blob_len, ses, nls_cp);
828 capabilities |= CAP_EXTENDED_SECURITY; 843 if (rc)
829 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
830 if (phase == NtLmNegotiate) {
831 setup_ntlmssp_neg_req(pSMB, ses);
832 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
833 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
834 } else if (phase == NtLmAuthenticate) {
835 /* 5 is an empirical value, large enought to
836 * hold authenticate message, max 10 of
837 * av paris, doamin,user,workstation mames,
838 * flags etc..
839 */
840 ntlmsspblob = kmalloc(
841 5*sizeof(struct _AUTHENTICATE_MESSAGE),
842 GFP_KERNEL);
843 if (!ntlmsspblob) {
844 cERROR(1, "Can't allocate NTLMSSP");
845 rc = -ENOMEM;
846 goto ssetup_exit;
847 }
848
849 rc = build_ntlmssp_auth_blob(ntlmsspblob,
850 &blob_len, ses, nls_cp);
851 if (rc)
852 goto ssetup_exit;
853 iov[1].iov_len = blob_len;
854 iov[1].iov_base = ntlmsspblob;
855 pSMB->req.SecurityBlobLength =
856 cpu_to_le16(blob_len);
857 /* Make sure that we tell the server that we
858 are using the uid that it just gave us back
859 on the response (challenge) */
860 smb_buf->Uid = ses->Suid;
861 } else {
862 cERROR(1, "invalid phase %d", phase);
863 rc = -ENOSYS;
864 goto ssetup_exit; 844 goto ssetup_exit;
865 } 845 iov[1].iov_len = blob_len;
866 /* unicode strings must be word aligned */ 846 iov[1].iov_base = ntlmsspblob;
867 if ((iov[0].iov_len + iov[1].iov_len) % 2) { 847 pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
868 *bcc_ptr = 0; 848 /*
869 bcc_ptr++; 849 * Make sure that we tell the server that we are using
870 } 850 * the uid that it just gave us back on the response
871 unicode_oslm_strings(&bcc_ptr, nls_cp); 851 * (challenge)
872 } else { 852 */
873 cERROR(1, "secType %d not supported!", type); 853 smb_buf->Uid = ses->Suid;
854 break;
855 default:
856 cERROR(1, "invalid phase %d", phase);
874 rc = -ENOSYS; 857 rc = -ENOSYS;
875 goto ssetup_exit; 858 goto ssetup_exit;
876 } 859 }
877#else 860 /* unicode strings must be word aligned */
861 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
862 *bcc_ptr = 0;
863 bcc_ptr++;
864 }
865 unicode_oslm_strings(&bcc_ptr, nls_cp);
866 } else {
878 cERROR(1, "secType %d not supported!", type); 867 cERROR(1, "secType %d not supported!", type);
879 rc = -ENOSYS; 868 rc = -ENOSYS;
880 goto ssetup_exit; 869 goto ssetup_exit;
881#endif
882 } 870 }
883 871
884 iov[2].iov_base = str_area; 872 iov[2].iov_base = str_area;
@@ -887,10 +875,10 @@ ssetup_ntlmssp_authenticate:
887 count = iov[1].iov_len + iov[2].iov_len; 875 count = iov[1].iov_len + iov[2].iov_len;
888 smb_buf->smb_buf_length += count; 876 smb_buf->smb_buf_length += count;
889 877
890 BCC_LE(smb_buf) = cpu_to_le16(count); 878 put_bcc_le(count, smb_buf);
891 879
892 rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, 880 rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
893 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); 881 CIFS_LOG_ERROR);
894 /* SMB request buf freed in SendReceive2 */ 882 /* SMB request buf freed in SendReceive2 */
895 883
896 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 884 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
@@ -921,7 +909,7 @@ ssetup_ntlmssp_authenticate:
921 cFYI(1, "UID = %d ", ses->Suid); 909 cFYI(1, "UID = %d ", ses->Suid);
922 /* response can have either 3 or 4 word count - Samba sends 3 */ 910 /* response can have either 3 or 4 word count - Samba sends 3 */
923 /* and lanman response is 3 */ 911 /* and lanman response is 3 */
924 bytes_remaining = BCC(smb_buf); 912 bytes_remaining = get_bcc(smb_buf);
925 bcc_ptr = pByteArea(smb_buf); 913 bcc_ptr = pByteArea(smb_buf);
926 914
927 if (smb_buf->WordCount == 4) { 915 if (smb_buf->WordCount == 4) {
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e0588cdf4cc5..c1ccca1a933f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,7 +36,13 @@
36 36
37extern mempool_t *cifs_mid_poolp; 37extern mempool_t *cifs_mid_poolp;
38 38
39static struct mid_q_entry * 39static void
40wake_up_task(struct mid_q_entry *mid)
41{
42 wake_up_process(mid->callback_data);
43}
44
45struct mid_q_entry *
40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) 46AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
41{ 47{
42 struct mid_q_entry *temp; 48 struct mid_q_entry *temp;
@@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
58 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ 64 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
59 /* when mid allocated can be before when sent */ 65 /* when mid allocated can be before when sent */
60 temp->when_alloc = jiffies; 66 temp->when_alloc = jiffies;
61 temp->tsk = current; 67
68 /*
69 * The default is for the mid to be synchronous, so the
70 * default callback just wakes up the current task.
71 */
72 temp->callback = wake_up_task;
73 temp->callback_data = current;
62 } 74 }
63 75
64 spin_lock(&GlobalMid_Lock);
65 list_add_tail(&temp->qhead, &server->pending_mid_q);
66 atomic_inc(&midCount); 76 atomic_inc(&midCount);
67 temp->midState = MID_REQUEST_ALLOCATED; 77 temp->midState = MID_REQUEST_ALLOCATED;
68 spin_unlock(&GlobalMid_Lock);
69 return temp; 78 return temp;
70} 79}
71 80
72static void 81void
73DeleteMidQEntry(struct mid_q_entry *midEntry) 82DeleteMidQEntry(struct mid_q_entry *midEntry)
74{ 83{
75#ifdef CONFIG_CIFS_STATS2 84#ifdef CONFIG_CIFS_STATS2
76 unsigned long now; 85 unsigned long now;
77#endif 86#endif
78 spin_lock(&GlobalMid_Lock);
79 midEntry->midState = MID_FREE; 87 midEntry->midState = MID_FREE;
80 list_del(&midEntry->qhead);
81 atomic_dec(&midCount); 88 atomic_dec(&midCount);
82 spin_unlock(&GlobalMid_Lock);
83 if (midEntry->largeBuf) 89 if (midEntry->largeBuf)
84 cifs_buf_release(midEntry->resp_buf); 90 cifs_buf_release(midEntry->resp_buf);
85 else 91 else
@@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
103 mempool_free(midEntry, cifs_mid_poolp); 109 mempool_free(midEntry, cifs_mid_poolp);
104} 110}
105 111
112static void
113delete_mid(struct mid_q_entry *mid)
114{
115 spin_lock(&GlobalMid_Lock);
116 list_del(&mid->qhead);
117 spin_unlock(&GlobalMid_Lock);
118
119 DeleteMidQEntry(mid);
120}
121
106static int 122static int
107smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) 123smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
108{ 124{
@@ -119,7 +135,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
119 if (ssocket == NULL) 135 if (ssocket == NULL)
120 return -ENOTSOCK; /* BB eventually add reconnect code here */ 136 return -ENOTSOCK; /* BB eventually add reconnect code here */
121 137
122 smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr; 138 smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
123 smb_msg.msg_namelen = sizeof(struct sockaddr); 139 smb_msg.msg_namelen = sizeof(struct sockaddr);
124 smb_msg.msg_control = NULL; 140 smb_msg.msg_control = NULL;
125 smb_msg.msg_controllen = 0; 141 smb_msg.msg_controllen = 0;
@@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
244 return smb_sendv(server, &iov, 1); 260 return smb_sendv(server, &iov, 1);
245} 261}
246 262
247static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) 263static int wait_for_free_request(struct TCP_Server_Info *server,
264 const int long_op)
248{ 265{
249 if (long_op == CIFS_ASYNC_OP) { 266 if (long_op == CIFS_ASYNC_OP) {
250 /* oplock breaks must not be held up */ 267 /* oplock breaks must not be held up */
251 atomic_inc(&ses->server->inFlight); 268 atomic_inc(&server->inFlight);
252 return 0; 269 return 0;
253 } 270 }
254 271
255 spin_lock(&GlobalMid_Lock); 272 spin_lock(&GlobalMid_Lock);
256 while (1) { 273 while (1) {
257 if (atomic_read(&ses->server->inFlight) >= 274 if (atomic_read(&server->inFlight) >= cifs_max_pending) {
258 cifs_max_pending){
259 spin_unlock(&GlobalMid_Lock); 275 spin_unlock(&GlobalMid_Lock);
260#ifdef CONFIG_CIFS_STATS2 276#ifdef CONFIG_CIFS_STATS2
261 atomic_inc(&ses->server->num_waiters); 277 atomic_inc(&server->num_waiters);
262#endif 278#endif
263 wait_event(ses->server->request_q, 279 wait_event(server->request_q,
264 atomic_read(&ses->server->inFlight) 280 atomic_read(&server->inFlight)
265 < cifs_max_pending); 281 < cifs_max_pending);
266#ifdef CONFIG_CIFS_STATS2 282#ifdef CONFIG_CIFS_STATS2
267 atomic_dec(&ses->server->num_waiters); 283 atomic_dec(&server->num_waiters);
268#endif 284#endif
269 spin_lock(&GlobalMid_Lock); 285 spin_lock(&GlobalMid_Lock);
270 } else { 286 } else {
271 if (ses->server->tcpStatus == CifsExiting) { 287 if (server->tcpStatus == CifsExiting) {
272 spin_unlock(&GlobalMid_Lock); 288 spin_unlock(&GlobalMid_Lock);
273 return -ENOENT; 289 return -ENOENT;
274 } 290 }
@@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
278 294
279 /* update # of requests on the wire to server */ 295 /* update # of requests on the wire to server */
280 if (long_op != CIFS_BLOCKING_OP) 296 if (long_op != CIFS_BLOCKING_OP)
281 atomic_inc(&ses->server->inFlight); 297 atomic_inc(&server->inFlight);
282 spin_unlock(&GlobalMid_Lock); 298 spin_unlock(&GlobalMid_Lock);
283 break; 299 break;
284 } 300 }
@@ -308,53 +324,81 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
308 *ppmidQ = AllocMidQEntry(in_buf, ses->server); 324 *ppmidQ = AllocMidQEntry(in_buf, ses->server);
309 if (*ppmidQ == NULL) 325 if (*ppmidQ == NULL)
310 return -ENOMEM; 326 return -ENOMEM;
327 spin_lock(&GlobalMid_Lock);
328 list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
329 spin_unlock(&GlobalMid_Lock);
311 return 0; 330 return 0;
312} 331}
313 332
314static int wait_for_response(struct cifsSesInfo *ses, 333static int
315 struct mid_q_entry *midQ, 334wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
316 unsigned long timeout,
317 unsigned long time_to_wait)
318{ 335{
319 unsigned long curr_timeout; 336 int error;
320 337
321 for (;;) { 338 error = wait_event_killable(server->response_q,
322 curr_timeout = timeout + jiffies; 339 midQ->midState != MID_REQUEST_SUBMITTED);
323 wait_event_timeout(ses->server->response_q, 340 if (error < 0)
324 midQ->midState != MID_REQUEST_SUBMITTED, timeout); 341 return -ERESTARTSYS;
325 342
326 if (time_after(jiffies, curr_timeout) && 343 return 0;
327 (midQ->midState == MID_REQUEST_SUBMITTED) && 344}
328 ((ses->server->tcpStatus == CifsGood) ||
329 (ses->server->tcpStatus == CifsNew))) {
330 345
331 unsigned long lrt;
332 346
333 /* We timed out. Is the server still 347/*
334 sending replies ? */ 348 * Send a SMB request and set the callback function in the mid to handle
335 spin_lock(&GlobalMid_Lock); 349 * the result. Caller is responsible for dealing with timeouts.
336 lrt = ses->server->lstrp; 350 */
337 spin_unlock(&GlobalMid_Lock); 351int
352cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
353 mid_callback_t *callback, void *cbdata)
354{
355 int rc;
356 struct mid_q_entry *mid;
338 357
339 /* Calculate time_to_wait past last receive time. 358 rc = wait_for_free_request(server, CIFS_ASYNC_OP);
340 Although we prefer not to time out if the 359 if (rc)
341 server is still responding - we will time 360 return rc;
342 out if the server takes more than 15 (or 45 361
343 or 180) seconds to respond to this request 362 mutex_lock(&server->srv_mutex);
344 and has not responded to any request from 363 mid = AllocMidQEntry(in_buf, server);
345 other threads on the client within 10 seconds */ 364 if (mid == NULL) {
346 lrt += time_to_wait; 365 mutex_unlock(&server->srv_mutex);
347 if (time_after(jiffies, lrt)) { 366 return -ENOMEM;
348 /* No replies for time_to_wait. */
349 cERROR(1, "server not responding");
350 return -1;
351 }
352 } else {
353 return 0;
354 }
355 } 367 }
356}
357 368
369 /* put it on the pending_mid_q */
370 spin_lock(&GlobalMid_Lock);
371 list_add_tail(&mid->qhead, &server->pending_mid_q);
372 spin_unlock(&GlobalMid_Lock);
373
374 rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
375 if (rc) {
376 mutex_unlock(&server->srv_mutex);
377 goto out_err;
378 }
379
380 mid->callback = callback;
381 mid->callback_data = cbdata;
382 mid->midState = MID_REQUEST_SUBMITTED;
383#ifdef CONFIG_CIFS_STATS2
384 atomic_inc(&server->inSend);
385#endif
386 rc = smb_send(server, in_buf, in_buf->smb_buf_length);
387#ifdef CONFIG_CIFS_STATS2
388 atomic_dec(&server->inSend);
389 mid->when_sent = jiffies;
390#endif
391 mutex_unlock(&server->srv_mutex);
392 if (rc)
393 goto out_err;
394
395 return rc;
396out_err:
397 delete_mid(mid);
398 atomic_dec(&server->inFlight);
399 wake_up(&server->request_q);
400 return rc;
401}
358 402
359/* 403/*
360 * 404 *
@@ -382,6 +426,81 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
382 return rc; 426 return rc;
383} 427}
384 428
429static int
430sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
431{
432 int rc = 0;
433
434 cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
435 mid->mid, mid->midState);
436
437 spin_lock(&GlobalMid_Lock);
438 /* ensure that it's no longer on the pending_mid_q */
439 list_del_init(&mid->qhead);
440
441 switch (mid->midState) {
442 case MID_RESPONSE_RECEIVED:
443 spin_unlock(&GlobalMid_Lock);
444 return rc;
445 case MID_REQUEST_SUBMITTED:
446 /* socket is going down, reject all calls */
447 if (server->tcpStatus == CifsExiting) {
448 cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
449 __func__, mid->mid, mid->command, mid->midState);
450 rc = -EHOSTDOWN;
451 break;
452 }
453 case MID_RETRY_NEEDED:
454 rc = -EAGAIN;
455 break;
456 default:
457 cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
458 mid->mid, mid->midState);
459 rc = -EIO;
460 }
461 spin_unlock(&GlobalMid_Lock);
462
463 DeleteMidQEntry(mid);
464 return rc;
465}
466
467/*
468 * An NT cancel request header looks just like the original request except:
469 *
470 * The Command is SMB_COM_NT_CANCEL
471 * The WordCount is zeroed out
472 * The ByteCount is zeroed out
473 *
474 * This function mangles an existing request buffer into a
475 * SMB_COM_NT_CANCEL request and then sends it.
476 */
477static int
478send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
479 struct mid_q_entry *mid)
480{
481 int rc = 0;
482
483 /* -4 for RFC1001 length and +2 for BCC field */
484 in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4 + 2;
485 in_buf->Command = SMB_COM_NT_CANCEL;
486 in_buf->WordCount = 0;
487 put_bcc_le(0, in_buf);
488
489 mutex_lock(&server->srv_mutex);
490 rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
491 if (rc) {
492 mutex_unlock(&server->srv_mutex);
493 return rc;
494 }
495 rc = smb_send(server, in_buf, in_buf->smb_buf_length);
496 mutex_unlock(&server->srv_mutex);
497
498 cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
499 in_buf->Mid, rc);
500
501 return rc;
502}
503
385int 504int
386SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, 505SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
387 struct kvec *iov, int n_vec, int *pRespBufType /* ret */, 506 struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -390,7 +509,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
390 int rc = 0; 509 int rc = 0;
391 int long_op; 510 int long_op;
392 unsigned int receive_len; 511 unsigned int receive_len;
393 unsigned long timeout;
394 struct mid_q_entry *midQ; 512 struct mid_q_entry *midQ;
395 struct smb_hdr *in_buf = iov[0].iov_base; 513 struct smb_hdr *in_buf = iov[0].iov_base;
396 514
@@ -413,7 +531,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
413 to the same server. We may make this configurable later or 531 to the same server. We may make this configurable later or
414 use ses->maxReq */ 532 use ses->maxReq */
415 533
416 rc = wait_for_free_request(ses, long_op); 534 rc = wait_for_free_request(ses->server, long_op);
417 if (rc) { 535 if (rc) {
418 cifs_small_buf_release(in_buf); 536 cifs_small_buf_release(in_buf);
419 return rc; 537 return rc;
@@ -457,65 +575,20 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
457 if (rc < 0) 575 if (rc < 0)
458 goto out; 576 goto out;
459 577
460 if (long_op == CIFS_STD_OP) 578 if (long_op == CIFS_ASYNC_OP)
461 timeout = 15 * HZ;
462 else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
463 timeout = 180 * HZ;
464 else if (long_op == CIFS_LONG_OP)
465 timeout = 45 * HZ; /* should be greater than
466 servers oplock break timeout (about 43 seconds) */
467 else if (long_op == CIFS_ASYNC_OP)
468 goto out; 579 goto out;
469 else if (long_op == CIFS_BLOCKING_OP)
470 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */
471 else {
472 cERROR(1, "unknown timeout flag %d", long_op);
473 rc = -EIO;
474 goto out;
475 }
476
477 /* wait for 15 seconds or until woken up due to response arriving or
478 due to last connection to this server being unmounted */
479 if (signal_pending(current)) {
480 /* if signal pending do not hold up user for full smb timeout
481 but we still give response a chance to complete */
482 timeout = 2 * HZ;
483 }
484
485 /* No user interrupts in wait - wreaks havoc with performance */
486 wait_for_response(ses, midQ, timeout, 10 * HZ);
487
488 spin_lock(&GlobalMid_Lock);
489 580
490 if (midQ->resp_buf == NULL) { 581 rc = wait_for_response(ses->server, midQ);
491 cERROR(1, "No response to cmd %d mid %d", 582 if (rc != 0)
492 midQ->command, midQ->mid); 583 goto out;
493 if (midQ->midState == MID_REQUEST_SUBMITTED) {
494 if (ses->server->tcpStatus == CifsExiting)
495 rc = -EHOSTDOWN;
496 else {
497 ses->server->tcpStatus = CifsNeedReconnect;
498 midQ->midState = MID_RETRY_NEEDED;
499 }
500 }
501 584
502 if (rc != -EHOSTDOWN) { 585 rc = sync_mid_result(midQ, ses->server);
503 if (midQ->midState == MID_RETRY_NEEDED) { 586 if (rc != 0) {
504 rc = -EAGAIN;
505 cFYI(1, "marking request for retry");
506 } else {
507 rc = -EIO;
508 }
509 }
510 spin_unlock(&GlobalMid_Lock);
511 DeleteMidQEntry(midQ);
512 /* Update # of requests on wire to server */
513 atomic_dec(&ses->server->inFlight); 587 atomic_dec(&ses->server->inFlight);
514 wake_up(&ses->server->request_q); 588 wake_up(&ses->server->request_q);
515 return rc; 589 return rc;
516 } 590 }
517 591
518 spin_unlock(&GlobalMid_Lock);
519 receive_len = midQ->resp_buf->smb_buf_length; 592 receive_len = midQ->resp_buf->smb_buf_length;
520 593
521 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 594 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -559,19 +632,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
559 if (receive_len >= sizeof(struct smb_hdr) - 4 632 if (receive_len >= sizeof(struct smb_hdr) - 4
560 /* do not count RFC1001 header */ + 633 /* do not count RFC1001 header */ +
561 (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ ) 634 (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
562 BCC(midQ->resp_buf) = 635 put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
563 le16_to_cpu(BCC_LE(midQ->resp_buf));
564 if ((flags & CIFS_NO_RESP) == 0) 636 if ((flags & CIFS_NO_RESP) == 0)
565 midQ->resp_buf = NULL; /* mark it so buf will 637 midQ->resp_buf = NULL; /* mark it so buf will
566 not be freed by 638 not be freed by
567 DeleteMidQEntry */ 639 delete_mid */
568 } else { 640 } else {
569 rc = -EIO; 641 rc = -EIO;
570 cFYI(1, "Bad MID state?"); 642 cFYI(1, "Bad MID state?");
571 } 643 }
572 644
573out: 645out:
574 DeleteMidQEntry(midQ); 646 delete_mid(midQ);
575 atomic_dec(&ses->server->inFlight); 647 atomic_dec(&ses->server->inFlight);
576 wake_up(&ses->server->request_q); 648 wake_up(&ses->server->request_q);
577 649
@@ -585,7 +657,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
585{ 657{
586 int rc = 0; 658 int rc = 0;
587 unsigned int receive_len; 659 unsigned int receive_len;
588 unsigned long timeout;
589 struct mid_q_entry *midQ; 660 struct mid_q_entry *midQ;
590 661
591 if (ses == NULL) { 662 if (ses == NULL) {
@@ -610,7 +681,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
610 return -EIO; 681 return -EIO;
611 } 682 }
612 683
613 rc = wait_for_free_request(ses, long_op); 684 rc = wait_for_free_request(ses->server, long_op);
614 if (rc) 685 if (rc)
615 return rc; 686 return rc;
616 687
@@ -649,64 +720,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
649 if (rc < 0) 720 if (rc < 0)
650 goto out; 721 goto out;
651 722
652 if (long_op == CIFS_STD_OP) 723 if (long_op == CIFS_ASYNC_OP)
653 timeout = 15 * HZ;
654 /* wait for 15 seconds or until woken up due to response arriving or
655 due to last connection to this server being unmounted */
656 else if (long_op == CIFS_ASYNC_OP)
657 goto out; 724 goto out;
658 else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
659 timeout = 180 * HZ;
660 else if (long_op == CIFS_LONG_OP)
661 timeout = 45 * HZ; /* should be greater than
662 servers oplock break timeout (about 43 seconds) */
663 else if (long_op == CIFS_BLOCKING_OP)
664 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
665 else {
666 cERROR(1, "unknown timeout flag %d", long_op);
667 rc = -EIO;
668 goto out;
669 }
670 725
671 if (signal_pending(current)) { 726 rc = wait_for_response(ses->server, midQ);
672 /* if signal pending do not hold up user for full smb timeout 727 if (rc != 0)
673 but we still give response a chance to complete */ 728 goto out;
674 timeout = 2 * HZ;
675 }
676
677 /* No user interrupts in wait - wreaks havoc with performance */
678 wait_for_response(ses, midQ, timeout, 10 * HZ);
679
680 spin_lock(&GlobalMid_Lock);
681 if (midQ->resp_buf == NULL) {
682 cERROR(1, "No response for cmd %d mid %d",
683 midQ->command, midQ->mid);
684 if (midQ->midState == MID_REQUEST_SUBMITTED) {
685 if (ses->server->tcpStatus == CifsExiting)
686 rc = -EHOSTDOWN;
687 else {
688 ses->server->tcpStatus = CifsNeedReconnect;
689 midQ->midState = MID_RETRY_NEEDED;
690 }
691 }
692 729
693 if (rc != -EHOSTDOWN) { 730 rc = sync_mid_result(midQ, ses->server);
694 if (midQ->midState == MID_RETRY_NEEDED) { 731 if (rc != 0) {
695 rc = -EAGAIN;
696 cFYI(1, "marking request for retry");
697 } else {
698 rc = -EIO;
699 }
700 }
701 spin_unlock(&GlobalMid_Lock);
702 DeleteMidQEntry(midQ);
703 /* Update # of requests on wire to server */
704 atomic_dec(&ses->server->inFlight); 732 atomic_dec(&ses->server->inFlight);
705 wake_up(&ses->server->request_q); 733 wake_up(&ses->server->request_q);
706 return rc; 734 return rc;
707 } 735 }
708 736
709 spin_unlock(&GlobalMid_Lock);
710 receive_len = midQ->resp_buf->smb_buf_length; 737 receive_len = midQ->resp_buf->smb_buf_length;
711 738
712 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 739 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -748,43 +775,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
748 if (receive_len >= sizeof(struct smb_hdr) - 4 775 if (receive_len >= sizeof(struct smb_hdr) - 4
749 /* do not count RFC1001 header */ + 776 /* do not count RFC1001 header */ +
750 (2 * out_buf->WordCount) + 2 /* bcc */ ) 777 (2 * out_buf->WordCount) + 2 /* bcc */ )
751 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 778 put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
752 } else { 779 } else {
753 rc = -EIO; 780 rc = -EIO;
754 cERROR(1, "Bad MID state?"); 781 cERROR(1, "Bad MID state?");
755 } 782 }
756 783
757out: 784out:
758 DeleteMidQEntry(midQ); 785 delete_mid(midQ);
759 atomic_dec(&ses->server->inFlight); 786 atomic_dec(&ses->server->inFlight);
760 wake_up(&ses->server->request_q); 787 wake_up(&ses->server->request_q);
761 788
762 return rc; 789 return rc;
763} 790}
764 791
765/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
766
767static int
768send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
769 struct mid_q_entry *midQ)
770{
771 int rc = 0;
772 struct cifsSesInfo *ses = tcon->ses;
773 __u16 mid = in_buf->Mid;
774
775 header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
776 in_buf->Mid = mid;
777 mutex_lock(&ses->server->srv_mutex);
778 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
779 if (rc) {
780 mutex_unlock(&ses->server->srv_mutex);
781 return rc;
782 }
783 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
784 mutex_unlock(&ses->server->srv_mutex);
785 return rc;
786}
787
788/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows 792/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
789 blocking lock to return. */ 793 blocking lock to return. */
790 794
@@ -807,7 +811,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
807 pSMB->hdr.Mid = GetNextMid(ses->server); 811 pSMB->hdr.Mid = GetNextMid(ses->server);
808 812
809 return SendReceive(xid, ses, in_buf, out_buf, 813 return SendReceive(xid, ses, in_buf, out_buf,
810 &bytes_returned, CIFS_STD_OP); 814 &bytes_returned, 0);
811} 815}
812 816
813int 817int
@@ -845,7 +849,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
845 return -EIO; 849 return -EIO;
846 } 850 }
847 851
848 rc = wait_for_free_request(ses, CIFS_BLOCKING_OP); 852 rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
849 if (rc) 853 if (rc)
850 return rc; 854 return rc;
851 855
@@ -863,7 +867,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
863 867
864 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); 868 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
865 if (rc) { 869 if (rc) {
866 DeleteMidQEntry(midQ); 870 delete_mid(midQ);
867 mutex_unlock(&ses->server->srv_mutex); 871 mutex_unlock(&ses->server->srv_mutex);
868 return rc; 872 return rc;
869 } 873 }
@@ -880,7 +884,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
880 mutex_unlock(&ses->server->srv_mutex); 884 mutex_unlock(&ses->server->srv_mutex);
881 885
882 if (rc < 0) { 886 if (rc < 0) {
883 DeleteMidQEntry(midQ); 887 delete_mid(midQ);
884 return rc; 888 return rc;
885 } 889 }
886 890
@@ -899,10 +903,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
899 if (in_buf->Command == SMB_COM_TRANSACTION2) { 903 if (in_buf->Command == SMB_COM_TRANSACTION2) {
900 /* POSIX lock. We send a NT_CANCEL SMB to cause the 904 /* POSIX lock. We send a NT_CANCEL SMB to cause the
901 blocking lock to return. */ 905 blocking lock to return. */
902 906 rc = send_nt_cancel(ses->server, in_buf, midQ);
903 rc = send_nt_cancel(tcon, in_buf, midQ);
904 if (rc) { 907 if (rc) {
905 DeleteMidQEntry(midQ); 908 delete_mid(midQ);
906 return rc; 909 return rc;
907 } 910 }
908 } else { 911 } else {
@@ -914,47 +917,22 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
914 /* If we get -ENOLCK back the lock may have 917 /* If we get -ENOLCK back the lock may have
915 already been removed. Don't exit in this case. */ 918 already been removed. Don't exit in this case. */
916 if (rc && rc != -ENOLCK) { 919 if (rc && rc != -ENOLCK) {
917 DeleteMidQEntry(midQ); 920 delete_mid(midQ);
918 return rc; 921 return rc;
919 } 922 }
920 } 923 }
921 924
922 /* Wait 5 seconds for the response. */ 925 if (wait_for_response(ses->server, midQ) == 0) {
923 if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) {
924 /* We got the response - restart system call. */ 926 /* We got the response - restart system call. */
925 rstart = 1; 927 rstart = 1;
926 } 928 }
927 } 929 }
928 930
929 spin_lock(&GlobalMid_Lock); 931 rc = sync_mid_result(midQ, ses->server);
930 if (midQ->resp_buf) { 932 if (rc != 0)
931 spin_unlock(&GlobalMid_Lock);
932 receive_len = midQ->resp_buf->smb_buf_length;
933 } else {
934 cERROR(1, "No response for cmd %d mid %d",
935 midQ->command, midQ->mid);
936 if (midQ->midState == MID_REQUEST_SUBMITTED) {
937 if (ses->server->tcpStatus == CifsExiting)
938 rc = -EHOSTDOWN;
939 else {
940 ses->server->tcpStatus = CifsNeedReconnect;
941 midQ->midState = MID_RETRY_NEEDED;
942 }
943 }
944
945 if (rc != -EHOSTDOWN) {
946 if (midQ->midState == MID_RETRY_NEEDED) {
947 rc = -EAGAIN;
948 cFYI(1, "marking request for retry");
949 } else {
950 rc = -EIO;
951 }
952 }
953 spin_unlock(&GlobalMid_Lock);
954 DeleteMidQEntry(midQ);
955 return rc; 933 return rc;
956 }
957 934
935 receive_len = midQ->resp_buf->smb_buf_length;
958 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 936 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
959 cERROR(1, "Frame too large received. Length: %d Xid: %d", 937 cERROR(1, "Frame too large received. Length: %d Xid: %d",
960 receive_len, xid); 938 receive_len, xid);
@@ -998,10 +976,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
998 if (receive_len >= sizeof(struct smb_hdr) - 4 976 if (receive_len >= sizeof(struct smb_hdr) - 4
999 /* do not count RFC1001 header */ + 977 /* do not count RFC1001 header */ +
1000 (2 * out_buf->WordCount) + 2 /* bcc */ ) 978 (2 * out_buf->WordCount) + 2 /* bcc */ )
1001 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 979 put_bcc(get_bcc_le(out_buf), out_buf);
1002 980
1003out: 981out:
1004 DeleteMidQEntry(midQ); 982 delete_mid(midQ);
1005 if (rstart && rc == -EACCES) 983 if (rstart && rc == -EACCES)
1006 return -ERESTARTSYS; 984 return -ERESTARTSYS;
1007 return rc; 985 return rc;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 5525e1c660fd..690157876184 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -20,10 +20,9 @@
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
21 21
22#include <linux/coda.h> 22#include <linux/coda.h>
23#include <linux/coda_linux.h>
24#include <linux/coda_psdev.h> 23#include <linux/coda_psdev.h>
25#include <linux/coda_fs_i.h> 24#include "coda_linux.h"
26#include <linux/coda_cache.h> 25#include "coda_cache.h"
27 26
28static atomic_t permission_epoch = ATOMIC_INIT(0); 27static atomic_t permission_epoch = ATOMIC_INIT(0);
29 28
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 602240569c89..6475877b0763 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -7,9 +7,8 @@
7#include <linux/time.h> 7#include <linux/time.h>
8 8
9#include <linux/coda.h> 9#include <linux/coda.h>
10#include <linux/coda_linux.h>
11#include <linux/coda_fs_i.h>
12#include <linux/coda_psdev.h> 10#include <linux/coda_psdev.h>
11#include "coda_linux.h"
13 12
14static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) 13static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
15{ 14{
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 000000000000..c910b5eb1ceb
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
1/* Coda filesystem -- Linux Minicache
2 *
3 * Copyright (C) 1989 - 1997 Carnegie Mellon University
4 *
5 * Carnegie Mellon University encourages users of this software to
6 * contribute improvements to the Coda project. Contact Peter Braam
7 * <coda@cs.cmu.edu>
8 */
9
10#ifndef _CFSNC_HEADER_
11#define _CFSNC_HEADER_
12
13/* credential cache */
14void coda_cache_enter(struct inode *inode, int mask);
15void coda_cache_clear_inode(struct inode *);
16void coda_cache_clear_all(struct super_block *sb);
17int coda_cache_check(struct inode *inode, int mask);
18
19/* for downcalls and attributes and lookups */
20void coda_flag_inode_children(struct inode *inode, int flag);
21
22#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 000000000000..e35071b1de0e
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
1/*
2 * coda_fs_i.h
3 *
4 * Copyright (C) 1998 Carnegie Mellon University
5 *
6 */
7
8#ifndef _LINUX_CODA_FS_I
9#define _LINUX_CODA_FS_I
10
11#include <linux/types.h>
12#include <linux/list.h>
13#include <linux/spinlock.h>
14#include <linux/coda.h>
15
16/*
17 * coda fs inode data
18 * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
19 * c_cached_perm.
20 * vfs_inode is set only when the inode is created and never changes.
21 * c_fid is set when the inode is created and should be considered immutable.
22 */
23struct coda_inode_info {
24 struct CodaFid c_fid; /* Coda identifier */
25 u_short c_flags; /* flags (see below) */
26 unsigned int c_mapcount; /* nr of times this inode is mapped */
27 unsigned int c_cached_epoch; /* epoch for cached permissions */
28 vuid_t c_uid; /* fsuid for cached permissions */
29 unsigned int c_cached_perm; /* cached access permissions */
30 spinlock_t c_lock;
31 struct inode vfs_inode;
32};
33
34/*
35 * coda fs file private data
36 */
37#define CODA_MAGIC 0xC0DAC0DA
38struct coda_file_info {
39 int cfi_magic; /* magic number */
40 struct file *cfi_container; /* container file for this cnode */
41 unsigned int cfi_mapcount; /* nr of times this file is mapped */
42};
43
44#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
45
46/* flags */
47#define C_VATTR 0x1 /* Validity of vattr in inode */
48#define C_FLUSH 0x2 /* used after a flush */
49#define C_DYING 0x4 /* from venus (which died) */
50#define C_PURGE 0x8
51
52int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
53struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
54int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
55struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
56void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
57
58#endif
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index bf4a3fd3c8e3..2bdbcc11b373 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -17,9 +17,8 @@
17#include <linux/string.h> 17#include <linux/string.h>
18 18
19#include <linux/coda.h> 19#include <linux/coda.h>
20#include <linux/coda_linux.h>
21#include <linux/coda_psdev.h> 20#include <linux/coda_psdev.h>
22#include <linux/coda_fs_i.h> 21#include "coda_linux.h"
23 22
24/* initialize the debugging variables */ 23/* initialize the debugging variables */
25int coda_fake_statfs; 24int coda_fake_statfs;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 000000000000..9b0c5323890b
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
1/*
2 * Coda File System, Linux Kernel module
3 *
4 * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
5 * Linux modifications (C) 1996, Peter J. Braam
6 * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
7 *
8 * Carnegie Mellon University encourages users of this software to
9 * contribute improvements to the Coda project.
10 */
11
12#ifndef _LINUX_CODA_FS
13#define _LINUX_CODA_FS
14
15#include <linux/kernel.h>
16#include <linux/param.h>
17#include <linux/mm.h>
18#include <linux/vmalloc.h>
19#include <linux/slab.h>
20#include <linux/wait.h>
21#include <linux/types.h>
22#include <linux/fs.h>
23#include "coda_fs_i.h"
24
25/* operations */
26extern const struct inode_operations coda_dir_inode_operations;
27extern const struct inode_operations coda_file_inode_operations;
28extern const struct inode_operations coda_ioctl_inode_operations;
29
30extern const struct dentry_operations coda_dentry_operations;
31
32extern const struct address_space_operations coda_file_aops;
33extern const struct address_space_operations coda_symlink_aops;
34
35extern const struct file_operations coda_dir_operations;
36extern const struct file_operations coda_file_operations;
37extern const struct file_operations coda_ioctl_operations;
38
39/* operations shared over more than one file */
40int coda_open(struct inode *i, struct file *f);
41int coda_release(struct inode *i, struct file *f);
42int coda_permission(struct inode *inode, int mask, unsigned int flags);
43int coda_revalidate_inode(struct dentry *);
44int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
45int coda_setattr(struct dentry *, struct iattr *);
46
47/* this file: heloers */
48char *coda_f2s(struct CodaFid *f);
49int coda_isroot(struct inode *i);
50int coda_iscontrol(const char *name, size_t length);
51
52void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
53void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
54unsigned short coda_flags_to_cflags(unsigned short);
55
56/* sysctl.h */
57void coda_sysctl_init(void);
58void coda_sysctl_clean(void);
59
60#define CODA_ALLOC(ptr, cast, size) do { \
61 if (size < PAGE_SIZE) \
62 ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
63 else \
64 ptr = (cast)vmalloc((unsigned long) size); \
65 if (!ptr) \
66 printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
67 else memset( ptr, 0, size ); \
68} while (0)
69
70
71#define CODA_FREE(ptr,size) \
72 do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
73
74/* inode to cnode access functions */
75
76static inline struct coda_inode_info *ITOC(struct inode *inode)
77{
78 return list_entry(inode, struct coda_inode_info, vfs_inode);
79}
80
81static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
82{
83 return &(ITOC(inode)->c_fid);
84}
85
86static __inline__ char *coda_i2s(struct inode *inode)
87{
88 return coda_f2s(&(ITOC(inode)->c_fid));
89}
90
91/* this will not zap the inode away */
92static __inline__ void coda_flag_inode(struct inode *inode, int flag)
93{
94 struct coda_inode_info *cii = ITOC(inode);
95
96 spin_lock(&cii->c_lock);
97 cii->c_flags |= flag;
98 spin_unlock(&cii->c_lock);
99}
100
101#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 29badd91360f..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -23,10 +23,9 @@
23#include <asm/uaccess.h> 23#include <asm/uaccess.h>
24 24
25#include <linux/coda.h> 25#include <linux/coda.h>
26#include <linux/coda_linux.h>
27#include <linux/coda_psdev.h> 26#include <linux/coda_psdev.h>
28#include <linux/coda_fs_i.h> 27#include "coda_linux.h"
29#include <linux/coda_cache.h> 28#include "coda_cache.h"
30 29
31#include "coda_int.h" 30#include "coda_int.h"
32 31
@@ -61,7 +60,7 @@ static int coda_return_EIO(void)
61} 60}
62#define CODA_EIO_ERROR ((void *) (coda_return_EIO)) 61#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
63 62
64static const struct dentry_operations coda_dentry_operations = 63const struct dentry_operations coda_dentry_operations =
65{ 64{
66 .d_revalidate = coda_dentry_revalidate, 65 .d_revalidate = coda_dentry_revalidate,
67 .d_delete = coda_dentry_delete, 66 .d_delete = coda_dentry_delete,
@@ -126,8 +125,6 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
126 return ERR_PTR(error); 125 return ERR_PTR(error);
127 126
128exit: 127exit:
129 d_set_d_op(entry, &coda_dentry_operations);
130
131 if (inode && (type & CODA_NOCACHE)) 128 if (inode && (type & CODA_NOCACHE))
132 coda_flag_inode(inode, C_VATTR | C_PURGE); 129 coda_flag_inode(inode, C_VATTR | C_PURGE);
133 130
diff --git a/fs/coda/file.c b/fs/coda/file.c
index c8b50ba4366a..0433057be330 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -21,10 +21,9 @@
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22 22
23#include <linux/coda.h> 23#include <linux/coda.h>
24#include <linux/coda_linux.h>
25#include <linux/coda_fs_i.h>
26#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
27 25
26#include "coda_linux.h"
28#include "coda_int.h" 27#include "coda_int.h"
29 28
30static ssize_t 29static ssize_t
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 50dc7d189f56..871b27715465 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -28,10 +28,9 @@
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29 29
30#include <linux/coda.h> 30#include <linux/coda.h>
31#include <linux/coda_linux.h>
32#include <linux/coda_psdev.h> 31#include <linux/coda_psdev.h>
33#include <linux/coda_fs_i.h> 32#include "coda_linux.h"
34#include <linux/coda_cache.h> 33#include "coda_cache.h"
35 34
36#include "coda_int.h" 35#include "coda_int.h"
37 36
@@ -45,7 +44,7 @@ static struct kmem_cache * coda_inode_cachep;
45static struct inode *coda_alloc_inode(struct super_block *sb) 44static struct inode *coda_alloc_inode(struct super_block *sb)
46{ 45{
47 struct coda_inode_info *ei; 46 struct coda_inode_info *ei;
48 ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); 47 ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
49 if (!ei) 48 if (!ei)
50 return NULL; 49 return NULL;
51 memset(&ei->c_fid, 0, sizeof(struct CodaFid)); 50 memset(&ei->c_fid, 0, sizeof(struct CodaFid));
@@ -193,6 +192,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
193 sb->s_blocksize_bits = 12; 192 sb->s_blocksize_bits = 12;
194 sb->s_magic = CODA_SUPER_MAGIC; 193 sb->s_magic = CODA_SUPER_MAGIC;
195 sb->s_op = &coda_super_operations; 194 sb->s_op = &coda_super_operations;
195 sb->s_d_op = &coda_dentry_operations;
196 sb->s_bdi = &vc->bdi; 196 sb->s_bdi = &vc->bdi;
197 197
198 /* get root fid from Venus: this needs the root inode */ 198 /* get root fid from Venus: this needs the root inode */
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 741f0bd03918..6cbb3afb36dc 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -19,10 +19,10 @@
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20 20
21#include <linux/coda.h> 21#include <linux/coda.h>
22#include <linux/coda_linux.h>
23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 22#include <linux/coda_psdev.h>
25 23
24#include "coda_linux.h"
25
26/* pioctl ops */ 26/* pioctl ops */
27static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags); 27static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
28static long coda_pioctl(struct file *filp, unsigned int cmd, 28static long coda_pioctl(struct file *filp, unsigned int cmd,
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 62647a8595e4..8f616e0e252c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -43,10 +43,10 @@
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44 44
45#include <linux/coda.h> 45#include <linux/coda.h>
46#include <linux/coda_linux.h>
47#include <linux/coda_fs_i.h>
48#include <linux/coda_psdev.h> 46#include <linux/coda_psdev.h>
49 47
48#include "coda_linux.h"
49
50#include "coda_int.h" 50#include "coda_int.h"
51 51
52/* statistics */ 52/* statistics */
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index af78f007a2b0..ab94ef63caef 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -16,9 +16,9 @@
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17 17
18#include <linux/coda.h> 18#include <linux/coda.h>
19#include <linux/coda_linux.h>
20#include <linux/coda_psdev.h> 19#include <linux/coda_psdev.h>
21#include <linux/coda_fs_i.h> 20
21#include "coda_linux.h"
22 22
23static int coda_symlink_filler(struct file *file, struct page *page) 23static int coda_symlink_filler(struct file *file, struct page *page)
24{ 24{
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c3563cab9758..9727e0c52579 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,10 +33,9 @@
33#include <linux/vfs.h> 33#include <linux/vfs.h>
34 34
35#include <linux/coda.h> 35#include <linux/coda.h>
36#include <linux/coda_linux.h>
37#include <linux/coda_psdev.h> 36#include <linux/coda_psdev.h>
38#include <linux/coda_fs_i.h> 37#include "coda_linux.h"
39#include <linux/coda_cache.h> 38#include "coda_cache.h"
40 39
41#include "coda_int.h" 40#include "coda_int.h"
42 41
diff --git a/fs/compat.c b/fs/compat.c
index eb1740ac8c0a..f6fd0a00e6cc 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
257} 257}
258 258
259/* 259/*
260 * The following statfs calls are copies of code from fs/open.c and 260 * The following statfs calls are copies of code from fs/statfs.c and
261 * should be checked against those from time to time 261 * should be checked against those from time to time
262 */ 262 */
263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
@@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
320 __put_user(kbuf->f_namelen, &ubuf->f_namelen) || 320 __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
321 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || 321 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
322 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || 322 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
323 __put_user(kbuf->f_frsize, &ubuf->f_frsize)) 323 __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
324 __put_user(kbuf->f_flags, &ubuf->f_flags) ||
325 __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
324 return -EFAULT; 326 return -EFAULT;
325 return 0; 327 return 0;
326} 328}
@@ -597,10 +599,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
597 if (nr_segs > fast_segs) { 599 if (nr_segs > fast_segs) {
598 ret = -ENOMEM; 600 ret = -ENOMEM;
599 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 601 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
600 if (iov == NULL) { 602 if (iov == NULL)
601 *ret_pointer = fast_pointer;
602 goto out; 603 goto out;
603 }
604 } 604 }
605 *ret_pointer = iov; 605 *ret_pointer = iov;
606 606
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0b..9febcdefdfdc 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
1config CONFIGFS_FS 1config CONFIGFS_FS
2 tristate "Userspace-driven configuration filesystem" 2 tristate "Userspace-driven configuration filesystem"
3 depends on SYSFS 3 select SYSFS
4 help 4 help
5 configfs is a ram-based filesystem that provides the converse 5 configfs is a RAM-based filesystem that provides the converse
6 of sysfs's functionality. Where sysfs is a filesystem-based 6 of sysfs's functionality. Where sysfs is a filesystem-based
7 view of kernel objects, configfs is a filesystem-based manager 7 view of kernel objects, configfs is a filesystem-based manager
8 of kernel objects, or config_items. 8 of kernel objects, or config_items.
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 026cf68553a4..82bda8fdfc1c 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -90,6 +90,7 @@ extern const struct file_operations configfs_file_operations;
90extern const struct file_operations bin_fops; 90extern const struct file_operations bin_fops;
91extern const struct inode_operations configfs_dir_inode_operations; 91extern const struct inode_operations configfs_dir_inode_operations;
92extern const struct inode_operations configfs_symlink_inode_operations; 92extern const struct inode_operations configfs_symlink_inode_operations;
93extern const struct dentry_operations configfs_dentry_ops;
93 94
94extern int configfs_symlink(struct inode *dir, struct dentry *dentry, 95extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
95 const char *symname); 96 const char *symname);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 36637a8c1ed3..90ff3cb10de3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static int configfs_d_delete(const struct dentry *dentry)
72 return 1; 72 return 1;
73} 73}
74 74
75static const struct dentry_operations configfs_dentry_ops = { 75const struct dentry_operations configfs_dentry_ops = {
76 .d_iput = configfs_d_iput, 76 .d_iput = configfs_d_iput,
77 /* simple_delete_dentry() isn't exported */ 77 /* simple_delete_dentry() isn't exported */
78 .d_delete = configfs_d_delete, 78 .d_delete = configfs_d_delete,
@@ -442,7 +442,6 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
442 return error; 442 return error;
443 } 443 }
444 444
445 d_set_d_op(dentry, &configfs_dentry_ops);
446 d_rehash(dentry); 445 d_rehash(dentry);
447 446
448 return 0; 447 return 0;
@@ -489,7 +488,6 @@ static struct dentry * configfs_lookup(struct inode *dir,
489 */ 488 */
490 if (dentry->d_name.len > NAME_MAX) 489 if (dentry->d_name.len > NAME_MAX)
491 return ERR_PTR(-ENAMETOOLONG); 490 return ERR_PTR(-ENAMETOOLONG);
492 d_set_d_op(dentry, &configfs_dentry_ops);
493 d_add(dentry, NULL); 491 d_add(dentry, NULL);
494 return NULL; 492 return NULL;
495 } 493 }
@@ -683,7 +681,6 @@ static int create_default_group(struct config_group *parent_group,
683 ret = -ENOMEM; 681 ret = -ENOMEM;
684 child = d_alloc(parent, &name); 682 child = d_alloc(parent, &name);
685 if (child) { 683 if (child) {
686 d_set_d_op(child, &configfs_dentry_ops);
687 d_add(child, NULL); 684 d_add(child, NULL);
688 685
689 ret = configfs_attach_group(&parent_group->cg_item, 686 ret = configfs_attach_group(&parent_group->cg_item,
@@ -1681,7 +1678,6 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
1681 err = -ENOMEM; 1678 err = -ENOMEM;
1682 dentry = d_alloc(configfs_sb->s_root, &name); 1679 dentry = d_alloc(configfs_sb->s_root, &name);
1683 if (dentry) { 1680 if (dentry) {
1684 d_set_d_op(dentry, &configfs_dentry_ops);
1685 d_add(dentry, NULL); 1681 d_add(dentry, NULL);
1686 1682
1687 err = configfs_attach_group(sd->s_element, &group->cg_item, 1683 err = configfs_attach_group(sd->s_element, &group->cg_item,
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 7d3607febe1c..ecc62178beda 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -101,6 +101,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
101 configfs_root_group.cg_item.ci_dentry = root; 101 configfs_root_group.cg_item.ci_dentry = root;
102 root->d_fsdata = &configfs_root; 102 root->d_fsdata = &configfs_root;
103 sb->s_root = root; 103 sb->s_root = root;
104 sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
104 return 0; 105 return 0;
105} 106}
106 107
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 32fd5fe9ca0e..e141939080f0 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops;
34static DEFINE_MUTEX(read_mutex); 34static DEFINE_MUTEX(read_mutex);
35 35
36 36
37/* These two macros may change in future, to provide better st_ino 37/* These macros may change in future, to provide better st_ino semantics. */
38 semantics. */
39#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1)
40#define OFFSET(x) ((x)->i_ino) 38#define OFFSET(x) ((x)->i_ino)
41 39
42static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode) 40static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
43{ 41{
42 if (!cino->offset)
43 return offset + 1;
44 if (!cino->size)
45 return offset + 1;
46
47 /*
48 * The file mode test fixes buggy mkcramfs implementations where
49 * cramfs_inode->offset is set to a non zero value for entries
50 * which did not contain data, like devices node and fifos.
51 */
52 switch (cino->mode & S_IFMT) {
53 case S_IFREG:
54 case S_IFDIR:
55 case S_IFLNK:
56 return cino->offset << 2;
57 default:
58 break;
59 }
60 return offset + 1;
61}
62
63static struct inode *get_cramfs_inode(struct super_block *sb,
64 struct cramfs_inode *cramfs_inode, unsigned int offset)
65{
66 struct inode *inode;
44 static struct timespec zerotime; 67 static struct timespec zerotime;
68
69 inode = iget_locked(sb, cramino(cramfs_inode, offset));
70 if (!inode)
71 return ERR_PTR(-ENOMEM);
72 if (!(inode->i_state & I_NEW))
73 return inode;
74
75 switch (cramfs_inode->mode & S_IFMT) {
76 case S_IFREG:
77 inode->i_fop = &generic_ro_fops;
78 inode->i_data.a_ops = &cramfs_aops;
79 break;
80 case S_IFDIR:
81 inode->i_op = &cramfs_dir_inode_operations;
82 inode->i_fop = &cramfs_directory_operations;
83 break;
84 case S_IFLNK:
85 inode->i_op = &page_symlink_inode_operations;
86 inode->i_data.a_ops = &cramfs_aops;
87 break;
88 default:
89 init_special_inode(inode, cramfs_inode->mode,
90 old_decode_dev(cramfs_inode->size));
91 }
92
45 inode->i_mode = cramfs_inode->mode; 93 inode->i_mode = cramfs_inode->mode;
46 inode->i_uid = cramfs_inode->uid; 94 inode->i_uid = cramfs_inode->uid;
47 inode->i_size = cramfs_inode->size;
48 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
49 inode->i_gid = cramfs_inode->gid; 95 inode->i_gid = cramfs_inode->gid;
96
97 /* if the lower 2 bits are zero, the inode contains data */
98 if (!(inode->i_ino & 3)) {
99 inode->i_size = cramfs_inode->size;
100 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
101 }
102
50 /* Struct copy intentional */ 103 /* Struct copy intentional */
51 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; 104 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
52 /* inode->i_nlink is left 1 - arguably wrong for directories, 105 /* inode->i_nlink is left 1 - arguably wrong for directories,
53 but it's the best we can do without reading the directory 106 but it's the best we can do without reading the directory
54 contents. 1 yields the right result in GNU find, even 107 contents. 1 yields the right result in GNU find, even
55 without -noleaf option. */ 108 without -noleaf option. */
56 if (S_ISREG(inode->i_mode)) {
57 inode->i_fop = &generic_ro_fops;
58 inode->i_data.a_ops = &cramfs_aops;
59 } else if (S_ISDIR(inode->i_mode)) {
60 inode->i_op = &cramfs_dir_inode_operations;
61 inode->i_fop = &cramfs_directory_operations;
62 } else if (S_ISLNK(inode->i_mode)) {
63 inode->i_op = &page_symlink_inode_operations;
64 inode->i_data.a_ops = &cramfs_aops;
65 } else {
66 init_special_inode(inode, inode->i_mode,
67 old_decode_dev(cramfs_inode->size));
68 }
69}
70 109
71static struct inode *get_cramfs_inode(struct super_block *sb, 110 unlock_new_inode(inode);
72 struct cramfs_inode * cramfs_inode) 111
73{
74 struct inode *inode;
75 if (CRAMINO(cramfs_inode) == 1) {
76 inode = new_inode(sb);
77 if (inode) {
78 inode->i_ino = 1;
79 setup_inode(inode, cramfs_inode);
80 }
81 } else {
82 inode = iget_locked(sb, CRAMINO(cramfs_inode));
83 if (inode && (inode->i_state & I_NEW)) {
84 setup_inode(inode, cramfs_inode);
85 unlock_new_inode(inode);
86 }
87 }
88 return inode; 112 return inode;
89} 113}
90 114
@@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
265 printk(KERN_ERR "cramfs: root is not a directory\n"); 289 printk(KERN_ERR "cramfs: root is not a directory\n");
266 goto out; 290 goto out;
267 } 291 }
292 /* correct strange, hard-coded permissions of mkcramfs */
293 super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
294
268 root_offset = super.root.offset << 2; 295 root_offset = super.root.offset << 2;
269 if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { 296 if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
270 sbi->size=super.size; 297 sbi->size=super.size;
@@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
289 316
290 /* Set it all up.. */ 317 /* Set it all up.. */
291 sb->s_op = &cramfs_ops; 318 sb->s_op = &cramfs_ops;
292 root = get_cramfs_inode(sb, &super.root); 319 root = get_cramfs_inode(sb, &super.root, 0);
293 if (!root) 320 if (!root)
294 goto out; 321 goto out;
295 sb->s_root = d_alloc_root(root); 322 sb->s_root = d_alloc_root(root);
@@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
365 */ 392 */
366 namelen = de->namelen << 2; 393 namelen = de->namelen << 2;
367 memcpy(buf, name, namelen); 394 memcpy(buf, name, namelen);
368 ino = CRAMINO(de); 395 ino = cramino(de, OFFSET(inode) + offset);
369 mode = de->mode; 396 mode = de->mode;
370 mutex_unlock(&read_mutex); 397 mutex_unlock(&read_mutex);
371 nextoffset = offset + sizeof(*de) + namelen; 398 nextoffset = offset + sizeof(*de) + namelen;
@@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
404 struct cramfs_inode *de; 431 struct cramfs_inode *de;
405 char *name; 432 char *name;
406 int namelen, retval; 433 int namelen, retval;
434 int dir_off = OFFSET(dir) + offset;
407 435
408 de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); 436 de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
409 name = (char *)(de+1); 437 name = (char *)(de+1);
410 438
411 /* Try to take advantage of sorted directories */ 439 /* Try to take advantage of sorted directories */
@@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
436 if (!retval) { 464 if (!retval) {
437 struct cramfs_inode entry = *de; 465 struct cramfs_inode entry = *de;
438 mutex_unlock(&read_mutex); 466 mutex_unlock(&read_mutex);
439 d_add(dentry, get_cramfs_inode(dir->i_sb, &entry)); 467 d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
440 return NULL; 468 return NULL;
441 } 469 }
442 /* else (retval < 0) */ 470 /* else (retval < 0) */
diff --git a/fs/dcache.c b/fs/dcache.c
index 5699d4c027cb..2a6bd9a4ae97 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -176,6 +176,7 @@ static void d_free(struct dentry *dentry)
176 176
177/** 177/**
178 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups 178 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
179 * @dentry: the target dentry
179 * After this call, in-progress rcu-walk path lookup will fail. This 180 * After this call, in-progress rcu-walk path lookup will fail. This
180 * should be called after unhashing, and after changing d_inode (if 181 * should be called after unhashing, and after changing d_inode (if
181 * the dentry has not already been unhashed). 182 * the dentry has not already been unhashed).
@@ -281,6 +282,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
281/** 282/**
282 * d_kill - kill dentry and return parent 283 * d_kill - kill dentry and return parent
283 * @dentry: dentry to kill 284 * @dentry: dentry to kill
285 * @parent: parent dentry
284 * 286 *
285 * The dentry must already be unhashed and removed from the LRU. 287 * The dentry must already be unhashed and removed from the LRU.
286 * 288 *
@@ -1320,6 +1322,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
1320 __dget_dlock(parent); 1322 __dget_dlock(parent);
1321 dentry->d_parent = parent; 1323 dentry->d_parent = parent;
1322 dentry->d_sb = parent->d_sb; 1324 dentry->d_sb = parent->d_sb;
1325 d_set_d_op(dentry, dentry->d_sb->s_d_op);
1323 list_add(&dentry->d_u.d_child, &parent->d_subdirs); 1326 list_add(&dentry->d_u.d_child, &parent->d_subdirs);
1324 spin_unlock(&parent->d_lock); 1327 spin_unlock(&parent->d_lock);
1325 } 1328 }
@@ -1335,6 +1338,7 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
1335 struct dentry *dentry = d_alloc(NULL, name); 1338 struct dentry *dentry = d_alloc(NULL, name);
1336 if (dentry) { 1339 if (dentry) {
1337 dentry->d_sb = sb; 1340 dentry->d_sb = sb;
1341 d_set_d_op(dentry, dentry->d_sb->s_d_op);
1338 dentry->d_parent = dentry; 1342 dentry->d_parent = dentry;
1339 dentry->d_flags |= DCACHE_DISCONNECTED; 1343 dentry->d_flags |= DCACHE_DISCONNECTED;
1340 } 1344 }
@@ -1355,8 +1359,8 @@ EXPORT_SYMBOL(d_alloc_name);
1355 1359
1356void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) 1360void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
1357{ 1361{
1358 BUG_ON(dentry->d_op); 1362 WARN_ON_ONCE(dentry->d_op);
1359 BUG_ON(dentry->d_flags & (DCACHE_OP_HASH | 1363 WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
1360 DCACHE_OP_COMPARE | 1364 DCACHE_OP_COMPARE |
1361 DCACHE_OP_REVALIDATE | 1365 DCACHE_OP_REVALIDATE |
1362 DCACHE_OP_DELETE )); 1366 DCACHE_OP_DELETE ));
@@ -1378,8 +1382,11 @@ EXPORT_SYMBOL(d_set_d_op);
1378static void __d_instantiate(struct dentry *dentry, struct inode *inode) 1382static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1379{ 1383{
1380 spin_lock(&dentry->d_lock); 1384 spin_lock(&dentry->d_lock);
1381 if (inode) 1385 if (inode) {
1386 if (unlikely(IS_AUTOMOUNT(inode)))
1387 dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
1382 list_add(&dentry->d_alias, &inode->i_dentry); 1388 list_add(&dentry->d_alias, &inode->i_dentry);
1389 }
1383 dentry->d_inode = inode; 1390 dentry->d_inode = inode;
1384 dentry_rcuwalk_barrier(dentry); 1391 dentry_rcuwalk_barrier(dentry);
1385 spin_unlock(&dentry->d_lock); 1392 spin_unlock(&dentry->d_lock);
@@ -1507,6 +1514,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1507 res = d_alloc(NULL, &name); 1514 res = d_alloc(NULL, &name);
1508 if (res) { 1515 if (res) {
1509 res->d_sb = root_inode->i_sb; 1516 res->d_sb = root_inode->i_sb;
1517 d_set_d_op(res, res->d_sb->s_d_op);
1510 res->d_parent = res; 1518 res->d_parent = res;
1511 d_instantiate(res, root_inode); 1519 d_instantiate(res, root_inode);
1512 } 1520 }
@@ -1567,6 +1575,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
1567 /* attach a disconnected dentry */ 1575 /* attach a disconnected dentry */
1568 spin_lock(&tmp->d_lock); 1576 spin_lock(&tmp->d_lock);
1569 tmp->d_sb = inode->i_sb; 1577 tmp->d_sb = inode->i_sb;
1578 d_set_d_op(tmp, tmp->d_sb->s_d_op);
1570 tmp->d_inode = inode; 1579 tmp->d_inode = inode;
1571 tmp->d_flags |= DCACHE_DISCONNECTED; 1580 tmp->d_flags |= DCACHE_DISCONNECTED;
1572 list_add(&tmp->d_alias, &inode->i_dentry); 1581 list_add(&tmp->d_alias, &inode->i_dentry);
@@ -2449,8 +2458,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
2449} 2458}
2450 2459
2451/** 2460/**
2452 * Prepend path string to a buffer 2461 * prepend_path - Prepend path string to a buffer
2453 *
2454 * @path: the dentry/vfsmount to report 2462 * @path: the dentry/vfsmount to report
2455 * @root: root vfsmnt/dentry (may be modified by this function) 2463 * @root: root vfsmnt/dentry (may be modified by this function)
2456 * @buffer: pointer to the end of the buffer 2464 * @buffer: pointer to the end of the buffer
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 85882f6ba5f7..b044705eedd4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
325} 325}
326EXPORT_SYMBOL_GPL(dio_end_io); 326EXPORT_SYMBOL_GPL(dio_end_io);
327 327
328static int 328static void
329dio_bio_alloc(struct dio *dio, struct block_device *bdev, 329dio_bio_alloc(struct dio *dio, struct block_device *bdev,
330 sector_t first_sector, int nr_vecs) 330 sector_t first_sector, int nr_vecs)
331{ 331{
332 struct bio *bio; 332 struct bio *bio;
333 333
334 /*
335 * bio_alloc() is guaranteed to return a bio when called with
336 * __GFP_WAIT and we request a valid number of vectors.
337 */
334 bio = bio_alloc(GFP_KERNEL, nr_vecs); 338 bio = bio_alloc(GFP_KERNEL, nr_vecs);
335 339
336 bio->bi_bdev = bdev; 340 bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
342 346
343 dio->bio = bio; 347 dio->bio = bio;
344 dio->logical_offset_in_bio = dio->cur_page_fs_offset; 348 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
345 return 0;
346} 349}
347 350
348/* 351/*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
583 goto out; 586 goto out;
584 sector = start_sector << (dio->blkbits - 9); 587 sector = start_sector << (dio->blkbits - 9);
585 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); 588 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
589 nr_pages = min(nr_pages, BIO_MAX_PAGES);
586 BUG_ON(nr_pages <= 0); 590 BUG_ON(nr_pages <= 0);
587 ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); 591 dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
588 dio->boundary = 0; 592 dio->boundary = 0;
589out: 593out:
590 return ret; 594 return ret;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e8116..1897eb1b4b6a 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
1menuconfig DLM 1menuconfig DLM
2 tristate "Distributed Lock Manager (DLM)" 2 tristate "Distributed Lock Manager (DLM)"
3 depends on EXPERIMENTAL && INET 3 depends on EXPERIMENTAL && INET
4 depends on SYSFS && (IPV6 || IPV6=n) 4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
5 select CONFIGFS_FS
6 select IP_SCTP 5 select IP_SCTP
7 help 6 help
8 A general purpose distributed lock manager for kernel or userspace 7 A general purpose distributed lock manager for kernel or userspace
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622a..9c64ae9e4c1a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
63#define NEEDED_RMEM (4*1024*1024) 63#define NEEDED_RMEM (4*1024*1024)
64#define CONN_HASH_SIZE 32 64#define CONN_HASH_SIZE 32
65 65
66/* Number of messages to send before rescheduling */
67#define MAX_SEND_MSG_COUNT 25
68
66struct cbuf { 69struct cbuf {
67 unsigned int base; 70 unsigned int base;
68 unsigned int len; 71 unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
108#define CF_INIT_PENDING 4 111#define CF_INIT_PENDING 4
109#define CF_IS_OTHERCON 5 112#define CF_IS_OTHERCON 5
110#define CF_CLOSE 6 113#define CF_CLOSE 6
114#define CF_APP_LIMITED 7
111 struct list_head writequeue; /* List of outgoing writequeue_entries */ 115 struct list_head writequeue; /* List of outgoing writequeue_entries */
112 spinlock_t writequeue_lock; 116 spinlock_t writequeue_lock;
113 int (*rx_action) (struct connection *); /* What to do when active */ 117 int (*rx_action) (struct connection *); /* What to do when active */
@@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk)
295{ 299{
296 struct connection *con = sock2con(sk); 300 struct connection *con = sock2con(sk);
297 301
298 if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags)) 302 if (!con)
303 return;
304
305 clear_bit(SOCK_NOSPACE, &con->sock->flags);
306
307 if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
308 con->sock->sk->sk_write_pending--;
309 clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
310 }
311
312 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
299 queue_work(send_workqueue, &con->swork); 313 queue_work(send_workqueue, &con->swork);
300} 314}
301 315
@@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con)
915 struct sockaddr_storage saddr, src_addr; 929 struct sockaddr_storage saddr, src_addr;
916 int addr_len; 930 int addr_len;
917 struct socket *sock = NULL; 931 struct socket *sock = NULL;
932 int one = 1;
918 933
919 if (con->nodeid == 0) { 934 if (con->nodeid == 0) {
920 log_print("attempt to connect sock 0 foiled"); 935 log_print("attempt to connect sock 0 foiled");
@@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con)
960 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); 975 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
961 976
962 log_print("connecting to %d", con->nodeid); 977 log_print("connecting to %d", con->nodeid);
978
979 /* Turn off Nagle's algorithm */
980 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
981 sizeof(one));
982
963 result = 983 result =
964 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, 984 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
965 O_NONBLOCK); 985 O_NONBLOCK);
@@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
1011 goto create_out; 1031 goto create_out;
1012 } 1032 }
1013 1033
1034 /* Turn off Nagle's algorithm */
1035 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
1036 sizeof(one));
1037
1014 result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, 1038 result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
1015 (char *)&one, sizeof(one)); 1039 (char *)&one, sizeof(one));
1016 1040
@@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con)
1297 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1321 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1298 struct writequeue_entry *e; 1322 struct writequeue_entry *e;
1299 int len, offset; 1323 int len, offset;
1324 int count = 0;
1300 1325
1301 mutex_lock(&con->sock_mutex); 1326 mutex_lock(&con->sock_mutex);
1302 if (con->sock == NULL) 1327 if (con->sock == NULL)
@@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con)
1319 ret = kernel_sendpage(con->sock, e->page, offset, len, 1344 ret = kernel_sendpage(con->sock, e->page, offset, len,
1320 msg_flags); 1345 msg_flags);
1321 if (ret == -EAGAIN || ret == 0) { 1346 if (ret == -EAGAIN || ret == 0) {
1347 if (ret == -EAGAIN &&
1348 test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
1349 !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
1350 /* Notify TCP that we're limited by the
1351 * application window size.
1352 */
1353 set_bit(SOCK_NOSPACE, &con->sock->flags);
1354 con->sock->sk->sk_write_pending++;
1355 }
1322 cond_resched(); 1356 cond_resched();
1323 goto out; 1357 goto out;
1324 } 1358 }
1325 if (ret <= 0) 1359 if (ret <= 0)
1326 goto send_error; 1360 goto send_error;
1327 } 1361 }
1328 /* Don't starve people filling buffers */ 1362
1363 /* Don't starve people filling buffers */
1364 if (++count >= MAX_SEND_MSG_COUNT) {
1329 cond_resched(); 1365 cond_resched();
1366 count = 0;
1367 }
1330 1368
1331 spin_lock(&con->writequeue_lock); 1369 spin_lock(&con->writequeue_lock);
1332 e->offset += ret; 1370 e->offset += ret;
@@ -1430,20 +1468,19 @@ static void work_stop(void)
1430 1468
1431static int work_start(void) 1469static int work_start(void)
1432{ 1470{
1433 int error; 1471 recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
1434 recv_workqueue = create_workqueue("dlm_recv"); 1472 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1435 error = IS_ERR(recv_workqueue); 1473 if (!recv_workqueue) {
1436 if (error) { 1474 log_print("can't start dlm_recv");
1437 log_print("can't start dlm_recv %d", error); 1475 return -ENOMEM;
1438 return error;
1439 } 1476 }
1440 1477
1441 send_workqueue = create_singlethread_workqueue("dlm_send"); 1478 send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
1442 error = IS_ERR(send_workqueue); 1479 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1443 if (error) { 1480 if (!send_workqueue) {
1444 log_print("can't start dlm_send %d", error); 1481 log_print("can't start dlm_send");
1445 destroy_workqueue(recv_workqueue); 1482 destroy_workqueue(recv_workqueue);
1446 return error; 1483 return -ENOMEM;
1447 } 1484 }
1448 1485
1449 return 0; 1486 return 0;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e7..bfd8b680e648 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
348 BUG_ON(!crypt_stat || !crypt_stat->tfm 348 BUG_ON(!crypt_stat || !crypt_stat->tfm
349 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); 349 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
350 if (unlikely(ecryptfs_verbosity > 0)) { 350 if (unlikely(ecryptfs_verbosity > 0)) {
351 ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n", 351 ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
352 crypt_stat->key_size); 352 crypt_stat->key_size);
353 ecryptfs_dump_hex(crypt_stat->key, 353 ecryptfs_dump_hex(crypt_stat->key,
354 crypt_stat->key_size); 354 crypt_stat->key_size);
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
413 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 413 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
414 (extent_base + extent_offset)); 414 (extent_base + extent_offset));
415 if (rc) { 415 if (rc) {
416 ecryptfs_printk(KERN_ERR, "Error attempting to " 416 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
417 "derive IV for extent [0x%.16x]; " 417 "extent [0x%.16llx]; rc = [%d]\n",
418 "rc = [%d]\n", (extent_base + extent_offset), 418 (unsigned long long)(extent_base + extent_offset), rc);
419 rc);
420 goto out; 419 goto out;
421 } 420 }
422 if (unlikely(ecryptfs_verbosity > 0)) { 421 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
443 } 442 }
444 rc = 0; 443 rc = 0;
445 if (unlikely(ecryptfs_verbosity > 0)) { 444 if (unlikely(ecryptfs_verbosity > 0)) {
446 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; " 445 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
447 "rc = [%d]\n", (extent_base + extent_offset), 446 "rc = [%d]\n",
448 rc); 447 (unsigned long long)(extent_base + extent_offset), rc);
449 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " 448 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
450 "encryption:\n"); 449 "encryption:\n");
451 ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8); 450 ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
540 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 539 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
541 (extent_base + extent_offset)); 540 (extent_base + extent_offset));
542 if (rc) { 541 if (rc) {
543 ecryptfs_printk(KERN_ERR, "Error attempting to " 542 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
544 "derive IV for extent [0x%.16x]; " 543 "extent [0x%.16llx]; rc = [%d]\n",
545 "rc = [%d]\n", (extent_base + extent_offset), 544 (unsigned long long)(extent_base + extent_offset), rc);
546 rc);
547 goto out; 545 goto out;
548 } 546 }
549 if (unlikely(ecryptfs_verbosity > 0)) { 547 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
571 } 569 }
572 rc = 0; 570 rc = 0;
573 if (unlikely(ecryptfs_verbosity > 0)) { 571 if (unlikely(ecryptfs_verbosity > 0)) {
574 ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; " 572 ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
575 "rc = [%d]\n", (extent_base + extent_offset), 573 "rc = [%d]\n",
576 rc); 574 (unsigned long long)(extent_base + extent_offset), rc);
577 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " 575 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
578 "decryption:\n"); 576 "decryption:\n");
579 ecryptfs_dump_hex((char *)(page_address(page) 577 ecryptfs_dump_hex((char *)(page_address(page)
@@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
780 } 778 }
781 ecryptfs_printk(KERN_DEBUG, 779 ecryptfs_printk(KERN_DEBUG,
782 "Initializing cipher [%s]; strlen = [%d]; " 780 "Initializing cipher [%s]; strlen = [%d]; "
783 "key_size_bits = [%d]\n", 781 "key_size_bits = [%zd]\n",
784 crypt_stat->cipher, (int)strlen(crypt_stat->cipher), 782 crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
785 crypt_stat->key_size << 3); 783 crypt_stat->key_size << 3);
786 if (crypt_stat->tfm) { 784 if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 413a3c48f0bb..dbc84ed96336 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
192 (((struct user_key_payload*)key->payload.data)->data); 192 (((struct user_key_payload*)key->payload.data)->data);
193} 193}
194 194
195#define ECRYPTFS_SUPER_MAGIC 0xf15f
196#define ECRYPTFS_MAX_KEYSET_SIZE 1024 195#define ECRYPTFS_MAX_KEYSET_SIZE 1024
197#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 196#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
198#define ECRYPTFS_MAX_NUM_ENC_KEYS 64 197#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
@@ -584,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
584 583
585#define ecryptfs_printk(type, fmt, arg...) \ 584#define ecryptfs_printk(type, fmt, arg...) \
586 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); 585 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
586__attribute__ ((format(printf, 1, 2)))
587void __ecryptfs_printk(const char *fmt, ...); 587void __ecryptfs_printk(const char *fmt, ...);
588 588
589extern const struct file_operations ecryptfs_main_fops; 589extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 91da02987bff..81e10e6a9443 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
47 const struct iovec *iov, 47 const struct iovec *iov,
48 unsigned long nr_segs, loff_t pos) 48 unsigned long nr_segs, loff_t pos)
49{ 49{
50 int rc; 50 ssize_t rc;
51 struct dentry *lower_dentry; 51 struct dentry *lower_dentry;
52 struct vfsmount *lower_vfsmount; 52 struct vfsmount *lower_vfsmount;
53 struct file *file = iocb->ki_filp; 53 struct file *file = iocb->ki_filp;
@@ -191,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
191 | ECRYPTFS_ENCRYPTED); 191 | ECRYPTFS_ENCRYPTED);
192 } 192 }
193 mutex_unlock(&crypt_stat->cs_mutex); 193 mutex_unlock(&crypt_stat->cs_mutex);
194 if (!ecryptfs_inode_to_private(inode)->lower_file) { 194 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
195 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 195 if (rc) {
196 if (rc) { 196 printk(KERN_ERR "%s: Error attempting to initialize "
197 printk(KERN_ERR "%s: Error attempting to initialize " 197 "the persistent file for the dentry with name "
198 "the persistent file for the dentry with name " 198 "[%s]; rc = [%d]\n", __func__,
199 "[%s]; rc = [%d]\n", __func__, 199 ecryptfs_dentry->d_name.name, rc);
200 ecryptfs_dentry->d_name.name, rc); 200 goto out_free;
201 goto out_free;
202 }
203 } 201 }
204 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) 202 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
205 && !(file->f_flags & O_RDONLY)) { 203 == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
206 rc = -EPERM; 204 rc = -EPERM;
207 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " 205 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
208 "file must hence be opened RO\n", __func__); 206 "file must hence be opened RO\n", __func__);
@@ -243,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
243 } 241 }
244 } 242 }
245 mutex_unlock(&crypt_stat->cs_mutex); 243 mutex_unlock(&crypt_stat->cs_mutex);
246 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] " 244 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
247 "size: [0x%.16x]\n", inode, inode->i_ino, 245 "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
248 i_size_read(inode)); 246 (unsigned long long)i_size_read(inode));
249 goto out; 247 goto out;
250out_free: 248out_free:
251 kmem_cache_free(ecryptfs_file_info_cache, 249 kmem_cache_free(ecryptfs_file_info_cache,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 337352a94751..bd33f87a1907 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
185 "context; rc = [%d]\n", rc); 185 "context; rc = [%d]\n", rc);
186 goto out; 186 goto out;
187 } 187 }
188 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { 188 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
189 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 189 if (rc) {
190 if (rc) { 190 printk(KERN_ERR "%s: Error attempting to initialize "
191 printk(KERN_ERR "%s: Error attempting to initialize " 191 "the persistent file for the dentry with name "
192 "the persistent file for the dentry with name " 192 "[%s]; rc = [%d]\n", __func__,
193 "[%s]; rc = [%d]\n", __func__, 193 ecryptfs_dentry->d_name.name, rc);
194 ecryptfs_dentry->d_name.name, rc); 194 goto out;
195 goto out;
196 }
197 } 195 }
198 rc = ecryptfs_write_metadata(ecryptfs_dentry); 196 rc = ecryptfs_write_metadata(ecryptfs_dentry);
199 if (rc) { 197 if (rc) {
@@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
302 rc = -ENOMEM; 300 rc = -ENOMEM;
303 goto out; 301 goto out;
304 } 302 }
305 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { 303 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
306 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 304 if (rc) {
307 if (rc) { 305 printk(KERN_ERR "%s: Error attempting to initialize "
308 printk(KERN_ERR "%s: Error attempting to initialize " 306 "the persistent file for the dentry with name "
309 "the persistent file for the dentry with name " 307 "[%s]; rc = [%d]\n", __func__,
310 "[%s]; rc = [%d]\n", __func__, 308 ecryptfs_dentry->d_name.name, rc);
311 ecryptfs_dentry->d_name.name, rc); 309 goto out_free_kmem;
312 goto out_free_kmem;
313 }
314 } 310 }
315 crypt_stat = &ecryptfs_inode_to_private( 311 crypt_stat = &ecryptfs_inode_to_private(
316 ecryptfs_dentry->d_inode)->crypt_stat; 312 ecryptfs_dentry->d_inode)->crypt_stat;
@@ -441,7 +437,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
441 struct qstr lower_name; 437 struct qstr lower_name;
442 int rc = 0; 438 int rc = 0;
443 439
444 d_set_d_op(ecryptfs_dentry, &ecryptfs_dops);
445 if ((ecryptfs_dentry->d_name.len == 1 440 if ((ecryptfs_dentry->d_name.len == 1
446 && !strcmp(ecryptfs_dentry->d_name.name, ".")) 441 && !strcmp(ecryptfs_dentry->d_name.name, "."))
447 || (ecryptfs_dentry->d_name.len == 2 442 || (ecryptfs_dentry->d_name.len == 2
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index b1f6858a5223..c1436cff6f2d 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
59 break; 59 break;
60 default: 60 default:
61 ecryptfs_printk(KERN_WARNING, "Unknown error code: " 61 ecryptfs_printk(KERN_WARNING, "Unknown error code: "
62 "[0x%.16x]\n", err_code); 62 "[0x%.16lx]\n", err_code);
63 rc = -EINVAL; 63 rc = -EINVAL;
64 } 64 }
65 return rc; 65 return rc;
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
130 } else { 130 } else {
131 rc = -EINVAL; 131 rc = -EINVAL;
132 ecryptfs_printk(KERN_WARNING, 132 ecryptfs_printk(KERN_WARNING,
133 "Unsupported packet size: [%d]\n", size); 133 "Unsupported packet size: [%zd]\n", size);
134 } 134 }
135 return rc; 135 return rc;
136} 136}
@@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
1672 auth_tok->session_key.decrypted_key_size); 1672 auth_tok->session_key.decrypted_key_size);
1673 crypt_stat->flags |= ECRYPTFS_KEY_VALID; 1673 crypt_stat->flags |= ECRYPTFS_KEY_VALID;
1674 if (unlikely(ecryptfs_verbosity > 0)) { 1674 if (unlikely(ecryptfs_verbosity > 0)) {
1675 ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n", 1675 ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
1676 crypt_stat->key_size); 1676 crypt_stat->key_size);
1677 ecryptfs_dump_hex(crypt_stat->key, 1677 ecryptfs_dump_hex(crypt_stat->key,
1678 crypt_stat->key_size); 1678 crypt_stat->key_size);
@@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1754 if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) { 1754 if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
1755 ecryptfs_printk(KERN_ERR, "Expected " 1755 ecryptfs_printk(KERN_ERR, "Expected "
1756 "signature of size [%d]; " 1756 "signature of size [%d]; "
1757 "read size [%d]\n", 1757 "read size [%zd]\n",
1758 ECRYPTFS_SIG_SIZE, 1758 ECRYPTFS_SIG_SIZE,
1759 tag_11_contents_size); 1759 tag_11_contents_size);
1760 rc = -EIO; 1760 rc = -EIO;
@@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1787 goto out_wipe_list; 1787 goto out_wipe_list;
1788 break; 1788 break;
1789 default: 1789 default:
1790 ecryptfs_printk(KERN_DEBUG, "No packet at offset " 1790 ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
1791 "[%d] of the file header; hex value of " 1791 "of the file header; hex value of "
1792 "character is [0x%.2x]\n", i, src[i]); 1792 "character is [0x%.2x]\n", i, src[i]);
1793 next_packet_is_auth_tok_packet = 0; 1793 next_packet_is_auth_tok_packet = 0;
1794 } 1794 }
@@ -1864,8 +1864,8 @@ found_matching_auth_tok:
1864 "session key for authentication token with sig " 1864 "session key for authentication token with sig "
1865 "[%.*s]; rc = [%d]. Removing auth tok " 1865 "[%.*s]; rc = [%d]. Removing auth tok "
1866 "candidate from the list and searching for " 1866 "candidate from the list and searching for "
1867 "the next match.\n", candidate_auth_tok_sig, 1867 "the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
1868 ECRYPTFS_SIG_SIZE_HEX, rc); 1868 candidate_auth_tok_sig, rc);
1869 list_for_each_entry_safe(auth_tok_list_item, 1869 list_for_each_entry_safe(auth_tok_list_item,
1870 auth_tok_list_item_tmp, 1870 auth_tok_list_item_tmp,
1871 &auth_tok_list, list) { 1871 &auth_tok_list, list) {
@@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2168 if (encrypted_session_key_valid) { 2168 if (encrypted_session_key_valid) {
2169 ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; " 2169 ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
2170 "using auth_tok->session_key.encrypted_key, " 2170 "using auth_tok->session_key.encrypted_key, "
2171 "where key_rec->enc_key_size = [%d]\n", 2171 "where key_rec->enc_key_size = [%zd]\n",
2172 key_rec->enc_key_size); 2172 key_rec->enc_key_size);
2173 memcpy(key_rec->enc_key, 2173 memcpy(key_rec->enc_key,
2174 auth_tok->session_key.encrypted_key, 2174 auth_tok->session_key.encrypted_key,
@@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2198 if (rc < 1 || rc > 2) { 2198 if (rc < 1 || rc > 2) {
2199 ecryptfs_printk(KERN_ERR, "Error generating scatterlist " 2199 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
2200 "for crypt_stat session key; expected rc = 1; " 2200 "for crypt_stat session key; expected rc = 1; "
2201 "got rc = [%d]. key_rec->enc_key_size = [%d]\n", 2201 "got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
2202 rc, key_rec->enc_key_size); 2202 rc, key_rec->enc_key_size);
2203 rc = -ENOMEM; 2203 rc = -ENOMEM;
2204 goto out; 2204 goto out;
@@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2209 ecryptfs_printk(KERN_ERR, "Error generating scatterlist " 2209 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
2210 "for crypt_stat encrypted session key; " 2210 "for crypt_stat encrypted session key; "
2211 "expected rc = 1; got rc = [%d]. " 2211 "expected rc = 1; got rc = [%d]. "
2212 "key_rec->enc_key_size = [%d]\n", rc, 2212 "key_rec->enc_key_size = [%zd]\n", rc,
2213 key_rec->enc_key_size); 2213 key_rec->enc_key_size);
2214 rc = -ENOMEM; 2214 rc = -ENOMEM;
2215 goto out; 2215 goto out;
@@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2224 goto out; 2224 goto out;
2225 } 2225 }
2226 rc = 0; 2226 rc = 0;
2227 ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n", 2227 ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
2228 crypt_stat->key_size); 2228 crypt_stat->key_size);
2229 rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, 2229 rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
2230 (*key_rec).enc_key_size); 2230 (*key_rec).enc_key_size);
@@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2235 } 2235 }
2236 ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); 2236 ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
2237 if (ecryptfs_verbosity > 0) { 2237 if (ecryptfs_verbosity > 0) {
2238 ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n", 2238 ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
2239 key_rec->enc_key_size); 2239 key_rec->enc_key_size);
2240 ecryptfs_dump_hex(key_rec->enc_key, 2240 ecryptfs_dump_hex(key_rec->enc_key,
2241 key_rec->enc_key_size); 2241 key_rec->enc_key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 351038675376..758323a0f09a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/magic.h>
39#include "ecryptfs_kernel.h" 40#include "ecryptfs_kernel.h"
40 41
41/** 42/**
@@ -141,25 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
141 return rc; 142 return rc;
142} 143}
143 144
144/** 145static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
145 * ecryptfs_interpose 146 struct super_block *sb)
146 * @lower_dentry: Existing dentry in the lower filesystem
147 * @dentry: ecryptfs' dentry
148 * @sb: ecryptfs's super_block
149 * @flags: flags to govern behavior of interpose procedure
150 *
151 * Interposes upper and lower dentries.
152 *
153 * Returns zero on success; non-zero otherwise
154 */
155int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
156 struct super_block *sb, u32 flags)
157{ 147{
158 struct inode *lower_inode;
159 struct inode *inode; 148 struct inode *inode;
160 int rc = 0; 149 int rc = 0;
161 150
162 lower_inode = lower_dentry->d_inode;
163 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { 151 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
164 rc = -EXDEV; 152 rc = -EXDEV;
165 goto out; 153 goto out;
@@ -189,17 +177,38 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
189 if (special_file(lower_inode->i_mode)) 177 if (special_file(lower_inode->i_mode))
190 init_special_inode(inode, lower_inode->i_mode, 178 init_special_inode(inode, lower_inode->i_mode,
191 lower_inode->i_rdev); 179 lower_inode->i_rdev);
192 d_set_d_op(dentry, &ecryptfs_dops);
193 fsstack_copy_attr_all(inode, lower_inode); 180 fsstack_copy_attr_all(inode, lower_inode);
194 /* This size will be overwritten for real files w/ headers and 181 /* This size will be overwritten for real files w/ headers and
195 * other metadata */ 182 * other metadata */
196 fsstack_copy_inode_size(inode, lower_inode); 183 fsstack_copy_inode_size(inode, lower_inode);
184 return inode;
185out:
186 return ERR_PTR(rc);
187}
188
189/**
190 * ecryptfs_interpose
191 * @lower_dentry: Existing dentry in the lower filesystem
192 * @dentry: ecryptfs' dentry
193 * @sb: ecryptfs's super_block
194 * @flags: flags to govern behavior of interpose procedure
195 *
196 * Interposes upper and lower dentries.
197 *
198 * Returns zero on success; non-zero otherwise
199 */
200int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
201 struct super_block *sb, u32 flags)
202{
203 struct inode *lower_inode = lower_dentry->d_inode;
204 struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
205 if (IS_ERR(inode))
206 return PTR_ERR(inode);
197 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) 207 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
198 d_add(dentry, inode); 208 d_add(dentry, inode);
199 else 209 else
200 d_instantiate(dentry, inode); 210 d_instantiate(dentry, inode);
201out: 211 return 0;
202 return rc;
203} 212}
204 213
205enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, 214enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
@@ -492,59 +501,11 @@ struct kmem_cache *ecryptfs_sb_info_cache;
492static struct file_system_type ecryptfs_fs_type; 501static struct file_system_type ecryptfs_fs_type;
493 502
494/** 503/**
495 * ecryptfs_read_super
496 * @sb: The ecryptfs super block
497 * @dev_name: The path to mount over
498 *
499 * Read the super block of the lower filesystem, and use
500 * ecryptfs_interpose to create our initial inode and super block
501 * struct.
502 */
503static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
504{
505 struct path path;
506 int rc;
507
508 rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
509 if (rc) {
510 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
511 goto out;
512 }
513 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
514 rc = -EINVAL;
515 printk(KERN_ERR "Mount on filesystem of type "
516 "eCryptfs explicitly disallowed due to "
517 "known incompatibilities\n");
518 goto out_free;
519 }
520 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
521 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
522 sb->s_blocksize = path.dentry->d_sb->s_blocksize;
523 ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
524 ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
525 rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
526 if (rc)
527 goto out_free;
528 rc = 0;
529 goto out;
530out_free:
531 path_put(&path);
532out:
533 return rc;
534}
535
536/**
537 * ecryptfs_get_sb 504 * ecryptfs_get_sb
538 * @fs_type 505 * @fs_type
539 * @flags 506 * @flags
540 * @dev_name: The path to mount over 507 * @dev_name: The path to mount over
541 * @raw_data: The options passed into the kernel 508 * @raw_data: The options passed into the kernel
542 *
543 * The whole ecryptfs_get_sb process is broken into 3 functions:
544 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
545 * ecryptfs_read_super(): this accesses the lower filesystem and uses
546 * ecryptfs_interpose to perform most of the linking
547 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
548 */ 509 */
549static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, 510static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
550 const char *dev_name, void *raw_data) 511 const char *dev_name, void *raw_data)
@@ -553,6 +514,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
553 struct ecryptfs_sb_info *sbi; 514 struct ecryptfs_sb_info *sbi;
554 struct ecryptfs_dentry_info *root_info; 515 struct ecryptfs_dentry_info *root_info;
555 const char *err = "Getting sb failed"; 516 const char *err = "Getting sb failed";
517 struct inode *inode;
518 struct path path;
556 int rc; 519 int rc;
557 520
558 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); 521 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -575,10 +538,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
575 538
576 s->s_flags = flags; 539 s->s_flags = flags;
577 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); 540 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
578 if (rc) { 541 if (rc)
579 deactivate_locked_super(s); 542 goto out1;
580 goto out;
581 }
582 543
583 ecryptfs_set_superblock_private(s, sbi); 544 ecryptfs_set_superblock_private(s, sbi);
584 s->s_bdi = &sbi->bdi; 545 s->s_bdi = &sbi->bdi;
@@ -586,34 +547,55 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
586 /* ->kill_sb() will take care of sbi after that point */ 547 /* ->kill_sb() will take care of sbi after that point */
587 sbi = NULL; 548 sbi = NULL;
588 s->s_op = &ecryptfs_sops; 549 s->s_op = &ecryptfs_sops;
550 s->s_d_op = &ecryptfs_dops;
589 551
590 rc = -ENOMEM; 552 err = "Reading sb failed";
591 s->s_root = d_alloc(NULL, &(const struct qstr) { 553 rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
592 .hash = 0,.name = "/",.len = 1}); 554 if (rc) {
555 ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
556 goto out1;
557 }
558 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
559 rc = -EINVAL;
560 printk(KERN_ERR "Mount on filesystem of type "
561 "eCryptfs explicitly disallowed due to "
562 "known incompatibilities\n");
563 goto out_free;
564 }
565 ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
566 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
567 s->s_blocksize = path.dentry->d_sb->s_blocksize;
568 s->s_magic = ECRYPTFS_SUPER_MAGIC;
569
570 inode = ecryptfs_get_inode(path.dentry->d_inode, s);
571 rc = PTR_ERR(inode);
572 if (IS_ERR(inode))
573 goto out_free;
574
575 s->s_root = d_alloc_root(inode);
593 if (!s->s_root) { 576 if (!s->s_root) {
594 deactivate_locked_super(s); 577 iput(inode);
595 goto out; 578 rc = -ENOMEM;
579 goto out_free;
596 } 580 }
597 d_set_d_op(s->s_root, &ecryptfs_dops);
598 s->s_root->d_sb = s;
599 s->s_root->d_parent = s->s_root;
600 581
582 rc = -ENOMEM;
601 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); 583 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
602 if (!root_info) { 584 if (!root_info)
603 deactivate_locked_super(s); 585 goto out_free;
604 goto out; 586
605 }
606 /* ->kill_sb() will take care of root_info */ 587 /* ->kill_sb() will take care of root_info */
607 ecryptfs_set_dentry_private(s->s_root, root_info); 588 ecryptfs_set_dentry_private(s->s_root, root_info);
589 ecryptfs_set_dentry_lower(s->s_root, path.dentry);
590 ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
591
608 s->s_flags |= MS_ACTIVE; 592 s->s_flags |= MS_ACTIVE;
609 rc = ecryptfs_read_super(s, dev_name);
610 if (rc) {
611 deactivate_locked_super(s);
612 err = "Reading sb failed";
613 goto out;
614 }
615 return dget(s->s_root); 593 return dget(s->s_root);
616 594
595out_free:
596 path_put(&path);
597out1:
598 deactivate_locked_super(s);
617out: 599out:
618 if (sbi) { 600 if (sbi) {
619 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); 601 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
@@ -828,9 +810,10 @@ static int __init ecryptfs_init(void)
828 ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " 810 ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
829 "larger than the host's page size, and so " 811 "larger than the host's page size, and so "
830 "eCryptfs cannot run on this system. The " 812 "eCryptfs cannot run on this system. The "
831 "default eCryptfs extent size is [%d] bytes; " 813 "default eCryptfs extent size is [%u] bytes; "
832 "the page size is [%d] bytes.\n", 814 "the page size is [%lu] bytes.\n",
833 ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE); 815 ECRYPTFS_DEFAULT_EXTENT_SIZE,
816 (unsigned long)PAGE_CACHE_SIZE);
834 goto out; 817 goto out;
835 } 818 }
836 rc = ecryptfs_init_kmem_caches(); 819 rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544b..cc64fca89f8d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
65 rc = ecryptfs_encrypt_page(page); 65 rc = ecryptfs_encrypt_page(page);
66 if (rc) { 66 if (rc) {
67 ecryptfs_printk(KERN_WARNING, "Error encrypting " 67 ecryptfs_printk(KERN_WARNING, "Error encrypting "
68 "page (upper index [0x%.16x])\n", page->index); 68 "page (upper index [0x%.16lx])\n", page->index);
69 ClearPageUptodate(page); 69 ClearPageUptodate(page);
70 goto out; 70 goto out;
71 } 71 }
@@ -237,7 +237,7 @@ out:
237 ClearPageUptodate(page); 237 ClearPageUptodate(page);
238 else 238 else
239 SetPageUptodate(page); 239 SetPageUptodate(page);
240 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n", 240 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
241 page->index); 241 page->index);
242 unlock_page(page); 242 unlock_page(page);
243 return rc; 243 return rc;
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
290 return -ENOMEM; 290 return -ENOMEM;
291 *pagep = page; 291 *pagep = page;
292 292
293 prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
293 if (!PageUptodate(page)) { 294 if (!PageUptodate(page)) {
294 struct ecryptfs_crypt_stat *crypt_stat = 295 struct ecryptfs_crypt_stat *crypt_stat =
295 &ecryptfs_inode_to_private(mapping->host)->crypt_stat; 296 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
335 SetPageUptodate(page); 336 SetPageUptodate(page);
336 } 337 }
337 } else { 338 } else {
338 rc = ecryptfs_decrypt_page(page); 339 if (prev_page_end_size
339 if (rc) { 340 >= i_size_read(page->mapping->host)) {
340 printk(KERN_ERR "%s: Error decrypting page " 341 zero_user(page, 0, PAGE_CACHE_SIZE);
341 "at index [%ld]; rc = [%d]\n", 342 } else {
342 __func__, page->index, rc); 343 rc = ecryptfs_decrypt_page(page);
343 ClearPageUptodate(page); 344 if (rc) {
344 goto out; 345 printk(KERN_ERR "%s: Error decrypting "
346 "page at index [%ld]; "
347 "rc = [%d]\n",
348 __func__, page->index, rc);
349 ClearPageUptodate(page);
350 goto out;
351 }
345 } 352 }
346 SetPageUptodate(page); 353 SetPageUptodate(page);
347 } 354 }
348 } 355 }
349 prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
350 /* If creating a page or more of holes, zero them out via truncate. 356 /* If creating a page or more of holes, zero them out via truncate.
351 * Note, this will increase i_size. */ 357 * Note, this will increase i_size. */
352 if (index != 0) { 358 if (index != 0) {
@@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file,
488 } else 494 } else
489 ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); 495 ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
490 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" 496 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
491 "(page w/ index = [0x%.16x], to = [%d])\n", index, to); 497 "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
492 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 498 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
493 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, 499 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
494 to); 500 to);
@@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file,
503 rc = fill_zeros_to_end_of_page(page, to); 509 rc = fill_zeros_to_end_of_page(page, to);
504 if (rc) { 510 if (rc) {
505 ecryptfs_printk(KERN_WARNING, "Error attempting to fill " 511 ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
506 "zeros in page with index = [0x%.16x]\n", index); 512 "zeros in page with index = [0x%.16lx]\n", index);
507 goto out; 513 goto out;
508 } 514 }
509 rc = ecryptfs_encrypt_page(page); 515 rc = ecryptfs_encrypt_page(page);
510 if (rc) { 516 if (rc) {
511 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " 517 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
512 "index [0x%.16x])\n", index); 518 "index [0x%.16lx])\n", index);
513 goto out; 519 goto out;
514 } 520 }
515 if (pos + copied > i_size_read(ecryptfs_inode)) { 521 if (pos + copied > i_size_read(ecryptfs_inode)) {
516 i_size_write(ecryptfs_inode, pos + copied); 522 i_size_write(ecryptfs_inode, pos + copied);
517 ecryptfs_printk(KERN_DEBUG, "Expanded file size to " 523 ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
518 "[0x%.16x]\n", i_size_read(ecryptfs_inode)); 524 "[0x%.16llx]\n",
525 (unsigned long long)i_size_read(ecryptfs_inode));
519 } 526 }
520 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 527 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
521 if (rc) 528 if (rc)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8cf07242067d..cc8a9b7d6064 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -217,7 +217,7 @@ struct ep_send_events_data {
217 * Configuration options available inside /proc/sys/fs/epoll/ 217 * Configuration options available inside /proc/sys/fs/epoll/
218 */ 218 */
219/* Maximum number of epoll watched descriptors, per user */ 219/* Maximum number of epoll watched descriptors, per user */
220static int max_user_watches __read_mostly; 220static long max_user_watches __read_mostly;
221 221
222/* 222/*
223 * This mutex is used to serialize ep_free() and eventpoll_release_file(). 223 * This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -240,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
240 240
241#include <linux/sysctl.h> 241#include <linux/sysctl.h>
242 242
243static int zero; 243static long zero;
244static long long_max = LONG_MAX;
244 245
245ctl_table epoll_table[] = { 246ctl_table epoll_table[] = {
246 { 247 {
247 .procname = "max_user_watches", 248 .procname = "max_user_watches",
248 .data = &max_user_watches, 249 .data = &max_user_watches,
249 .maxlen = sizeof(int), 250 .maxlen = sizeof(max_user_watches),
250 .mode = 0644, 251 .mode = 0644,
251 .proc_handler = proc_dointvec_minmax, 252 .proc_handler = proc_doulongvec_minmax,
252 .extra1 = &zero, 253 .extra1 = &zero,
254 .extra2 = &long_max,
253 }, 255 },
254 { } 256 { }
255}; 257};
@@ -561,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
561 /* At this point it is safe to free the eventpoll item */ 563 /* At this point it is safe to free the eventpoll item */
562 kmem_cache_free(epi_cache, epi); 564 kmem_cache_free(epi_cache, epi);
563 565
564 atomic_dec(&ep->user->epoll_watches); 566 atomic_long_dec(&ep->user->epoll_watches);
565 567
566 return 0; 568 return 0;
567} 569}
@@ -898,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
898{ 900{
899 int error, revents, pwake = 0; 901 int error, revents, pwake = 0;
900 unsigned long flags; 902 unsigned long flags;
903 long user_watches;
901 struct epitem *epi; 904 struct epitem *epi;
902 struct ep_pqueue epq; 905 struct ep_pqueue epq;
903 906
904 if (unlikely(atomic_read(&ep->user->epoll_watches) >= 907 user_watches = atomic_long_read(&ep->user->epoll_watches);
905 max_user_watches)) 908 if (unlikely(user_watches >= max_user_watches))
906 return -ENOSPC; 909 return -ENOSPC;
907 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 910 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
908 return -ENOMEM; 911 return -ENOMEM;
@@ -966,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
966 969
967 spin_unlock_irqrestore(&ep->lock, flags); 970 spin_unlock_irqrestore(&ep->lock, flags);
968 971
969 atomic_inc(&ep->user->epoll_watches); 972 atomic_long_inc(&ep->user->epoll_watches);
970 973
971 /* We have to call this outside the lock */ 974 /* We have to call this outside the lock */
972 if (pwake) 975 if (pwake)
@@ -1426,6 +1429,7 @@ static int __init eventpoll_init(void)
1426 */ 1429 */
1427 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / 1430 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1428 EP_ITEM_COST; 1431 EP_ITEM_COST;
1432 BUG_ON(max_user_watches < 0);
1429 1433
1430 /* Initialize the structure used to perform safe poll wait head wake ups */ 1434 /* Initialize the structure used to perform safe poll wait head wake ups */
1431 ep_nested_calls_init(&poll_safewake_ncalls); 1435 ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2709b34206ab..47cda410b548 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,21 +28,30 @@
28 28
29typedef struct ext2_dir_entry_2 ext2_dirent; 29typedef struct ext2_dir_entry_2 ext2_dirent;
30 30
31/*
32 * Tests against MAX_REC_LEN etc were put in place for 64k block
33 * sizes; if that is not possible on this arch, we can skip
34 * those tests and speed things up.
35 */
31static inline unsigned ext2_rec_len_from_disk(__le16 dlen) 36static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
32{ 37{
33 unsigned len = le16_to_cpu(dlen); 38 unsigned len = le16_to_cpu(dlen);
34 39
40#if (PAGE_CACHE_SIZE >= 65536)
35 if (len == EXT2_MAX_REC_LEN) 41 if (len == EXT2_MAX_REC_LEN)
36 return 1 << 16; 42 return 1 << 16;
43#endif
37 return len; 44 return len;
38} 45}
39 46
40static inline __le16 ext2_rec_len_to_disk(unsigned len) 47static inline __le16 ext2_rec_len_to_disk(unsigned len)
41{ 48{
49#if (PAGE_CACHE_SIZE >= 65536)
42 if (len == (1 << 16)) 50 if (len == (1 << 16))
43 return cpu_to_le16(EXT2_MAX_REC_LEN); 51 return cpu_to_le16(EXT2_MAX_REC_LEN);
44 else 52 else
45 BUG_ON(len > (1 << 16)); 53 BUG_ON(len > (1 << 16));
54#endif
46 return cpu_to_le16(len); 55 return cpu_to_le16(len);
47} 56}
48 57
@@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet)
129 p = (ext2_dirent *)(kaddr + offs); 138 p = (ext2_dirent *)(kaddr + offs);
130 rec_len = ext2_rec_len_from_disk(p->rec_len); 139 rec_len = ext2_rec_len_from_disk(p->rec_len);
131 140
132 if (rec_len < EXT2_DIR_REC_LEN(1)) 141 if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
133 goto Eshort; 142 goto Eshort;
134 if (rec_len & 3) 143 if (unlikely(rec_len & 3))
135 goto Ealign; 144 goto Ealign;
136 if (rec_len < EXT2_DIR_REC_LEN(p->name_len)) 145 if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
137 goto Enamelen; 146 goto Enamelen;
138 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) 147 if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
139 goto Espan; 148 goto Espan;
140 if (le32_to_cpu(p->inode) > max_inumber) 149 if (unlikely(le32_to_cpu(p->inode) > max_inumber))
141 goto Einumber; 150 goto Einumber;
142 } 151 }
143 if (offs != limit) 152 if (offs != limit)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index f8aecd2e3297..2e1d8341d827 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
67 inode = NULL; 67 inode = NULL;
68 if (ino) { 68 if (ino) {
69 inode = ext2_iget(dir->i_sb, ino); 69 inode = ext2_iget(dir->i_sb, ino);
70 if (unlikely(IS_ERR(inode))) { 70 if (IS_ERR(inode)) {
71 if (PTR_ERR(inode) == -ESTALE) { 71 if (PTR_ERR(inode) == -ESTALE) {
72 ext2_error(dir->i_sb, __func__, 72 ext2_error(dir->i_sb, __func__,
73 "deleted inode referenced: %lu", 73 "deleted inode referenced: %lu",
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e0c6380ff992..7731695e65d9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data);
43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
44static int ext2_sync_fs(struct super_block *sb, int wait); 44static int ext2_sync_fs(struct super_block *sb, int wait);
45 45
46void ext2_error (struct super_block * sb, const char * function, 46void ext2_error(struct super_block *sb, const char *function,
47 const char * fmt, ...) 47 const char *fmt, ...)
48{ 48{
49 struct va_format vaf;
49 va_list args; 50 va_list args;
50 struct ext2_sb_info *sbi = EXT2_SB(sb); 51 struct ext2_sb_info *sbi = EXT2_SB(sb);
51 struct ext2_super_block *es = sbi->s_es; 52 struct ext2_super_block *es = sbi->s_es;
@@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function,
59 } 60 }
60 61
61 va_start(args, fmt); 62 va_start(args, fmt);
62 printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function); 63
63 vprintk(fmt, args); 64 vaf.fmt = fmt;
64 printk("\n"); 65 vaf.va = &args;
66
67 printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
68 sb->s_id, function, &vaf);
69
65 va_end(args); 70 va_end(args);
66 71
67 if (test_opt(sb, ERRORS_PANIC)) 72 if (test_opt(sb, ERRORS_PANIC))
@@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function,
76void ext2_msg(struct super_block *sb, const char *prefix, 81void ext2_msg(struct super_block *sb, const char *prefix,
77 const char *fmt, ...) 82 const char *fmt, ...)
78{ 83{
84 struct va_format vaf;
79 va_list args; 85 va_list args;
80 86
81 va_start(args, fmt); 87 va_start(args, fmt);
82 printk("%sEXT2-fs (%s): ", prefix, sb->s_id); 88
83 vprintk(fmt, args); 89 vaf.fmt = fmt;
84 printk("\n"); 90 vaf.va = &args;
91
92 printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
93
85 va_end(args); 94 va_end(args);
86} 95}
87 96
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f84700be3274..c2e4dce984d2 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -199,14 +199,6 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
199 goto found; 199 goto found;
200 entry = next; 200 entry = next;
201 } 201 }
202 /* Check the remaining name entries */
203 while (!IS_LAST_ENTRY(entry)) {
204 struct ext2_xattr_entry *next =
205 EXT2_XATTR_NEXT(entry);
206 if ((char *)next >= end)
207 goto bad_block;
208 entry = next;
209 }
210 if (ext2_xattr_cache_insert(bh)) 202 if (ext2_xattr_cache_insert(bh))
211 ea_idebug(inode, "cache insert failed"); 203 ea_idebug(inode, "cache insert failed");
212 error = -ENODATA; 204 error = -ENODATA;
@@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
355/* 347/*
356 * ext2_xattr_set() 348 * ext2_xattr_set()
357 * 349 *
358 * Create, replace or remove an extended attribute for this inode. Buffer 350 * Create, replace or remove an extended attribute for this inode. Value
359 * is NULL to remove an existing extended attribute, and non-NULL to 351 * is NULL to remove an existing extended attribute, and non-NULL to
360 * either replace an existing extended attribute, or create a new extended 352 * either replace an existing extended attribute, or create a new extended
361 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 353 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b3db22649426..045995c8ce5a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
20#include <linux/ext3_jbd.h> 20#include <linux/ext3_jbd.h>
21#include <linux/quotaops.h> 21#include <linux/quotaops.h>
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/blkdev.h>
23 24
24/* 25/*
25 * balloc.c contains the blocks allocation and deallocation routines 26 * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
39 40
40#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 41#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
41 42
43/*
44 * Calculate the block group number and offset, given a block number
45 */
46static void ext3_get_group_no_and_offset(struct super_block *sb,
47 ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
48{
49 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
50
51 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
52 if (offsetp)
53 *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
54 if (blockgrpp)
55 *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
56}
57
42/** 58/**
43 * ext3_get_group_desc() -- load group descriptor from disk 59 * ext3_get_group_desc() -- load group descriptor from disk
44 * @sb: super block 60 * @sb: super block
@@ -1885,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1885 return ext3_bg_num_gdb_meta(sb,group); 1901 return ext3_bg_num_gdb_meta(sb,group);
1886 1902
1887} 1903}
1904
1905/**
1906 * ext3_trim_all_free -- function to trim all free space in alloc. group
1907 * @sb: super block for file system
1908 * @group: allocation group to trim
1909 * @start: first group block to examine
1910 * @max: last group block to examine
1911 * @gdp: allocation group description structure
1912 * @minblocks: minimum extent block count
1913 *
1914 * ext3_trim_all_free walks through group's block bitmap searching for free
1915 * blocks. When the free block is found, it tries to allocate this block and
1916 * consequent free block to get the biggest free extent possible, until it
1917 * reaches any used block. Then issue a TRIM command on this extent and free
1918 * the extent in the block bitmap. This is done until whole group is scanned.
1919 */
1920ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
1921 ext3_grpblk_t start, ext3_grpblk_t max,
1922 ext3_grpblk_t minblocks)
1923{
1924 handle_t *handle;
1925 ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
1926 ext3_fsblk_t discard_block;
1927 struct ext3_sb_info *sbi;
1928 struct buffer_head *gdp_bh, *bitmap_bh = NULL;
1929 struct ext3_group_desc *gdp;
1930 int err = 0, ret = 0;
1931
1932 /*
1933 * We will update one block bitmap, and one group descriptor
1934 */
1935 handle = ext3_journal_start_sb(sb, 2);
1936 if (IS_ERR(handle))
1937 return PTR_ERR(handle);
1938
1939 bitmap_bh = read_block_bitmap(sb, group);
1940 if (!bitmap_bh) {
1941 err = -EIO;
1942 goto err_out;
1943 }
1944
1945 BUFFER_TRACE(bitmap_bh, "getting undo access");
1946 err = ext3_journal_get_undo_access(handle, bitmap_bh);
1947 if (err)
1948 goto err_out;
1949
1950 gdp = ext3_get_group_desc(sb, group, &gdp_bh);
1951 if (!gdp) {
1952 err = -EIO;
1953 goto err_out;
1954 }
1955
1956 BUFFER_TRACE(gdp_bh, "get_write_access");
1957 err = ext3_journal_get_write_access(handle, gdp_bh);
1958 if (err)
1959 goto err_out;
1960
1961 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1962 sbi = EXT3_SB(sb);
1963
1964 /* Walk through the whole group */
1965 while (start < max) {
1966 start = bitmap_search_next_usable_block(start, bitmap_bh, max);
1967 if (start < 0)
1968 break;
1969 next = start;
1970
1971 /*
1972 * Allocate contiguous free extents by setting bits in the
1973 * block bitmap
1974 */
1975 while (next < max
1976 && claim_block(sb_bgl_lock(sbi, group),
1977 next, bitmap_bh)) {
1978 next++;
1979 }
1980
1981 /* We did not claim any blocks */
1982 if (next == start)
1983 continue;
1984
1985 discard_block = (ext3_fsblk_t)start +
1986 ext3_group_first_block_no(sb, group);
1987
1988 /* Update counters */
1989 spin_lock(sb_bgl_lock(sbi, group));
1990 le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
1991 spin_unlock(sb_bgl_lock(sbi, group));
1992 percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
1993
1994 /* Do not issue a TRIM on extents smaller than minblocks */
1995 if ((next - start) < minblocks)
1996 goto free_extent;
1997
1998 /* Send the TRIM command down to the device */
1999 err = sb_issue_discard(sb, discard_block, next - start,
2000 GFP_NOFS, 0);
2001 count += (next - start);
2002free_extent:
2003 freed = 0;
2004
2005 /*
2006 * Clear bits in the bitmap
2007 */
2008 for (bit = start; bit < next; bit++) {
2009 BUFFER_TRACE(bitmap_bh, "clear bit");
2010 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
2011 bit, bitmap_bh->b_data)) {
2012 ext3_error(sb, __func__,
2013 "bit already cleared for block "E3FSBLK,
2014 (unsigned long)bit);
2015 BUFFER_TRACE(bitmap_bh, "bit already cleared");
2016 } else {
2017 freed++;
2018 }
2019 }
2020
2021 /* Update couters */
2022 spin_lock(sb_bgl_lock(sbi, group));
2023 le16_add_cpu(&gdp->bg_free_blocks_count, freed);
2024 spin_unlock(sb_bgl_lock(sbi, group));
2025 percpu_counter_add(&sbi->s_freeblocks_counter, freed);
2026
2027 start = next;
2028 if (err < 0) {
2029 if (err != -EOPNOTSUPP)
2030 ext3_warning(sb, __func__, "Discard command "
2031 "returned error %d\n", err);
2032 break;
2033 }
2034
2035 if (fatal_signal_pending(current)) {
2036 err = -ERESTARTSYS;
2037 break;
2038 }
2039
2040 cond_resched();
2041
2042 /* No more suitable extents */
2043 if ((free_blocks - count) < minblocks)
2044 break;
2045 }
2046
2047 /* We dirtied the bitmap block */
2048 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2049 ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
2050 if (!err)
2051 err = ret;
2052
2053 /* And the group descriptor block */
2054 BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
2055 ret = ext3_journal_dirty_metadata(handle, gdp_bh);
2056 if (!err)
2057 err = ret;
2058
2059 ext3_debug("trimmed %d blocks in the group %d\n",
2060 count, group);
2061
2062err_out:
2063 if (err)
2064 count = err;
2065 ext3_journal_stop(handle);
2066 brelse(bitmap_bh);
2067
2068 return count;
2069}
2070
2071/**
2072 * ext3_trim_fs() -- trim ioctl handle function
2073 * @sb: superblock for filesystem
2074 * @start: First Byte to trim
2075 * @len: number of Bytes to trim from start
2076 * @minlen: minimum extent length in Bytes
2077 *
2078 * ext3_trim_fs goes through all allocation groups containing Bytes from
2079 * start to start+len. For each such a group ext3_trim_all_free function
2080 * is invoked to trim all free space.
2081 */
2082int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2083{
2084 ext3_grpblk_t last_block, first_block, free_blocks;
2085 unsigned long first_group, last_group;
2086 unsigned long group, ngroups;
2087 struct ext3_group_desc *gdp;
2088 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
2089 uint64_t start, len, minlen, trimmed;
2090 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2091 int ret = 0;
2092
2093 start = range->start >> sb->s_blocksize_bits;
2094 len = range->len >> sb->s_blocksize_bits;
2095 minlen = range->minlen >> sb->s_blocksize_bits;
2096 trimmed = 0;
2097
2098 if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
2099 return -EINVAL;
2100 if (start >= max_blks)
2101 goto out;
2102 if (start < le32_to_cpu(es->s_first_data_block)) {
2103 len -= le32_to_cpu(es->s_first_data_block) - start;
2104 start = le32_to_cpu(es->s_first_data_block);
2105 }
2106 if (start + len > max_blks)
2107 len = max_blks - start;
2108
2109 ngroups = EXT3_SB(sb)->s_groups_count;
2110 smp_rmb();
2111
2112 /* Determine first and last group to examine based on start and len */
2113 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
2114 &first_group, &first_block);
2115 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
2116 &last_group, &last_block);
2117 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
2118 last_block = EXT3_BLOCKS_PER_GROUP(sb);
2119
2120 if (first_group > last_group)
2121 return -EINVAL;
2122
2123 for (group = first_group; group <= last_group; group++) {
2124 gdp = ext3_get_group_desc(sb, group, NULL);
2125 if (!gdp)
2126 break;
2127
2128 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
2129 if (free_blocks < minlen)
2130 continue;
2131
2132 if (len >= EXT3_BLOCKS_PER_GROUP(sb))
2133 len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
2134 else
2135 last_block = first_block + len;
2136
2137 ret = ext3_trim_all_free(sb, group, first_block,
2138 last_block, minlen);
2139 if (ret < 0)
2140 break;
2141
2142 trimmed += ret;
2143 first_block = 0;
2144 }
2145
2146 if (ret >= 0)
2147 ret = 0;
2148
2149out:
2150 range->len = trimmed * sb->s_blocksize;
2151
2152 return ret;
2153}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf6..34f0a072b935 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
69 const char * error_msg = NULL; 69 const char * error_msg = NULL;
70 const int rlen = ext3_rec_len_from_disk(de->rec_len); 70 const int rlen = ext3_rec_len_from_disk(de->rec_len);
71 71
72 if (rlen < EXT3_DIR_REC_LEN(1)) 72 if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
73 error_msg = "rec_len is smaller than minimal"; 73 error_msg = "rec_len is smaller than minimal";
74 else if (rlen % 4 != 0) 74 else if (unlikely(rlen % 4 != 0))
75 error_msg = "rec_len % 4 != 0"; 75 error_msg = "rec_len % 4 != 0";
76 else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) 76 else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
77 error_msg = "rec_len is too small for name_len"; 77 error_msg = "rec_len is too small for name_len";
78 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) 78 else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
79 error_msg = "directory entry across blocks"; 79 error_msg = "directory entry across blocks";
80 else if (le32_to_cpu(de->inode) > 80 else if (unlikely(le32_to_cpu(de->inode) >
81 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) 81 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
82 error_msg = "inode out of bounds"; 82 error_msg = "inode out of bounds";
83 83
84 if (error_msg != NULL) 84 if (unlikely(error_msg != NULL))
85 ext3_error (dir->i_sb, function, 85 ext3_error (dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
89 (unsigned long) le32_to_cpu(de->inode), 89 (unsigned long) le32_to_cpu(de->inode),
90 rlen, de->name_len); 90 rlen, de->name_len);
91
91 return error_msg == NULL ? 1 : 0; 92 return error_msg == NULL ? 1 : 0;
92} 93}
93 94
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a9580617edd2..ae94f6d949f5 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2145,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2145 if (try_to_extend_transaction(handle, inode)) { 2145 if (try_to_extend_transaction(handle, inode)) {
2146 if (bh) { 2146 if (bh) {
2147 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 2147 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2148 ext3_journal_dirty_metadata(handle, bh); 2148 if (ext3_journal_dirty_metadata(handle, bh))
2149 return;
2149 } 2150 }
2150 ext3_mark_inode_dirty(handle, inode); 2151 ext3_mark_inode_dirty(handle, inode);
2151 truncate_restart_transaction(handle, inode); 2152 truncate_restart_transaction(handle, inode);
2152 if (bh) { 2153 if (bh) {
2153 BUFFER_TRACE(bh, "retaking write access"); 2154 BUFFER_TRACE(bh, "retaking write access");
2154 ext3_journal_get_write_access(handle, bh); 2155 if (ext3_journal_get_write_access(handle, bh))
2156 return;
2155 } 2157 }
2156 } 2158 }
2157 2159
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783a..fc080dd561f7 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -276,7 +276,29 @@ group_add_out:
276 mnt_drop_write(filp->f_path.mnt); 276 mnt_drop_write(filp->f_path.mnt);
277 return err; 277 return err;
278 } 278 }
279 case FITRIM: {
279 280
281 struct super_block *sb = inode->i_sb;
282 struct fstrim_range range;
283 int ret = 0;
284
285 if (!capable(CAP_SYS_ADMIN))
286 return -EPERM;
287
288 if (copy_from_user(&range, (struct fstrim_range *)arg,
289 sizeof(range)))
290 return -EFAULT;
291
292 ret = ext3_trim_fs(sb, &range);
293 if (ret < 0)
294 return ret;
295
296 if (copy_to_user((struct fstrim_range *)arg, &range,
297 sizeof(range)))
298 return -EFAULT;
299
300 return 0;
301 }
280 302
281 default: 303 default:
282 return -ENOTTY; 304 return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index bce9dce639b8..b27ba71810ec 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
858 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 858 struct buffer_head * bh_use[NAMEI_RA_SIZE];
859 struct buffer_head * bh, *ret = NULL; 859 struct buffer_head * bh, *ret = NULL;
860 unsigned long start, block, b; 860 unsigned long start, block, b;
861 const u8 *name = entry->name;
861 int ra_max = 0; /* Number of bh's in the readahead 862 int ra_max = 0; /* Number of bh's in the readahead
862 buffer, bh_use[] */ 863 buffer, bh_use[] */
863 int ra_ptr = 0; /* Current index into readahead 864 int ra_ptr = 0; /* Current index into readahead
@@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
871 namelen = entry->len; 872 namelen = entry->len;
872 if (namelen > EXT3_NAME_LEN) 873 if (namelen > EXT3_NAME_LEN)
873 return NULL; 874 return NULL;
875 if ((namelen <= 2) && (name[0] == '.') &&
876 (name[1] == '.' || name[1] == 0)) {
877 /*
878 * "." or ".." will only be in the first block
879 * NFS may look up ".."; "." should be handled by the VFS
880 */
881 block = start = 0;
882 nblocks = 1;
883 goto restart;
884 }
874 if (is_dx(dir)) { 885 if (is_dx(dir)) {
875 bh = ext3_dx_find_entry(dir, entry, res_dir, &err); 886 bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
876 /* 887 /*
@@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
961 struct qstr *entry, struct ext3_dir_entry_2 **res_dir, 972 struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
962 int *err) 973 int *err)
963{ 974{
964 struct super_block * sb; 975 struct super_block *sb = dir->i_sb;
965 struct dx_hash_info hinfo; 976 struct dx_hash_info hinfo;
966 u32 hash;
967 struct dx_frame frames[2], *frame; 977 struct dx_frame frames[2], *frame;
968 struct ext3_dir_entry_2 *de, *top;
969 struct buffer_head *bh; 978 struct buffer_head *bh;
970 unsigned long block; 979 unsigned long block;
971 int retval; 980 int retval;
972 int namelen = entry->len;
973 const u8 *name = entry->name;
974 981
975 sb = dir->i_sb; 982 if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
976 /* NFS may look up ".." - look at dx_root directory block */ 983 return NULL;
977 if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
978 if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
979 return NULL;
980 } else {
981 frame = frames;
982 frame->bh = NULL; /* for dx_release() */
983 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
984 dx_set_block(frame->at, 0); /* dx_root block is 0 */
985 }
986 hash = hinfo.hash;
987 do { 984 do {
988 block = dx_get_block(frame->at); 985 block = dx_get_block(frame->at);
989 if (!(bh = ext3_bread (NULL,dir, block, 0, err))) 986 if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
990 goto errout; 987 goto errout;
991 de = (struct ext3_dir_entry_2 *) bh->b_data;
992 top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
993 EXT3_DIR_REC_LEN(0));
994 for (; de < top; de = ext3_next_entry(de)) {
995 int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
996 + ((char *) de - bh->b_data);
997
998 if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
999 brelse(bh);
1000 *err = ERR_BAD_DX_DIR;
1001 goto errout;
1002 }
1003 988
1004 if (ext3_match(namelen, name, de)) { 989 retval = search_dirblock(bh, dir, entry,
1005 *res_dir = de; 990 block << EXT3_BLOCK_SIZE_BITS(sb),
1006 dx_release(frames); 991 res_dir);
1007 return bh; 992 if (retval == 1) {
1008 } 993 dx_release(frames);
994 return bh;
1009 } 995 }
1010 brelse (bh); 996 brelse(bh);
997 if (retval == -1) {
998 *err = ERR_BAD_DX_DIR;
999 goto errout;
1000 }
1001
1011 /* Check to see if we should continue to search */ 1002 /* Check to see if we should continue to search */
1012 retval = ext3_htree_next_block(dir, hash, frame, 1003 retval = ext3_htree_next_block(dir, hinfo.hash, frame,
1013 frames, NULL); 1004 frames, NULL);
1014 if (retval < 0) { 1005 if (retval < 0) {
1015 ext3_warning(sb, __func__, 1006 ext3_warning(sb, __func__,
@@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1047 return ERR_PTR(-EIO); 1038 return ERR_PTR(-EIO);
1048 } 1039 }
1049 inode = ext3_iget(dir->i_sb, ino); 1040 inode = ext3_iget(dir->i_sb, ino);
1050 if (unlikely(IS_ERR(inode))) { 1041 if (IS_ERR(inode)) {
1051 if (PTR_ERR(inode) == -ESTALE) { 1042 if (PTR_ERR(inode) == -ESTALE) {
1052 ext3_error(dir->i_sb, __func__, 1043 ext3_error(dir->i_sb, __func__,
1053 "deleted inode referenced: %lu", 1044 "deleted inode referenced: %lu",
@@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1607 if (err) 1598 if (err)
1608 goto journal_error; 1599 goto journal_error;
1609 } 1600 }
1610 ext3_journal_dirty_metadata(handle, frames[0].bh); 1601 err = ext3_journal_dirty_metadata(handle, frames[0].bh);
1602 if (err)
1603 goto journal_error;
1611 } 1604 }
1612 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1605 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1613 if (!de) 1606 if (!de)
@@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle,
1644 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) 1637 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
1645 return -EIO; 1638 return -EIO;
1646 if (de == de_del) { 1639 if (de == de_del) {
1640 int err;
1641
1647 BUFFER_TRACE(bh, "get_write_access"); 1642 BUFFER_TRACE(bh, "get_write_access");
1648 ext3_journal_get_write_access(handle, bh); 1643 err = ext3_journal_get_write_access(handle, bh);
1644 if (err)
1645 goto journal_error;
1646
1649 if (pde) 1647 if (pde)
1650 pde->rec_len = ext3_rec_len_to_disk( 1648 pde->rec_len = ext3_rec_len_to_disk(
1651 ext3_rec_len_from_disk(pde->rec_len) + 1649 ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle,
1654 de->inode = 0; 1652 de->inode = 0;
1655 dir->i_version++; 1653 dir->i_version++;
1656 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 1654 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1657 ext3_journal_dirty_metadata(handle, bh); 1655 err = ext3_journal_dirty_metadata(handle, bh);
1656 if (err) {
1657journal_error:
1658 ext3_std_error(dir->i_sb, err);
1659 return err;
1660 }
1658 return 0; 1661 return 0;
1659 } 1662 }
1660 i += ext3_rec_len_from_disk(de->rec_len); 1663 i += ext3_rec_len_from_disk(de->rec_len);
@@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1762{ 1765{
1763 handle_t *handle; 1766 handle_t *handle;
1764 struct inode * inode; 1767 struct inode * inode;
1765 struct buffer_head * dir_block; 1768 struct buffer_head * dir_block = NULL;
1766 struct ext3_dir_entry_2 * de; 1769 struct ext3_dir_entry_2 * de;
1767 int err, retries = 0; 1770 int err, retries = 0;
1768 1771
@@ -1790,15 +1793,14 @@ retry:
1790 inode->i_fop = &ext3_dir_operations; 1793 inode->i_fop = &ext3_dir_operations;
1791 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1794 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1792 dir_block = ext3_bread (handle, inode, 0, 1, &err); 1795 dir_block = ext3_bread (handle, inode, 0, 1, &err);
1793 if (!dir_block) { 1796 if (!dir_block)
1794 drop_nlink(inode); /* is this nlink == 0? */ 1797 goto out_clear_inode;
1795 unlock_new_inode(inode); 1798
1796 ext3_mark_inode_dirty(handle, inode);
1797 iput (inode);
1798 goto out_stop;
1799 }
1800 BUFFER_TRACE(dir_block, "get_write_access"); 1799 BUFFER_TRACE(dir_block, "get_write_access");
1801 ext3_journal_get_write_access(handle, dir_block); 1800 err = ext3_journal_get_write_access(handle, dir_block);
1801 if (err)
1802 goto out_clear_inode;
1803
1802 de = (struct ext3_dir_entry_2 *) dir_block->b_data; 1804 de = (struct ext3_dir_entry_2 *) dir_block->b_data;
1803 de->inode = cpu_to_le32(inode->i_ino); 1805 de->inode = cpu_to_le32(inode->i_ino);
1804 de->name_len = 1; 1806 de->name_len = 1;
@@ -1814,11 +1816,16 @@ retry:
1814 ext3_set_de_type(dir->i_sb, de, S_IFDIR); 1816 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1815 inode->i_nlink = 2; 1817 inode->i_nlink = 2;
1816 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); 1818 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1817 ext3_journal_dirty_metadata(handle, dir_block); 1819 err = ext3_journal_dirty_metadata(handle, dir_block);
1818 brelse (dir_block); 1820 if (err)
1819 ext3_mark_inode_dirty(handle, inode); 1821 goto out_clear_inode;
1820 err = ext3_add_entry (handle, dentry, inode); 1822
1823 err = ext3_mark_inode_dirty(handle, inode);
1824 if (!err)
1825 err = ext3_add_entry (handle, dentry, inode);
1826
1821 if (err) { 1827 if (err) {
1828out_clear_inode:
1822 inode->i_nlink = 0; 1829 inode->i_nlink = 0;
1823 unlock_new_inode(inode); 1830 unlock_new_inode(inode);
1824 ext3_mark_inode_dirty(handle, inode); 1831 ext3_mark_inode_dirty(handle, inode);
@@ -1827,10 +1834,14 @@ retry:
1827 } 1834 }
1828 inc_nlink(dir); 1835 inc_nlink(dir);
1829 ext3_update_dx_flag(dir); 1836 ext3_update_dx_flag(dir);
1830 ext3_mark_inode_dirty(handle, dir); 1837 err = ext3_mark_inode_dirty(handle, dir);
1838 if (err)
1839 goto out_clear_inode;
1840
1831 d_instantiate(dentry, inode); 1841 d_instantiate(dentry, inode);
1832 unlock_new_inode(inode); 1842 unlock_new_inode(inode);
1833out_stop: 1843out_stop:
1844 brelse(dir_block);
1834 ext3_journal_stop(handle); 1845 ext3_journal_stop(handle);
1835 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) 1846 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1836 goto retry; 1847 goto retry;
@@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2353 goto end_rename; 2364 goto end_rename;
2354 } else { 2365 } else {
2355 BUFFER_TRACE(new_bh, "get write access"); 2366 BUFFER_TRACE(new_bh, "get write access");
2356 ext3_journal_get_write_access(handle, new_bh); 2367 retval = ext3_journal_get_write_access(handle, new_bh);
2368 if (retval)
2369 goto journal_error;
2357 new_de->inode = cpu_to_le32(old_inode->i_ino); 2370 new_de->inode = cpu_to_le32(old_inode->i_ino);
2358 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, 2371 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2359 EXT3_FEATURE_INCOMPAT_FILETYPE)) 2372 EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2362 new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; 2375 new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
2363 ext3_mark_inode_dirty(handle, new_dir); 2376 ext3_mark_inode_dirty(handle, new_dir);
2364 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); 2377 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
2365 ext3_journal_dirty_metadata(handle, new_bh); 2378 retval = ext3_journal_dirty_metadata(handle, new_bh);
2379 if (retval)
2380 goto journal_error;
2366 brelse(new_bh); 2381 brelse(new_bh);
2367 new_bh = NULL; 2382 new_bh = NULL;
2368 } 2383 }
@@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2411 ext3_update_dx_flag(old_dir); 2426 ext3_update_dx_flag(old_dir);
2412 if (dir_bh) { 2427 if (dir_bh) {
2413 BUFFER_TRACE(dir_bh, "get_write_access"); 2428 BUFFER_TRACE(dir_bh, "get_write_access");
2414 ext3_journal_get_write_access(handle, dir_bh); 2429 retval = ext3_journal_get_write_access(handle, dir_bh);
2430 if (retval)
2431 goto journal_error;
2415 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2432 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2416 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); 2433 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
2417 ext3_journal_dirty_metadata(handle, dir_bh); 2434 retval = ext3_journal_dirty_metadata(handle, dir_bh);
2435 if (retval) {
2436journal_error:
2437 ext3_std_error(new_dir->i_sb, retval);
2438 goto end_rename;
2439 }
2418 drop_nlink(old_dir); 2440 drop_nlink(old_dir);
2419 if (new_inode) { 2441 if (new_inode) {
2420 drop_nlink(new_inode); 2442 drop_nlink(new_inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index e746d30b1232..108b142e11ed 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb,
249 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 249 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
250 set_buffer_uptodate(gdb); 250 set_buffer_uptodate(gdb);
251 unlock_buffer(gdb); 251 unlock_buffer(gdb);
252 ext3_journal_dirty_metadata(handle, gdb); 252 err = ext3_journal_dirty_metadata(handle, gdb);
253 if (err) {
254 brelse(gdb);
255 goto exit_bh;
256 }
253 ext3_set_bit(bit, bh->b_data); 257 ext3_set_bit(bit, bh->b_data);
254 brelse(gdb); 258 brelse(gdb);
255 } 259 }
@@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb,
269 err = PTR_ERR(gdb); 273 err = PTR_ERR(gdb);
270 goto exit_bh; 274 goto exit_bh;
271 } 275 }
272 ext3_journal_dirty_metadata(handle, gdb); 276 err = ext3_journal_dirty_metadata(handle, gdb);
277 if (err) {
278 brelse(gdb);
279 goto exit_bh;
280 }
273 ext3_set_bit(bit, bh->b_data); 281 ext3_set_bit(bit, bh->b_data);
274 brelse(gdb); 282 brelse(gdb);
275 } 283 }
@@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb,
295 err = PTR_ERR(it); 303 err = PTR_ERR(it);
296 goto exit_bh; 304 goto exit_bh;
297 } 305 }
298 ext3_journal_dirty_metadata(handle, it); 306 err = ext3_journal_dirty_metadata(handle, it);
307 if (err) {
308 brelse(it);
309 goto exit_bh;
310 }
299 brelse(it); 311 brelse(it);
300 ext3_set_bit(bit, bh->b_data); 312 ext3_set_bit(bit, bh->b_data);
301 } 313 }
@@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb,
306 318
307 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), 319 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
308 bh->b_data); 320 bh->b_data);
309 ext3_journal_dirty_metadata(handle, bh); 321 err = ext3_journal_dirty_metadata(handle, bh);
322 if (err)
323 goto exit_bh;
310 brelse(bh); 324 brelse(bh);
311 325
312 /* Mark unused entries in inode bitmap used */ 326 /* Mark unused entries in inode bitmap used */
@@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb,
319 333
320 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), 334 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
321 bh->b_data); 335 bh->b_data);
322 ext3_journal_dirty_metadata(handle, bh); 336 err = ext3_journal_dirty_metadata(handle, bh);
323exit_bh: 337exit_bh:
324 brelse(bh); 338 brelse(bh);
325 339
@@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
503 * reserved inode, and will become GDT blocks (primary and backup). 517 * reserved inode, and will become GDT blocks (primary and backup).
504 */ 518 */
505 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; 519 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
506 ext3_journal_dirty_metadata(handle, dind); 520 err = ext3_journal_dirty_metadata(handle, dind);
521 if (err)
522 goto exit_group_desc;
507 brelse(dind); 523 brelse(dind);
524 dind = NULL;
508 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 525 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
509 ext3_mark_iloc_dirty(handle, inode, &iloc); 526 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
527 if (err)
528 goto exit_group_desc;
510 memset((*primary)->b_data, 0, sb->s_blocksize); 529 memset((*primary)->b_data, 0, sb->s_blocksize);
511 ext3_journal_dirty_metadata(handle, *primary); 530 err = ext3_journal_dirty_metadata(handle, *primary);
531 if (err)
532 goto exit_group_desc;
512 533
513 o_group_desc = EXT3_SB(sb)->s_group_desc; 534 o_group_desc = EXT3_SB(sb)->s_group_desc;
514 memcpy(n_group_desc, o_group_desc, 535 memcpy(n_group_desc, o_group_desc,
@@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
519 kfree(o_group_desc); 540 kfree(o_group_desc);
520 541
521 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 542 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
522 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 543 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
544 if (err)
545 goto exit_inode;
523 546
524 return 0; 547 return 0;
525 548
549exit_group_desc:
550 kfree(n_group_desc);
526exit_inode: 551exit_inode:
527 //ext3_journal_release_buffer(handle, iloc.bh); 552 //ext3_journal_release_buffer(handle, iloc.bh);
528 brelse(iloc.bh); 553 brelse(iloc.bh);
@@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb,
706 } 731 }
707 ext3_debug("update metadata backup %#04lx\n", 732 ext3_debug("update metadata backup %#04lx\n",
708 (unsigned long)bh->b_blocknr); 733 (unsigned long)bh->b_blocknr);
709 if ((err = ext3_journal_get_write_access(handle, bh))) 734 if ((err = ext3_journal_get_write_access(handle, bh))) {
735 brelse(bh);
710 break; 736 break;
737 }
711 lock_buffer(bh); 738 lock_buffer(bh);
712 memcpy(bh->b_data, data, size); 739 memcpy(bh->b_data, data, size);
713 if (rest) 740 if (rest)
714 memset(bh->b_data + size, 0, rest); 741 memset(bh->b_data + size, 0, rest);
715 set_buffer_uptodate(bh); 742 set_buffer_uptodate(bh);
716 unlock_buffer(bh); 743 unlock_buffer(bh);
717 ext3_journal_dirty_metadata(handle, bh); 744 err = ext3_journal_dirty_metadata(handle, bh);
718 brelse(bh); 745 brelse(bh);
746 if (err)
747 break;
719 } 748 }
720 if ((err2 = ext3_journal_stop(handle)) && !err) 749 if ((err2 = ext3_journal_stop(handle)) && !err)
721 err = err2; 750 err = err2;
@@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
922 /* Update the global fs size fields */ 951 /* Update the global fs size fields */
923 sbi->s_groups_count++; 952 sbi->s_groups_count++;
924 953
925 ext3_journal_dirty_metadata(handle, primary); 954 err = ext3_journal_dirty_metadata(handle, primary);
955 if (err)
956 goto exit_journal;
926 957
927 /* Update the reserved block counts only once the new group is 958 /* Update the reserved block counts only once the new group is
928 * active. */ 959 * active. */
@@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
934 percpu_counter_add(&sbi->s_freeinodes_counter, 965 percpu_counter_add(&sbi->s_freeinodes_counter,
935 EXT3_INODES_PER_GROUP(sb)); 966 EXT3_INODES_PER_GROUP(sb));
936 967
937 ext3_journal_dirty_metadata(handle, sbi->s_sbh); 968 err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
938 969
939exit_journal: 970exit_journal:
940 mutex_unlock(&sbi->s_resize_lock); 971 mutex_unlock(&sbi->s_resize_lock);
@@ -1064,8 +1095,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1064 goto exit_put; 1095 goto exit_put;
1065 } 1096 }
1066 es->s_blocks_count = cpu_to_le32(o_blocks_count + add); 1097 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1067 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 1098 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1068 mutex_unlock(&EXT3_SB(sb)->s_resize_lock); 1099 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1100 if (err) {
1101 ext3_warning(sb, __func__,
1102 "error %d on journal dirty metadata", err);
1103 ext3_journal_stop(handle);
1104 goto exit_put;
1105 }
1069 ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", 1106 ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
1070 o_blocks_count, o_blocks_count + add); 1107 o_blocks_count, o_blocks_count + add);
1071 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1108 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 77ce1616f725..85c8cc8f2473 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -143,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
143void ext3_msg(struct super_block *sb, const char *prefix, 143void ext3_msg(struct super_block *sb, const char *prefix,
144 const char *fmt, ...) 144 const char *fmt, ...)
145{ 145{
146 struct va_format vaf;
146 va_list args; 147 va_list args;
147 148
148 va_start(args, fmt); 149 va_start(args, fmt);
149 printk("%sEXT3-fs (%s): ", prefix, sb->s_id); 150
150 vprintk(fmt, args); 151 vaf.fmt = fmt;
151 printk("\n"); 152 vaf.va = &args;
153
154 printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
155
152 va_end(args); 156 va_end(args);
153} 157}
154 158
@@ -195,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb)
195 sb->s_id); 199 sb->s_id);
196} 200}
197 201
198void ext3_error (struct super_block * sb, const char * function, 202void ext3_error(struct super_block *sb, const char *function,
199 const char * fmt, ...) 203 const char *fmt, ...)
200{ 204{
205 struct va_format vaf;
201 va_list args; 206 va_list args;
202 207
203 va_start(args, fmt); 208 va_start(args, fmt);
204 printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); 209
205 vprintk(fmt, args); 210 vaf.fmt = fmt;
206 printk("\n"); 211 vaf.va = &args;
212
213 printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
214 sb->s_id, function, &vaf);
215
207 va_end(args); 216 va_end(args);
208 217
209 ext3_handle_error(sb); 218 ext3_handle_error(sb);
@@ -274,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
274 * case we take the easy way out and panic immediately. 283 * case we take the easy way out and panic immediately.
275 */ 284 */
276 285
277void ext3_abort (struct super_block * sb, const char * function, 286void ext3_abort(struct super_block *sb, const char *function,
278 const char * fmt, ...) 287 const char *fmt, ...)
279{ 288{
289 struct va_format vaf;
280 va_list args; 290 va_list args;
281 291
282 va_start(args, fmt); 292 va_start(args, fmt);
283 printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function); 293
284 vprintk(fmt, args); 294 vaf.fmt = fmt;
285 printk("\n"); 295 vaf.va = &args;
296
297 printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
298 sb->s_id, function, &vaf);
299
286 va_end(args); 300 va_end(args);
287 301
288 if (test_opt(sb, ERRORS_PANIC)) 302 if (test_opt(sb, ERRORS_PANIC))
@@ -300,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function,
300 journal_abort(EXT3_SB(sb)->s_journal, -EIO); 314 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
301} 315}
302 316
303void ext3_warning (struct super_block * sb, const char * function, 317void ext3_warning(struct super_block *sb, const char *function,
304 const char * fmt, ...) 318 const char *fmt, ...)
305{ 319{
320 struct va_format vaf;
306 va_list args; 321 va_list args;
307 322
308 va_start(args, fmt); 323 va_start(args, fmt);
309 printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ", 324
310 sb->s_id, function); 325 vaf.fmt = fmt;
311 vprintk(fmt, args); 326 vaf.va = &args;
312 printk("\n"); 327
328 printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
329 sb->s_id, function, &vaf);
330
313 va_end(args); 331 va_end(args);
314} 332}
315 333
@@ -346,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
346 struct block_device *bdev; 364 struct block_device *bdev;
347 char b[BDEVNAME_SIZE]; 365 char b[BDEVNAME_SIZE];
348 366
349 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 367 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
350 if (IS_ERR(bdev)) 368 if (IS_ERR(bdev))
351 goto fail; 369 goto fail;
352 return bdev; 370 return bdev;
@@ -363,8 +381,7 @@ fail:
363 */ 381 */
364static int ext3_blkdev_put(struct block_device *bdev) 382static int ext3_blkdev_put(struct block_device *bdev)
365{ 383{
366 bd_release(bdev); 384 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
367 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
368} 385}
369 386
370static int ext3_blkdev_remove(struct ext3_sb_info *sbi) 387static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -737,7 +754,7 @@ static int ext3_release_dquot(struct dquot *dquot);
737static int ext3_mark_dquot_dirty(struct dquot *dquot); 754static int ext3_mark_dquot_dirty(struct dquot *dquot);
738static int ext3_write_info(struct super_block *sb, int type); 755static int ext3_write_info(struct super_block *sb, int type);
739static int ext3_quota_on(struct super_block *sb, int type, int format_id, 756static int ext3_quota_on(struct super_block *sb, int type, int format_id,
740 char *path); 757 struct path *path);
741static int ext3_quota_on_mount(struct super_block *sb, int type); 758static int ext3_quota_on_mount(struct super_block *sb, int type);
742static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 759static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
743 size_t len, loff_t off); 760 size_t len, loff_t off);
@@ -1848,13 +1865,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1848 goto failed_mount; 1865 goto failed_mount;
1849 } 1866 }
1850 1867
1851 if (generic_check_addressable(sb->s_blocksize_bits, 1868 err = generic_check_addressable(sb->s_blocksize_bits,
1852 le32_to_cpu(es->s_blocks_count))) { 1869 le32_to_cpu(es->s_blocks_count));
1870 if (err) {
1853 ext3_msg(sb, KERN_ERR, 1871 ext3_msg(sb, KERN_ERR,
1854 "error: filesystem is too large to mount safely"); 1872 "error: filesystem is too large to mount safely");
1855 if (sizeof(sector_t) < 8) 1873 if (sizeof(sector_t) < 8)
1856 ext3_msg(sb, KERN_ERR, 1874 ext3_msg(sb, KERN_ERR,
1857 "error: CONFIG_LBDAF not enabled"); 1875 "error: CONFIG_LBDAF not enabled");
1876 ret = err;
1858 goto failed_mount; 1877 goto failed_mount;
1859 } 1878 }
1860 1879
@@ -2142,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2142 if (bdev == NULL) 2161 if (bdev == NULL)
2143 return NULL; 2162 return NULL;
2144 2163
2145 if (bd_claim(bdev, sb)) {
2146 ext3_msg(sb, KERN_ERR,
2147 "error: failed to claim external journal device");
2148 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2149 return NULL;
2150 }
2151
2152 blocksize = sb->s_blocksize; 2164 blocksize = sb->s_blocksize;
2153 hblock = bdev_logical_block_size(bdev); 2165 hblock = bdev_logical_block_size(bdev);
2154 if (blocksize < hblock) { 2166 if (blocksize < hblock) {
@@ -2297,7 +2309,7 @@ static int ext3_load_journal(struct super_block *sb,
2297 EXT3_SB(sb)->s_journal = journal; 2309 EXT3_SB(sb)->s_journal = journal;
2298 ext3_clear_journal_err(sb, es); 2310 ext3_clear_journal_err(sb, es);
2299 2311
2300 if (journal_devnum && 2312 if (!really_read_only && journal_devnum &&
2301 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2313 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2302 es->s_journal_dev = cpu_to_le32(journal_devnum); 2314 es->s_journal_dev = cpu_to_le32(journal_devnum);
2303 2315
@@ -2865,27 +2877,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
2865 * Standard function to be called on quota_on 2877 * Standard function to be called on quota_on
2866 */ 2878 */
2867static int ext3_quota_on(struct super_block *sb, int type, int format_id, 2879static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2868 char *name) 2880 struct path *path)
2869{ 2881{
2870 int err; 2882 int err;
2871 struct path path;
2872 2883
2873 if (!test_opt(sb, QUOTA)) 2884 if (!test_opt(sb, QUOTA))
2874 return -EINVAL; 2885 return -EINVAL;
2875 2886
2876 err = kern_path(name, LOOKUP_FOLLOW, &path);
2877 if (err)
2878 return err;
2879
2880 /* Quotafile not on the same filesystem? */ 2887 /* Quotafile not on the same filesystem? */
2881 if (path.mnt->mnt_sb != sb) { 2888 if (path->mnt->mnt_sb != sb)
2882 path_put(&path);
2883 return -EXDEV; 2889 return -EXDEV;
2884 }
2885 /* Journaling quota? */ 2890 /* Journaling quota? */
2886 if (EXT3_SB(sb)->s_qf_names[type]) { 2891 if (EXT3_SB(sb)->s_qf_names[type]) {
2887 /* Quotafile not of fs root? */ 2892 /* Quotafile not of fs root? */
2888 if (path.dentry->d_parent != sb->s_root) 2893 if (path->dentry->d_parent != sb->s_root)
2889 ext3_msg(sb, KERN_WARNING, 2894 ext3_msg(sb, KERN_WARNING,
2890 "warning: Quota file not on filesystem root. " 2895 "warning: Quota file not on filesystem root. "
2891 "Journaled quota will not work."); 2896 "Journaled quota will not work.");
@@ -2895,7 +2900,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2895 * When we journal data on quota file, we have to flush journal to see 2900 * When we journal data on quota file, we have to flush journal to see
2896 * all updates to the file when we bypass pagecache... 2901 * all updates to the file when we bypass pagecache...
2897 */ 2902 */
2898 if (ext3_should_journal_data(path.dentry->d_inode)) { 2903 if (ext3_should_journal_data(path->dentry->d_inode)) {
2899 /* 2904 /*
2900 * We don't need to lock updates but journal_flush() could 2905 * We don't need to lock updates but journal_flush() could
2901 * otherwise be livelocked... 2906 * otherwise be livelocked...
@@ -2903,15 +2908,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2903 journal_lock_updates(EXT3_SB(sb)->s_journal); 2908 journal_lock_updates(EXT3_SB(sb)->s_journal);
2904 err = journal_flush(EXT3_SB(sb)->s_journal); 2909 err = journal_flush(EXT3_SB(sb)->s_journal);
2905 journal_unlock_updates(EXT3_SB(sb)->s_journal); 2910 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2906 if (err) { 2911 if (err)
2907 path_put(&path);
2908 return err; 2912 return err;
2909 }
2910 } 2913 }
2911 2914
2912 err = dquot_quota_on_path(sb, type, format_id, &path); 2915 return dquot_quota_on(sb, type, format_id, path);
2913 path_put(&path);
2914 return err;
2915} 2916}
2916 2917
2917/* Read data from quotafile - avoid pagecache and such because we cannot afford 2918/* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa89..32e6cc23bd9a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
925/* 925/*
926 * ext3_xattr_set_handle() 926 * ext3_xattr_set_handle()
927 * 927 *
928 * Create, replace or remove an extended attribute for this inode. Buffer 928 * Create, replace or remove an extended attribute for this inode. Value
929 * is NULL to remove an existing extended attribute, and non-NULL to 929 * is NULL to remove an existing extended attribute, and non-NULL to
930 * either replace an existing extended attribute, or create a new extended 930 * either replace an existing extended attribute, or create a new extended
931 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 931 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 14c3af26c671..adf96b822781 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -592,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
592 * Account for the allocated meta blocks. We will never 592 * Account for the allocated meta blocks. We will never
593 * fail EDQUOT for metdata, but we do account for it. 593 * fail EDQUOT for metdata, but we do account for it.
594 */ 594 */
595 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 595 if (!(*errp) &&
596 ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
596 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 597 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
597 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 598 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
598 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 599 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ece76fb6a40c..164c56092e58 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
60 return (ext4_filetype_table[filetype]); 60 return (ext4_filetype_table[filetype]);
61} 61}
62 62
63 63/*
64 * Return 0 if the directory entry is OK, and 1 if there is a problem
65 *
66 * Note: this is the opposite of what ext2 and ext3 historically returned...
67 */
64int __ext4_check_dir_entry(const char *function, unsigned int line, 68int __ext4_check_dir_entry(const char *function, unsigned int line,
65 struct inode *dir, 69 struct inode *dir, struct file *filp,
66 struct ext4_dir_entry_2 *de, 70 struct ext4_dir_entry_2 *de,
67 struct buffer_head *bh, 71 struct buffer_head *bh,
68 unsigned int offset) 72 unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
71 const int rlen = ext4_rec_len_from_disk(de->rec_len, 75 const int rlen = ext4_rec_len_from_disk(de->rec_len,
72 dir->i_sb->s_blocksize); 76 dir->i_sb->s_blocksize);
73 77
74 if (rlen < EXT4_DIR_REC_LEN(1)) 78 if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
75 error_msg = "rec_len is smaller than minimal"; 79 error_msg = "rec_len is smaller than minimal";
76 else if (rlen % 4 != 0) 80 else if (unlikely(rlen % 4 != 0))
77 error_msg = "rec_len % 4 != 0"; 81 error_msg = "rec_len % 4 != 0";
78 else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) 82 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
79 error_msg = "rec_len is too small for name_len"; 83 error_msg = "rec_len is too small for name_len";
80 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) 84 else if (unlikely(((char *) de - bh->b_data) + rlen >
85 dir->i_sb->s_blocksize))
81 error_msg = "directory entry across blocks"; 86 error_msg = "directory entry across blocks";
82 else if (le32_to_cpu(de->inode) > 87 else if (unlikely(le32_to_cpu(de->inode) >
83 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) 88 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
84 error_msg = "inode out of bounds"; 89 error_msg = "inode out of bounds";
90 else
91 return 0;
85 92
86 if (error_msg != NULL) 93 if (filp)
87 ext4_error_inode(dir, function, line, bh->b_blocknr, 94 ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
88 "bad entry in directory: %s - " 95 "bad entry in directory: %s - offset=%u(%u), "
89 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 96 "inode=%u, rec_len=%d, name_len=%d",
90 error_msg, (unsigned) (offset%bh->b_size), offset, 97 error_msg, (unsigned) (offset%bh->b_size),
91 le32_to_cpu(de->inode), 98 offset, le32_to_cpu(de->inode),
92 rlen, de->name_len); 99 rlen, de->name_len);
93 return error_msg == NULL ? 1 : 0; 100 else
101 ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
102 "bad entry in directory: %s - offset=%u(%u), "
103 "inode=%u, rec_len=%d, name_len=%d",
104 error_msg, (unsigned) (offset%bh->b_size),
105 offset, le32_to_cpu(de->inode),
106 rlen, de->name_len);
107
108 return 1;
94} 109}
95 110
96static int ext4_readdir(struct file *filp, 111static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
152 */ 167 */
153 if (!bh) { 168 if (!bh) {
154 if (!dir_has_error) { 169 if (!dir_has_error) {
155 EXT4_ERROR_INODE(inode, "directory " 170 EXT4_ERROR_FILE(filp, 0,
156 "contains a hole at offset %Lu", 171 "directory contains a "
172 "hole at offset %llu",
157 (unsigned long long) filp->f_pos); 173 (unsigned long long) filp->f_pos);
158 dir_has_error = 1; 174 dir_has_error = 1;
159 } 175 }
@@ -194,8 +210,8 @@ revalidate:
194 while (!error && filp->f_pos < inode->i_size 210 while (!error && filp->f_pos < inode->i_size
195 && offset < sb->s_blocksize) { 211 && offset < sb->s_blocksize) {
196 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 212 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
197 if (!ext4_check_dir_entry(inode, de, 213 if (ext4_check_dir_entry(inode, filp, de,
198 bh, offset)) { 214 bh, offset)) {
199 /* 215 /*
200 * On error, skip the f_pos to the next block 216 * On error, skip the f_pos to the next block
201 */ 217 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94ce3d7a1c4b..0c8d97b56f34 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ 62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) 63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
64 64
65#define EXT4_ERROR_FILE(file, fmt, a...) \ 65#define EXT4_ERROR_FILE(file, block, fmt, a...) \
66 ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) 66 ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
67 67
68/* data type for block offset of block group */ 68/* data type for block offset of block group */
69typedef int ext4_grpblk_t; 69typedef int ext4_grpblk_t;
@@ -561,23 +561,7 @@ struct ext4_new_group_data {
561#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 561#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
562#endif 562#endif
563 563
564 564/* Max physical block we can address w/o extents */
565/*
566 * Mount options
567 */
568struct ext4_mount_options {
569 unsigned long s_mount_opt;
570 uid_t s_resuid;
571 gid_t s_resgid;
572 unsigned long s_commit_interval;
573 u32 s_min_batch_time, s_max_batch_time;
574#ifdef CONFIG_QUOTA
575 int s_jquota_fmt;
576 char *s_qf_names[MAXQUOTAS];
577#endif
578};
579
580/* Max physical block we can addres w/o extents */
581#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF 565#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
582 566
583/* 567/*
@@ -709,6 +693,8 @@ do { \
709 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ 693 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
710 ext4_decode_extra_time(&(inode)->xtime, \ 694 ext4_decode_extra_time(&(inode)->xtime, \
711 raw_inode->xtime ## _extra); \ 695 raw_inode->xtime ## _extra); \
696 else \
697 (inode)->xtime.tv_nsec = 0; \
712} while (0) 698} while (0)
713 699
714#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ 700#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
@@ -719,6 +705,8 @@ do { \
719 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ 705 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
720 ext4_decode_extra_time(&(einode)->xtime, \ 706 ext4_decode_extra_time(&(einode)->xtime, \
721 raw_inode->xtime ## _extra); \ 707 raw_inode->xtime ## _extra); \
708 else \
709 (einode)->xtime.tv_nsec = 0; \
722} while (0) 710} while (0)
723 711
724#define i_disk_version osd1.linux1.l_i_version 712#define i_disk_version osd1.linux1.l_i_version
@@ -750,12 +738,13 @@ do { \
750 738
751/* 739/*
752 * storage for cached extent 740 * storage for cached extent
741 * If ec_len == 0, then the cache is invalid.
742 * If ec_start == 0, then the cache represents a gap (null mapping)
753 */ 743 */
754struct ext4_ext_cache { 744struct ext4_ext_cache {
755 ext4_fsblk_t ec_start; 745 ext4_fsblk_t ec_start;
756 ext4_lblk_t ec_block; 746 ext4_lblk_t ec_block;
757 __u32 ec_len; /* must be 32bit to return holes */ 747 __u32 ec_len; /* must be 32bit to return holes */
758 __u32 ec_type;
759}; 748};
760 749
761/* 750/*
@@ -774,10 +763,12 @@ struct ext4_inode_info {
774 * near to their parent directory's inode. 763 * near to their parent directory's inode.
775 */ 764 */
776 ext4_group_t i_block_group; 765 ext4_group_t i_block_group;
766 ext4_lblk_t i_dir_start_lookup;
767#if (BITS_PER_LONG < 64)
777 unsigned long i_state_flags; /* Dynamic state flags */ 768 unsigned long i_state_flags; /* Dynamic state flags */
769#endif
778 unsigned long i_flags; 770 unsigned long i_flags;
779 771
780 ext4_lblk_t i_dir_start_lookup;
781#ifdef CONFIG_EXT4_FS_XATTR 772#ifdef CONFIG_EXT4_FS_XATTR
782 /* 773 /*
783 * Extended attributes can be read independently of the main file 774 * Extended attributes can be read independently of the main file
@@ -820,7 +811,7 @@ struct ext4_inode_info {
820 */ 811 */
821 struct rw_semaphore i_data_sem; 812 struct rw_semaphore i_data_sem;
822 struct inode vfs_inode; 813 struct inode vfs_inode;
823 struct jbd2_inode jinode; 814 struct jbd2_inode *jinode;
824 815
825 struct ext4_ext_cache i_cached_extent; 816 struct ext4_ext_cache i_cached_extent;
826 /* 817 /*
@@ -840,14 +831,12 @@ struct ext4_inode_info {
840 unsigned int i_reserved_data_blocks; 831 unsigned int i_reserved_data_blocks;
841 unsigned int i_reserved_meta_blocks; 832 unsigned int i_reserved_meta_blocks;
842 unsigned int i_allocated_meta_blocks; 833 unsigned int i_allocated_meta_blocks;
843 unsigned short i_delalloc_reserved_flag; 834 ext4_lblk_t i_da_metadata_calc_last_lblock;
844 sector_t i_da_metadata_calc_last_lblock;
845 int i_da_metadata_calc_len; 835 int i_da_metadata_calc_len;
846 836
847 /* on-disk additional length */ 837 /* on-disk additional length */
848 __u16 i_extra_isize; 838 __u16 i_extra_isize;
849 839
850 spinlock_t i_block_reservation_lock;
851#ifdef CONFIG_QUOTA 840#ifdef CONFIG_QUOTA
852 /* quota space reservation, managed internally by quota code */ 841 /* quota space reservation, managed internally by quota code */
853 qsize_t i_reserved_quota; 842 qsize_t i_reserved_quota;
@@ -856,9 +845,11 @@ struct ext4_inode_info {
856 /* completed IOs that might need unwritten extents handling */ 845 /* completed IOs that might need unwritten extents handling */
857 struct list_head i_completed_io_list; 846 struct list_head i_completed_io_list;
858 spinlock_t i_completed_io_lock; 847 spinlock_t i_completed_io_lock;
848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
859 /* current io_end structure for async DIO write*/ 849 /* current io_end structure for async DIO write*/
860 ext4_io_end_t *cur_aio_dio; 850 ext4_io_end_t *cur_aio_dio;
861 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 851
852 spinlock_t i_block_reservation_lock;
862 853
863 /* 854 /*
864 * Transactions that contain inode's metadata needed to complete 855 * Transactions that contain inode's metadata needed to complete
@@ -917,11 +908,20 @@ struct ext4_inode_info {
917#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 908#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
918#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ 909#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
919 910
920#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 911#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
921#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 912 ~EXT4_MOUNT_##opt
913#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
914 EXT4_MOUNT_##opt
922#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ 915#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
923 EXT4_MOUNT_##opt) 916 EXT4_MOUNT_##opt)
924 917
918#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \
919 ~EXT4_MOUNT2_##opt
920#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \
921 EXT4_MOUNT2_##opt
922#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
923 EXT4_MOUNT2_##opt)
924
925#define ext4_set_bit ext2_set_bit 925#define ext4_set_bit ext2_set_bit
926#define ext4_set_bit_atomic ext2_set_bit_atomic 926#define ext4_set_bit_atomic ext2_set_bit_atomic
927#define ext4_clear_bit ext2_clear_bit 927#define ext4_clear_bit ext2_clear_bit
@@ -1087,6 +1087,7 @@ struct ext4_sb_info {
1087 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ 1087 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
1088 struct buffer_head **s_group_desc; 1088 struct buffer_head **s_group_desc;
1089 unsigned int s_mount_opt; 1089 unsigned int s_mount_opt;
1090 unsigned int s_mount_opt2;
1090 unsigned int s_mount_flags; 1091 unsigned int s_mount_flags;
1091 ext4_fsblk_t s_sb_block; 1092 ext4_fsblk_t s_sb_block;
1092 uid_t s_resuid; 1093 uid_t s_resuid;
@@ -1237,24 +1238,39 @@ enum {
1237 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1238 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1238 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1239 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1239 EXT4_STATE_NEWENTRY, /* File just added to dir */ 1240 EXT4_STATE_NEWENTRY, /* File just added to dir */
1241 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
1240}; 1242};
1241 1243
1242#define EXT4_INODE_BIT_FNS(name, field) \ 1244#define EXT4_INODE_BIT_FNS(name, field, offset) \
1243static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ 1245static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1244{ \ 1246{ \
1245 return test_bit(bit, &EXT4_I(inode)->i_##field); \ 1247 return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1246} \ 1248} \
1247static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ 1249static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1248{ \ 1250{ \
1249 set_bit(bit, &EXT4_I(inode)->i_##field); \ 1251 set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1250} \ 1252} \
1251static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ 1253static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1252{ \ 1254{ \
1253 clear_bit(bit, &EXT4_I(inode)->i_##field); \ 1255 clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1254} 1256}
1255 1257
1256EXT4_INODE_BIT_FNS(flag, flags) 1258EXT4_INODE_BIT_FNS(flag, flags, 0)
1257EXT4_INODE_BIT_FNS(state, state_flags) 1259#if (BITS_PER_LONG < 64)
1260EXT4_INODE_BIT_FNS(state, state_flags, 0)
1261
1262static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1263{
1264 (ei)->i_state_flags = 0;
1265}
1266#else
1267EXT4_INODE_BIT_FNS(state, flags, 32)
1268
1269static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1270{
1271 /* We depend on the fact that callers will set i_flags */
1272}
1273#endif
1258#else 1274#else
1259/* Assume that user mode programs are passing in an ext4fs superblock, not 1275/* Assume that user mode programs are passing in an ext4fs superblock, not
1260 * a kernel struct super_block. This will allow us to call the feature-test 1276 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1642,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1642 1658
1643/* dir.c */ 1659/* dir.c */
1644extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 1660extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
1661 struct file *,
1645 struct ext4_dir_entry_2 *, 1662 struct ext4_dir_entry_2 *,
1646 struct buffer_head *, unsigned int); 1663 struct buffer_head *, unsigned int);
1647#define ext4_check_dir_entry(dir, de, bh, offset) \ 1664#define ext4_check_dir_entry(dir, filp, de, bh, offset) \
1648 __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) 1665 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
1666 (de), (bh), (offset)))
1649extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1667extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1650 __u32 minor_hash, 1668 __u32 minor_hash,
1651 struct ext4_dir_entry_2 *dirent); 1669 struct ext4_dir_entry_2 *dirent);
@@ -1653,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1653 1671
1654/* fsync.c */ 1672/* fsync.c */
1655extern int ext4_sync_file(struct file *, int); 1673extern int ext4_sync_file(struct file *, int);
1674extern int ext4_flush_completed_IO(struct inode *);
1656 1675
1657/* hash.c */ 1676/* hash.c */
1658extern int ext4fs_dirhash(const char *name, int len, struct 1677extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1752,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
1752 ext4_fsblk_t, const char *, ...) 1771 ext4_fsblk_t, const char *, ...)
1753 __attribute__ ((format (printf, 5, 6))); 1772 __attribute__ ((format (printf, 5, 6)));
1754extern void ext4_error_file(struct file *, const char *, unsigned int, 1773extern void ext4_error_file(struct file *, const char *, unsigned int,
1755 const char *, ...) 1774 ext4_fsblk_t, const char *, ...)
1756 __attribute__ ((format (printf, 4, 5))); 1775 __attribute__ ((format (printf, 5, 6)));
1757extern void __ext4_std_error(struct super_block *, const char *, 1776extern void __ext4_std_error(struct super_block *, const char *,
1758 unsigned int, int); 1777 unsigned int, int);
1759extern void __ext4_abort(struct super_block *, const char *, unsigned int, 1778extern void __ext4_abort(struct super_block *, const char *, unsigned int,
@@ -2046,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2046extern void ext4_ext_truncate(struct inode *); 2065extern void ext4_ext_truncate(struct inode *);
2047extern void ext4_ext_init(struct super_block *); 2066extern void ext4_ext_init(struct super_block *);
2048extern void ext4_ext_release(struct super_block *); 2067extern void ext4_ext_release(struct super_block *);
2049extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 2068extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
2050 loff_t len); 2069 loff_t len);
2051extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2070extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
2052 ssize_t len); 2071 ssize_t len);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 28ce70fd9cd0..2e29abb30f76 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,10 +119,6 @@ struct ext4_ext_path {
119 * structure for external API 119 * structure for external API
120 */ 120 */
121 121
122#define EXT4_EXT_CACHE_NO 0
123#define EXT4_EXT_CACHE_GAP 1
124#define EXT4_EXT_CACHE_EXTENT 2
125
126/* 122/*
127 * to be called by ext4_ext_walk_space() 123 * to be called by ext4_ext_walk_space()
128 * negative retcode - error 124 * negative retcode - error
@@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode)
197static inline void 193static inline void
198ext4_ext_invalidate_cache(struct inode *inode) 194ext4_ext_invalidate_cache(struct inode *inode)
199{ 195{
200 EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO; 196 EXT4_I(inode)->i_cached_extent.ec_len = 0;
201} 197}
202 198
203static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) 199static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -278,7 +274,7 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
278} 274}
279 275
280extern int ext4_ext_calc_metadata_amount(struct inode *inode, 276extern int ext4_ext_calc_metadata_amount(struct inode *inode,
281 sector_t lblocks); 277 ext4_lblk_t lblocks);
282extern int ext4_extent_tree_init(handle_t *, struct inode *); 278extern int ext4_extent_tree_init(handle_t *, struct inode *);
283extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, 279extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
284 int num, 280 int num,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c5..d8b992e658c1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
253static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 253static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
254{ 254{
255 if (ext4_handle_valid(handle)) 255 if (ext4_handle_valid(handle))
256 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 256 return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
257 return 0; 257 return 0;
258} 258}
259 259
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0554c48cb1fd..63a75810b7c3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -117,11 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
117 struct ext4_extent *ex; 117 struct ext4_extent *ex;
118 depth = path->p_depth; 118 depth = path->p_depth;
119 119
120 /* try to predict block placement */ 120 /*
121 * Try to predict block placement assuming that we are
122 * filling in a file which will eventually be
123 * non-sparse --- i.e., in the case of libbfd writing
124 * an ELF object sections out-of-order but in a way
125 * the eventually results in a contiguous object or
126 * executable file, or some database extending a table
127 * space file. However, this is actually somewhat
128 * non-ideal if we are writing a sparse file such as
129 * qemu or KVM writing a raw image file that is going
130 * to stay fairly sparse, since it will end up
131 * fragmenting the file system's free space. Maybe we
132 * should have some hueristics or some way to allow
133 * userspace to pass a hint to file system,
134 * especiially if the latter case turns out to be
135 * common.
136 */
121 ex = path[depth].p_ext; 137 ex = path[depth].p_ext;
122 if (ex) 138 if (ex) {
123 return (ext4_ext_pblock(ex) + 139 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
124 (block - le32_to_cpu(ex->ee_block))); 140 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
141
142 if (block > ext_block)
143 return ext_pblk + (block - ext_block);
144 else
145 return ext_pblk - (ext_block - block);
146 }
125 147
126 /* it looks like index is empty; 148 /* it looks like index is empty;
127 * try to find starting block from index itself */ 149 * try to find starting block from index itself */
@@ -244,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
244 * to allocate @blocks 266 * to allocate @blocks
245 * Worse case is one block per extent 267 * Worse case is one block per extent
246 */ 268 */
247int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) 269int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
248{ 270{
249 struct ext4_inode_info *ei = EXT4_I(inode); 271 struct ext4_inode_info *ei = EXT4_I(inode);
250 int idxs, num = 0; 272 int idxs, num = 0;
@@ -1872,12 +1894,10 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1872 cbex.ec_block = start; 1894 cbex.ec_block = start;
1873 cbex.ec_len = end - start; 1895 cbex.ec_len = end - start;
1874 cbex.ec_start = 0; 1896 cbex.ec_start = 0;
1875 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1876 } else { 1897 } else {
1877 cbex.ec_block = le32_to_cpu(ex->ee_block); 1898 cbex.ec_block = le32_to_cpu(ex->ee_block);
1878 cbex.ec_len = ext4_ext_get_actual_len(ex); 1899 cbex.ec_len = ext4_ext_get_actual_len(ex);
1879 cbex.ec_start = ext4_ext_pblock(ex); 1900 cbex.ec_start = ext4_ext_pblock(ex);
1880 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1881 } 1901 }
1882 1902
1883 if (unlikely(cbex.ec_len == 0)) { 1903 if (unlikely(cbex.ec_len == 0)) {
@@ -1917,13 +1937,12 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1917 1937
1918static void 1938static void
1919ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1939ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1920 __u32 len, ext4_fsblk_t start, int type) 1940 __u32 len, ext4_fsblk_t start)
1921{ 1941{
1922 struct ext4_ext_cache *cex; 1942 struct ext4_ext_cache *cex;
1923 BUG_ON(len == 0); 1943 BUG_ON(len == 0);
1924 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1944 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1925 cex = &EXT4_I(inode)->i_cached_extent; 1945 cex = &EXT4_I(inode)->i_cached_extent;
1926 cex->ec_type = type;
1927 cex->ec_block = block; 1946 cex->ec_block = block;
1928 cex->ec_len = len; 1947 cex->ec_len = len;
1929 cex->ec_start = start; 1948 cex->ec_start = start;
@@ -1976,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1976 } 1995 }
1977 1996
1978 ext_debug(" -> %u:%lu\n", lblock, len); 1997 ext_debug(" -> %u:%lu\n", lblock, len);
1979 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); 1998 ext4_ext_put_in_cache(inode, lblock, len, 0);
1980} 1999}
1981 2000
2001/*
2002 * Return 0 if cache is invalid; 1 if the cache is valid
2003 */
1982static int 2004static int
1983ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2005ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1984 struct ext4_extent *ex) 2006 struct ext4_extent *ex)
1985{ 2007{
1986 struct ext4_ext_cache *cex; 2008 struct ext4_ext_cache *cex;
1987 int ret = EXT4_EXT_CACHE_NO; 2009 int ret = 0;
1988 2010
1989 /* 2011 /*
1990 * We borrow i_block_reservation_lock to protect i_cached_extent 2012 * We borrow i_block_reservation_lock to protect i_cached_extent
@@ -1993,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1993 cex = &EXT4_I(inode)->i_cached_extent; 2015 cex = &EXT4_I(inode)->i_cached_extent;
1994 2016
1995 /* has cache valid data? */ 2017 /* has cache valid data? */
1996 if (cex->ec_type == EXT4_EXT_CACHE_NO) 2018 if (cex->ec_len == 0)
1997 goto errout; 2019 goto errout;
1998 2020
1999 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
2000 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
2001 if (in_range(block, cex->ec_block, cex->ec_len)) { 2021 if (in_range(block, cex->ec_block, cex->ec_len)) {
2002 ex->ee_block = cpu_to_le32(cex->ec_block); 2022 ex->ee_block = cpu_to_le32(cex->ec_block);
2003 ext4_ext_store_pblock(ex, cex->ec_start); 2023 ext4_ext_store_pblock(ex, cex->ec_start);
@@ -2005,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2005 ext_debug("%u cached by %u:%u:%llu\n", 2025 ext_debug("%u cached by %u:%u:%llu\n",
2006 block, 2026 block,
2007 cex->ec_block, cex->ec_len, cex->ec_start); 2027 cex->ec_block, cex->ec_len, cex->ec_start);
2008 ret = cex->ec_type; 2028 ret = 1;
2009 } 2029 }
2010errout: 2030errout:
2011 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2031 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2825,14 +2845,14 @@ fix_extent_len:
2825 * to an uninitialized extent. 2845 * to an uninitialized extent.
2826 * 2846 *
2827 * Writing to an uninitized extent may result in splitting the uninitialized 2847 * Writing to an uninitized extent may result in splitting the uninitialized
2828 * extent into multiple /intialized unintialized extents (up to three) 2848 * extent into multiple /initialized uninitialized extents (up to three)
2829 * There are three possibilities: 2849 * There are three possibilities:
2830 * a> There is no split required: Entire extent should be uninitialized 2850 * a> There is no split required: Entire extent should be uninitialized
2831 * b> Splits in two extents: Write is happening at either end of the extent 2851 * b> Splits in two extents: Write is happening at either end of the extent
2832 * c> Splits in three extents: Somone is writing in middle of the extent 2852 * c> Splits in three extents: Somone is writing in middle of the extent
2833 * 2853 *
2834 * One of more index blocks maybe needed if the extent tree grow after 2854 * One of more index blocks maybe needed if the extent tree grow after
2835 * the unintialized extent split. To prevent ENOSPC occur at the IO 2855 * the uninitialized extent split. To prevent ENOSPC occur at the IO
2836 * complete, we need to split the uninitialized extent before DIO submit 2856 * complete, we need to split the uninitialized extent before DIO submit
2837 * the IO. The uninitialized extent called at this time will be split 2857 * the IO. The uninitialized extent called at this time will be split
2838 * into three uninitialized extent(at most). After IO complete, the part 2858 * into three uninitialized extent(at most). After IO complete, the part
@@ -3082,7 +3102,7 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3082 * Handle EOFBLOCKS_FL flag, clearing it if necessary 3102 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3083 */ 3103 */
3084static int check_eofblocks_fl(handle_t *handle, struct inode *inode, 3104static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3085 struct ext4_map_blocks *map, 3105 ext4_lblk_t lblk,
3086 struct ext4_ext_path *path, 3106 struct ext4_ext_path *path,
3087 unsigned int len) 3107 unsigned int len)
3088{ 3108{
@@ -3112,7 +3132,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3112 * this turns out to be false, we can bail out from this 3132 * this turns out to be false, we can bail out from this
3113 * function immediately. 3133 * function immediately.
3114 */ 3134 */
3115 if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) + 3135 if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3116 ext4_ext_get_actual_len(last_ex)) 3136 ext4_ext_get_actual_len(last_ex))
3117 return 0; 3137 return 0;
3118 /* 3138 /*
@@ -3168,8 +3188,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3168 path); 3188 path);
3169 if (ret >= 0) { 3189 if (ret >= 0) {
3170 ext4_update_inode_fsync_trans(handle, inode, 1); 3190 ext4_update_inode_fsync_trans(handle, inode, 1);
3171 err = check_eofblocks_fl(handle, inode, map, path, 3191 err = check_eofblocks_fl(handle, inode, map->m_lblk,
3172 map->m_len); 3192 path, map->m_len);
3173 } else 3193 } else
3174 err = ret; 3194 err = ret;
3175 goto out2; 3195 goto out2;
@@ -3199,7 +3219,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3199 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3219 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3200 if (ret >= 0) { 3220 if (ret >= 0) {
3201 ext4_update_inode_fsync_trans(handle, inode, 1); 3221 ext4_update_inode_fsync_trans(handle, inode, 1);
3202 err = check_eofblocks_fl(handle, inode, map, path, map->m_len); 3222 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3223 map->m_len);
3203 if (err < 0) 3224 if (err < 0)
3204 goto out2; 3225 goto out2;
3205 } 3226 }
@@ -3276,7 +3297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3276 struct ext4_extent_header *eh; 3297 struct ext4_extent_header *eh;
3277 struct ext4_extent newex, *ex; 3298 struct ext4_extent newex, *ex;
3278 ext4_fsblk_t newblock; 3299 ext4_fsblk_t newblock;
3279 int err = 0, depth, ret, cache_type; 3300 int err = 0, depth, ret;
3280 unsigned int allocated = 0; 3301 unsigned int allocated = 0;
3281 struct ext4_allocation_request ar; 3302 struct ext4_allocation_request ar;
3282 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3303 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3285,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3285 map->m_lblk, map->m_len, inode->i_ino); 3306 map->m_lblk, map->m_len, inode->i_ino);
3286 3307
3287 /* check in cache */ 3308 /* check in cache */
3288 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); 3309 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3289 if (cache_type) { 3310 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3290 if (cache_type == EXT4_EXT_CACHE_GAP) {
3291 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3311 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3292 /* 3312 /*
3293 * block isn't allocated yet and 3313 * block isn't allocated yet and
@@ -3296,7 +3316,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3296 goto out2; 3316 goto out2;
3297 } 3317 }
3298 /* we should allocate requested block */ 3318 /* we should allocate requested block */
3299 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3319 } else {
3300 /* block is already allocated */ 3320 /* block is already allocated */
3301 newblock = map->m_lblk 3321 newblock = map->m_lblk
3302 - le32_to_cpu(newex.ee_block) 3322 - le32_to_cpu(newex.ee_block)
@@ -3305,8 +3325,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3305 allocated = ext4_ext_get_actual_len(&newex) - 3325 allocated = ext4_ext_get_actual_len(&newex) -
3306 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3326 (map->m_lblk - le32_to_cpu(newex.ee_block));
3307 goto out; 3327 goto out;
3308 } else {
3309 BUG();
3310 } 3328 }
3311 } 3329 }
3312 3330
@@ -3357,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3357 /* Do not put uninitialized extent in the cache */ 3375 /* Do not put uninitialized extent in the cache */
3358 if (!ext4_ext_is_uninitialized(ex)) { 3376 if (!ext4_ext_is_uninitialized(ex)) {
3359 ext4_ext_put_in_cache(inode, ee_block, 3377 ext4_ext_put_in_cache(inode, ee_block,
3360 ee_len, ee_start, 3378 ee_len, ee_start);
3361 EXT4_EXT_CACHE_EXTENT);
3362 goto out; 3379 goto out;
3363 } 3380 }
3364 ret = ext4_ext_handle_uninitialized_extents(handle, 3381 ret = ext4_ext_handle_uninitialized_extents(handle,
@@ -3456,7 +3473,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3456 map->m_flags |= EXT4_MAP_UNINIT; 3473 map->m_flags |= EXT4_MAP_UNINIT;
3457 } 3474 }
3458 3475
3459 err = check_eofblocks_fl(handle, inode, map, path, ar.len); 3476 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
3460 if (err) 3477 if (err)
3461 goto out2; 3478 goto out2;
3462 3479
@@ -3490,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3490 * when it is _not_ an uninitialized extent. 3507 * when it is _not_ an uninitialized extent.
3491 */ 3508 */
3492 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3509 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3493 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, 3510 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
3494 EXT4_EXT_CACHE_EXTENT);
3495 ext4_update_inode_fsync_trans(handle, inode, 1); 3511 ext4_update_inode_fsync_trans(handle, inode, 1);
3496 } else 3512 } else
3497 ext4_update_inode_fsync_trans(handle, inode, 0); 3513 ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3519,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode)
3519 int err = 0; 3535 int err = 0;
3520 3536
3521 /* 3537 /*
3538 * finish any pending end_io work so we won't run the risk of
3539 * converting any truncated blocks to initialized later
3540 */
3541 ext4_flush_completed_IO(inode);
3542
3543 /*
3522 * probably first extent we're gonna free will be last in block 3544 * probably first extent we're gonna free will be last in block
3523 */ 3545 */
3524 err = ext4_writepage_trans_blocks(inode); 3546 err = ext4_writepage_trans_blocks(inode);
@@ -3605,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
3605} 3627}
3606 3628
3607/* 3629/*
3608 * preallocate space for a file. This implements ext4's fallocate inode 3630 * preallocate space for a file. This implements ext4's fallocate file
3609 * operation, which gets called from sys_fallocate system call. 3631 * operation, which gets called from sys_fallocate system call.
3610 * For block-mapped files, posix_fallocate should fall back to the method 3632 * For block-mapped files, posix_fallocate should fall back to the method
3611 * of writing zeroes to the required new blocks (the same behavior which is 3633 * of writing zeroes to the required new blocks (the same behavior which is
3612 * expected for file systems which do not support fallocate() system call). 3634 * expected for file systems which do not support fallocate() system call).
3613 */ 3635 */
3614long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3636long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3615{ 3637{
3638 struct inode *inode = file->f_path.dentry->d_inode;
3616 handle_t *handle; 3639 handle_t *handle;
3617 loff_t new_size; 3640 loff_t new_size;
3618 unsigned int max_blocks; 3641 unsigned int max_blocks;
@@ -3622,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3622 struct ext4_map_blocks map; 3645 struct ext4_map_blocks map;
3623 unsigned int credits, blkbits = inode->i_blkbits; 3646 unsigned int credits, blkbits = inode->i_blkbits;
3624 3647
3648 /* We only support the FALLOC_FL_KEEP_SIZE mode */
3649 if (mode & ~FALLOC_FL_KEEP_SIZE)
3650 return -EOPNOTSUPP;
3651
3625 /* 3652 /*
3626 * currently supporting (pre)allocate mode for extent-based 3653 * currently supporting (pre)allocate mode for extent-based
3627 * files _only_ 3654 * files _only_
@@ -3629,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3629 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3656 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3630 return -EOPNOTSUPP; 3657 return -EOPNOTSUPP;
3631 3658
3632 /* preallocation to directories is currently not supported */
3633 if (S_ISDIR(inode->i_mode))
3634 return -ENODEV;
3635
3636 map.m_lblk = offset >> blkbits; 3659 map.m_lblk = offset >> blkbits;
3637 /* 3660 /*
3638 * We can't just convert len to max_blocks because 3661 * We can't just convert len to max_blocks because
@@ -3767,7 +3790,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3767 3790
3768 logical = (__u64)newex->ec_block << blksize_bits; 3791 logical = (__u64)newex->ec_block << blksize_bits;
3769 3792
3770 if (newex->ec_type == EXT4_EXT_CACHE_GAP) { 3793 if (newex->ec_start == 0) {
3771 pgoff_t offset; 3794 pgoff_t offset;
3772 struct page *page; 3795 struct page *page;
3773 struct buffer_head *bh = NULL; 3796 struct buffer_head *bh = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5a5c55ddceef..2e8322c8aa88 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
104{ 104{
105 struct super_block *sb = inode->i_sb; 105 struct super_block *sb = inode->i_sb;
106 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 106 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
107 struct ext4_inode_info *ei = EXT4_I(inode);
107 struct vfsmount *mnt = filp->f_path.mnt; 108 struct vfsmount *mnt = filp->f_path.mnt;
108 struct path path; 109 struct path path;
109 char buf[64], *cp; 110 char buf[64], *cp;
@@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
127 ext4_mark_super_dirty(sb); 128 ext4_mark_super_dirty(sb);
128 } 129 }
129 } 130 }
131 /*
132 * Set up the jbd2_inode if we are opening the inode for
133 * writing and the journal is present
134 */
135 if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
136 struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
137
138 spin_lock(&inode->i_lock);
139 if (!ei->jinode) {
140 if (!jinode) {
141 spin_unlock(&inode->i_lock);
142 return -ENOMEM;
143 }
144 ei->jinode = jinode;
145 jbd2_journal_init_jbd_inode(ei->jinode, inode);
146 jinode = NULL;
147 }
148 spin_unlock(&inode->i_lock);
149 if (unlikely(jinode != NULL))
150 jbd2_free_inode(jinode);
151 }
130 return dquot_file_open(inode, filp); 152 return dquot_file_open(inode, filp);
131} 153}
132 154
@@ -188,6 +210,7 @@ const struct file_operations ext4_file_operations = {
188 .fsync = ext4_sync_file, 210 .fsync = ext4_sync_file,
189 .splice_read = generic_file_splice_read, 211 .splice_read = generic_file_splice_read,
190 .splice_write = generic_file_splice_write, 212 .splice_write = generic_file_splice_write,
213 .fallocate = ext4_fallocate,
191}; 214};
192 215
193const struct inode_operations ext4_file_inode_operations = { 216const struct inode_operations ext4_file_inode_operations = {
@@ -201,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
201 .removexattr = generic_removexattr, 224 .removexattr = generic_removexattr,
202#endif 225#endif
203 .check_acl = ext4_check_acl, 226 .check_acl = ext4_check_acl,
204 .fallocate = ext4_fallocate,
205 .fiemap = ext4_fiemap, 227 .fiemap = ext4_fiemap,
206}; 228};
207 229
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c1a7bc923cf6..7829b287822a 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
75 * to written. 75 * to written.
76 * The function return the number of pending IOs on success. 76 * The function return the number of pending IOs on success.
77 */ 77 */
78static int flush_completed_IO(struct inode *inode) 78extern int ext4_flush_completed_IO(struct inode *inode)
79{ 79{
80 ext4_io_end_t *io; 80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode); 81 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -169,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync)
169 if (inode->i_sb->s_flags & MS_RDONLY) 169 if (inode->i_sb->s_flags & MS_RDONLY)
170 return 0; 170 return 0;
171 171
172 ret = flush_completed_IO(inode); 172 ret = ext4_flush_completed_IO(inode);
173 if (ret < 0) 173 if (ret < 0)
174 return ret; 174 return ret;
175 175
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ce240a23ebb..eb9097aec6f0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1027,7 +1027,7 @@ got:
1027 inode->i_generation = sbi->s_next_generation++; 1027 inode->i_generation = sbi->s_next_generation++;
1028 spin_unlock(&sbi->s_next_gen_lock); 1028 spin_unlock(&sbi->s_next_gen_lock);
1029 1029
1030 ei->i_state_flags = 0; 1030 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
1031 ext4_set_inode_state(inode, EXT4_STATE_NEW); 1031 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1032 1032
1033 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1033 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e659597b690b..9f7f9e49914f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/printk.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/ratelimit.h>
43 45
44#include "ext4_jbd2.h" 46#include "ext4_jbd2.h"
45#include "xattr.h" 47#include "xattr.h"
@@ -54,10 +56,17 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
54 loff_t new_size) 56 loff_t new_size)
55{ 57{
56 trace_ext4_begin_ordered_truncate(inode, new_size); 58 trace_ext4_begin_ordered_truncate(inode, new_size);
57 return jbd2_journal_begin_ordered_truncate( 59 /*
58 EXT4_SB(inode->i_sb)->s_journal, 60 * If jinode is zero, then we never opened the file for
59 &EXT4_I(inode)->jinode, 61 * writing, so there's no need to call
60 new_size); 62 * jbd2_journal_begin_ordered_truncate() since there's no
63 * outstanding writes we need to flush.
64 */
65 if (!EXT4_I(inode)->jinode)
66 return 0;
67 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
68 EXT4_I(inode)->jinode,
69 new_size);
61} 70}
62 71
63static void ext4_invalidatepage(struct page *page, unsigned long offset); 72static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -552,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
552} 561}
553 562
554/** 563/**
555 * ext4_blks_to_allocate: Look up the block map and count the number 564 * ext4_blks_to_allocate - Look up the block map and count the number
556 * of direct blocks need to be allocated for the given branch. 565 * of direct blocks need to be allocated for the given branch.
557 * 566 *
558 * @branch: chain of indirect blocks 567 * @branch: chain of indirect blocks
@@ -591,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
591 600
592/** 601/**
593 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
603 * @handle: handle for this transaction
604 * @inode: inode which needs allocated blocks
605 * @iblock: the logical block to start allocated at
606 * @goal: preferred physical block of allocation
594 * @indirect_blks: the number of blocks need to allocate for indirect 607 * @indirect_blks: the number of blocks need to allocate for indirect
595 * blocks 608 * blocks
596 * 609 * @blks: number of desired blocks
597 * @new_blocks: on return it will store the new block numbers for 610 * @new_blocks: on return it will store the new block numbers for
598 * the indirect blocks(if needed) and the first direct block, 611 * the indirect blocks(if needed) and the first direct block,
599 * @blks: on return it will store the total number of allocated 612 * @err: on return it will store the error code
600 * direct blocks 613 *
614 * This function will return the number of blocks allocated as
615 * requested by the passed-in parameters.
601 */ 616 */
602static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 617static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
603 ext4_lblk_t iblock, ext4_fsblk_t goal, 618 ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -711,9 +726,11 @@ failed_out:
711 726
712/** 727/**
713 * ext4_alloc_branch - allocate and set up a chain of blocks. 728 * ext4_alloc_branch - allocate and set up a chain of blocks.
729 * @handle: handle for this transaction
714 * @inode: owner 730 * @inode: owner
715 * @indirect_blks: number of allocated indirect blocks 731 * @indirect_blks: number of allocated indirect blocks
716 * @blks: number of allocated direct blocks 732 * @blks: number of allocated direct blocks
733 * @goal: preferred place for allocation
717 * @offsets: offsets (in the blocks) to store the pointers to next. 734 * @offsets: offsets (in the blocks) to store the pointers to next.
718 * @branch: place to store the chain in. 735 * @branch: place to store the chain in.
719 * 736 *
@@ -826,6 +843,7 @@ failed:
826 843
827/** 844/**
828 * ext4_splice_branch - splice the allocated branch onto inode. 845 * ext4_splice_branch - splice the allocated branch onto inode.
846 * @handle: handle for this transaction
829 * @inode: owner 847 * @inode: owner
830 * @block: (logical) number of block we are adding 848 * @block: (logical) number of block we are adding
831 * @chain: chain of indirect blocks (with a missing link - see 849 * @chain: chain of indirect blocks (with a missing link - see
@@ -1081,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1081 * Calculate the number of metadata blocks need to reserve 1099 * Calculate the number of metadata blocks need to reserve
1082 * to allocate a block located at @lblock 1100 * to allocate a block located at @lblock
1083 */ 1101 */
1084static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1102static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1085{ 1103{
1086 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1104 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1087 return ext4_ext_calc_metadata_amount(inode, lblock); 1105 return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1320,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1320 * avoid double accounting 1338 * avoid double accounting
1321 */ 1339 */
1322 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1340 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1323 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1341 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1324 /* 1342 /*
1325 * We need to check for EXT4 here because migrate 1343 * We need to check for EXT4 here because migrate
1326 * could have changed the inode type in between 1344 * could have changed the inode type in between
@@ -1350,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1350 ext4_da_update_reserve_space(inode, retval, 1); 1368 ext4_da_update_reserve_space(inode, retval, 1);
1351 } 1369 }
1352 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1370 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1353 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1371 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1354 1372
1355 up_write((&EXT4_I(inode)->i_data_sem)); 1373 up_write((&EXT4_I(inode)->i_data_sem));
1356 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1374 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1878,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file,
1878/* 1896/*
1879 * Reserve a single block located at lblock 1897 * Reserve a single block located at lblock
1880 */ 1898 */
1881static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) 1899static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1882{ 1900{
1883 int retries = 0; 1901 int retries = 0;
1884 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1902 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2239,7 +2257,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2239 * affects functions in many different parts of the allocation 2257 * affects functions in many different parts of the allocation
2240 * call path. This flag exists primarily because we don't 2258 * call path. This flag exists primarily because we don't
2241 * want to change *many* call functions, so ext4_map_blocks() 2259 * want to change *many* call functions, so ext4_map_blocks()
2242 * will set the magic i_delalloc_reserved_flag once the 2260 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
2243 * inode's allocation semaphore is taken. 2261 * inode's allocation semaphore is taken.
2244 * 2262 *
2245 * If the blocks in questions were delalloc blocks, set 2263 * If the blocks in questions were delalloc blocks, set
@@ -3362,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
3362 * doing I/O at all. 3380 * doing I/O at all.
3363 * 3381 *
3364 * We could call write_cache_pages(), and then redirty all of 3382 * We could call write_cache_pages(), and then redirty all of
3365 * the pages by calling redirty_page_for_writeback() but that 3383 * the pages by calling redirty_page_for_writepage() but that
3366 * would be ugly in the extreme. So instead we would need to 3384 * would be ugly in the extreme. So instead we would need to
3367 * replicate parts of the code in the above functions, 3385 * replicate parts of the code in the above functions,
3368 * simplifying them becuase we wouldn't actually intend to 3386 * simplifying them becuase we wouldn't actually intend to
@@ -3720,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3720retry: 3738retry:
3721 io_end = ext4_init_io_end(inode, GFP_ATOMIC); 3739 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3722 if (!io_end) { 3740 if (!io_end) {
3723 if (printk_ratelimit()) 3741 pr_warn_ratelimited("%s: allocation fail\n", __func__);
3724 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3725 schedule(); 3742 schedule();
3726 goto retry; 3743 goto retry;
3727 } 3744 }
@@ -3745,9 +3762,9 @@ retry:
3745 * preallocated extents, and those write extend the file, no need to 3762 * preallocated extents, and those write extend the file, no need to
3746 * fall back to buffered IO. 3763 * fall back to buffered IO.
3747 * 3764 *
3748 * For holes, we fallocate those blocks, mark them as unintialized 3765 * For holes, we fallocate those blocks, mark them as uninitialized
3749 * If those blocks were preallocated, we mark sure they are splited, but 3766 * If those blocks were preallocated, we mark sure they are splited, but
3750 * still keep the range to write as unintialized. 3767 * still keep the range to write as uninitialized.
3751 * 3768 *
3752 * The unwrritten extents will be converted to written when DIO is completed. 3769 * The unwrritten extents will be converted to written when DIO is completed.
3753 * For async direct IO, since the IO may still pending when return, we 3770 * For async direct IO, since the IO may still pending when return, we
@@ -4045,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle,
4045 if (ext4_should_journal_data(inode)) { 4062 if (ext4_should_journal_data(inode)) {
4046 err = ext4_handle_dirty_metadata(handle, inode, bh); 4063 err = ext4_handle_dirty_metadata(handle, inode, bh);
4047 } else { 4064 } else {
4048 if (ext4_should_order_data(inode)) 4065 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
4049 err = ext4_jbd2_file_inode(handle, inode); 4066 err = ext4_jbd2_file_inode(handle, inode);
4050 mark_buffer_dirty(bh); 4067 mark_buffer_dirty(bh);
4051 } 4068 }
@@ -4169,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4169{ 4186{
4170 __le32 *p; 4187 __le32 *p;
4171 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 4188 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4189 int err;
4172 4190
4173 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4191 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4174 flags |= EXT4_FREE_BLOCKS_METADATA; 4192 flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4184,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4184 if (try_to_extend_transaction(handle, inode)) { 4202 if (try_to_extend_transaction(handle, inode)) {
4185 if (bh) { 4203 if (bh) {
4186 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4204 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4187 ext4_handle_dirty_metadata(handle, inode, bh); 4205 err = ext4_handle_dirty_metadata(handle, inode, bh);
4206 if (unlikely(err)) {
4207 ext4_std_error(inode->i_sb, err);
4208 return 1;
4209 }
4210 }
4211 err = ext4_mark_inode_dirty(handle, inode);
4212 if (unlikely(err)) {
4213 ext4_std_error(inode->i_sb, err);
4214 return 1;
4215 }
4216 err = ext4_truncate_restart_trans(handle, inode,
4217 blocks_for_truncate(inode));
4218 if (unlikely(err)) {
4219 ext4_std_error(inode->i_sb, err);
4220 return 1;
4188 } 4221 }
4189 ext4_mark_inode_dirty(handle, inode);
4190 ext4_truncate_restart_trans(handle, inode,
4191 blocks_for_truncate(inode));
4192 if (bh) { 4222 if (bh) {
4193 BUFFER_TRACE(bh, "retaking write access"); 4223 BUFFER_TRACE(bh, "retaking write access");
4194 ext4_journal_get_write_access(handle, bh); 4224 ext4_journal_get_write_access(handle, bh);
@@ -4349,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4349 (__le32 *) bh->b_data, 4379 (__le32 *) bh->b_data,
4350 (__le32 *) bh->b_data + addr_per_block, 4380 (__le32 *) bh->b_data + addr_per_block,
4351 depth); 4381 depth);
4382 brelse(bh);
4352 4383
4353 /* 4384 /*
4354 * Everything below this this pointer has been 4385 * Everything below this this pointer has been
@@ -4859,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4859 } 4890 }
4860 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4891 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4861 4892
4862 ei->i_state_flags = 0; 4893 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
4863 ei->i_dir_start_lookup = 0; 4894 ei->i_dir_start_lookup = 0;
4864 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4895 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4865 /* We now have enough fields to check if the inode was active or not. 4896 /* We now have enough fields to check if the inode was active or not.
@@ -5118,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
5118 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 5149 if (ext4_inode_blocks_set(handle, raw_inode, ei))
5119 goto out_brelse; 5150 goto out_brelse;
5120 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 5151 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5121 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 5152 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
5122 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 5153 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5123 cpu_to_le32(EXT4_OS_HURD)) 5154 cpu_to_le32(EXT4_OS_HURD))
5124 raw_inode->i_file_acl_high = 5155 raw_inode->i_file_acl_high =
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5b4d4e3a4d58..851f49b2f9d2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2608,18 +2608,12 @@ int ext4_mb_release(struct super_block *sb)
2608static inline int ext4_issue_discard(struct super_block *sb, 2608static inline int ext4_issue_discard(struct super_block *sb,
2609 ext4_group_t block_group, ext4_grpblk_t block, int count) 2609 ext4_group_t block_group, ext4_grpblk_t block, int count)
2610{ 2610{
2611 int ret;
2612 ext4_fsblk_t discard_block; 2611 ext4_fsblk_t discard_block;
2613 2612
2614 discard_block = block + ext4_group_first_block_no(sb, block_group); 2613 discard_block = block + ext4_group_first_block_no(sb, block_group);
2615 trace_ext4_discard_blocks(sb, 2614 trace_ext4_discard_blocks(sb,
2616 (unsigned long long) discard_block, count); 2615 (unsigned long long) discard_block, count);
2617 ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 2616 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2618 if (ret == -EOPNOTSUPP) {
2619 ext4_warning(sb, "discard not supported, disabling");
2620 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2621 }
2622 return ret;
2623} 2617}
2624 2618
2625/* 2619/*
@@ -2631,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2631 struct super_block *sb = journal->j_private; 2625 struct super_block *sb = journal->j_private;
2632 struct ext4_buddy e4b; 2626 struct ext4_buddy e4b;
2633 struct ext4_group_info *db; 2627 struct ext4_group_info *db;
2634 int err, count = 0, count2 = 0; 2628 int err, ret, count = 0, count2 = 0;
2635 struct ext4_free_data *entry; 2629 struct ext4_free_data *entry;
2636 struct list_head *l, *ltmp; 2630 struct list_head *l, *ltmp;
2637 2631
@@ -2641,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2641 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2635 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2642 entry->count, entry->group, entry); 2636 entry->count, entry->group, entry);
2643 2637
2644 if (test_opt(sb, DISCARD)) 2638 if (test_opt(sb, DISCARD)) {
2645 ext4_issue_discard(sb, entry->group, 2639 ret = ext4_issue_discard(sb, entry->group,
2646 entry->start_blk, entry->count); 2640 entry->start_blk, entry->count);
2641 if (unlikely(ret == -EOPNOTSUPP)) {
2642 ext4_warning(sb, "discard not supported, "
2643 "disabling");
2644 clear_opt(sb, DISCARD);
2645 }
2646 }
2647 2647
2648 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2648 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2649 /* we expect to find existing buddy because it's pinned */ 2649 /* we expect to find existing buddy because it's pinned */
@@ -3881,19 +3881,6 @@ repeat:
3881 } 3881 }
3882} 3882}
3883 3883
3884/*
3885 * finds all preallocated spaces and return blocks being freed to them
3886 * if preallocated space becomes full (no block is used from the space)
3887 * then the function frees space in buddy
3888 * XXX: at the moment, truncate (which is the only way to free blocks)
3889 * discards all preallocations
3890 */
3891static void ext4_mb_return_to_preallocation(struct inode *inode,
3892 struct ext4_buddy *e4b,
3893 sector_t block, int count)
3894{
3895 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
3896}
3897#ifdef CONFIG_EXT4_DEBUG 3884#ifdef CONFIG_EXT4_DEBUG
3898static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3885static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3899{ 3886{
@@ -4283,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4283 * EDQUOT check, as blocks and quotas have been already 4270 * EDQUOT check, as blocks and quotas have been already
4284 * reserved when data being copied into pagecache. 4271 * reserved when data being copied into pagecache.
4285 */ 4272 */
4286 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4273 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4287 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4274 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4288 else { 4275 else {
4289 /* Without delayed allocation we need to verify 4276 /* Without delayed allocation we need to verify
@@ -4380,7 +4367,8 @@ out:
4380 if (inquota && ar->len < inquota) 4367 if (inquota && ar->len < inquota)
4381 dquot_free_block(ar->inode, inquota - ar->len); 4368 dquot_free_block(ar->inode, inquota - ar->len);
4382 if (!ar->len) { 4369 if (!ar->len) {
4383 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4370 if (!ext4_test_inode_state(ar->inode,
4371 EXT4_STATE_DELALLOC_RESERVED))
4384 /* release all the reserved blocks if non delalloc */ 4372 /* release all the reserved blocks if non delalloc */
4385 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4373 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4386 reserv_blks); 4374 reserv_blks);
@@ -4626,7 +4614,11 @@ do_more:
4626 * blocks being freed are metadata. these blocks shouldn't 4614 * blocks being freed are metadata. these blocks shouldn't
4627 * be used until this transaction is committed 4615 * be used until this transaction is committed
4628 */ 4616 */
4629 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4617 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4618 if (!new_entry) {
4619 err = -ENOMEM;
4620 goto error_return;
4621 }
4630 new_entry->start_blk = bit; 4622 new_entry->start_blk = bit;
4631 new_entry->group = block_group; 4623 new_entry->group = block_group;
4632 new_entry->count = count; 4624 new_entry->count = count;
@@ -4643,7 +4635,6 @@ do_more:
4643 ext4_lock_group(sb, block_group); 4635 ext4_lock_group(sb, block_group);
4644 mb_clear_bits(bitmap_bh->b_data, bit, count); 4636 mb_clear_bits(bitmap_bh->b_data, bit, count);
4645 mb_free_blocks(inode, &e4b, bit, count); 4637 mb_free_blocks(inode, &e4b, bit, count);
4646 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4647 } 4638 }
4648 4639
4649 ret = ext4_free_blks_count(sb, gdp) + count; 4640 ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4718,8 +4709,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4718 ext4_unlock_group(sb, group); 4709 ext4_unlock_group(sb, group);
4719 4710
4720 ret = ext4_issue_discard(sb, group, start, count); 4711 ret = ext4_issue_discard(sb, group, start, count);
4721 if (ret)
4722 ext4_std_error(sb, ret);
4723 4712
4724 ext4_lock_group(sb, group); 4713 ext4_lock_group(sb, group);
4725 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4714 mb_free_blocks(NULL, e4b, start, ex.fe_len);
@@ -4819,6 +4808,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4819 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 4808 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4820 ext4_grpblk_t cnt = 0, first_block, last_block; 4809 ext4_grpblk_t cnt = 0, first_block, last_block;
4821 uint64_t start, len, minlen, trimmed; 4810 uint64_t start, len, minlen, trimmed;
4811 ext4_fsblk_t first_data_blk =
4812 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4822 int ret = 0; 4813 int ret = 0;
4823 4814
4824 start = range->start >> sb->s_blocksize_bits; 4815 start = range->start >> sb->s_blocksize_bits;
@@ -4828,6 +4819,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4828 4819
4829 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 4820 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4830 return -EINVAL; 4821 return -EINVAL;
4822 if (start < first_data_blk) {
4823 len -= first_data_blk - start;
4824 start = first_data_blk;
4825 }
4831 4826
4832 /* Determine first and last group to examine based on start and len */ 4827 /* Determine first and last group to examine based on start and len */
4833 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 4828 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
@@ -4851,7 +4846,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4851 if (len >= EXT4_BLOCKS_PER_GROUP(sb)) 4846 if (len >= EXT4_BLOCKS_PER_GROUP(sb))
4852 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); 4847 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
4853 else 4848 else
4854 last_block = len; 4849 last_block = first_block + len;
4855 4850
4856 if (e4b.bd_info->bb_free >= minlen) { 4851 if (e4b.bd_info->bb_free >= minlen) {
4857 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4852 cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 25f3a974b725..b0a126f23c20 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * 496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; 497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, 498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
499 S_IFREG, 0, goal); 499 S_IFREG, NULL, goal);
500 if (IS_ERR(tmp_inode)) { 500 if (IS_ERR(tmp_inode)) {
501 retval = -ENOMEM; 501 retval = -ENOMEM;
502 ext4_journal_stop(handle); 502 ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index dc40e75cba88..5485390d32c5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
581 dir->i_sb->s_blocksize - 581 dir->i_sb->s_blocksize -
582 EXT4_DIR_REC_LEN(0)); 582 EXT4_DIR_REC_LEN(0));
583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
584 if (!ext4_check_dir_entry(dir, de, bh, 584 if (ext4_check_dir_entry(dir, NULL, de, bh,
585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
586 +((char *)de - bh->b_data))) { 586 + ((char *)de - bh->b_data))) {
587 /* On error, skip the f_pos to the next block. */ 587 /* On error, skip the f_pos to the next block. */
588 dir_file->f_pos = (dir_file->f_pos | 588 dir_file->f_pos = (dir_file->f_pos |
589 (dir->i_sb->s_blocksize - 1)) + 1; 589 (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
820 if ((char *) de + namelen <= dlimit && 820 if ((char *) de + namelen <= dlimit &&
821 ext4_match (namelen, name, de)) { 821 ext4_match (namelen, name, de)) {
822 /* found a match - just to be sure, do a full check */ 822 /* found a match - just to be sure, do a full check */
823 if (!ext4_check_dir_entry(dir, de, bh, offset)) 823 if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
824 return -1; 824 return -1;
825 *res_dir = de; 825 *res_dir = de;
826 return 1; 826 return 1;
@@ -1036,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1036 return ERR_PTR(-EIO); 1036 return ERR_PTR(-EIO);
1037 } 1037 }
1038 inode = ext4_iget(dir->i_sb, ino); 1038 inode = ext4_iget(dir->i_sb, ino);
1039 if (unlikely(IS_ERR(inode))) { 1039 if (IS_ERR(inode)) {
1040 if (PTR_ERR(inode) == -ESTALE) { 1040 if (PTR_ERR(inode) == -ESTALE) {
1041 EXT4_ERROR_INODE(dir, 1041 EXT4_ERROR_INODE(dir,
1042 "deleted inode referenced: %u", 1042 "deleted inode referenced: %u",
@@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1269 de = (struct ext4_dir_entry_2 *)bh->b_data; 1269 de = (struct ext4_dir_entry_2 *)bh->b_data;
1270 top = bh->b_data + blocksize - reclen; 1270 top = bh->b_data + blocksize - reclen;
1271 while ((char *) de <= top) { 1271 while ((char *) de <= top) {
1272 if (!ext4_check_dir_entry(dir, de, bh, offset)) 1272 if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1273 return -EIO; 1273 return -EIO;
1274 if (ext4_match(namelen, name, de)) 1274 if (ext4_match(namelen, name, de))
1275 return -EEXIST; 1275 return -EEXIST;
@@ -1602,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1602 if (err) 1602 if (err)
1603 goto journal_error; 1603 goto journal_error;
1604 } 1604 }
1605 ext4_handle_dirty_metadata(handle, inode, frames[0].bh); 1605 err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
1606 if (err) {
1607 ext4_std_error(inode->i_sb, err);
1608 goto cleanup;
1609 }
1606 } 1610 }
1607 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1611 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1608 if (!de) 1612 if (!de)
@@ -1630,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle,
1630{ 1634{
1631 struct ext4_dir_entry_2 *de, *pde; 1635 struct ext4_dir_entry_2 *de, *pde;
1632 unsigned int blocksize = dir->i_sb->s_blocksize; 1636 unsigned int blocksize = dir->i_sb->s_blocksize;
1633 int i; 1637 int i, err;
1634 1638
1635 i = 0; 1639 i = 0;
1636 pde = NULL; 1640 pde = NULL;
1637 de = (struct ext4_dir_entry_2 *) bh->b_data; 1641 de = (struct ext4_dir_entry_2 *) bh->b_data;
1638 while (i < bh->b_size) { 1642 while (i < bh->b_size) {
1639 if (!ext4_check_dir_entry(dir, de, bh, i)) 1643 if (ext4_check_dir_entry(dir, NULL, de, bh, i))
1640 return -EIO; 1644 return -EIO;
1641 if (de == de_del) { 1645 if (de == de_del) {
1642 BUFFER_TRACE(bh, "get_write_access"); 1646 BUFFER_TRACE(bh, "get_write_access");
1643 ext4_journal_get_write_access(handle, bh); 1647 err = ext4_journal_get_write_access(handle, bh);
1648 if (unlikely(err)) {
1649 ext4_std_error(dir->i_sb, err);
1650 return err;
1651 }
1644 if (pde) 1652 if (pde)
1645 pde->rec_len = ext4_rec_len_to_disk( 1653 pde->rec_len = ext4_rec_len_to_disk(
1646 ext4_rec_len_from_disk(pde->rec_len, 1654 ext4_rec_len_from_disk(pde->rec_len,
@@ -1652,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle,
1652 de->inode = 0; 1660 de->inode = 0;
1653 dir->i_version++; 1661 dir->i_version++;
1654 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1662 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1655 ext4_handle_dirty_metadata(handle, dir, bh); 1663 err = ext4_handle_dirty_metadata(handle, dir, bh);
1664 if (unlikely(err)) {
1665 ext4_std_error(dir->i_sb, err);
1666 return err;
1667 }
1656 return 0; 1668 return 0;
1657 } 1669 }
1658 i += ext4_rec_len_from_disk(de->rec_len, blocksize); 1670 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1789,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1789{ 1801{
1790 handle_t *handle; 1802 handle_t *handle;
1791 struct inode *inode; 1803 struct inode *inode;
1792 struct buffer_head *dir_block; 1804 struct buffer_head *dir_block = NULL;
1793 struct ext4_dir_entry_2 *de; 1805 struct ext4_dir_entry_2 *de;
1794 unsigned int blocksize = dir->i_sb->s_blocksize; 1806 unsigned int blocksize = dir->i_sb->s_blocksize;
1795 int err, retries = 0; 1807 int err, retries = 0;
@@ -1822,7 +1834,9 @@ retry:
1822 if (!dir_block) 1834 if (!dir_block)
1823 goto out_clear_inode; 1835 goto out_clear_inode;
1824 BUFFER_TRACE(dir_block, "get_write_access"); 1836 BUFFER_TRACE(dir_block, "get_write_access");
1825 ext4_journal_get_write_access(handle, dir_block); 1837 err = ext4_journal_get_write_access(handle, dir_block);
1838 if (err)
1839 goto out_clear_inode;
1826 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1840 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1827 de->inode = cpu_to_le32(inode->i_ino); 1841 de->inode = cpu_to_le32(inode->i_ino);
1828 de->name_len = 1; 1842 de->name_len = 1;
@@ -1839,10 +1853,12 @@ retry:
1839 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1853 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1840 inode->i_nlink = 2; 1854 inode->i_nlink = 2;
1841 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 1855 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1842 ext4_handle_dirty_metadata(handle, dir, dir_block); 1856 err = ext4_handle_dirty_metadata(handle, dir, dir_block);
1843 brelse(dir_block); 1857 if (err)
1844 ext4_mark_inode_dirty(handle, inode); 1858 goto out_clear_inode;
1845 err = ext4_add_entry(handle, dentry, inode); 1859 err = ext4_mark_inode_dirty(handle, inode);
1860 if (!err)
1861 err = ext4_add_entry(handle, dentry, inode);
1846 if (err) { 1862 if (err) {
1847out_clear_inode: 1863out_clear_inode:
1848 clear_nlink(inode); 1864 clear_nlink(inode);
@@ -1853,10 +1869,13 @@ out_clear_inode:
1853 } 1869 }
1854 ext4_inc_count(handle, dir); 1870 ext4_inc_count(handle, dir);
1855 ext4_update_dx_flag(dir); 1871 ext4_update_dx_flag(dir);
1856 ext4_mark_inode_dirty(handle, dir); 1872 err = ext4_mark_inode_dirty(handle, dir);
1873 if (err)
1874 goto out_clear_inode;
1857 d_instantiate(dentry, inode); 1875 d_instantiate(dentry, inode);
1858 unlock_new_inode(inode); 1876 unlock_new_inode(inode);
1859out_stop: 1877out_stop:
1878 brelse(dir_block);
1860 ext4_journal_stop(handle); 1879 ext4_journal_stop(handle);
1861 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 1880 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1862 goto retry; 1881 goto retry;
@@ -1919,7 +1938,7 @@ static int empty_dir(struct inode *inode)
1919 } 1938 }
1920 de = (struct ext4_dir_entry_2 *) bh->b_data; 1939 de = (struct ext4_dir_entry_2 *) bh->b_data;
1921 } 1940 }
1922 if (!ext4_check_dir_entry(inode, de, bh, offset)) { 1941 if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
1923 de = (struct ext4_dir_entry_2 *)(bh->b_data + 1942 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1924 sb->s_blocksize); 1943 sb->s_blocksize);
1925 offset = (offset | (sb->s_blocksize - 1)) + 1; 1944 offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2407,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2407 ext4_current_time(new_dir); 2426 ext4_current_time(new_dir);
2408 ext4_mark_inode_dirty(handle, new_dir); 2427 ext4_mark_inode_dirty(handle, new_dir);
2409 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); 2428 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2410 ext4_handle_dirty_metadata(handle, new_dir, new_bh); 2429 retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
2430 if (unlikely(retval)) {
2431 ext4_std_error(new_dir->i_sb, retval);
2432 goto end_rename;
2433 }
2411 brelse(new_bh); 2434 brelse(new_bh);
2412 new_bh = NULL; 2435 new_bh = NULL;
2413 } 2436 }
@@ -2459,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2459 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2482 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2460 cpu_to_le32(new_dir->i_ino); 2483 cpu_to_le32(new_dir->i_ino);
2461 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2484 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2462 ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2485 retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2486 if (retval) {
2487 ext4_std_error(old_dir->i_sb, retval);
2488 goto end_rename;
2489 }
2463 ext4_dec_count(handle, old_dir); 2490 ext4_dec_count(handle, old_dir);
2464 if (new_inode) { 2491 if (new_inode) {
2465 /* checked empty_dir above, can't have another parent, 2492 /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index beacce11ac50..7270dcfca92a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -44,7 +44,7 @@ int __init ext4_init_pageio(void)
44 if (io_page_cachep == NULL) 44 if (io_page_cachep == NULL)
45 return -ENOMEM; 45 return -ENOMEM;
46 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 46 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
47 if (io_page_cachep == NULL) { 47 if (io_end_cachep == NULL) {
48 kmem_cache_destroy(io_page_cachep); 48 kmem_cache_destroy(io_page_cachep);
49 return -ENOMEM; 49 return -ENOMEM;
50 } 50 }
@@ -158,11 +158,8 @@ static void ext4_end_io_work(struct work_struct *work)
158 158
159ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 159ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
160{ 160{
161 ext4_io_end_t *io = NULL; 161 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
162
163 io = kmem_cache_alloc(io_end_cachep, flags);
164 if (io) { 162 if (io) {
165 memset(io, 0, sizeof(*io));
166 atomic_inc(&EXT4_I(inode)->i_ioend_count); 163 atomic_inc(&EXT4_I(inode)->i_ioend_count);
167 io->inode = inode; 164 io->inode = inode;
168 INIT_WORK(&io->work, ext4_end_io_work); 165 INIT_WORK(&io->work, ext4_end_io_work);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 981c8477adab..3ecc6e45d2f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,7 +220,11 @@ static int setup_new_group_blocks(struct super_block *sb,
220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
221 set_buffer_uptodate(gdb); 221 set_buffer_uptodate(gdb);
222 unlock_buffer(gdb); 222 unlock_buffer(gdb);
223 ext4_handle_dirty_metadata(handle, NULL, gdb); 223 err = ext4_handle_dirty_metadata(handle, NULL, gdb);
224 if (unlikely(err)) {
225 brelse(gdb);
226 goto exit_bh;
227 }
224 ext4_set_bit(bit, bh->b_data); 228 ext4_set_bit(bit, bh->b_data);
225 brelse(gdb); 229 brelse(gdb);
226 } 230 }
@@ -258,7 +262,11 @@ static int setup_new_group_blocks(struct super_block *sb,
258 262
259 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, 263 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
260 bh->b_data); 264 bh->b_data);
261 ext4_handle_dirty_metadata(handle, NULL, bh); 265 err = ext4_handle_dirty_metadata(handle, NULL, bh);
266 if (unlikely(err)) {
267 ext4_std_error(sb, err);
268 goto exit_bh;
269 }
262 brelse(bh); 270 brelse(bh);
263 /* Mark unused entries in inode bitmap used */ 271 /* Mark unused entries in inode bitmap used */
264 ext4_debug("clear inode bitmap %#04llx (+%llu)\n", 272 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -270,7 +278,9 @@ static int setup_new_group_blocks(struct super_block *sb,
270 278
271 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 279 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
272 bh->b_data); 280 bh->b_data);
273 ext4_handle_dirty_metadata(handle, NULL, bh); 281 err = ext4_handle_dirty_metadata(handle, NULL, bh);
282 if (unlikely(err))
283 ext4_std_error(sb, err);
274exit_bh: 284exit_bh:
275 brelse(bh); 285 brelse(bh);
276 286
@@ -422,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
422 goto exit_dind; 432 goto exit_dind;
423 } 433 }
424 434
425 if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) 435 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
436 if (unlikely(err))
426 goto exit_dind; 437 goto exit_dind;
427 438
428 if ((err = ext4_journal_get_write_access(handle, *primary))) 439 err = ext4_journal_get_write_access(handle, *primary);
440 if (unlikely(err))
429 goto exit_sbh; 441 goto exit_sbh;
430 442
431 if ((err = ext4_journal_get_write_access(handle, dind))) 443 err = ext4_journal_get_write_access(handle, dind);
432 goto exit_primary; 444 if (unlikely(err))
445 ext4_std_error(sb, err);
433 446
434 /* ext4_reserve_inode_write() gets a reference on the iloc */ 447 /* ext4_reserve_inode_write() gets a reference on the iloc */
435 if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) 448 err = ext4_reserve_inode_write(handle, inode, &iloc);
449 if (unlikely(err))
436 goto exit_dindj; 450 goto exit_dindj;
437 451
438 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), 452 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -454,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
454 * reserved inode, and will become GDT blocks (primary and backup). 468 * reserved inode, and will become GDT blocks (primary and backup).
455 */ 469 */
456 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; 470 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
457 ext4_handle_dirty_metadata(handle, NULL, dind); 471 err = ext4_handle_dirty_metadata(handle, NULL, dind);
458 brelse(dind); 472 if (unlikely(err)) {
473 ext4_std_error(sb, err);
474 goto exit_inode;
475 }
459 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 476 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
460 ext4_mark_iloc_dirty(handle, inode, &iloc); 477 ext4_mark_iloc_dirty(handle, inode, &iloc);
461 memset((*primary)->b_data, 0, sb->s_blocksize); 478 memset((*primary)->b_data, 0, sb->s_blocksize);
462 ext4_handle_dirty_metadata(handle, NULL, *primary); 479 err = ext4_handle_dirty_metadata(handle, NULL, *primary);
480 if (unlikely(err)) {
481 ext4_std_error(sb, err);
482 goto exit_inode;
483 }
484 brelse(dind);
463 485
464 o_group_desc = EXT4_SB(sb)->s_group_desc; 486 o_group_desc = EXT4_SB(sb)->s_group_desc;
465 memcpy(n_group_desc, o_group_desc, 487 memcpy(n_group_desc, o_group_desc,
@@ -470,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
470 kfree(o_group_desc); 492 kfree(o_group_desc);
471 493
472 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 494 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
473 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 495 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
496 if (err)
497 ext4_std_error(sb, err);
474 498
475 return 0; 499 return err;
476 500
477exit_inode: 501exit_inode:
478 /* ext4_journal_release_buffer(handle, iloc.bh); */ 502 /* ext4_journal_release_buffer(handle, iloc.bh); */
479 brelse(iloc.bh); 503 brelse(iloc.bh);
480exit_dindj: 504exit_dindj:
481 /* ext4_journal_release_buffer(handle, dind); */ 505 /* ext4_journal_release_buffer(handle, dind); */
482exit_primary:
483 /* ext4_journal_release_buffer(handle, *primary); */
484exit_sbh: 506exit_sbh:
485 /* ext4_journal_release_buffer(handle, *primary); */ 507 /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
486exit_dind: 508exit_dind:
487 brelse(dind); 509 brelse(dind);
488exit_bh: 510exit_bh:
@@ -665,7 +687,9 @@ static void update_backups(struct super_block *sb,
665 memset(bh->b_data + size, 0, rest); 687 memset(bh->b_data + size, 0, rest);
666 set_buffer_uptodate(bh); 688 set_buffer_uptodate(bh);
667 unlock_buffer(bh); 689 unlock_buffer(bh);
668 ext4_handle_dirty_metadata(handle, NULL, bh); 690 err = ext4_handle_dirty_metadata(handle, NULL, bh);
691 if (unlikely(err))
692 ext4_std_error(sb, err);
669 brelse(bh); 693 brelse(bh);
670 } 694 }
671 if ((err2 = ext4_journal_stop(handle)) && !err) 695 if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -883,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
883 /* Update the global fs size fields */ 907 /* Update the global fs size fields */
884 sbi->s_groups_count++; 908 sbi->s_groups_count++;
885 909
886 ext4_handle_dirty_metadata(handle, NULL, primary); 910 err = ext4_handle_dirty_metadata(handle, NULL, primary);
911 if (unlikely(err)) {
912 ext4_std_error(sb, err);
913 goto exit_journal;
914 }
887 915
888 /* Update the reserved block counts only once the new group is 916 /* Update the reserved block counts only once the new group is
889 * active. */ 917 * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cd37f9d5e447..48ce561fafac 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -388,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb)
388void __ext4_error(struct super_block *sb, const char *function, 388void __ext4_error(struct super_block *sb, const char *function,
389 unsigned int line, const char *fmt, ...) 389 unsigned int line, const char *fmt, ...)
390{ 390{
391 struct va_format vaf;
391 va_list args; 392 va_list args;
392 393
393 va_start(args, fmt); 394 va_start(args, fmt);
394 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", 395 vaf.fmt = fmt;
395 sb->s_id, function, line, current->comm); 396 vaf.va = &args;
396 vprintk(fmt, args); 397 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
397 printk("\n"); 398 sb->s_id, function, line, current->comm, &vaf);
398 va_end(args); 399 va_end(args);
399 400
400 ext4_handle_error(sb); 401 ext4_handle_error(sb);
@@ -405,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
405 const char *fmt, ...) 406 const char *fmt, ...)
406{ 407{
407 va_list args; 408 va_list args;
409 struct va_format vaf;
408 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 410 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
409 411
410 es->s_last_error_ino = cpu_to_le32(inode->i_ino); 412 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
411 es->s_last_error_block = cpu_to_le64(block); 413 es->s_last_error_block = cpu_to_le64(block);
412 save_error_info(inode->i_sb, function, line); 414 save_error_info(inode->i_sb, function, line);
413 va_start(args, fmt); 415 va_start(args, fmt);
416 vaf.fmt = fmt;
417 vaf.va = &args;
414 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", 418 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
415 inode->i_sb->s_id, function, line, inode->i_ino); 419 inode->i_sb->s_id, function, line, inode->i_ino);
416 if (block) 420 if (block)
417 printk("block %llu: ", block); 421 printk(KERN_CONT "block %llu: ", block);
418 printk("comm %s: ", current->comm); 422 printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
419 vprintk(fmt, args);
420 printk("\n");
421 va_end(args); 423 va_end(args);
422 424
423 ext4_handle_error(inode->i_sb); 425 ext4_handle_error(inode->i_sb);
424} 426}
425 427
426void ext4_error_file(struct file *file, const char *function, 428void ext4_error_file(struct file *file, const char *function,
427 unsigned int line, const char *fmt, ...) 429 unsigned int line, ext4_fsblk_t block,
430 const char *fmt, ...)
428{ 431{
429 va_list args; 432 va_list args;
433 struct va_format vaf;
430 struct ext4_super_block *es; 434 struct ext4_super_block *es;
431 struct inode *inode = file->f_dentry->d_inode; 435 struct inode *inode = file->f_dentry->d_inode;
432 char pathname[80], *path; 436 char pathname[80], *path;
@@ -434,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function,
434 es = EXT4_SB(inode->i_sb)->s_es; 438 es = EXT4_SB(inode->i_sb)->s_es;
435 es->s_last_error_ino = cpu_to_le32(inode->i_ino); 439 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
436 save_error_info(inode->i_sb, function, line); 440 save_error_info(inode->i_sb, function, line);
437 va_start(args, fmt);
438 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 441 path = d_path(&(file->f_path), pathname, sizeof(pathname));
439 if (!path) 442 if (IS_ERR(path))
440 path = "(unknown)"; 443 path = "(unknown)";
441 printk(KERN_CRIT 444 printk(KERN_CRIT
442 "EXT4-fs error (device %s): %s:%d: inode #%lu " 445 "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
443 "(comm %s path %s): ", 446 inode->i_sb->s_id, function, line, inode->i_ino);
444 inode->i_sb->s_id, function, line, inode->i_ino, 447 if (block)
445 current->comm, path); 448 printk(KERN_CONT "block %llu: ", block);
446 vprintk(fmt, args); 449 va_start(args, fmt);
447 printk("\n"); 450 vaf.fmt = fmt;
451 vaf.va = &args;
452 printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
448 va_end(args); 453 va_end(args);
449 454
450 ext4_handle_error(inode->i_sb); 455 ext4_handle_error(inode->i_sb);
@@ -543,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
543 panic("EXT4-fs panic from previous error\n"); 548 panic("EXT4-fs panic from previous error\n");
544} 549}
545 550
546void ext4_msg (struct super_block * sb, const char *prefix, 551void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
547 const char *fmt, ...)
548{ 552{
553 struct va_format vaf;
549 va_list args; 554 va_list args;
550 555
551 va_start(args, fmt); 556 va_start(args, fmt);
552 printk("%sEXT4-fs (%s): ", prefix, sb->s_id); 557 vaf.fmt = fmt;
553 vprintk(fmt, args); 558 vaf.va = &args;
554 printk("\n"); 559 printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
555 va_end(args); 560 va_end(args);
556} 561}
557 562
558void __ext4_warning(struct super_block *sb, const char *function, 563void __ext4_warning(struct super_block *sb, const char *function,
559 unsigned int line, const char *fmt, ...) 564 unsigned int line, const char *fmt, ...)
560{ 565{
566 struct va_format vaf;
561 va_list args; 567 va_list args;
562 568
563 va_start(args, fmt); 569 va_start(args, fmt);
564 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", 570 vaf.fmt = fmt;
565 sb->s_id, function, line); 571 vaf.va = &args;
566 vprintk(fmt, args); 572 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
567 printk("\n"); 573 sb->s_id, function, line, &vaf);
568 va_end(args); 574 va_end(args);
569} 575}
570 576
@@ -575,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
575__releases(bitlock) 581__releases(bitlock)
576__acquires(bitlock) 582__acquires(bitlock)
577{ 583{
584 struct va_format vaf;
578 va_list args; 585 va_list args;
579 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 586 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
580 587
581 es->s_last_error_ino = cpu_to_le32(ino); 588 es->s_last_error_ino = cpu_to_le32(ino);
582 es->s_last_error_block = cpu_to_le64(block); 589 es->s_last_error_block = cpu_to_le64(block);
583 __save_error_info(sb, function, line); 590 __save_error_info(sb, function, line);
591
584 va_start(args, fmt); 592 va_start(args, fmt);
593
594 vaf.fmt = fmt;
595 vaf.va = &args;
585 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", 596 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
586 sb->s_id, function, line, grp); 597 sb->s_id, function, line, grp);
587 if (ino) 598 if (ino)
588 printk("inode %lu: ", ino); 599 printk(KERN_CONT "inode %lu: ", ino);
589 if (block) 600 if (block)
590 printk("block %llu:", (unsigned long long) block); 601 printk(KERN_CONT "block %llu:", (unsigned long long) block);
591 vprintk(fmt, args); 602 printk(KERN_CONT "%pV\n", &vaf);
592 printk("\n");
593 va_end(args); 603 va_end(args);
594 604
595 if (test_opt(sb, ERRORS_CONT)) { 605 if (test_opt(sb, ERRORS_CONT)) {
@@ -647,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
647 struct block_device *bdev; 657 struct block_device *bdev;
648 char b[BDEVNAME_SIZE]; 658 char b[BDEVNAME_SIZE];
649 659
650 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 660 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
651 if (IS_ERR(bdev)) 661 if (IS_ERR(bdev))
652 goto fail; 662 goto fail;
653 return bdev; 663 return bdev;
@@ -663,8 +673,7 @@ fail:
663 */ 673 */
664static int ext4_blkdev_put(struct block_device *bdev) 674static int ext4_blkdev_put(struct block_device *bdev)
665{ 675{
666 bd_release(bdev); 676 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
667 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
668} 677}
669 678
670static int ext4_blkdev_remove(struct ext4_sb_info *sbi) 679static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -808,21 +817,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
808 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 817 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
809 INIT_LIST_HEAD(&ei->i_prealloc_list); 818 INIT_LIST_HEAD(&ei->i_prealloc_list);
810 spin_lock_init(&ei->i_prealloc_lock); 819 spin_lock_init(&ei->i_prealloc_lock);
811 /*
812 * Note: We can be called before EXT4_SB(sb)->s_journal is set,
813 * therefore it can be null here. Don't check it, just initialize
814 * jinode.
815 */
816 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
817 ei->i_reserved_data_blocks = 0; 820 ei->i_reserved_data_blocks = 0;
818 ei->i_reserved_meta_blocks = 0; 821 ei->i_reserved_meta_blocks = 0;
819 ei->i_allocated_meta_blocks = 0; 822 ei->i_allocated_meta_blocks = 0;
820 ei->i_da_metadata_calc_len = 0; 823 ei->i_da_metadata_calc_len = 0;
821 ei->i_delalloc_reserved_flag = 0;
822 spin_lock_init(&(ei->i_block_reservation_lock)); 824 spin_lock_init(&(ei->i_block_reservation_lock));
823#ifdef CONFIG_QUOTA 825#ifdef CONFIG_QUOTA
824 ei->i_reserved_quota = 0; 826 ei->i_reserved_quota = 0;
825#endif 827#endif
828 ei->jinode = NULL;
826 INIT_LIST_HEAD(&ei->i_completed_io_list); 829 INIT_LIST_HEAD(&ei->i_completed_io_list);
827 spin_lock_init(&ei->i_completed_io_lock); 830 spin_lock_init(&ei->i_completed_io_lock);
828 ei->cur_aio_dio = NULL; 831 ei->cur_aio_dio = NULL;
@@ -898,9 +901,12 @@ void ext4_clear_inode(struct inode *inode)
898 end_writeback(inode); 901 end_writeback(inode);
899 dquot_drop(inode); 902 dquot_drop(inode);
900 ext4_discard_preallocations(inode); 903 ext4_discard_preallocations(inode);
901 if (EXT4_JOURNAL(inode)) 904 if (EXT4_I(inode)->jinode) {
902 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 905 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
903 &EXT4_I(inode)->jinode); 906 EXT4_I(inode)->jinode);
907 jbd2_free_inode(EXT4_I(inode)->jinode);
908 EXT4_I(inode)->jinode = NULL;
909 }
904} 910}
905 911
906static inline void ext4_show_quota_options(struct seq_file *seq, 912static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -1155,7 +1161,7 @@ static int ext4_release_dquot(struct dquot *dquot);
1155static int ext4_mark_dquot_dirty(struct dquot *dquot); 1161static int ext4_mark_dquot_dirty(struct dquot *dquot);
1156static int ext4_write_info(struct super_block *sb, int type); 1162static int ext4_write_info(struct super_block *sb, int type);
1157static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1163static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1158 char *path); 1164 struct path *path);
1159static int ext4_quota_off(struct super_block *sb, int type); 1165static int ext4_quota_off(struct super_block *sb, int type);
1160static int ext4_quota_on_mount(struct super_block *sb, int type); 1166static int ext4_quota_on_mount(struct super_block *sb, int type);
1161static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1167static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1393,7 +1399,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1393 sbi->s_qf_names[qtype] = NULL; 1399 sbi->s_qf_names[qtype] = NULL;
1394 return 0; 1400 return 0;
1395 } 1401 }
1396 set_opt(sbi->s_mount_opt, QUOTA); 1402 set_opt(sb, QUOTA);
1397 return 1; 1403 return 1;
1398} 1404}
1399 1405
@@ -1448,21 +1454,21 @@ static int parse_options(char *options, struct super_block *sb,
1448 switch (token) { 1454 switch (token) {
1449 case Opt_bsd_df: 1455 case Opt_bsd_df:
1450 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1456 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1451 clear_opt(sbi->s_mount_opt, MINIX_DF); 1457 clear_opt(sb, MINIX_DF);
1452 break; 1458 break;
1453 case Opt_minix_df: 1459 case Opt_minix_df:
1454 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1460 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1455 set_opt(sbi->s_mount_opt, MINIX_DF); 1461 set_opt(sb, MINIX_DF);
1456 1462
1457 break; 1463 break;
1458 case Opt_grpid: 1464 case Opt_grpid:
1459 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1465 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1460 set_opt(sbi->s_mount_opt, GRPID); 1466 set_opt(sb, GRPID);
1461 1467
1462 break; 1468 break;
1463 case Opt_nogrpid: 1469 case Opt_nogrpid:
1464 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1470 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1465 clear_opt(sbi->s_mount_opt, GRPID); 1471 clear_opt(sb, GRPID);
1466 1472
1467 break; 1473 break;
1468 case Opt_resuid: 1474 case Opt_resuid:
@@ -1480,38 +1486,38 @@ static int parse_options(char *options, struct super_block *sb,
1480 /* *sb_block = match_int(&args[0]); */ 1486 /* *sb_block = match_int(&args[0]); */
1481 break; 1487 break;
1482 case Opt_err_panic: 1488 case Opt_err_panic:
1483 clear_opt(sbi->s_mount_opt, ERRORS_CONT); 1489 clear_opt(sb, ERRORS_CONT);
1484 clear_opt(sbi->s_mount_opt, ERRORS_RO); 1490 clear_opt(sb, ERRORS_RO);
1485 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1491 set_opt(sb, ERRORS_PANIC);
1486 break; 1492 break;
1487 case Opt_err_ro: 1493 case Opt_err_ro:
1488 clear_opt(sbi->s_mount_opt, ERRORS_CONT); 1494 clear_opt(sb, ERRORS_CONT);
1489 clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 1495 clear_opt(sb, ERRORS_PANIC);
1490 set_opt(sbi->s_mount_opt, ERRORS_RO); 1496 set_opt(sb, ERRORS_RO);
1491 break; 1497 break;
1492 case Opt_err_cont: 1498 case Opt_err_cont:
1493 clear_opt(sbi->s_mount_opt, ERRORS_RO); 1499 clear_opt(sb, ERRORS_RO);
1494 clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 1500 clear_opt(sb, ERRORS_PANIC);
1495 set_opt(sbi->s_mount_opt, ERRORS_CONT); 1501 set_opt(sb, ERRORS_CONT);
1496 break; 1502 break;
1497 case Opt_nouid32: 1503 case Opt_nouid32:
1498 set_opt(sbi->s_mount_opt, NO_UID32); 1504 set_opt(sb, NO_UID32);
1499 break; 1505 break;
1500 case Opt_debug: 1506 case Opt_debug:
1501 set_opt(sbi->s_mount_opt, DEBUG); 1507 set_opt(sb, DEBUG);
1502 break; 1508 break;
1503 case Opt_oldalloc: 1509 case Opt_oldalloc:
1504 set_opt(sbi->s_mount_opt, OLDALLOC); 1510 set_opt(sb, OLDALLOC);
1505 break; 1511 break;
1506 case Opt_orlov: 1512 case Opt_orlov:
1507 clear_opt(sbi->s_mount_opt, OLDALLOC); 1513 clear_opt(sb, OLDALLOC);
1508 break; 1514 break;
1509#ifdef CONFIG_EXT4_FS_XATTR 1515#ifdef CONFIG_EXT4_FS_XATTR
1510 case Opt_user_xattr: 1516 case Opt_user_xattr:
1511 set_opt(sbi->s_mount_opt, XATTR_USER); 1517 set_opt(sb, XATTR_USER);
1512 break; 1518 break;
1513 case Opt_nouser_xattr: 1519 case Opt_nouser_xattr:
1514 clear_opt(sbi->s_mount_opt, XATTR_USER); 1520 clear_opt(sb, XATTR_USER);
1515 break; 1521 break;
1516#else 1522#else
1517 case Opt_user_xattr: 1523 case Opt_user_xattr:
@@ -1521,10 +1527,10 @@ static int parse_options(char *options, struct super_block *sb,
1521#endif 1527#endif
1522#ifdef CONFIG_EXT4_FS_POSIX_ACL 1528#ifdef CONFIG_EXT4_FS_POSIX_ACL
1523 case Opt_acl: 1529 case Opt_acl:
1524 set_opt(sbi->s_mount_opt, POSIX_ACL); 1530 set_opt(sb, POSIX_ACL);
1525 break; 1531 break;
1526 case Opt_noacl: 1532 case Opt_noacl:
1527 clear_opt(sbi->s_mount_opt, POSIX_ACL); 1533 clear_opt(sb, POSIX_ACL);
1528 break; 1534 break;
1529#else 1535#else
1530 case Opt_acl: 1536 case Opt_acl:
@@ -1543,7 +1549,7 @@ static int parse_options(char *options, struct super_block *sb,
1543 "Cannot specify journal on remount"); 1549 "Cannot specify journal on remount");
1544 return 0; 1550 return 0;
1545 } 1551 }
1546 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1552 set_opt(sb, UPDATE_JOURNAL);
1547 break; 1553 break;
1548 case Opt_journal_dev: 1554 case Opt_journal_dev:
1549 if (is_remount) { 1555 if (is_remount) {
@@ -1556,14 +1562,14 @@ static int parse_options(char *options, struct super_block *sb,
1556 *journal_devnum = option; 1562 *journal_devnum = option;
1557 break; 1563 break;
1558 case Opt_journal_checksum: 1564 case Opt_journal_checksum:
1559 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1565 set_opt(sb, JOURNAL_CHECKSUM);
1560 break; 1566 break;
1561 case Opt_journal_async_commit: 1567 case Opt_journal_async_commit:
1562 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1568 set_opt(sb, JOURNAL_ASYNC_COMMIT);
1563 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1569 set_opt(sb, JOURNAL_CHECKSUM);
1564 break; 1570 break;
1565 case Opt_noload: 1571 case Opt_noload:
1566 set_opt(sbi->s_mount_opt, NOLOAD); 1572 set_opt(sb, NOLOAD);
1567 break; 1573 break;
1568 case Opt_commit: 1574 case Opt_commit:
1569 if (match_int(&args[0], &option)) 1575 if (match_int(&args[0], &option))
@@ -1606,15 +1612,15 @@ static int parse_options(char *options, struct super_block *sb,
1606 return 0; 1612 return 0;
1607 } 1613 }
1608 } else { 1614 } else {
1609 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 1615 clear_opt(sb, DATA_FLAGS);
1610 sbi->s_mount_opt |= data_opt; 1616 sbi->s_mount_opt |= data_opt;
1611 } 1617 }
1612 break; 1618 break;
1613 case Opt_data_err_abort: 1619 case Opt_data_err_abort:
1614 set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1620 set_opt(sb, DATA_ERR_ABORT);
1615 break; 1621 break;
1616 case Opt_data_err_ignore: 1622 case Opt_data_err_ignore:
1617 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1623 clear_opt(sb, DATA_ERR_ABORT);
1618 break; 1624 break;
1619#ifdef CONFIG_QUOTA 1625#ifdef CONFIG_QUOTA
1620 case Opt_usrjquota: 1626 case Opt_usrjquota:
@@ -1654,12 +1660,12 @@ set_qf_format:
1654 break; 1660 break;
1655 case Opt_quota: 1661 case Opt_quota:
1656 case Opt_usrquota: 1662 case Opt_usrquota:
1657 set_opt(sbi->s_mount_opt, QUOTA); 1663 set_opt(sb, QUOTA);
1658 set_opt(sbi->s_mount_opt, USRQUOTA); 1664 set_opt(sb, USRQUOTA);
1659 break; 1665 break;
1660 case Opt_grpquota: 1666 case Opt_grpquota:
1661 set_opt(sbi->s_mount_opt, QUOTA); 1667 set_opt(sb, QUOTA);
1662 set_opt(sbi->s_mount_opt, GRPQUOTA); 1668 set_opt(sb, GRPQUOTA);
1663 break; 1669 break;
1664 case Opt_noquota: 1670 case Opt_noquota:
1665 if (sb_any_quota_loaded(sb)) { 1671 if (sb_any_quota_loaded(sb)) {
@@ -1667,9 +1673,9 @@ set_qf_format:
1667 "options when quota turned on"); 1673 "options when quota turned on");
1668 return 0; 1674 return 0;
1669 } 1675 }
1670 clear_opt(sbi->s_mount_opt, QUOTA); 1676 clear_opt(sb, QUOTA);
1671 clear_opt(sbi->s_mount_opt, USRQUOTA); 1677 clear_opt(sb, USRQUOTA);
1672 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1678 clear_opt(sb, GRPQUOTA);
1673 break; 1679 break;
1674#else 1680#else
1675 case Opt_quota: 1681 case Opt_quota:
@@ -1695,7 +1701,7 @@ set_qf_format:
1695 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; 1701 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1696 break; 1702 break;
1697 case Opt_nobarrier: 1703 case Opt_nobarrier:
1698 clear_opt(sbi->s_mount_opt, BARRIER); 1704 clear_opt(sb, BARRIER);
1699 break; 1705 break;
1700 case Opt_barrier: 1706 case Opt_barrier:
1701 if (args[0].from) { 1707 if (args[0].from) {
@@ -1704,9 +1710,9 @@ set_qf_format:
1704 } else 1710 } else
1705 option = 1; /* No argument, default to 1 */ 1711 option = 1; /* No argument, default to 1 */
1706 if (option) 1712 if (option)
1707 set_opt(sbi->s_mount_opt, BARRIER); 1713 set_opt(sb, BARRIER);
1708 else 1714 else
1709 clear_opt(sbi->s_mount_opt, BARRIER); 1715 clear_opt(sb, BARRIER);
1710 break; 1716 break;
1711 case Opt_ignore: 1717 case Opt_ignore:
1712 break; 1718 break;
@@ -1730,17 +1736,17 @@ set_qf_format:
1730 "Ignoring deprecated bh option"); 1736 "Ignoring deprecated bh option");
1731 break; 1737 break;
1732 case Opt_i_version: 1738 case Opt_i_version:
1733 set_opt(sbi->s_mount_opt, I_VERSION); 1739 set_opt(sb, I_VERSION);
1734 sb->s_flags |= MS_I_VERSION; 1740 sb->s_flags |= MS_I_VERSION;
1735 break; 1741 break;
1736 case Opt_nodelalloc: 1742 case Opt_nodelalloc:
1737 clear_opt(sbi->s_mount_opt, DELALLOC); 1743 clear_opt(sb, DELALLOC);
1738 break; 1744 break;
1739 case Opt_mblk_io_submit: 1745 case Opt_mblk_io_submit:
1740 set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); 1746 set_opt(sb, MBLK_IO_SUBMIT);
1741 break; 1747 break;
1742 case Opt_nomblk_io_submit: 1748 case Opt_nomblk_io_submit:
1743 clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); 1749 clear_opt(sb, MBLK_IO_SUBMIT);
1744 break; 1750 break;
1745 case Opt_stripe: 1751 case Opt_stripe:
1746 if (match_int(&args[0], &option)) 1752 if (match_int(&args[0], &option))
@@ -1750,13 +1756,13 @@ set_qf_format:
1750 sbi->s_stripe = option; 1756 sbi->s_stripe = option;
1751 break; 1757 break;
1752 case Opt_delalloc: 1758 case Opt_delalloc:
1753 set_opt(sbi->s_mount_opt, DELALLOC); 1759 set_opt(sb, DELALLOC);
1754 break; 1760 break;
1755 case Opt_block_validity: 1761 case Opt_block_validity:
1756 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 1762 set_opt(sb, BLOCK_VALIDITY);
1757 break; 1763 break;
1758 case Opt_noblock_validity: 1764 case Opt_noblock_validity:
1759 clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 1765 clear_opt(sb, BLOCK_VALIDITY);
1760 break; 1766 break;
1761 case Opt_inode_readahead_blks: 1767 case Opt_inode_readahead_blks:
1762 if (match_int(&args[0], &option)) 1768 if (match_int(&args[0], &option))
@@ -1780,7 +1786,7 @@ set_qf_format:
1780 option); 1786 option);
1781 break; 1787 break;
1782 case Opt_noauto_da_alloc: 1788 case Opt_noauto_da_alloc:
1783 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1789 set_opt(sb, NO_AUTO_DA_ALLOC);
1784 break; 1790 break;
1785 case Opt_auto_da_alloc: 1791 case Opt_auto_da_alloc:
1786 if (args[0].from) { 1792 if (args[0].from) {
@@ -1789,24 +1795,24 @@ set_qf_format:
1789 } else 1795 } else
1790 option = 1; /* No argument, default to 1 */ 1796 option = 1; /* No argument, default to 1 */
1791 if (option) 1797 if (option)
1792 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1798 clear_opt(sb, NO_AUTO_DA_ALLOC);
1793 else 1799 else
1794 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1800 set_opt(sb,NO_AUTO_DA_ALLOC);
1795 break; 1801 break;
1796 case Opt_discard: 1802 case Opt_discard:
1797 set_opt(sbi->s_mount_opt, DISCARD); 1803 set_opt(sb, DISCARD);
1798 break; 1804 break;
1799 case Opt_nodiscard: 1805 case Opt_nodiscard:
1800 clear_opt(sbi->s_mount_opt, DISCARD); 1806 clear_opt(sb, DISCARD);
1801 break; 1807 break;
1802 case Opt_dioread_nolock: 1808 case Opt_dioread_nolock:
1803 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1809 set_opt(sb, DIOREAD_NOLOCK);
1804 break; 1810 break;
1805 case Opt_dioread_lock: 1811 case Opt_dioread_lock:
1806 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1812 clear_opt(sb, DIOREAD_NOLOCK);
1807 break; 1813 break;
1808 case Opt_init_inode_table: 1814 case Opt_init_inode_table:
1809 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); 1815 set_opt(sb, INIT_INODE_TABLE);
1810 if (args[0].from) { 1816 if (args[0].from) {
1811 if (match_int(&args[0], &option)) 1817 if (match_int(&args[0], &option))
1812 return 0; 1818 return 0;
@@ -1817,7 +1823,7 @@ set_qf_format:
1817 sbi->s_li_wait_mult = option; 1823 sbi->s_li_wait_mult = option;
1818 break; 1824 break;
1819 case Opt_noinit_inode_table: 1825 case Opt_noinit_inode_table:
1820 clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE); 1826 clear_opt(sb, INIT_INODE_TABLE);
1821 break; 1827 break;
1822 default: 1828 default:
1823 ext4_msg(sb, KERN_ERR, 1829 ext4_msg(sb, KERN_ERR,
@@ -1829,10 +1835,10 @@ set_qf_format:
1829#ifdef CONFIG_QUOTA 1835#ifdef CONFIG_QUOTA
1830 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1836 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1831 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) 1837 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1832 clear_opt(sbi->s_mount_opt, USRQUOTA); 1838 clear_opt(sb, USRQUOTA);
1833 1839
1834 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) 1840 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1835 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1841 clear_opt(sb, GRPQUOTA);
1836 1842
1837 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { 1843 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1838 ext4_msg(sb, KERN_ERR, "old and new quota " 1844 ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1902,12 +1908,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1902 ext4_commit_super(sb, 1); 1908 ext4_commit_super(sb, 1);
1903 if (test_opt(sb, DEBUG)) 1909 if (test_opt(sb, DEBUG))
1904 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 1910 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1905 "bpg=%lu, ipg=%lu, mo=%04x]\n", 1911 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
1906 sb->s_blocksize, 1912 sb->s_blocksize,
1907 sbi->s_groups_count, 1913 sbi->s_groups_count,
1908 EXT4_BLOCKS_PER_GROUP(sb), 1914 EXT4_BLOCKS_PER_GROUP(sb),
1909 EXT4_INODES_PER_GROUP(sb), 1915 EXT4_INODES_PER_GROUP(sb),
1910 sbi->s_mount_opt); 1916 sbi->s_mount_opt, sbi->s_mount_opt2);
1911 1917
1912 return res; 1918 return res;
1913} 1919}
@@ -1937,14 +1943,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
1937 size = flex_group_count * sizeof(struct flex_groups); 1943 size = flex_group_count * sizeof(struct flex_groups);
1938 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); 1944 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
1939 if (sbi->s_flex_groups == NULL) { 1945 if (sbi->s_flex_groups == NULL) {
1940 sbi->s_flex_groups = vmalloc(size); 1946 sbi->s_flex_groups = vzalloc(size);
1941 if (sbi->s_flex_groups) 1947 if (sbi->s_flex_groups == NULL) {
1942 memset(sbi->s_flex_groups, 0, size); 1948 ext4_msg(sb, KERN_ERR,
1943 } 1949 "not enough memory for %u flex groups",
1944 if (sbi->s_flex_groups == NULL) { 1950 flex_group_count);
1945 ext4_msg(sb, KERN_ERR, "not enough memory for " 1951 goto failed;
1946 "%u flex groups", flex_group_count); 1952 }
1947 goto failed;
1948 } 1953 }
1949 1954
1950 for (i = 0; i < sbi->s_groups_count; i++) { 1955 for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2923,7 +2928,7 @@ static int ext4_register_li_request(struct super_block *sb,
2923 struct ext4_sb_info *sbi = EXT4_SB(sb); 2928 struct ext4_sb_info *sbi = EXT4_SB(sb);
2924 struct ext4_li_request *elr; 2929 struct ext4_li_request *elr;
2925 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 2930 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2926 int ret; 2931 int ret = 0;
2927 2932
2928 if (sbi->s_li_request != NULL) 2933 if (sbi->s_li_request != NULL)
2929 return 0; 2934 return 0;
@@ -3078,41 +3083,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3078 3083
3079 /* Set defaults before we parse the mount options */ 3084 /* Set defaults before we parse the mount options */
3080 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 3085 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3081 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); 3086 set_opt(sb, INIT_INODE_TABLE);
3082 if (def_mount_opts & EXT4_DEFM_DEBUG) 3087 if (def_mount_opts & EXT4_DEFM_DEBUG)
3083 set_opt(sbi->s_mount_opt, DEBUG); 3088 set_opt(sb, DEBUG);
3084 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3089 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
3085 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", 3090 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
3086 "2.6.38"); 3091 "2.6.38");
3087 set_opt(sbi->s_mount_opt, GRPID); 3092 set_opt(sb, GRPID);
3088 } 3093 }
3089 if (def_mount_opts & EXT4_DEFM_UID16) 3094 if (def_mount_opts & EXT4_DEFM_UID16)
3090 set_opt(sbi->s_mount_opt, NO_UID32); 3095 set_opt(sb, NO_UID32);
3091#ifdef CONFIG_EXT4_FS_XATTR 3096#ifdef CONFIG_EXT4_FS_XATTR
3092 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 3097 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
3093 set_opt(sbi->s_mount_opt, XATTR_USER); 3098 set_opt(sb, XATTR_USER);
3094#endif 3099#endif
3095#ifdef CONFIG_EXT4_FS_POSIX_ACL 3100#ifdef CONFIG_EXT4_FS_POSIX_ACL
3096 if (def_mount_opts & EXT4_DEFM_ACL) 3101 if (def_mount_opts & EXT4_DEFM_ACL)
3097 set_opt(sbi->s_mount_opt, POSIX_ACL); 3102 set_opt(sb, POSIX_ACL);
3098#endif 3103#endif
3099 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 3104 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3100 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 3105 set_opt(sb, JOURNAL_DATA);
3101 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 3106 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
3102 set_opt(sbi->s_mount_opt, ORDERED_DATA); 3107 set_opt(sb, ORDERED_DATA);
3103 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 3108 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
3104 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 3109 set_opt(sb, WRITEBACK_DATA);
3105 3110
3106 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 3111 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
3107 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 3112 set_opt(sb, ERRORS_PANIC);
3108 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) 3113 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
3109 set_opt(sbi->s_mount_opt, ERRORS_CONT); 3114 set_opt(sb, ERRORS_CONT);
3110 else 3115 else
3111 set_opt(sbi->s_mount_opt, ERRORS_RO); 3116 set_opt(sb, ERRORS_RO);
3112 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) 3117 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
3113 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 3118 set_opt(sb, BLOCK_VALIDITY);
3114 if (def_mount_opts & EXT4_DEFM_DISCARD) 3119 if (def_mount_opts & EXT4_DEFM_DISCARD)
3115 set_opt(sbi->s_mount_opt, DISCARD); 3120 set_opt(sb, DISCARD);
3116 3121
3117 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 3122 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
3118 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 3123 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -3121,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3121 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 3126 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
3122 3127
3123 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) 3128 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
3124 set_opt(sbi->s_mount_opt, BARRIER); 3129 set_opt(sb, BARRIER);
3125 3130
3126 /* 3131 /*
3127 * enable delayed allocation by default 3132 * enable delayed allocation by default
@@ -3129,7 +3134,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3129 */ 3134 */
3130 if (!IS_EXT3_SB(sb) && 3135 if (!IS_EXT3_SB(sb) &&
3131 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3136 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3132 set_opt(sbi->s_mount_opt, DELALLOC); 3137 set_opt(sb, DELALLOC);
3133 3138
3134 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3139 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3135 &journal_devnum, &journal_ioprio, NULL, 0)) { 3140 &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -3432,8 +3437,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3432 "suppressed and not mounted read-only"); 3437 "suppressed and not mounted read-only");
3433 goto failed_mount_wq; 3438 goto failed_mount_wq;
3434 } else { 3439 } else {
3435 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 3440 clear_opt(sb, DATA_FLAGS);
3436 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 3441 set_opt(sb, WRITEBACK_DATA);
3437 sbi->s_journal = NULL; 3442 sbi->s_journal = NULL;
3438 needs_recovery = 0; 3443 needs_recovery = 0;
3439 goto no_journal; 3444 goto no_journal;
@@ -3471,9 +3476,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3471 */ 3476 */
3472 if (jbd2_journal_check_available_features 3477 if (jbd2_journal_check_available_features
3473 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) 3478 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
3474 set_opt(sbi->s_mount_opt, ORDERED_DATA); 3479 set_opt(sb, ORDERED_DATA);
3475 else 3480 else
3476 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 3481 set_opt(sb, JOURNAL_DATA);
3477 break; 3482 break;
3478 3483
3479 case EXT4_MOUNT_ORDERED_DATA: 3484 case EXT4_MOUNT_ORDERED_DATA:
@@ -3563,18 +3568,18 @@ no_journal:
3563 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { 3568 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
3564 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 3569 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
3565 "requested data journaling mode"); 3570 "requested data journaling mode");
3566 clear_opt(sbi->s_mount_opt, DELALLOC); 3571 clear_opt(sb, DELALLOC);
3567 } 3572 }
3568 if (test_opt(sb, DIOREAD_NOLOCK)) { 3573 if (test_opt(sb, DIOREAD_NOLOCK)) {
3569 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 3574 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3570 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " 3575 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3571 "option - requested data journaling mode"); 3576 "option - requested data journaling mode");
3572 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 3577 clear_opt(sb, DIOREAD_NOLOCK);
3573 } 3578 }
3574 if (sb->s_blocksize < PAGE_SIZE) { 3579 if (sb->s_blocksize < PAGE_SIZE) {
3575 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " 3580 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3576 "option - block size is too small"); 3581 "option - block size is too small");
3577 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 3582 clear_opt(sb, DIOREAD_NOLOCK);
3578 } 3583 }
3579 } 3584 }
3580 3585
@@ -3772,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
3772 if (bdev == NULL) 3777 if (bdev == NULL)
3773 return NULL; 3778 return NULL;
3774 3779
3775 if (bd_claim(bdev, sb)) {
3776 ext4_msg(sb, KERN_ERR,
3777 "failed to claim external journal device");
3778 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
3779 return NULL;
3780 }
3781
3782 blocksize = sb->s_blocksize; 3780 blocksize = sb->s_blocksize;
3783 hblock = bdev_logical_block_size(bdev); 3781 hblock = bdev_logical_block_size(bdev);
3784 if (blocksize < hblock) { 3782 if (blocksize < hblock) {
@@ -4173,6 +4171,22 @@ static int ext4_unfreeze(struct super_block *sb)
4173 return 0; 4171 return 0;
4174} 4172}
4175 4173
4174/*
4175 * Structure to save mount options for ext4_remount's benefit
4176 */
4177struct ext4_mount_options {
4178 unsigned long s_mount_opt;
4179 unsigned long s_mount_opt2;
4180 uid_t s_resuid;
4181 gid_t s_resgid;
4182 unsigned long s_commit_interval;
4183 u32 s_min_batch_time, s_max_batch_time;
4184#ifdef CONFIG_QUOTA
4185 int s_jquota_fmt;
4186 char *s_qf_names[MAXQUOTAS];
4187#endif
4188};
4189
4176static int ext4_remount(struct super_block *sb, int *flags, char *data) 4190static int ext4_remount(struct super_block *sb, int *flags, char *data)
4177{ 4191{
4178 struct ext4_super_block *es; 4192 struct ext4_super_block *es;
@@ -4193,6 +4207,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4193 lock_super(sb); 4207 lock_super(sb);
4194 old_sb_flags = sb->s_flags; 4208 old_sb_flags = sb->s_flags;
4195 old_opts.s_mount_opt = sbi->s_mount_opt; 4209 old_opts.s_mount_opt = sbi->s_mount_opt;
4210 old_opts.s_mount_opt2 = sbi->s_mount_opt2;
4196 old_opts.s_resuid = sbi->s_resuid; 4211 old_opts.s_resuid = sbi->s_resuid;
4197 old_opts.s_resgid = sbi->s_resgid; 4212 old_opts.s_resgid = sbi->s_resgid;
4198 old_opts.s_commit_interval = sbi->s_commit_interval; 4213 old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -4346,6 +4361,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4346restore_opts: 4361restore_opts:
4347 sb->s_flags = old_sb_flags; 4362 sb->s_flags = old_sb_flags;
4348 sbi->s_mount_opt = old_opts.s_mount_opt; 4363 sbi->s_mount_opt = old_opts.s_mount_opt;
4364 sbi->s_mount_opt2 = old_opts.s_mount_opt2;
4349 sbi->s_resuid = old_opts.s_resuid; 4365 sbi->s_resuid = old_opts.s_resuid;
4350 sbi->s_resgid = old_opts.s_resgid; 4366 sbi->s_resgid = old_opts.s_resgid;
4351 sbi->s_commit_interval = old_opts.s_commit_interval; 4367 sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -4542,27 +4558,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
4542 * Standard function to be called on quota_on 4558 * Standard function to be called on quota_on
4543 */ 4559 */
4544static int ext4_quota_on(struct super_block *sb, int type, int format_id, 4560static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4545 char *name) 4561 struct path *path)
4546{ 4562{
4547 int err; 4563 int err;
4548 struct path path;
4549 4564
4550 if (!test_opt(sb, QUOTA)) 4565 if (!test_opt(sb, QUOTA))
4551 return -EINVAL; 4566 return -EINVAL;
4552 4567
4553 err = kern_path(name, LOOKUP_FOLLOW, &path);
4554 if (err)
4555 return err;
4556
4557 /* Quotafile not on the same filesystem? */ 4568 /* Quotafile not on the same filesystem? */
4558 if (path.mnt->mnt_sb != sb) { 4569 if (path->mnt->mnt_sb != sb)
4559 path_put(&path);
4560 return -EXDEV; 4570 return -EXDEV;
4561 }
4562 /* Journaling quota? */ 4571 /* Journaling quota? */
4563 if (EXT4_SB(sb)->s_qf_names[type]) { 4572 if (EXT4_SB(sb)->s_qf_names[type]) {
4564 /* Quotafile not in fs root? */ 4573 /* Quotafile not in fs root? */
4565 if (path.dentry->d_parent != sb->s_root) 4574 if (path->dentry->d_parent != sb->s_root)
4566 ext4_msg(sb, KERN_WARNING, 4575 ext4_msg(sb, KERN_WARNING,
4567 "Quota file not on filesystem root. " 4576 "Quota file not on filesystem root. "
4568 "Journaled quota will not work"); 4577 "Journaled quota will not work");
@@ -4573,7 +4582,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4573 * all updates to the file when we bypass pagecache... 4582 * all updates to the file when we bypass pagecache...
4574 */ 4583 */
4575 if (EXT4_SB(sb)->s_journal && 4584 if (EXT4_SB(sb)->s_journal &&
4576 ext4_should_journal_data(path.dentry->d_inode)) { 4585 ext4_should_journal_data(path->dentry->d_inode)) {
4577 /* 4586 /*
4578 * We don't need to lock updates but journal_flush() could 4587 * We don't need to lock updates but journal_flush() could
4579 * otherwise be livelocked... 4588 * otherwise be livelocked...
@@ -4581,15 +4590,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4581 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 4590 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
4582 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 4591 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
4583 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4592 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
4584 if (err) { 4593 if (err)
4585 path_put(&path);
4586 return err; 4594 return err;
4587 }
4588 } 4595 }
4589 4596
4590 err = dquot_quota_on_path(sb, type, format_id, &path); 4597 return dquot_quota_on(sb, type, format_id, path);
4591 path_put(&path);
4592 return err;
4593} 4598}
4594 4599
4595static int ext4_quota_off(struct super_block *sb, int type) 4600static int ext4_quota_off(struct super_block *sb, int type)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fa4b899da4b3..fc32176eee39 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
427static int 427static int
428ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) 428ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
429{ 429{
430 int i_error, b_error; 430 int ret, ret2;
431 431
432 down_read(&EXT4_I(dentry->d_inode)->xattr_sem); 432 down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
433 i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); 433 ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
434 if (i_error < 0) { 434 if (ret < 0)
435 b_error = 0; 435 goto errout;
436 } else { 436 if (buffer) {
437 if (buffer) { 437 buffer += ret;
438 buffer += i_error; 438 buffer_size -= ret;
439 buffer_size -= i_error;
440 }
441 b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
442 if (b_error < 0)
443 i_error = 0;
444 } 439 }
440 ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
441 if (ret < 0)
442 goto errout;
443 ret += ret2;
444errout:
445 up_read(&EXT4_I(dentry->d_inode)->xattr_sem); 445 up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
446 return i_error + b_error; 446 return ret;
447} 447}
448 448
449/* 449/*
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
947/* 947/*
948 * ext4_xattr_set_handle() 948 * ext4_xattr_set_handle()
949 * 949 *
950 * Create, replace or remove an extended attribute for this inode. Buffer 950 * Create, replace or remove an extended attribute for this inode. Value
951 * is NULL to remove an existing extended attribute, and non-NULL to 951 * is NULL to remove an existing extended attribute, and non-NULL to
952 * either replace an existing extended attribute, or create a new extended 952 * either replace an existing extended attribute, or create a new extended
953 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 953 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index d75a77f85c28..f50408901f7e 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,7 +319,8 @@ extern struct inode *fat_build_inode(struct super_block *sb,
319 struct msdos_dir_entry *de, loff_t i_pos); 319 struct msdos_dir_entry *de, loff_t i_pos);
320extern int fat_sync_inode(struct inode *inode); 320extern int fat_sync_inode(struct inode *inode);
321extern int fat_fill_super(struct super_block *sb, void *data, int silent, 321extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322 const struct inode_operations *fs_dir_inode_ops, int isvfat); 322 const struct inode_operations *fs_dir_inode_ops,
323 int isvfat, void (*setup)(struct super_block *));
323 324
324extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
325 struct inode *i2); 326 struct inode *i2);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 206351af7c58..86753fe10bd1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -703,7 +703,6 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
703 struct fid *fid, int fh_len, int fh_type) 703 struct fid *fid, int fh_len, int fh_type)
704{ 704{
705 struct inode *inode = NULL; 705 struct inode *inode = NULL;
706 struct dentry *result;
707 u32 *fh = fid->raw; 706 u32 *fh = fid->raw;
708 707
709 if (fh_len < 5 || fh_type != 3) 708 if (fh_len < 5 || fh_type != 3)
@@ -748,10 +747,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
748 * the fat_iget lookup again. If that fails, then we are totally out 747 * the fat_iget lookup again. If that fails, then we are totally out
749 * of luck. But all that is for another day 748 * of luck. But all that is for another day
750 */ 749 */
751 result = d_obtain_alias(inode); 750 return d_obtain_alias(inode);
752 if (!IS_ERR(result))
753 d_set_d_op(result, sb->s_root->d_op);
754 return result;
755} 751}
756 752
757static int 753static int
@@ -799,8 +795,6 @@ static struct dentry *fat_get_parent(struct dentry *child)
799 brelse(bh); 795 brelse(bh);
800 796
801 parent = d_obtain_alias(inode); 797 parent = d_obtain_alias(inode);
802 if (!IS_ERR(parent))
803 d_set_d_op(parent, sb->s_root->d_op);
804out: 798out:
805 unlock_super(sb); 799 unlock_super(sb);
806 800
@@ -1244,7 +1238,8 @@ static int fat_read_root(struct inode *inode)
1244 * Read the super block of an MS-DOS FS. 1238 * Read the super block of an MS-DOS FS.
1245 */ 1239 */
1246int fat_fill_super(struct super_block *sb, void *data, int silent, 1240int fat_fill_super(struct super_block *sb, void *data, int silent,
1247 const struct inode_operations *fs_dir_inode_ops, int isvfat) 1241 const struct inode_operations *fs_dir_inode_ops, int isvfat,
1242 void (*setup)(struct super_block *))
1248{ 1243{
1249 struct inode *root_inode = NULL, *fat_inode = NULL; 1244 struct inode *root_inode = NULL, *fat_inode = NULL;
1250 struct buffer_head *bh; 1245 struct buffer_head *bh;
@@ -1280,6 +1275,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1280 if (error) 1275 if (error)
1281 goto out_fail; 1276 goto out_fail;
1282 1277
1278 setup(sb); /* flavour-specific stuff that needs options */
1279
1283 error = -EIO; 1280 error = -EIO;
1284 sb_min_blocksize(sb, 512); 1281 sb_min_blocksize(sb, 512);
1285 bh = sb_bread(sb, 0); 1282 bh = sb_bread(sb, 0);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 35ffe43afa4b..711499040eb6 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -227,11 +227,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
227 } 227 }
228out: 228out:
229 unlock_super(sb); 229 unlock_super(sb);
230 d_set_d_op(dentry, &msdos_dentry_operations); 230 return d_splice_alias(inode, dentry);
231 dentry = d_splice_alias(inode, dentry);
232 if (dentry)
233 d_set_d_op(dentry, &msdos_dentry_operations);
234 return dentry;
235 231
236error: 232error:
237 unlock_super(sb); 233 unlock_super(sb);
@@ -661,21 +657,16 @@ static const struct inode_operations msdos_dir_inode_operations = {
661 .getattr = fat_getattr, 657 .getattr = fat_getattr,
662}; 658};
663 659
664static int msdos_fill_super(struct super_block *sb, void *data, int silent) 660static void setup(struct super_block *sb)
665{ 661{
666 int res; 662 sb->s_d_op = &msdos_dentry_operations;
667
668 lock_super(sb);
669 res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
670 if (res) {
671 unlock_super(sb);
672 return res;
673 }
674
675 sb->s_flags |= MS_NOATIME; 663 sb->s_flags |= MS_NOATIME;
676 d_set_d_op(sb->s_root, &msdos_dentry_operations); 664}
677 unlock_super(sb); 665
678 return 0; 666static int msdos_fill_super(struct super_block *sb, void *data, int silent)
667{
668 return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
669 0, setup);
679} 670}
680 671
681static struct dentry *msdos_mount(struct file_system_type *fs_type, 672static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index e3ffc5e12332..f88f752babd9 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -772,13 +772,10 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
772 772
773out: 773out:
774 unlock_super(sb); 774 unlock_super(sb);
775 d_set_d_op(dentry, sb->s_root->d_op);
776 dentry->d_time = dentry->d_parent->d_inode->i_version; 775 dentry->d_time = dentry->d_parent->d_inode->i_version;
777 dentry = d_splice_alias(inode, dentry); 776 dentry = d_splice_alias(inode, dentry);
778 if (dentry) { 777 if (dentry)
779 d_set_d_op(dentry, sb->s_root->d_op);
780 dentry->d_time = dentry->d_parent->d_inode->i_version; 778 dentry->d_time = dentry->d_parent->d_inode->i_version;
781 }
782 return dentry; 779 return dentry;
783 780
784error: 781error:
@@ -1066,24 +1063,18 @@ static const struct inode_operations vfat_dir_inode_operations = {
1066 .getattr = fat_getattr, 1063 .getattr = fat_getattr,
1067}; 1064};
1068 1065
1069static int vfat_fill_super(struct super_block *sb, void *data, int silent) 1066static void setup(struct super_block *sb)
1070{ 1067{
1071 int res;
1072
1073 lock_super(sb);
1074 res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
1075 if (res) {
1076 unlock_super(sb);
1077 return res;
1078 }
1079
1080 if (MSDOS_SB(sb)->options.name_check != 's') 1068 if (MSDOS_SB(sb)->options.name_check != 's')
1081 d_set_d_op(sb->s_root, &vfat_ci_dentry_ops); 1069 sb->s_d_op = &vfat_ci_dentry_ops;
1082 else 1070 else
1083 d_set_d_op(sb->s_root, &vfat_dentry_ops); 1071 sb->s_d_op = &vfat_dentry_ops;
1072}
1084 1073
1085 unlock_super(sb); 1074static int vfat_fill_super(struct super_block *sb, void *data, int silent)
1086 return 0; 1075{
1076 return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
1077 1, setup);
1087} 1078}
1088 1079
1089static struct dentry *vfat_mount(struct file_system_type *fs_type, 1080static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/file_table.c b/fs/file_table.c
index c3dee381f1b4..c3e89adf53c0 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
311 struct files_struct *files = current->files; 311 struct files_struct *files = current->files;
312 312
313 *fput_needed = 0; 313 *fput_needed = 0;
314 if (likely((atomic_read(&files->count) == 1))) { 314 if (atomic_read(&files->count) == 1) {
315 file = fcheck_files(files, fd); 315 file = fcheck_files(files, fd);
316 } else { 316 } else {
317 rcu_read_lock(); 317 rcu_read_lock();
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953aa..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
84 return list_entry(head, struct inode, i_wb_list); 84 return list_entry(head, struct inode, i_wb_list);
85} 85}
86 86
87static void bdi_queue_work(struct backing_dev_info *bdi, 87/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
88 struct wb_writeback_work *work) 88static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
89{ 89{
90 trace_writeback_queue(bdi, work);
91
92 spin_lock_bh(&bdi->wb_lock);
93 list_add_tail(&work->list, &bdi->work_list);
94 if (bdi->wb.task) { 90 if (bdi->wb.task) {
95 wake_up_process(bdi->wb.task); 91 wake_up_process(bdi->wb.task);
96 } else { 92 } else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
98 * The bdi thread isn't there, wake up the forker thread which 94 * The bdi thread isn't there, wake up the forker thread which
99 * will create and run it. 95 * will create and run it.
100 */ 96 */
101 trace_writeback_nothread(bdi, work);
102 wake_up_process(default_backing_dev_info.wb.task); 97 wake_up_process(default_backing_dev_info.wb.task);
103 } 98 }
99}
100
101static void bdi_queue_work(struct backing_dev_info *bdi,
102 struct wb_writeback_work *work)
103{
104 trace_writeback_queue(bdi, work);
105
106 spin_lock_bh(&bdi->wb_lock);
107 list_add_tail(&work->list, &bdi->work_list);
108 if (!bdi->wb.task)
109 trace_writeback_nothread(bdi, work);
110 bdi_wakeup_flusher(bdi);
104 spin_unlock_bh(&bdi->wb_lock); 111 spin_unlock_bh(&bdi->wb_lock);
105} 112}
106 113
107static void 114static void
108__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 115__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
109 bool range_cyclic, bool for_background) 116 bool range_cyclic)
110{ 117{
111 struct wb_writeback_work *work; 118 struct wb_writeback_work *work;
112 119
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
126 work->sync_mode = WB_SYNC_NONE; 133 work->sync_mode = WB_SYNC_NONE;
127 work->nr_pages = nr_pages; 134 work->nr_pages = nr_pages;
128 work->range_cyclic = range_cyclic; 135 work->range_cyclic = range_cyclic;
129 work->for_background = for_background;
130 136
131 bdi_queue_work(bdi, work); 137 bdi_queue_work(bdi, work);
132} 138}
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
144 */ 150 */
145void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 151void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
146{ 152{
147 __bdi_start_writeback(bdi, nr_pages, true, false); 153 __bdi_start_writeback(bdi, nr_pages, true);
148} 154}
149 155
150/** 156/**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
152 * @bdi: the backing device to write from 158 * @bdi: the backing device to write from
153 * 159 *
154 * Description: 160 * Description:
155 * This does WB_SYNC_NONE background writeback. The IO is only 161 * This makes sure WB_SYNC_NONE background writeback happens. When
156 * started when this function returns, we make no guarentees on 162 * this function returns, it is only guaranteed that for given BDI
157 * completion. Caller need not hold sb s_umount semaphore. 163 * some IO is happening if we are over background dirty threshold.
164 * Caller need not hold sb s_umount semaphore.
158 */ 165 */
159void bdi_start_background_writeback(struct backing_dev_info *bdi) 166void bdi_start_background_writeback(struct backing_dev_info *bdi)
160{ 167{
161 __bdi_start_writeback(bdi, LONG_MAX, true, true); 168 /*
169 * We just wake up the flusher thread. It will perform background
170 * writeback as soon as there is no other work to do.
171 */
172 trace_writeback_wake_background(bdi);
173 spin_lock_bh(&bdi->wb_lock);
174 bdi_wakeup_flusher(bdi);
175 spin_unlock_bh(&bdi->wb_lock);
162} 176}
163 177
164/* 178/*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
616 }; 630 };
617 unsigned long oldest_jif; 631 unsigned long oldest_jif;
618 long wrote = 0; 632 long wrote = 0;
633 long write_chunk;
619 struct inode *inode; 634 struct inode *inode;
620 635
621 if (wbc.for_kupdate) { 636 if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
628 wbc.range_end = LLONG_MAX; 643 wbc.range_end = LLONG_MAX;
629 } 644 }
630 645
646 /*
647 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
648 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
649 * here avoids calling into writeback_inodes_wb() more than once.
650 *
651 * The intended call sequence for WB_SYNC_ALL writeback is:
652 *
653 * wb_writeback()
654 * __writeback_inodes_sb() <== called only once
655 * write_cache_pages() <== called once for each inode
656 * (quickly) tag currently dirty pages
657 * (maybe slowly) sync all tagged pages
658 */
659 if (wbc.sync_mode == WB_SYNC_NONE)
660 write_chunk = MAX_WRITEBACK_PAGES;
661 else
662 write_chunk = LONG_MAX;
663
631 wbc.wb_start = jiffies; /* livelock avoidance */ 664 wbc.wb_start = jiffies; /* livelock avoidance */
632 for (;;) { 665 for (;;) {
633 /* 666 /*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
637 break; 670 break;
638 671
639 /* 672 /*
673 * Background writeout and kupdate-style writeback may
674 * run forever. Stop them if there is other work to do
675 * so that e.g. sync can proceed. They'll be restarted
676 * after the other works are all done.
677 */
678 if ((work->for_background || work->for_kupdate) &&
679 !list_empty(&wb->bdi->work_list))
680 break;
681
682 /*
640 * For background writeout, stop when we are below the 683 * For background writeout, stop when we are below the
641 * background dirty threshold 684 * background dirty threshold
642 */ 685 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
644 break; 687 break;
645 688
646 wbc.more_io = 0; 689 wbc.more_io = 0;
647 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 690 wbc.nr_to_write = write_chunk;
648 wbc.pages_skipped = 0; 691 wbc.pages_skipped = 0;
649 692
650 trace_wbc_writeback_start(&wbc, wb->bdi); 693 trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
654 writeback_inodes_wb(wb, &wbc); 697 writeback_inodes_wb(wb, &wbc);
655 trace_wbc_writeback_written(&wbc, wb->bdi); 698 trace_wbc_writeback_written(&wbc, wb->bdi);
656 699
657 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 700 work->nr_pages -= write_chunk - wbc.nr_to_write;
658 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 701 wrote += write_chunk - wbc.nr_to_write;
659 702
660 /* 703 /*
661 * If we consumed everything, see if we have more 704 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
670 /* 713 /*
671 * Did we write something? Try for more 714 * Did we write something? Try for more
672 */ 715 */
673 if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) 716 if (wbc.nr_to_write < write_chunk)
674 continue; 717 continue;
675 /* 718 /*
676 * Nothing written. Wait for some inode to 719 * Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
718 get_nr_dirty_inodes(); 761 get_nr_dirty_inodes();
719} 762}
720 763
764static long wb_check_background_flush(struct bdi_writeback *wb)
765{
766 if (over_bground_thresh()) {
767
768 struct wb_writeback_work work = {
769 .nr_pages = LONG_MAX,
770 .sync_mode = WB_SYNC_NONE,
771 .for_background = 1,
772 .range_cyclic = 1,
773 };
774
775 return wb_writeback(wb, &work);
776 }
777
778 return 0;
779}
780
721static long wb_check_old_data_flush(struct bdi_writeback *wb) 781static long wb_check_old_data_flush(struct bdi_writeback *wb)
722{ 782{
723 unsigned long expired; 783 unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
787 * Check for periodic writeback, kupdated() style 847 * Check for periodic writeback, kupdated() style
788 */ 848 */
789 wrote += wb_check_old_data_flush(wb); 849 wrote += wb_check_old_data_flush(wb);
850 wrote += wb_check_background_flush(wb);
790 clear_bit(BDI_writeback_running, &wb->bdi->state); 851 clear_bit(BDI_writeback_running, &wb->bdi->state);
791 852
792 return wrote; 853 return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
873 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 934 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
874 if (!bdi_has_dirty_io(bdi)) 935 if (!bdi_has_dirty_io(bdi))
875 continue; 936 continue;
876 __bdi_start_writeback(bdi, nr_pages, false, false); 937 __bdi_start_writeback(bdi, nr_pages, false);
877 } 938 }
878 rcu_read_unlock(); 939 rcu_read_unlock();
879} 940}
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1164 * @sb: the superblock 1225 * @sb: the superblock
1165 * 1226 *
1166 * This function writes and waits on any dirty inode belonging to this 1227 * This function writes and waits on any dirty inode belonging to this
1167 * super_block. The number of pages synced is returned. 1228 * super_block.
1168 */ 1229 */
1169void sync_inodes_sb(struct super_block *sb) 1230void sync_inodes_sb(struct super_block *sb)
1170{ 1231{
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1242EXPORT_SYMBOL(sync_inode); 1303EXPORT_SYMBOL(sync_inode);
1243 1304
1244/** 1305/**
1245 * sync_inode - write an inode to disk 1306 * sync_inode_metadata - write an inode to disk
1246 * @inode: the inode to sync 1307 * @inode: the inode to sync
1247 * @wait: wait for I/O to complete. 1308 * @wait: wait for I/O to complete.
1248 * 1309 *
1249 * Write an inode to disk and adjust it's dirty state after completion. 1310 * Write an inode to disk and adjust its dirty state after completion.
1250 * 1311 *
1251 * Note: only writes the actual inode, no associated data or other metadata. 1312 * Note: only writes the actual inode, no associated data or other metadata.
1252 */ 1313 */
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 68ca487bedb1..78b519c13536 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
4#include <linux/path.h> 4#include <linux/path.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6#include <linux/fs_struct.h> 6#include <linux/fs_struct.h>
7#include "internal.h"
8
9static inline void path_get_longterm(struct path *path)
10{
11 path_get(path);
12 mnt_make_longterm(path->mnt);
13}
14
15static inline void path_put_longterm(struct path *path)
16{
17 mnt_make_shortterm(path->mnt);
18 path_put(path);
19}
7 20
8/* 21/*
9 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. 22 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -17,11 +30,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
17 write_seqcount_begin(&fs->seq); 30 write_seqcount_begin(&fs->seq);
18 old_root = fs->root; 31 old_root = fs->root;
19 fs->root = *path; 32 fs->root = *path;
20 path_get_long(path); 33 path_get_longterm(path);
21 write_seqcount_end(&fs->seq); 34 write_seqcount_end(&fs->seq);
22 spin_unlock(&fs->lock); 35 spin_unlock(&fs->lock);
23 if (old_root.dentry) 36 if (old_root.dentry)
24 path_put_long(&old_root); 37 path_put_longterm(&old_root);
25} 38}
26 39
27/* 40/*
@@ -36,12 +49,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
36 write_seqcount_begin(&fs->seq); 49 write_seqcount_begin(&fs->seq);
37 old_pwd = fs->pwd; 50 old_pwd = fs->pwd;
38 fs->pwd = *path; 51 fs->pwd = *path;
39 path_get_long(path); 52 path_get_longterm(path);
40 write_seqcount_end(&fs->seq); 53 write_seqcount_end(&fs->seq);
41 spin_unlock(&fs->lock); 54 spin_unlock(&fs->lock);
42 55
43 if (old_pwd.dentry) 56 if (old_pwd.dentry)
44 path_put_long(&old_pwd); 57 path_put_longterm(&old_pwd);
45} 58}
46 59
47void chroot_fs_refs(struct path *old_root, struct path *new_root) 60void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +72,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
59 write_seqcount_begin(&fs->seq); 72 write_seqcount_begin(&fs->seq);
60 if (fs->root.dentry == old_root->dentry 73 if (fs->root.dentry == old_root->dentry
61 && fs->root.mnt == old_root->mnt) { 74 && fs->root.mnt == old_root->mnt) {
62 path_get_long(new_root); 75 path_get_longterm(new_root);
63 fs->root = *new_root; 76 fs->root = *new_root;
64 count++; 77 count++;
65 } 78 }
66 if (fs->pwd.dentry == old_root->dentry 79 if (fs->pwd.dentry == old_root->dentry
67 && fs->pwd.mnt == old_root->mnt) { 80 && fs->pwd.mnt == old_root->mnt) {
68 path_get_long(new_root); 81 path_get_longterm(new_root);
69 fs->pwd = *new_root; 82 fs->pwd = *new_root;
70 count++; 83 count++;
71 } 84 }
@@ -76,13 +89,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
76 } while_each_thread(g, p); 89 } while_each_thread(g, p);
77 read_unlock(&tasklist_lock); 90 read_unlock(&tasklist_lock);
78 while (count--) 91 while (count--)
79 path_put_long(old_root); 92 path_put_longterm(old_root);
80} 93}
81 94
82void free_fs_struct(struct fs_struct *fs) 95void free_fs_struct(struct fs_struct *fs)
83{ 96{
84 path_put_long(&fs->root); 97 path_put_longterm(&fs->root);
85 path_put_long(&fs->pwd); 98 path_put_longterm(&fs->pwd);
86 kmem_cache_free(fs_cachep, fs); 99 kmem_cache_free(fs_cachep, fs);
87} 100}
88 101
@@ -118,9 +131,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
118 131
119 spin_lock(&old->lock); 132 spin_lock(&old->lock);
120 fs->root = old->root; 133 fs->root = old->root;
121 path_get_long(&fs->root); 134 path_get_longterm(&fs->root);
122 fs->pwd = old->pwd; 135 fs->pwd = old->pwd;
123 path_get_long(&fs->pwd); 136 path_get_longterm(&fs->pwd);
124 spin_unlock(&old->lock); 137 spin_unlock(&old->lock);
125 } 138 }
126 return fs; 139 return fs;
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede09..48a18f184d50 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
101 object->n_ops++; 101 object->n_ops++;
102 object->n_exclusive++; /* reads and writes must wait */ 102 object->n_exclusive++; /* reads and writes must wait */
103 103
104 if (object->n_ops > 0) { 104 if (object->n_ops > 1) {
105 atomic_inc(&op->usage); 105 atomic_inc(&op->usage);
106 list_add_tail(&op->pend_link, &object->pending_ops); 106 list_add_tail(&op->pend_link, &object->pending_ops);
107 fscache_stat(&fscache_n_op_pend); 107 fscache_stat(&fscache_n_op_pend);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6e07696308dc..cf8d28d1fbad 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
251 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 251 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
252} 252}
253 253
254void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
255 u64 nodeid, u64 nlookup)
256{
257 forget->forget_one.nodeid = nodeid;
258 forget->forget_one.nlookup = nlookup;
259
260 spin_lock(&fc->lock);
261 fc->forget_list_tail->next = forget;
262 fc->forget_list_tail = forget;
263 wake_up(&fc->waitq);
264 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
265 spin_unlock(&fc->lock);
266}
267
254static void flush_bg_queue(struct fuse_conn *fc) 268static void flush_bg_queue(struct fuse_conn *fc)
255{ 269{
256 while (fc->active_background < fc->max_background && 270 while (fc->active_background < fc->max_background &&
@@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
438 } 452 }
439} 453}
440 454
441void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
442{
443 req->isreply = 0;
444 fuse_request_send_nowait(fc, req);
445}
446
447void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) 455void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
448{ 456{
449 req->isreply = 1; 457 req->isreply = 1;
@@ -896,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
896 return err; 904 return err;
897} 905}
898 906
907static int forget_pending(struct fuse_conn *fc)
908{
909 return fc->forget_list_head.next != NULL;
910}
911
899static int request_pending(struct fuse_conn *fc) 912static int request_pending(struct fuse_conn *fc)
900{ 913{
901 return !list_empty(&fc->pending) || !list_empty(&fc->interrupts); 914 return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
915 forget_pending(fc);
902} 916}
903 917
904/* Wait until a request is available on the pending list */ 918/* Wait until a request is available on the pending list */
@@ -960,6 +974,120 @@ __releases(fc->lock)
960 return err ? err : reqsize; 974 return err ? err : reqsize;
961} 975}
962 976
977static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
978 unsigned max,
979 unsigned *countp)
980{
981 struct fuse_forget_link *head = fc->forget_list_head.next;
982 struct fuse_forget_link **newhead = &head;
983 unsigned count;
984
985 for (count = 0; *newhead != NULL && count < max; count++)
986 newhead = &(*newhead)->next;
987
988 fc->forget_list_head.next = *newhead;
989 *newhead = NULL;
990 if (fc->forget_list_head.next == NULL)
991 fc->forget_list_tail = &fc->forget_list_head;
992
993 if (countp != NULL)
994 *countp = count;
995
996 return head;
997}
998
999static int fuse_read_single_forget(struct fuse_conn *fc,
1000 struct fuse_copy_state *cs,
1001 size_t nbytes)
1002__releases(fc->lock)
1003{
1004 int err;
1005 struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
1006 struct fuse_forget_in arg = {
1007 .nlookup = forget->forget_one.nlookup,
1008 };
1009 struct fuse_in_header ih = {
1010 .opcode = FUSE_FORGET,
1011 .nodeid = forget->forget_one.nodeid,
1012 .unique = fuse_get_unique(fc),
1013 .len = sizeof(ih) + sizeof(arg),
1014 };
1015
1016 spin_unlock(&fc->lock);
1017 kfree(forget);
1018 if (nbytes < ih.len)
1019 return -EINVAL;
1020
1021 err = fuse_copy_one(cs, &ih, sizeof(ih));
1022 if (!err)
1023 err = fuse_copy_one(cs, &arg, sizeof(arg));
1024 fuse_copy_finish(cs);
1025
1026 if (err)
1027 return err;
1028
1029 return ih.len;
1030}
1031
1032static int fuse_read_batch_forget(struct fuse_conn *fc,
1033 struct fuse_copy_state *cs, size_t nbytes)
1034__releases(fc->lock)
1035{
1036 int err;
1037 unsigned max_forgets;
1038 unsigned count;
1039 struct fuse_forget_link *head;
1040 struct fuse_batch_forget_in arg = { .count = 0 };
1041 struct fuse_in_header ih = {
1042 .opcode = FUSE_BATCH_FORGET,
1043 .unique = fuse_get_unique(fc),
1044 .len = sizeof(ih) + sizeof(arg),
1045 };
1046
1047 if (nbytes < ih.len) {
1048 spin_unlock(&fc->lock);
1049 return -EINVAL;
1050 }
1051
1052 max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
1053 head = dequeue_forget(fc, max_forgets, &count);
1054 spin_unlock(&fc->lock);
1055
1056 arg.count = count;
1057 ih.len += count * sizeof(struct fuse_forget_one);
1058 err = fuse_copy_one(cs, &ih, sizeof(ih));
1059 if (!err)
1060 err = fuse_copy_one(cs, &arg, sizeof(arg));
1061
1062 while (head) {
1063 struct fuse_forget_link *forget = head;
1064
1065 if (!err) {
1066 err = fuse_copy_one(cs, &forget->forget_one,
1067 sizeof(forget->forget_one));
1068 }
1069 head = forget->next;
1070 kfree(forget);
1071 }
1072
1073 fuse_copy_finish(cs);
1074
1075 if (err)
1076 return err;
1077
1078 return ih.len;
1079}
1080
1081static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
1082 size_t nbytes)
1083__releases(fc->lock)
1084{
1085 if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
1086 return fuse_read_single_forget(fc, cs, nbytes);
1087 else
1088 return fuse_read_batch_forget(fc, cs, nbytes);
1089}
1090
963/* 1091/*
964 * Read a single request into the userspace filesystem's buffer. This 1092 * Read a single request into the userspace filesystem's buffer. This
965 * function waits until a request is available, then removes it from 1093 * function waits until a request is available, then removes it from
@@ -998,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
998 return fuse_read_interrupt(fc, cs, nbytes, req); 1126 return fuse_read_interrupt(fc, cs, nbytes, req);
999 } 1127 }
1000 1128
1129 if (forget_pending(fc)) {
1130 if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
1131 return fuse_read_forget(fc, cs, nbytes);
1132
1133 if (fc->forget_batch <= -8)
1134 fc->forget_batch = 16;
1135 }
1136
1001 req = list_entry(fc->pending.next, struct fuse_req, list); 1137 req = list_entry(fc->pending.next, struct fuse_req, list);
1002 req->state = FUSE_REQ_READING; 1138 req->state = FUSE_REQ_READING;
1003 list_move(&req->list, &fc->io); 1139 list_move(&req->list, &fc->io);
@@ -1090,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1090 if (!fc) 1226 if (!fc)
1091 return -EPERM; 1227 return -EPERM;
1092 1228
1093 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); 1229 bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
1094 if (!bufs) 1230 if (!bufs)
1095 return -ENOMEM; 1231 return -ENOMEM;
1096 1232
@@ -1626,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1626 if (!fc) 1762 if (!fc)
1627 return -EPERM; 1763 return -EPERM;
1628 1764
1629 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); 1765 bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
1630 if (!bufs) 1766 if (!bufs)
1631 return -ENOMEM; 1767 return -ENOMEM;
1632 1768
@@ -1770,6 +1906,8 @@ __acquires(fc->lock)
1770 flush_bg_queue(fc); 1906 flush_bg_queue(fc);
1771 end_requests(fc, &fc->pending); 1907 end_requests(fc, &fc->pending);
1772 end_requests(fc, &fc->processing); 1908 end_requests(fc, &fc->processing);
1909 while (forget_pending(fc))
1910 kfree(dequeue_forget(fc, 1, NULL));
1773} 1911}
1774 1912
1775/* 1913/*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f738599fd8cd..bfed8447ed80 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,9 +10,9 @@
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/gfp.h>
14#include <linux/sched.h> 13#include <linux/sched.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/slab.h>
16 16
17#if BITS_PER_LONG >= 64 17#if BITS_PER_LONG >= 64
18static inline void fuse_dentry_settime(struct dentry *entry, u64 time) 18static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
@@ -169,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
169 struct fuse_entry_out outarg; 169 struct fuse_entry_out outarg;
170 struct fuse_conn *fc; 170 struct fuse_conn *fc;
171 struct fuse_req *req; 171 struct fuse_req *req;
172 struct fuse_req *forget_req; 172 struct fuse_forget_link *forget;
173 struct dentry *parent; 173 struct dentry *parent;
174 u64 attr_version; 174 u64 attr_version;
175 175
@@ -182,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
182 if (IS_ERR(req)) 182 if (IS_ERR(req))
183 return 0; 183 return 0;
184 184
185 forget_req = fuse_get_req(fc); 185 forget = fuse_alloc_forget();
186 if (IS_ERR(forget_req)) { 186 if (!forget) {
187 fuse_put_request(fc, req); 187 fuse_put_request(fc, req);
188 return 0; 188 return 0;
189 } 189 }
@@ -203,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
203 if (!err) { 203 if (!err) {
204 struct fuse_inode *fi = get_fuse_inode(inode); 204 struct fuse_inode *fi = get_fuse_inode(inode);
205 if (outarg.nodeid != get_node_id(inode)) { 205 if (outarg.nodeid != get_node_id(inode)) {
206 fuse_send_forget(fc, forget_req, 206 fuse_queue_forget(fc, forget, outarg.nodeid, 1);
207 outarg.nodeid, 1);
208 return 0; 207 return 0;
209 } 208 }
210 spin_lock(&fc->lock); 209 spin_lock(&fc->lock);
211 fi->nlookup++; 210 fi->nlookup++;
212 spin_unlock(&fc->lock); 211 spin_unlock(&fc->lock);
213 } 212 }
214 fuse_put_request(fc, forget_req); 213 kfree(forget);
215 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) 214 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
216 return 0; 215 return 0;
217 216
@@ -263,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
263{ 262{
264 struct fuse_conn *fc = get_fuse_conn_super(sb); 263 struct fuse_conn *fc = get_fuse_conn_super(sb);
265 struct fuse_req *req; 264 struct fuse_req *req;
266 struct fuse_req *forget_req; 265 struct fuse_forget_link *forget;
267 u64 attr_version; 266 u64 attr_version;
268 int err; 267 int err;
269 268
@@ -277,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
277 if (IS_ERR(req)) 276 if (IS_ERR(req))
278 goto out; 277 goto out;
279 278
280 forget_req = fuse_get_req(fc); 279 forget = fuse_alloc_forget();
281 err = PTR_ERR(forget_req); 280 err = -ENOMEM;
282 if (IS_ERR(forget_req)) { 281 if (!forget) {
283 fuse_put_request(fc, req); 282 fuse_put_request(fc, req);
284 goto out; 283 goto out;
285 } 284 }
@@ -305,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
305 attr_version); 304 attr_version);
306 err = -ENOMEM; 305 err = -ENOMEM;
307 if (!*inode) { 306 if (!*inode) {
308 fuse_send_forget(fc, forget_req, outarg->nodeid, 1); 307 fuse_queue_forget(fc, forget, outarg->nodeid, 1);
309 goto out; 308 goto out;
310 } 309 }
311 err = 0; 310 err = 0;
312 311
313 out_put_forget: 312 out_put_forget:
314 fuse_put_request(fc, forget_req); 313 kfree(forget);
315 out: 314 out:
316 return err; 315 return err;
317} 316}
@@ -351,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
351 } 350 }
352 351
353 entry = newent ? newent : entry; 352 entry = newent ? newent : entry;
354 d_set_d_op(entry, &fuse_dentry_operations);
355 if (outarg_valid) 353 if (outarg_valid)
356 fuse_change_entry_timeout(entry, &outarg); 354 fuse_change_entry_timeout(entry, &outarg);
357 else 355 else
@@ -378,7 +376,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
378 struct inode *inode; 376 struct inode *inode;
379 struct fuse_conn *fc = get_fuse_conn(dir); 377 struct fuse_conn *fc = get_fuse_conn(dir);
380 struct fuse_req *req; 378 struct fuse_req *req;
381 struct fuse_req *forget_req; 379 struct fuse_forget_link *forget;
382 struct fuse_create_in inarg; 380 struct fuse_create_in inarg;
383 struct fuse_open_out outopen; 381 struct fuse_open_out outopen;
384 struct fuse_entry_out outentry; 382 struct fuse_entry_out outentry;
@@ -392,9 +390,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
392 if (flags & O_DIRECT) 390 if (flags & O_DIRECT)
393 return -EINVAL; 391 return -EINVAL;
394 392
395 forget_req = fuse_get_req(fc); 393 forget = fuse_alloc_forget();
396 if (IS_ERR(forget_req)) 394 if (!forget)
397 return PTR_ERR(forget_req); 395 return -ENOMEM;
398 396
399 req = fuse_get_req(fc); 397 req = fuse_get_req(fc);
400 err = PTR_ERR(req); 398 err = PTR_ERR(req);
@@ -452,10 +450,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
452 if (!inode) { 450 if (!inode) {
453 flags &= ~(O_CREAT | O_EXCL | O_TRUNC); 451 flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
454 fuse_sync_release(ff, flags); 452 fuse_sync_release(ff, flags);
455 fuse_send_forget(fc, forget_req, outentry.nodeid, 1); 453 fuse_queue_forget(fc, forget, outentry.nodeid, 1);
456 return -ENOMEM; 454 return -ENOMEM;
457 } 455 }
458 fuse_put_request(fc, forget_req); 456 kfree(forget);
459 d_instantiate(entry, inode); 457 d_instantiate(entry, inode);
460 fuse_change_entry_timeout(entry, &outentry); 458 fuse_change_entry_timeout(entry, &outentry);
461 fuse_invalidate_attr(dir); 459 fuse_invalidate_attr(dir);
@@ -473,7 +471,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
473 out_put_request: 471 out_put_request:
474 fuse_put_request(fc, req); 472 fuse_put_request(fc, req);
475 out_put_forget_req: 473 out_put_forget_req:
476 fuse_put_request(fc, forget_req); 474 kfree(forget);
477 return err; 475 return err;
478} 476}
479 477
@@ -487,12 +485,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
487 struct fuse_entry_out outarg; 485 struct fuse_entry_out outarg;
488 struct inode *inode; 486 struct inode *inode;
489 int err; 487 int err;
490 struct fuse_req *forget_req; 488 struct fuse_forget_link *forget;
491 489
492 forget_req = fuse_get_req(fc); 490 forget = fuse_alloc_forget();
493 if (IS_ERR(forget_req)) { 491 if (!forget) {
494 fuse_put_request(fc, req); 492 fuse_put_request(fc, req);
495 return PTR_ERR(forget_req); 493 return -ENOMEM;
496 } 494 }
497 495
498 memset(&outarg, 0, sizeof(outarg)); 496 memset(&outarg, 0, sizeof(outarg));
@@ -519,10 +517,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
519 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, 517 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
520 &outarg.attr, entry_attr_timeout(&outarg), 0); 518 &outarg.attr, entry_attr_timeout(&outarg), 0);
521 if (!inode) { 519 if (!inode) {
522 fuse_send_forget(fc, forget_req, outarg.nodeid, 1); 520 fuse_queue_forget(fc, forget, outarg.nodeid, 1);
523 return -ENOMEM; 521 return -ENOMEM;
524 } 522 }
525 fuse_put_request(fc, forget_req); 523 kfree(forget);
526 524
527 if (S_ISDIR(inode->i_mode)) { 525 if (S_ISDIR(inode->i_mode)) {
528 struct dentry *alias; 526 struct dentry *alias;
@@ -545,7 +543,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
545 return 0; 543 return 0;
546 544
547 out_put_forget_req: 545 out_put_forget_req:
548 fuse_put_request(fc, forget_req); 546 kfree(forget);
549 return err; 547 return err;
550} 548}
551 549
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8b984a2cebbd..95da1bc1c826 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1634,9 +1634,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1634 * and 64bit. Fortunately we can determine which structure the server 1634 * and 64bit. Fortunately we can determine which structure the server
1635 * used from the size of the reply. 1635 * used from the size of the reply.
1636 */ 1636 */
1637static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src, 1637static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
1638 size_t transferred, unsigned count, 1638 size_t transferred, unsigned count,
1639 bool is_compat) 1639 bool is_compat)
1640{ 1640{
1641#ifdef CONFIG_COMPAT 1641#ifdef CONFIG_COMPAT
1642 if (count * sizeof(struct compat_iovec) == transferred) { 1642 if (count * sizeof(struct compat_iovec) == transferred) {
@@ -1680,6 +1680,42 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
1680 return 0; 1680 return 0;
1681} 1681}
1682 1682
1683static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
1684 void *src, size_t transferred, unsigned count,
1685 bool is_compat)
1686{
1687 unsigned i;
1688 struct fuse_ioctl_iovec *fiov = src;
1689
1690 if (fc->minor < 16) {
1691 return fuse_copy_ioctl_iovec_old(dst, src, transferred,
1692 count, is_compat);
1693 }
1694
1695 if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
1696 return -EIO;
1697
1698 for (i = 0; i < count; i++) {
1699 /* Did the server supply an inappropriate value? */
1700 if (fiov[i].base != (unsigned long) fiov[i].base ||
1701 fiov[i].len != (unsigned long) fiov[i].len)
1702 return -EIO;
1703
1704 dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
1705 dst[i].iov_len = (size_t) fiov[i].len;
1706
1707#ifdef CONFIG_COMPAT
1708 if (is_compat &&
1709 (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
1710 (compat_size_t) dst[i].iov_len != fiov[i].len))
1711 return -EIO;
1712#endif
1713 }
1714
1715 return 0;
1716}
1717
1718
1683/* 1719/*
1684 * For ioctls, there is no generic way to determine how much memory 1720 * For ioctls, there is no generic way to determine how much memory
1685 * needs to be read and/or written. Furthermore, ioctls are allowed 1721 * needs to be read and/or written. Furthermore, ioctls are allowed
@@ -1740,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1740 struct fuse_ioctl_out outarg; 1776 struct fuse_ioctl_out outarg;
1741 struct fuse_req *req = NULL; 1777 struct fuse_req *req = NULL;
1742 struct page **pages = NULL; 1778 struct page **pages = NULL;
1743 struct page *iov_page = NULL; 1779 struct iovec *iov_page = NULL;
1744 struct iovec *in_iov = NULL, *out_iov = NULL; 1780 struct iovec *in_iov = NULL, *out_iov = NULL;
1745 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; 1781 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1746 size_t in_size, out_size, transferred; 1782 size_t in_size, out_size, transferred;
1747 int err; 1783 int err;
1748 1784
1785#if BITS_PER_LONG == 32
1786 inarg.flags |= FUSE_IOCTL_32BIT;
1787#else
1788 if (flags & FUSE_IOCTL_COMPAT)
1789 inarg.flags |= FUSE_IOCTL_32BIT;
1790#endif
1791
1749 /* assume all the iovs returned by client always fits in a page */ 1792 /* assume all the iovs returned by client always fits in a page */
1750 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 1793 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1751 1794
1752 err = -ENOMEM; 1795 err = -ENOMEM;
1753 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); 1796 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1754 iov_page = alloc_page(GFP_KERNEL); 1797 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
1755 if (!pages || !iov_page) 1798 if (!pages || !iov_page)
1756 goto out; 1799 goto out;
1757 1800
@@ -1760,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1760 * RETRY from server is not allowed. 1803 * RETRY from server is not allowed.
1761 */ 1804 */
1762 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { 1805 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1763 struct iovec *iov = page_address(iov_page); 1806 struct iovec *iov = iov_page;
1764 1807
1765 iov->iov_base = (void __user *)arg; 1808 iov->iov_base = (void __user *)arg;
1766 iov->iov_len = _IOC_SIZE(cmd); 1809 iov->iov_len = _IOC_SIZE(cmd);
@@ -1841,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1841 1884
1842 /* did it ask for retry? */ 1885 /* did it ask for retry? */
1843 if (outarg.flags & FUSE_IOCTL_RETRY) { 1886 if (outarg.flags & FUSE_IOCTL_RETRY) {
1844 char *vaddr; 1887 void *vaddr;
1845 1888
1846 /* no retry if in restricted mode */ 1889 /* no retry if in restricted mode */
1847 err = -EIO; 1890 err = -EIO;
@@ -1862,14 +1905,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1862 goto out; 1905 goto out;
1863 1906
1864 vaddr = kmap_atomic(pages[0], KM_USER0); 1907 vaddr = kmap_atomic(pages[0], KM_USER0);
1865 err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr, 1908 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
1866 transferred, in_iovs + out_iovs, 1909 transferred, in_iovs + out_iovs,
1867 (flags & FUSE_IOCTL_COMPAT) != 0); 1910 (flags & FUSE_IOCTL_COMPAT) != 0);
1868 kunmap_atomic(vaddr, KM_USER0); 1911 kunmap_atomic(vaddr, KM_USER0);
1869 if (err) 1912 if (err)
1870 goto out; 1913 goto out;
1871 1914
1872 in_iov = page_address(iov_page); 1915 in_iov = iov_page;
1873 out_iov = in_iov + in_iovs; 1916 out_iov = in_iov + in_iovs;
1874 1917
1875 err = fuse_verify_ioctl_iov(in_iov, in_iovs); 1918 err = fuse_verify_ioctl_iov(in_iov, in_iovs);
@@ -1891,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1891 out: 1934 out:
1892 if (req) 1935 if (req)
1893 fuse_put_request(fc, req); 1936 fuse_put_request(fc, req);
1894 if (iov_page) 1937 free_page((unsigned long) iov_page);
1895 __free_page(iov_page);
1896 while (num_pages) 1938 while (num_pages)
1897 __free_page(pages[--num_pages]); 1939 __free_page(pages[--num_pages]);
1898 kfree(pages); 1940 kfree(pages);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 57d4a3a0f102..ae5744a2f9e9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -53,6 +53,12 @@ extern struct mutex fuse_mutex;
53extern unsigned max_user_bgreq; 53extern unsigned max_user_bgreq;
54extern unsigned max_user_congthresh; 54extern unsigned max_user_congthresh;
55 55
56/* One forget request */
57struct fuse_forget_link {
58 struct fuse_forget_one forget_one;
59 struct fuse_forget_link *next;
60};
61
56/** FUSE inode */ 62/** FUSE inode */
57struct fuse_inode { 63struct fuse_inode {
58 /** Inode data */ 64 /** Inode data */
@@ -66,7 +72,7 @@ struct fuse_inode {
66 u64 nlookup; 72 u64 nlookup;
67 73
68 /** The request used for sending the FORGET message */ 74 /** The request used for sending the FORGET message */
69 struct fuse_req *forget_req; 75 struct fuse_forget_link *forget;
70 76
71 /** Time in jiffies until the file attributes are valid */ 77 /** Time in jiffies until the file attributes are valid */
72 u64 i_time; 78 u64 i_time;
@@ -255,7 +261,6 @@ struct fuse_req {
255 261
256 /** Data for asynchronous requests */ 262 /** Data for asynchronous requests */
257 union { 263 union {
258 struct fuse_forget_in forget_in;
259 struct { 264 struct {
260 struct fuse_release_in in; 265 struct fuse_release_in in;
261 struct path path; 266 struct path path;
@@ -369,6 +374,13 @@ struct fuse_conn {
369 /** Pending interrupts */ 374 /** Pending interrupts */
370 struct list_head interrupts; 375 struct list_head interrupts;
371 376
377 /** Queue of pending forgets */
378 struct fuse_forget_link forget_list_head;
379 struct fuse_forget_link *forget_list_tail;
380
381 /** Batching of FORGET requests (positive indicates FORGET batch) */
382 int forget_batch;
383
372 /** Flag indicating if connection is blocked. This will be 384 /** Flag indicating if connection is blocked. This will be
373 the case before the INIT reply is received, and if there 385 the case before the INIT reply is received, and if there
374 are too many outstading backgrounds requests */ 386 are too many outstading backgrounds requests */
@@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
543/** 555/**
544 * Send FORGET command 556 * Send FORGET command
545 */ 557 */
546void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, 558void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
547 u64 nodeid, u64 nlookup); 559 u64 nodeid, u64 nlookup);
560
561struct fuse_forget_link *fuse_alloc_forget(void);
548 562
549/** 563/**
550 * Initialize READ or READDIR request 564 * Initialize READ or READDIR request
@@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
656void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); 670void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
657 671
658/** 672/**
659 * Send a request with no reply
660 */
661void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
662
663/**
664 * Send a request in the background 673 * Send a request in the background
665 */ 674 */
666void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); 675void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a8b31da19b93..9e3f68cc1bd1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,6 +71,11 @@ struct fuse_mount_data {
71 unsigned blksize; 71 unsigned blksize;
72}; 72};
73 73
74struct fuse_forget_link *fuse_alloc_forget()
75{
76 return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
77}
78
74static struct inode *fuse_alloc_inode(struct super_block *sb) 79static struct inode *fuse_alloc_inode(struct super_block *sb)
75{ 80{
76 struct inode *inode; 81 struct inode *inode;
@@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
90 INIT_LIST_HEAD(&fi->queued_writes); 95 INIT_LIST_HEAD(&fi->queued_writes);
91 INIT_LIST_HEAD(&fi->writepages); 96 INIT_LIST_HEAD(&fi->writepages);
92 init_waitqueue_head(&fi->page_waitq); 97 init_waitqueue_head(&fi->page_waitq);
93 fi->forget_req = fuse_request_alloc(); 98 fi->forget = fuse_alloc_forget();
94 if (!fi->forget_req) { 99 if (!fi->forget) {
95 kmem_cache_free(fuse_inode_cachep, inode); 100 kmem_cache_free(fuse_inode_cachep, inode);
96 return NULL; 101 return NULL;
97 } 102 }
@@ -111,24 +116,10 @@ static void fuse_destroy_inode(struct inode *inode)
111 struct fuse_inode *fi = get_fuse_inode(inode); 116 struct fuse_inode *fi = get_fuse_inode(inode);
112 BUG_ON(!list_empty(&fi->write_files)); 117 BUG_ON(!list_empty(&fi->write_files));
113 BUG_ON(!list_empty(&fi->queued_writes)); 118 BUG_ON(!list_empty(&fi->queued_writes));
114 if (fi->forget_req) 119 kfree(fi->forget);
115 fuse_request_free(fi->forget_req);
116 call_rcu(&inode->i_rcu, fuse_i_callback); 120 call_rcu(&inode->i_rcu, fuse_i_callback);
117} 121}
118 122
119void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
120 u64 nodeid, u64 nlookup)
121{
122 struct fuse_forget_in *inarg = &req->misc.forget_in;
123 inarg->nlookup = nlookup;
124 req->in.h.opcode = FUSE_FORGET;
125 req->in.h.nodeid = nodeid;
126 req->in.numargs = 1;
127 req->in.args[0].size = sizeof(struct fuse_forget_in);
128 req->in.args[0].value = inarg;
129 fuse_request_send_noreply(fc, req);
130}
131
132static void fuse_evict_inode(struct inode *inode) 123static void fuse_evict_inode(struct inode *inode)
133{ 124{
134 truncate_inode_pages(&inode->i_data, 0); 125 truncate_inode_pages(&inode->i_data, 0);
@@ -136,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode)
136 if (inode->i_sb->s_flags & MS_ACTIVE) { 127 if (inode->i_sb->s_flags & MS_ACTIVE) {
137 struct fuse_conn *fc = get_fuse_conn(inode); 128 struct fuse_conn *fc = get_fuse_conn(inode);
138 struct fuse_inode *fi = get_fuse_inode(inode); 129 struct fuse_inode *fi = get_fuse_inode(inode);
139 fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup); 130 fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
140 fi->forget_req = NULL; 131 fi->forget = NULL;
141 } 132 }
142} 133}
143 134
@@ -541,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc)
541 INIT_LIST_HEAD(&fc->interrupts); 532 INIT_LIST_HEAD(&fc->interrupts);
542 INIT_LIST_HEAD(&fc->bg_queue); 533 INIT_LIST_HEAD(&fc->bg_queue);
543 INIT_LIST_HEAD(&fc->entry); 534 INIT_LIST_HEAD(&fc->entry);
535 fc->forget_list_tail = &fc->forget_list_head;
544 atomic_set(&fc->num_waiting, 0); 536 atomic_set(&fc->num_waiting, 0);
545 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; 537 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
546 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; 538 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
@@ -625,10 +617,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
625 goto out_iput; 617 goto out_iput;
626 618
627 entry = d_obtain_alias(inode); 619 entry = d_obtain_alias(inode);
628 if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) { 620 if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
629 d_set_d_op(entry, &fuse_dentry_operations);
630 fuse_invalidate_entry_cache(entry); 621 fuse_invalidate_entry_cache(entry);
631 }
632 622
633 return entry; 623 return entry;
634 624
@@ -727,10 +717,8 @@ static struct dentry *fuse_get_parent(struct dentry *child)
727 } 717 }
728 718
729 parent = d_obtain_alias(inode); 719 parent = d_obtain_alias(inode);
730 if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) { 720 if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
731 d_set_d_op(parent, &fuse_dentry_operations);
732 fuse_invalidate_entry_cache(parent); 721 fuse_invalidate_entry_cache(parent);
733 }
734 722
735 return parent; 723 return parent;
736} 724}
@@ -997,6 +985,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
997 iput(root); 985 iput(root);
998 goto err_put_conn; 986 goto err_put_conn;
999 } 987 }
988 /* only now - we want root dentry with NULL ->d_op */
989 sb->s_d_op = &fuse_dentry_operations;
1000 990
1001 init_req = fuse_request_alloc(); 991 init_req = fuse_request_alloc();
1002 if (!init_req) 992 if (!init_req)
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 97012ecff560..9023db8184f9 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,12 +126,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
126 126
127static struct dentry *gfs2_get_parent(struct dentry *child) 127static struct dentry *gfs2_get_parent(struct dentry *child)
128{ 128{
129 struct dentry *dentry; 129 return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
130
131 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
132 if (!IS_ERR(dentry))
133 d_set_d_op(dentry, &gfs2_dops);
134 return dentry;
135} 130}
136 131
137static struct dentry *gfs2_get_dentry(struct super_block *sb, 132static struct dentry *gfs2_get_dentry(struct super_block *sb,
@@ -139,7 +134,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
139{ 134{
140 struct gfs2_sbd *sdp = sb->s_fs_info; 135 struct gfs2_sbd *sdp = sb->s_fs_info;
141 struct inode *inode; 136 struct inode *inode;
142 struct dentry *dentry;
143 137
144 inode = gfs2_ilookup(sb, inum->no_addr); 138 inode = gfs2_ilookup(sb, inum->no_addr);
145 if (inode) { 139 if (inode) {
@@ -156,10 +150,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
156 return ERR_CAST(inode); 150 return ERR_CAST(inode);
157 151
158out_inode: 152out_inode:
159 dentry = d_obtain_alias(inode); 153 return d_obtain_alias(inode);
160 if (!IS_ERR(dentry))
161 d_set_d_op(dentry, &gfs2_dops);
162 return dentry;
163} 154}
164 155
165static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, 156static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fca6689e12e6..7cfdcb913363 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/ext2_fs.h> 21#include <linux/ext2_fs.h>
22#include <linux/falloc.h>
23#include <linux/swap.h>
22#include <linux/crc32.h> 24#include <linux/crc32.h>
23#include <linux/writeback.h> 25#include <linux/writeback.h>
24#include <asm/uaccess.h> 26#include <asm/uaccess.h>
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
610 return generic_file_aio_write(iocb, iov, nr_segs, pos); 612 return generic_file_aio_write(iocb, iov, nr_segs, pos);
611} 613}
612 614
615static void empty_write_end(struct page *page, unsigned from,
616 unsigned to)
617{
618 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
619
620 page_zero_new_buffers(page, from, to);
621 flush_dcache_page(page);
622 mark_page_accessed(page);
623
624 if (!gfs2_is_writeback(ip))
625 gfs2_page_add_databufs(ip, page, from, to);
626
627 block_commit_write(page, from, to);
628}
629
630static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
631{
632 unsigned start, end, next;
633 struct buffer_head *bh, *head;
634 int error;
635
636 if (!page_has_buffers(page)) {
637 error = __block_write_begin(page, from, to - from, gfs2_block_map);
638 if (unlikely(error))
639 return error;
640
641 empty_write_end(page, from, to);
642 return 0;
643 }
644
645 bh = head = page_buffers(page);
646 next = end = 0;
647 while (next < from) {
648 next += bh->b_size;
649 bh = bh->b_this_page;
650 }
651 start = next;
652 do {
653 next += bh->b_size;
654 if (buffer_mapped(bh)) {
655 if (end) {
656 error = __block_write_begin(page, start, end - start,
657 gfs2_block_map);
658 if (unlikely(error))
659 return error;
660 empty_write_end(page, start, end);
661 end = 0;
662 }
663 start = next;
664 }
665 else
666 end = next;
667 bh = bh->b_this_page;
668 } while (next < to);
669
670 if (end) {
671 error = __block_write_begin(page, start, end - start, gfs2_block_map);
672 if (unlikely(error))
673 return error;
674 empty_write_end(page, start, end);
675 }
676
677 return 0;
678}
679
680static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
681 int mode)
682{
683 struct gfs2_inode *ip = GFS2_I(inode);
684 struct buffer_head *dibh;
685 int error;
686 u64 start = offset >> PAGE_CACHE_SHIFT;
687 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
688 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
689 pgoff_t curr;
690 struct page *page;
691 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
692 unsigned int from, to;
693
694 if (!end_offset)
695 end_offset = PAGE_CACHE_SIZE;
696
697 error = gfs2_meta_inode_buffer(ip, &dibh);
698 if (unlikely(error))
699 goto out;
700
701 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
702
703 if (gfs2_is_stuffed(ip)) {
704 error = gfs2_unstuff_dinode(ip, NULL);
705 if (unlikely(error))
706 goto out;
707 }
708
709 curr = start;
710 offset = start << PAGE_CACHE_SHIFT;
711 from = start_offset;
712 to = PAGE_CACHE_SIZE;
713 while (curr <= end) {
714 page = grab_cache_page_write_begin(inode->i_mapping, curr,
715 AOP_FLAG_NOFS);
716 if (unlikely(!page)) {
717 error = -ENOMEM;
718 goto out;
719 }
720
721 if (curr == end)
722 to = end_offset;
723 error = write_empty_blocks(page, from, to);
724 if (!error && offset + to > inode->i_size &&
725 !(mode & FALLOC_FL_KEEP_SIZE)) {
726 i_size_write(inode, offset + to);
727 }
728 unlock_page(page);
729 page_cache_release(page);
730 if (error)
731 goto out;
732 curr++;
733 offset += PAGE_CACHE_SIZE;
734 from = 0;
735 }
736
737 gfs2_dinode_out(ip, dibh->b_data);
738 mark_inode_dirty(inode);
739
740 brelse(dibh);
741
742out:
743 return error;
744}
745
746static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
747 unsigned int *data_blocks, unsigned int *ind_blocks)
748{
749 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
750 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
751 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
752
753 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
754 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
755 max_data -= tmp;
756 }
757 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
758 so it might end up with fewer data blocks */
759 if (max_data <= *data_blocks)
760 return;
761 *data_blocks = max_data;
762 *ind_blocks = max_blocks - max_data;
763 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
764 if (*len > max) {
765 *len = max;
766 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
767 }
768}
769
770static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
771 loff_t len)
772{
773 struct inode *inode = file->f_path.dentry->d_inode;
774 struct gfs2_sbd *sdp = GFS2_SB(inode);
775 struct gfs2_inode *ip = GFS2_I(inode);
776 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
777 loff_t bytes, max_bytes;
778 struct gfs2_alloc *al;
779 int error;
780 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
781 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
782
783 /* We only support the FALLOC_FL_KEEP_SIZE mode */
784 if (mode & ~FALLOC_FL_KEEP_SIZE)
785 return -EOPNOTSUPP;
786
787 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
788 sdp->sd_sb.sb_bsize_shift;
789
790 len = next - offset;
791 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
792 if (!bytes)
793 bytes = UINT_MAX;
794
795 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
796 error = gfs2_glock_nq(&ip->i_gh);
797 if (unlikely(error))
798 goto out_uninit;
799
800 if (!gfs2_write_alloc_required(ip, offset, len))
801 goto out_unlock;
802
803 while (len > 0) {
804 if (len < bytes)
805 bytes = len;
806 al = gfs2_alloc_get(ip);
807 if (!al) {
808 error = -ENOMEM;
809 goto out_unlock;
810 }
811
812 error = gfs2_quota_lock_check(ip);
813 if (error)
814 goto out_alloc_put;
815
816retry:
817 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
818
819 al->al_requested = data_blocks + ind_blocks;
820 error = gfs2_inplace_reserve(ip);
821 if (error) {
822 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
823 bytes >>= 1;
824 goto retry;
825 }
826 goto out_qunlock;
827 }
828 max_bytes = bytes;
829 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
830 al->al_requested = data_blocks + ind_blocks;
831
832 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
833 RES_RG_HDR + gfs2_rg_blocks(al);
834 if (gfs2_is_jdata(ip))
835 rblocks += data_blocks ? data_blocks : 1;
836
837 error = gfs2_trans_begin(sdp, rblocks,
838 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
839 if (error)
840 goto out_trans_fail;
841
842 error = fallocate_chunk(inode, offset, max_bytes, mode);
843 gfs2_trans_end(sdp);
844
845 if (error)
846 goto out_trans_fail;
847
848 len -= max_bytes;
849 offset += max_bytes;
850 gfs2_inplace_release(ip);
851 gfs2_quota_unlock(ip);
852 gfs2_alloc_put(ip);
853 }
854 goto out_unlock;
855
856out_trans_fail:
857 gfs2_inplace_release(ip);
858out_qunlock:
859 gfs2_quota_unlock(ip);
860out_alloc_put:
861 gfs2_alloc_put(ip);
862out_unlock:
863 gfs2_glock_dq(&ip->i_gh);
864out_uninit:
865 gfs2_holder_uninit(&ip->i_gh);
866 return error;
867}
868
613#ifdef CONFIG_GFS2_FS_LOCKING_DLM 869#ifdef CONFIG_GFS2_FS_LOCKING_DLM
614 870
615/** 871/**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
765 .splice_read = generic_file_splice_read, 1021 .splice_read = generic_file_splice_read,
766 .splice_write = generic_file_splice_write, 1022 .splice_write = generic_file_splice_write,
767 .setlease = gfs2_setlease, 1023 .setlease = gfs2_setlease,
1024 .fallocate = gfs2_fallocate,
768}; 1025};
769 1026
770const struct file_operations gfs2_dir_fops = { 1027const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
794 .splice_read = generic_file_splice_read, 1051 .splice_read = generic_file_splice_read,
795 .splice_write = generic_file_splice_write, 1052 .splice_write = generic_file_splice_write,
796 .setlease = generic_setlease, 1053 .setlease = generic_setlease,
1054 .fallocate = gfs2_fallocate,
797}; 1055};
798 1056
799const struct file_operations gfs2_dir_fops_nolock = { 1057const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8d3d2b4a0a7d..a79790c06275 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
11#define __INCORE_DOT_H__ 11#define __INCORE_DOT_H__
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/kobject.h>
14#include <linux/workqueue.h> 15#include <linux/workqueue.h>
15#include <linux/dlm.h> 16#include <linux/dlm.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2232b3c780bd..7aa7d4f8984a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -74,16 +74,14 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
74} 74}
75 75
76/** 76/**
77 * GFS2 lookup code fills in vfs inode contents based on info obtained 77 * gfs2_set_iop - Sets inode operations
78 * from directory entry inside gfs2_inode_lookup(). This has caused issues 78 * @inode: The inode with correct i_mode filled in
79 * with NFS code path since its get_dentry routine doesn't have the relevant
80 * directory entry when gfs2_inode_lookup() is invoked. Part of the code
81 * segment inside gfs2_inode_lookup code needs to get moved around.
82 * 79 *
83 * Clears I_NEW as well. 80 * GFS2 lookup code fills in vfs inode contents based on info obtained
84 **/ 81 * from directory entry inside gfs2_inode_lookup().
82 */
85 83
86void gfs2_set_iop(struct inode *inode) 84static void gfs2_set_iop(struct inode *inode)
87{ 85{
88 struct gfs2_sbd *sdp = GFS2_SB(inode); 86 struct gfs2_sbd *sdp = GFS2_SB(inode);
89 umode_t mode = inode->i_mode; 87 umode_t mode = inode->i_mode;
@@ -106,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
106 inode->i_op = &gfs2_file_iops; 104 inode->i_op = &gfs2_file_iops;
107 init_special_inode(inode, inode->i_mode, inode->i_rdev); 105 init_special_inode(inode, inode->i_mode, inode->i_rdev);
108 } 106 }
109
110 unlock_new_inode(inode);
111} 107}
112 108
113/** 109/**
@@ -119,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
119 * Returns: A VFS inode, or an error 115 * Returns: A VFS inode, or an error
120 */ 116 */
121 117
122struct inode *gfs2_inode_lookup(struct super_block *sb, 118struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
123 unsigned int type, 119 u64 no_addr, u64 no_formal_ino)
124 u64 no_addr,
125 u64 no_formal_ino)
126{ 120{
127 struct inode *inode; 121 struct inode *inode;
128 struct gfs2_inode *ip; 122 struct gfs2_inode *ip;
@@ -152,51 +146,37 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
152 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); 146 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
153 if (unlikely(error)) 147 if (unlikely(error))
154 goto fail_iopen; 148 goto fail_iopen;
155 ip->i_iopen_gh.gh_gl->gl_object = ip;
156 149
150 ip->i_iopen_gh.gh_gl->gl_object = ip;
157 gfs2_glock_put(io_gl); 151 gfs2_glock_put(io_gl);
158 io_gl = NULL; 152 io_gl = NULL;
159 153
160 if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
161 goto gfs2_nfsbypass;
162
163 inode->i_mode = DT2IF(type);
164
165 /*
166 * We must read the inode in order to work out its type in
167 * this case. Note that this doesn't happen often as we normally
168 * know the type beforehand. This code path only occurs during
169 * unlinked inode recovery (where it is safe to do this glock,
170 * which is not true in the general case).
171 */
172 if (type == DT_UNKNOWN) { 154 if (type == DT_UNKNOWN) {
173 struct gfs2_holder gh; 155 /* Inode glock must be locked already */
174 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 156 error = gfs2_inode_refresh(GFS2_I(inode));
175 if (unlikely(error)) 157 if (error)
176 goto fail_glock; 158 goto fail_refresh;
177 /* Inode is now uptodate */ 159 } else {
178 gfs2_glock_dq_uninit(&gh); 160 inode->i_mode = DT2IF(type);
179 } 161 }
180 162
181 gfs2_set_iop(inode); 163 gfs2_set_iop(inode);
164 unlock_new_inode(inode);
182 } 165 }
183 166
184gfs2_nfsbypass:
185 return inode; 167 return inode;
186fail_glock: 168
187 gfs2_glock_dq(&ip->i_iopen_gh); 169fail_refresh:
170 ip->i_iopen_gh.gh_gl->gl_object = NULL;
171 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
188fail_iopen: 172fail_iopen:
189 if (io_gl) 173 if (io_gl)
190 gfs2_glock_put(io_gl); 174 gfs2_glock_put(io_gl);
191fail_put: 175fail_put:
192 if (inode->i_state & I_NEW) 176 ip->i_gl->gl_object = NULL;
193 ip->i_gl->gl_object = NULL;
194 gfs2_glock_put(ip->i_gl); 177 gfs2_glock_put(ip->i_gl);
195fail: 178fail:
196 if (inode->i_state & I_NEW) 179 iget_failed(inode);
197 iget_failed(inode);
198 else
199 iput(inode);
200 return ERR_PTR(error); 180 return ERR_PTR(error);
201} 181}
202 182
@@ -221,14 +201,6 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
221 if (IS_ERR(inode)) 201 if (IS_ERR(inode))
222 goto fail; 202 goto fail;
223 203
224 error = gfs2_inode_refresh(GFS2_I(inode));
225 if (error)
226 goto fail_iput;
227
228 /* Pick up the works we bypass in gfs2_inode_lookup */
229 if (inode->i_state & I_NEW)
230 gfs2_set_iop(inode);
231
232 /* Two extra checks for NFS only */ 204 /* Two extra checks for NFS only */
233 if (no_formal_ino) { 205 if (no_formal_ino) {
234 error = -ESTALE; 206 error = -ESTALE;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 732a183efdb3..3e00a66e7cbd 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,7 +96,6 @@ err:
96 return -EIO; 96 return -EIO;
97} 97}
98 98
99extern void gfs2_set_iop(struct inode *inode);
100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 99extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
101 u64 no_addr, u64 no_formal_ino); 100 u64 no_addr, u64 no_formal_ino);
102extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, 101extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 2aeabd4218cc..777927ce6f79 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -440,7 +440,6 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
440 iput(inode); 440 iput(inode);
441 return -ENOMEM; 441 return -ENOMEM;
442 } 442 }
443 d_set_d_op(dentry, &gfs2_dops);
444 *dptr = dentry; 443 *dptr = dentry;
445 return 0; 444 return 0;
446} 445}
@@ -1106,6 +1105,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1106 1105
1107 sb->s_magic = GFS2_MAGIC; 1106 sb->s_magic = GFS2_MAGIC;
1108 sb->s_op = &gfs2_super_ops; 1107 sb->s_op = &gfs2_super_ops;
1108 sb->s_d_op = &gfs2_dops;
1109 sb->s_export_op = &gfs2_export_ops; 1109 sb->s_export_op = &gfs2_export_ops;
1110 sb->s_xattr = gfs2_xattr_handlers; 1110 sb->s_xattr = gfs2_xattr_handlers;
1111 sb->s_qcop = &gfs2_quotactl_ops; 1111 sb->s_qcop = &gfs2_quotactl_ops;
@@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1268{ 1268{
1269 struct block_device *bdev; 1269 struct block_device *bdev;
1270 struct super_block *s; 1270 struct super_block *s;
1271 fmode_t mode = FMODE_READ; 1271 fmode_t mode = FMODE_READ | FMODE_EXCL;
1272 int error; 1272 int error;
1273 struct gfs2_args args; 1273 struct gfs2_args args;
1274 struct gfs2_sbd *sdp; 1274 struct gfs2_sbd *sdp;
@@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1276 if (!(flags & MS_RDONLY)) 1276 if (!(flags & MS_RDONLY))
1277 mode |= FMODE_WRITE; 1277 mode |= FMODE_WRITE;
1278 1278
1279 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1279 bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1280 if (IS_ERR(bdev)) 1280 if (IS_ERR(bdev))
1281 return ERR_CAST(bdev); 1281 return ERR_CAST(bdev);
1282 1282
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1298 goto error_bdev; 1298 goto error_bdev;
1299 1299
1300 if (s->s_root) 1300 if (s->s_root)
1301 close_bdev_exclusive(bdev, mode); 1301 blkdev_put(bdev, mode);
1302 1302
1303 memset(&args, 0, sizeof(args)); 1303 memset(&args, 0, sizeof(args));
1304 args.ar_quota = GFS2_QUOTA_DEFAULT; 1304 args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
1342 deactivate_locked_super(s); 1342 deactivate_locked_super(s);
1343 return ERR_PTR(error); 1343 return ERR_PTR(error);
1344error_bdev: 1344error_bdev:
1345 close_bdev_exclusive(bdev, mode); 1345 blkdev_put(bdev, mode);
1346 return ERR_PTR(error); 1346 return ERR_PTR(error);
1347} 1347}
1348 1348
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1501db4f0e6d..d8b26ac2e20b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/crc32.h> 19#include <linux/crc32.h>
20#include <linux/fiemap.h> 20#include <linux/fiemap.h>
21#include <linux/swap.h>
22#include <linux/falloc.h>
23#include <asm/uaccess.h> 21#include <asm/uaccess.h>
24 22
25#include "gfs2.h" 23#include "gfs2.h"
@@ -106,8 +104,6 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
106{ 104{
107 struct inode *inode = NULL; 105 struct inode *inode = NULL;
108 106
109 d_set_d_op(dentry, &gfs2_dops);
110
111 inode = gfs2_lookupi(dir, &dentry->d_name, 0); 107 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
112 if (inode && IS_ERR(inode)) 108 if (inode && IS_ERR(inode))
113 return ERR_CAST(inode); 109 return ERR_CAST(inode);
@@ -1259,257 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1259 return ret; 1255 return ret;
1260} 1256}
1261 1257
1262static void empty_write_end(struct page *page, unsigned from,
1263 unsigned to)
1264{
1265 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
1266
1267 page_zero_new_buffers(page, from, to);
1268 flush_dcache_page(page);
1269 mark_page_accessed(page);
1270
1271 if (!gfs2_is_writeback(ip))
1272 gfs2_page_add_databufs(ip, page, from, to);
1273
1274 block_commit_write(page, from, to);
1275}
1276
1277
1278static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1279{
1280 unsigned start, end, next;
1281 struct buffer_head *bh, *head;
1282 int error;
1283
1284 if (!page_has_buffers(page)) {
1285 error = __block_write_begin(page, from, to - from, gfs2_block_map);
1286 if (unlikely(error))
1287 return error;
1288
1289 empty_write_end(page, from, to);
1290 return 0;
1291 }
1292
1293 bh = head = page_buffers(page);
1294 next = end = 0;
1295 while (next < from) {
1296 next += bh->b_size;
1297 bh = bh->b_this_page;
1298 }
1299 start = next;
1300 do {
1301 next += bh->b_size;
1302 if (buffer_mapped(bh)) {
1303 if (end) {
1304 error = __block_write_begin(page, start, end - start,
1305 gfs2_block_map);
1306 if (unlikely(error))
1307 return error;
1308 empty_write_end(page, start, end);
1309 end = 0;
1310 }
1311 start = next;
1312 }
1313 else
1314 end = next;
1315 bh = bh->b_this_page;
1316 } while (next < to);
1317
1318 if (end) {
1319 error = __block_write_begin(page, start, end - start, gfs2_block_map);
1320 if (unlikely(error))
1321 return error;
1322 empty_write_end(page, start, end);
1323 }
1324
1325 return 0;
1326}
1327
1328static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
1329 int mode)
1330{
1331 struct gfs2_inode *ip = GFS2_I(inode);
1332 struct buffer_head *dibh;
1333 int error;
1334 u64 start = offset >> PAGE_CACHE_SHIFT;
1335 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
1336 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
1337 pgoff_t curr;
1338 struct page *page;
1339 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
1340 unsigned int from, to;
1341
1342 if (!end_offset)
1343 end_offset = PAGE_CACHE_SIZE;
1344
1345 error = gfs2_meta_inode_buffer(ip, &dibh);
1346 if (unlikely(error))
1347 goto out;
1348
1349 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1350
1351 if (gfs2_is_stuffed(ip)) {
1352 error = gfs2_unstuff_dinode(ip, NULL);
1353 if (unlikely(error))
1354 goto out;
1355 }
1356
1357 curr = start;
1358 offset = start << PAGE_CACHE_SHIFT;
1359 from = start_offset;
1360 to = PAGE_CACHE_SIZE;
1361 while (curr <= end) {
1362 page = grab_cache_page_write_begin(inode->i_mapping, curr,
1363 AOP_FLAG_NOFS);
1364 if (unlikely(!page)) {
1365 error = -ENOMEM;
1366 goto out;
1367 }
1368
1369 if (curr == end)
1370 to = end_offset;
1371 error = write_empty_blocks(page, from, to);
1372 if (!error && offset + to > inode->i_size &&
1373 !(mode & FALLOC_FL_KEEP_SIZE)) {
1374 i_size_write(inode, offset + to);
1375 }
1376 unlock_page(page);
1377 page_cache_release(page);
1378 if (error)
1379 goto out;
1380 curr++;
1381 offset += PAGE_CACHE_SIZE;
1382 from = 0;
1383 }
1384
1385 gfs2_dinode_out(ip, dibh->b_data);
1386 mark_inode_dirty(inode);
1387
1388 brelse(dibh);
1389
1390out:
1391 return error;
1392}
1393
1394static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
1395 unsigned int *data_blocks, unsigned int *ind_blocks)
1396{
1397 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1398 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
1399 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
1400
1401 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
1402 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1403 max_data -= tmp;
1404 }
1405 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
1406 so it might end up with fewer data blocks */
1407 if (max_data <= *data_blocks)
1408 return;
1409 *data_blocks = max_data;
1410 *ind_blocks = max_blocks - max_data;
1411 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
1412 if (*len > max) {
1413 *len = max;
1414 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
1415 }
1416}
1417
1418static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1419 loff_t len)
1420{
1421 struct gfs2_sbd *sdp = GFS2_SB(inode);
1422 struct gfs2_inode *ip = GFS2_I(inode);
1423 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1424 loff_t bytes, max_bytes;
1425 struct gfs2_alloc *al;
1426 int error;
1427 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
1428 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
1429
1430 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
1431 sdp->sd_sb.sb_bsize_shift;
1432
1433 len = next - offset;
1434 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
1435 if (!bytes)
1436 bytes = UINT_MAX;
1437
1438 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
1439 error = gfs2_glock_nq(&ip->i_gh);
1440 if (unlikely(error))
1441 goto out_uninit;
1442
1443 if (!gfs2_write_alloc_required(ip, offset, len))
1444 goto out_unlock;
1445
1446 while (len > 0) {
1447 if (len < bytes)
1448 bytes = len;
1449 al = gfs2_alloc_get(ip);
1450 if (!al) {
1451 error = -ENOMEM;
1452 goto out_unlock;
1453 }
1454
1455 error = gfs2_quota_lock_check(ip);
1456 if (error)
1457 goto out_alloc_put;
1458
1459retry:
1460 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
1461
1462 al->al_requested = data_blocks + ind_blocks;
1463 error = gfs2_inplace_reserve(ip);
1464 if (error) {
1465 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
1466 bytes >>= 1;
1467 goto retry;
1468 }
1469 goto out_qunlock;
1470 }
1471 max_bytes = bytes;
1472 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
1473 al->al_requested = data_blocks + ind_blocks;
1474
1475 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
1476 RES_RG_HDR + gfs2_rg_blocks(al);
1477 if (gfs2_is_jdata(ip))
1478 rblocks += data_blocks ? data_blocks : 1;
1479
1480 error = gfs2_trans_begin(sdp, rblocks,
1481 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
1482 if (error)
1483 goto out_trans_fail;
1484
1485 error = fallocate_chunk(inode, offset, max_bytes, mode);
1486 gfs2_trans_end(sdp);
1487
1488 if (error)
1489 goto out_trans_fail;
1490
1491 len -= max_bytes;
1492 offset += max_bytes;
1493 gfs2_inplace_release(ip);
1494 gfs2_quota_unlock(ip);
1495 gfs2_alloc_put(ip);
1496 }
1497 goto out_unlock;
1498
1499out_trans_fail:
1500 gfs2_inplace_release(ip);
1501out_qunlock:
1502 gfs2_quota_unlock(ip);
1503out_alloc_put:
1504 gfs2_alloc_put(ip);
1505out_unlock:
1506 gfs2_glock_dq(&ip->i_gh);
1507out_uninit:
1508 gfs2_holder_uninit(&ip->i_gh);
1509 return error;
1510}
1511
1512
1513static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1258static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1514 u64 start, u64 len) 1259 u64 start, u64 len)
1515{ 1260{
@@ -1560,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
1560 .getxattr = gfs2_getxattr, 1305 .getxattr = gfs2_getxattr,
1561 .listxattr = gfs2_listxattr, 1306 .listxattr = gfs2_listxattr,
1562 .removexattr = gfs2_removexattr, 1307 .removexattr = gfs2_removexattr,
1563 .fallocate = gfs2_fallocate,
1564 .fiemap = gfs2_fiemap, 1308 .fiemap = gfs2_fiemap,
1565}; 1309};
1566 1310
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 16c2ecac7eb7..ec73ed70bae1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1336,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
1336 if (error) 1336 if (error)
1337 goto out_truncate; 1337 goto out_truncate;
1338 1338
1339 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1339 gfs2_glock_dq_wait(&ip->i_iopen_gh); 1340 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1340 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 1341 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
1341 error = gfs2_glock_nq(&ip->i_iopen_gh); 1342 error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index ea4aefe7c652..afa66aaa2237 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,8 +25,6 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
25 struct inode *inode = NULL; 25 struct inode *inode = NULL;
26 int res; 26 int res;
27 27
28 d_set_d_op(dentry, &hfs_dentry_operations);
29
30 hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); 28 hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
31 hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); 29 hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
32 res = hfs_brec_read(&fd, &rec, sizeof(rec)); 30 res = hfs_brec_read(&fd, &rec, sizeof(rec));
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 0bef62aa4f42..1b55f704fb22 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -429,13 +429,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
429 if (!root_inode) 429 if (!root_inode)
430 goto bail_no_root; 430 goto bail_no_root;
431 431
432 sb->s_d_op = &hfs_dentry_operations;
432 res = -ENOMEM; 433 res = -ENOMEM;
433 sb->s_root = d_alloc_root(root_inode); 434 sb->s_root = d_alloc_root(root_inode);
434 if (!sb->s_root) 435 if (!sb->s_root)
435 goto bail_iput; 436 goto bail_iput;
436 437
437 d_set_d_op(sb->s_root, &hfs_dentry_operations);
438
439 /* everything's okay */ 438 /* everything's okay */
440 return 0; 439 return 0;
441 440
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f896dc843026..4df5059c25da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,7 +37,6 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
37 37
38 sb = dir->i_sb; 38 sb = dir->i_sb;
39 39
40 d_set_d_op(dentry, &hfsplus_dentry_operations);
41 dentry->d_fsdata = NULL; 40 dentry->d_fsdata = NULL;
42 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); 41 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 42 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 6ee6ad20acf2..9a3b4795f43c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -444,13 +444,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
444 err = PTR_ERR(root); 444 err = PTR_ERR(root);
445 goto cleanup; 445 goto cleanup;
446 } 446 }
447 sb->s_d_op = &hfsplus_dentry_operations;
447 sb->s_root = d_alloc_root(root); 448 sb->s_root = d_alloc_root(root);
448 if (!sb->s_root) { 449 if (!sb->s_root) {
449 iput(root); 450 iput(root);
450 err = -ENOMEM; 451 err = -ENOMEM;
451 goto cleanup; 452 goto cleanup;
452 } 453 }
453 d_set_d_op(sb->s_root, &hfsplus_dentry_operations);
454 454
455 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 455 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
456 str.name = HFSP_HIDDENDIR_NAME; 456 str.name = HFSP_HIDDENDIR_NAME;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index d3244d949a4e..2638c834ed28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -612,7 +612,6 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
612 goto out_put; 612 goto out_put;
613 613
614 d_add(dentry, inode); 614 d_add(dentry, inode);
615 d_set_d_op(dentry, &hostfs_dentry_ops);
616 return NULL; 615 return NULL;
617 616
618 out_put: 617 out_put:
@@ -922,6 +921,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
922 sb->s_blocksize_bits = 10; 921 sb->s_blocksize_bits = 10;
923 sb->s_magic = HOSTFS_SUPER_MAGIC; 922 sb->s_magic = HOSTFS_SUPER_MAGIC;
924 sb->s_op = &hostfs_sbops; 923 sb->s_op = &hostfs_sbops;
924 sb->s_d_op = &hostfs_dentry_ops;
925 sb->s_maxbytes = MAX_LFS_FILESIZE; 925 sb->s_maxbytes = MAX_LFS_FILESIZE;
926 926
927 /* NULL is printed as <NULL> by sprintf: avoid that. */ 927 /* NULL is printed as <NULL> by sprintf: avoid that. */
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 32c13a94e1e9..05d4816e4e77 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -58,12 +58,7 @@ static int hpfs_compare_dentry(const struct dentry *parent,
58 return 0; 58 return 0;
59} 59}
60 60
61static const struct dentry_operations hpfs_dentry_operations = { 61const struct dentry_operations hpfs_dentry_operations = {
62 .d_hash = hpfs_hash_dentry, 62 .d_hash = hpfs_hash_dentry,
63 .d_compare = hpfs_compare_dentry, 63 .d_compare = hpfs_compare_dentry,
64}; 64};
65
66void hpfs_set_dentry_operations(struct dentry *dentry)
67{
68 d_set_d_op(dentry, &hpfs_dentry_operations);
69}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2338130cceba..d32f63a569f7 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -298,7 +298,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
298 298
299 end: 299 end:
300 end_add: 300 end_add:
301 hpfs_set_dentry_operations(dentry);
302 unlock_kernel(); 301 unlock_kernel();
303 d_add(dentry, result); 302 d_add(dentry, result);
304 return NULL; 303 return NULL;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2fee17d0d9ab..1c43dbea55e8 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -233,7 +233,7 @@ void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
233 233
234/* dentry.c */ 234/* dentry.c */
235 235
236void hpfs_set_dentry_operations(struct dentry *); 236extern const struct dentry_operations hpfs_dentry_operations;
237 237
238/* dir.c */ 238/* dir.c */
239 239
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd10..1ae35baa539e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
281 attr->ia_size != i_size_read(inode)) { 281 attr->ia_size != i_size_read(inode)) {
282 error = vmtruncate(inode, attr->ia_size); 282 error = vmtruncate(inode, attr->ia_size);
283 if (error) 283 if (error)
284 return error; 284 goto out_unlock;
285 } 285 }
286 286
287 setattr_copy(inode, attr); 287 setattr_copy(inode, attr);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 49935ba78db8..b30426b1fc97 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -550,6 +550,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
550 /* Fill superblock stuff */ 550 /* Fill superblock stuff */
551 s->s_magic = HPFS_SUPER_MAGIC; 551 s->s_magic = HPFS_SUPER_MAGIC;
552 s->s_op = &hpfs_sops; 552 s->s_op = &hpfs_sops;
553 s->s_d_op = &hpfs_dentry_operations;
553 554
554 sbi->sb_root = superblock->root; 555 sbi->sb_root = superblock->root;
555 sbi->sb_fs_size = superblock->n_sectors; 556 sbi->sb_fs_size = superblock->n_sectors;
@@ -651,7 +652,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
651 iput(root); 652 iput(root);
652 goto bail0; 653 goto bail0;
653 } 654 }
654 hpfs_set_dentry_operations(s->s_root);
655 655
656 /* 656 /*
657 * find the root directory's . pointer & finish filling in the inode 657 * find the root directory's . pointer & finish filling in the inode
diff --git a/fs/internal.h b/fs/internal.h
index 9687c2ee2735..0663568b1247 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,10 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
70extern void release_mounts(struct list_head *); 70extern void release_mounts(struct list_head *);
71extern void umount_tree(struct vfsmount *, int, struct list_head *); 71extern void umount_tree(struct vfsmount *, int, struct list_head *);
72extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); 72extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
73extern int finish_automount(struct vfsmount *, struct path *);
74
75extern void mnt_make_longterm(struct vfsmount *);
76extern void mnt_make_shortterm(struct vfsmount *);
73 77
74extern void __init mnt_init(void); 78extern void __init mnt_init(void);
75 79
diff --git a/fs/ioctl.c b/fs/ioctl.c
index d6cc16476620..a59635e295fa 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -86,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
86 u64 phys, u64 len, u32 flags) 86 u64 phys, u64 len, u32 flags)
87{ 87{
88 struct fiemap_extent extent; 88 struct fiemap_extent extent;
89 struct fiemap_extent *dest = fieinfo->fi_extents_start; 89 struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
90 90
91 /* only count the extents */ 91 /* only count the extents */
92 if (fieinfo->fi_extents_max == 0) { 92 if (fieinfo->fi_extents_max == 0) {
@@ -173,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
173static int ioctl_fiemap(struct file *filp, unsigned long arg) 173static int ioctl_fiemap(struct file *filp, unsigned long arg)
174{ 174{
175 struct fiemap fiemap; 175 struct fiemap fiemap;
176 struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
176 struct fiemap_extent_info fieinfo = { 0, }; 177 struct fiemap_extent_info fieinfo = { 0, };
177 struct inode *inode = filp->f_path.dentry->d_inode; 178 struct inode *inode = filp->f_path.dentry->d_inode;
178 struct super_block *sb = inode->i_sb; 179 struct super_block *sb = inode->i_sb;
@@ -182,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
182 if (!inode->i_op->fiemap) 183 if (!inode->i_op->fiemap)
183 return -EOPNOTSUPP; 184 return -EOPNOTSUPP;
184 185
185 if (copy_from_user(&fiemap, (struct fiemap __user *)arg, 186 if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
186 sizeof(struct fiemap)))
187 return -EFAULT; 187 return -EFAULT;
188 188
189 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) 189 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -196,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
196 196
197 fieinfo.fi_flags = fiemap.fm_flags; 197 fieinfo.fi_flags = fiemap.fm_flags;
198 fieinfo.fi_extents_max = fiemap.fm_extent_count; 198 fieinfo.fi_extents_max = fiemap.fm_extent_count;
199 fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); 199 fieinfo.fi_extents_start = ufiemap->fm_extents;
200 200
201 if (fiemap.fm_extent_count != 0 && 201 if (fiemap.fm_extent_count != 0 &&
202 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, 202 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -209,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
209 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); 209 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
210 fiemap.fm_flags = fieinfo.fi_flags; 210 fiemap.fm_flags = fieinfo.fi_flags;
211 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; 211 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
212 if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) 212 if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
213 error = -EFAULT; 213 error = -EFAULT;
214 214
215 return error; 215 return error;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 844a7903c72f..a0f3833c0dbf 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -939,17 +939,18 @@ root_found:
939 goto out_iput; 939 goto out_iput;
940 } 940 }
941 941
942 /* get the root dentry */
943 s->s_root = d_alloc_root(inode);
944 if (!(s->s_root))
945 goto out_no_root;
946
947 table = 0; 942 table = 0;
948 if (joliet_level) 943 if (joliet_level)
949 table += 2; 944 table += 2;
950 if (opt.check == 'r') 945 if (opt.check == 'r')
951 table++; 946 table++;
952 d_set_d_op(s->s_root, &isofs_dentry_ops[table]); 947
948 s->s_d_op = &isofs_dentry_ops[table];
949
950 /* get the root dentry */
951 s->s_root = d_alloc_root(inode);
952 if (!(s->s_root))
953 goto out_no_root;
953 954
954 kfree(opt.iocharset); 955 kfree(opt.iocharset);
955 956
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 679a849c3b27..4fb3e8074fd4 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -172,8 +172,6 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
172 struct inode *inode; 172 struct inode *inode;
173 struct page *page; 173 struct page *page;
174 174
175 d_set_d_op(dentry, dir->i_sb->s_root->d_op);
176
177 page = alloc_page(GFP_USER); 175 page = alloc_page(GFP_USER);
178 if (!page) 176 if (!page)
179 return ERR_PTR(-ENOMEM); 177 return ERR_PTR(-ENOMEM);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 846a3f314111..5b2e4c30a2a1 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -207,7 +207,7 @@ repeat_locked:
207 * the committing transaction. Really, we only need to give it 207 * the committing transaction. Really, we only need to give it
208 * committing_transaction->t_outstanding_credits plus "enough" for 208 * committing_transaction->t_outstanding_credits plus "enough" for
209 * the log control blocks. 209 * the log control blocks.
210 * Also, this test is inconsitent with the matching one in 210 * Also, this test is inconsistent with the matching one in
211 * journal_extend(). 211 * journal_extend().
212 */ 212 */
213 if (__log_space_left(journal) < jbd_space_needed(journal)) { 213 if (__log_space_left(journal) < jbd_space_needed(journal)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f837ba953529..9e4686900f18 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/backing-dev.h> 44#include <linux/backing-dev.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/ratelimit.h>
46 47
47#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
48#include <trace/events/jbd2.h> 49#include <trace/events/jbd2.h>
@@ -93,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
93EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); 94EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
94EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); 95EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
95EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 96EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
97EXPORT_SYMBOL(jbd2_inode_cache);
96 98
97static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 99static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
98static void __journal_abort_soft (journal_t *journal, int errno); 100static void __journal_abort_soft (journal_t *journal, int errno);
@@ -827,7 +829,7 @@ static journal_t * journal_init_common (void)
827 829
828 journal = kzalloc(sizeof(*journal), GFP_KERNEL); 830 journal = kzalloc(sizeof(*journal), GFP_KERNEL);
829 if (!journal) 831 if (!journal)
830 goto fail; 832 return NULL;
831 833
832 init_waitqueue_head(&journal->j_wait_transaction_locked); 834 init_waitqueue_head(&journal->j_wait_transaction_locked);
833 init_waitqueue_head(&journal->j_wait_logspace); 835 init_waitqueue_head(&journal->j_wait_logspace);
@@ -852,14 +854,12 @@ static journal_t * journal_init_common (void)
852 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); 854 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
853 if (err) { 855 if (err) {
854 kfree(journal); 856 kfree(journal);
855 goto fail; 857 return NULL;
856 } 858 }
857 859
858 spin_lock_init(&journal->j_history_lock); 860 spin_lock_init(&journal->j_history_lock);
859 861
860 return journal; 862 return journal;
861fail:
862 return NULL;
863} 863}
864 864
865/* jbd2_journal_init_dev and jbd2_journal_init_inode: 865/* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -1982,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
1982static struct journal_head *journal_alloc_journal_head(void) 1982static struct journal_head *journal_alloc_journal_head(void)
1983{ 1983{
1984 struct journal_head *ret; 1984 struct journal_head *ret;
1985 static unsigned long last_warning;
1986 1985
1987#ifdef CONFIG_JBD2_DEBUG 1986#ifdef CONFIG_JBD2_DEBUG
1988 atomic_inc(&nr_journal_heads); 1987 atomic_inc(&nr_journal_heads);
@@ -1990,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void)
1990 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 1989 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
1991 if (!ret) { 1990 if (!ret) {
1992 jbd_debug(1, "out of memory for journal_head\n"); 1991 jbd_debug(1, "out of memory for journal_head\n");
1993 if (time_after(jiffies, last_warning + 5*HZ)) { 1992 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
1994 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1995 __func__);
1996 last_warning = jiffies;
1997 }
1998 while (!ret) { 1993 while (!ret) {
1999 yield(); 1994 yield();
2000 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 1995 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -2292,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2292 2287
2293#endif 2288#endif
2294 2289
2295struct kmem_cache *jbd2_handle_cache; 2290struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
2296 2291
2297static int __init journal_init_handle_cache(void) 2292static int __init journal_init_handle_cache(void)
2298{ 2293{
2299 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", 2294 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
2300 sizeof(handle_t),
2301 0, /* offset */
2302 SLAB_TEMPORARY, /* flags */
2303 NULL); /* ctor */
2304 if (jbd2_handle_cache == NULL) { 2295 if (jbd2_handle_cache == NULL) {
2305 printk(KERN_EMERG "JBD: failed to create handle cache\n"); 2296 printk(KERN_EMERG "JBD2: failed to create handle cache\n");
2297 return -ENOMEM;
2298 }
2299 jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
2300 if (jbd2_inode_cache == NULL) {
2301 printk(KERN_EMERG "JBD2: failed to create inode cache\n");
2302 kmem_cache_destroy(jbd2_handle_cache);
2306 return -ENOMEM; 2303 return -ENOMEM;
2307 } 2304 }
2308 return 0; 2305 return 0;
@@ -2312,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
2312{ 2309{
2313 if (jbd2_handle_cache) 2310 if (jbd2_handle_cache)
2314 kmem_cache_destroy(jbd2_handle_cache); 2311 kmem_cache_destroy(jbd2_handle_cache);
2312 if (jbd2_inode_cache)
2313 kmem_cache_destroy(jbd2_inode_cache);
2314
2315} 2315}
2316 2316
2317/* 2317/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2bc4d5f116f1..1cad869494f0 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal)
299#ifdef CONFIG_JBD2_DEBUG 299#ifdef CONFIG_JBD2_DEBUG
300 int dropped = info.end_transaction - 300 int dropped = info.end_transaction -
301 be32_to_cpu(journal->j_superblock->s_sequence); 301 be32_to_cpu(journal->j_superblock->s_sequence);
302#endif
303 jbd_debug(1, 302 jbd_debug(1,
304 "JBD: ignoring %d transaction%s from the journal.\n", 303 "JBD: ignoring %d transaction%s from the journal.\n",
305 dropped, (dropped == 1) ? "" : "s"); 304 dropped, (dropped == 1) ? "" : "s");
305#endif
306 journal->j_transaction_sequence = ++info.end_transaction; 306 journal->j_transaction_sequence = ++info.end_transaction;
307 } 307 }
308 308
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6bf0a242613e..faad2bd787c7 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -251,7 +251,7 @@ repeat:
251 * the committing transaction. Really, we only need to give it 251 * the committing transaction. Really, we only need to give it
252 * committing_transaction->t_outstanding_credits plus "enough" for 252 * committing_transaction->t_outstanding_credits plus "enough" for
253 * the log control blocks. 253 * the log control blocks.
254 * Also, this test is inconsitent with the matching one in 254 * Also, this test is inconsistent with the matching one in
255 * jbd2_journal_extend(). 255 * jbd2_journal_extend().
256 */ 256 */
257 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { 257 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
@@ -340,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
340 jbd2_free_handle(handle); 340 jbd2_free_handle(handle);
341 current->journal_info = NULL; 341 current->journal_info = NULL;
342 handle = ERR_PTR(err); 342 handle = ERR_PTR(err);
343 goto out;
344 } 343 }
345out:
346 return handle; 344 return handle;
347} 345}
348EXPORT_SYMBOL(jbd2__journal_start); 346EXPORT_SYMBOL(jbd2__journal_start);
@@ -589,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
589 transaction = handle->h_transaction; 587 transaction = handle->h_transaction;
590 journal = transaction->t_journal; 588 journal = transaction->t_journal;
591 589
592 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); 590 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
593 591
594 JBUFFER_TRACE(jh, "entry"); 592 JBUFFER_TRACE(jh, "entry");
595repeat: 593repeat:
@@ -774,7 +772,7 @@ done:
774 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), 772 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
775 "Possible IO failure.\n"); 773 "Possible IO failure.\n");
776 page = jh2bh(jh)->b_page; 774 page = jh2bh(jh)->b_page;
777 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; 775 offset = offset_in_page(jh2bh(jh)->b_data);
778 source = kmap_atomic(page, KM_USER0); 776 source = kmap_atomic(page, KM_USER0);
779 /* Fire data frozen trigger just before we copy the data */ 777 /* Fire data frozen trigger just before we copy the data */
780 jbd2_buffer_frozen_trigger(jh, source + offset, 778 jbd2_buffer_frozen_trigger(jh, source + offset,
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 85c6be2db02f..3005ec4520ad 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
336 size = sizeof(struct jffs2_eraseblock) * c->nr_blocks; 336 size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
337#ifndef __ECOS 337#ifndef __ECOS
338 if (jffs2_blocks_use_vmalloc(c)) 338 if (jffs2_blocks_use_vmalloc(c))
339 c->blocks = vmalloc(size); 339 c->blocks = vzalloc(size);
340 else 340 else
341#endif 341#endif
342 c->blocks = kmalloc(size, GFP_KERNEL); 342 c->blocks = kzalloc(size, GFP_KERNEL);
343 if (!c->blocks) 343 if (!c->blocks)
344 return -ENOMEM; 344 return -ENOMEM;
345 345
346 memset(c->blocks, 0, size);
347 for (i=0; i<c->nr_blocks; i++) { 346 for (i=0; i<c->nr_blocks; i++) {
348 INIT_LIST_HEAD(&c->blocks[i].list); 347 INIT_LIST_HEAD(&c->blocks[i].list);
349 c->blocks[i].offset = i * c->sector_size; 348 c->blocks[i].offset = i * c->sector_size;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index f864005de64c..0bc6a6c80a56 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -144,4 +144,4 @@ struct jffs2_sb_info {
144 void *os_priv; 144 void *os_priv;
145}; 145};
146 146
147#endif /* _JFFS2_FB_SB */ 147#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a49..4f9cc0482949 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
151 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", 151 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
152 offset, je32_to_cpu(rx.hdr_crc), crc); 152 offset, je32_to_cpu(rx.hdr_crc), crc);
153 xd->flags |= JFFS2_XFLAGS_INVALID; 153 xd->flags |= JFFS2_XFLAGS_INVALID;
154 return EIO; 154 return -EIO;
155 } 155 }
156 totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len)); 156 totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
157 if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK 157 if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
167 je32_to_cpu(rx.xid), xd->xid, 167 je32_to_cpu(rx.xid), xd->xid,
168 je32_to_cpu(rx.version), xd->version); 168 je32_to_cpu(rx.version), xd->version);
169 xd->flags |= JFFS2_XFLAGS_INVALID; 169 xd->flags |= JFFS2_XFLAGS_INVALID;
170 return EIO; 170 return -EIO;
171 } 171 }
172 xd->xprefix = rx.xprefix; 172 xd->xprefix = rx.xprefix;
173 xd->name_len = rx.name_len; 173 xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
230 ref_offset(xd->node), xd->data_crc, crc); 230 ref_offset(xd->node), xd->data_crc, crc);
231 kfree(data); 231 kfree(data);
232 xd->flags |= JFFS2_XFLAGS_INVALID; 232 xd->flags |= JFFS2_XFLAGS_INVALID;
233 return EIO; 233 return -EIO;
234 } 234 }
235 235
236 xd->flags |= JFFS2_XFLAGS_HOT; 236 xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
268 if (xd->xname) 268 if (xd->xname)
269 return 0; 269 return 0;
270 if (xd->flags & JFFS2_XFLAGS_INVALID) 270 if (xd->flags & JFFS2_XFLAGS_INVALID)
271 return EIO; 271 return -EIO;
272 if (unlikely(is_xattr_datum_unchecked(c, xd))) 272 if (unlikely(is_xattr_datum_unchecked(c, xd)))
273 rc = do_verify_xattr_datum(c, xd); 273 rc = do_verify_xattr_datum(c, xd);
274 if (!rc) 274 if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
460 if (crc != je32_to_cpu(rr.node_crc)) { 460 if (crc != je32_to_cpu(rr.node_crc)) {
461 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", 461 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
462 offset, je32_to_cpu(rr.node_crc), crc); 462 offset, je32_to_cpu(rr.node_crc), crc);
463 return EIO; 463 return -EIO;
464 } 464 }
465 if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK 465 if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
466 || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF 466 || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
470 offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, 470 offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
471 je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF, 471 je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
472 je32_to_cpu(rr.totlen), PAD(sizeof(rr))); 472 je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
473 return EIO; 473 return -EIO;
474 } 474 }
475 ref->ino = je32_to_cpu(rr.ino); 475 ref->ino = je32_to_cpu(rr.ino);
476 ref->xid = je32_to_cpu(rr.xid); 476 ref->xid = je32_to_cpu(rr.xid);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e1b8493b9aaa..278e3fb40b71 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
1120 * file systems to log may have n-to-1 relationship; 1120 * file systems to log may have n-to-1 relationship;
1121 */ 1121 */
1122 1122
1123 bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE); 1123 bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1124 log);
1124 if (IS_ERR(bdev)) { 1125 if (IS_ERR(bdev)) {
1125 rc = -PTR_ERR(bdev); 1126 rc = -PTR_ERR(bdev);
1126 goto free; 1127 goto free;
1127 } 1128 }
1128 1129
1129 if ((rc = bd_claim(bdev, log))) {
1130 goto close;
1131 }
1132
1133 log->bdev = bdev; 1130 log->bdev = bdev;
1134 memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); 1131 memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
1135 1132
@@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
1137 * initialize log: 1134 * initialize log:
1138 */ 1135 */
1139 if ((rc = lmLogInit(log))) 1136 if ((rc = lmLogInit(log)))
1140 goto unclaim; 1137 goto close;
1141 1138
1142 list_add(&log->journal_list, &jfs_external_logs); 1139 list_add(&log->journal_list, &jfs_external_logs);
1143 1140
@@ -1163,11 +1160,8 @@ journal_found:
1163 list_del(&log->journal_list); 1160 list_del(&log->journal_list);
1164 lbmLogShutdown(log); 1161 lbmLogShutdown(log);
1165 1162
1166 unclaim:
1167 bd_release(bdev);
1168
1169 close: /* close external log device */ 1163 close: /* close external log device */
1170 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1164 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1171 1165
1172 free: /* free log descriptor */ 1166 free: /* free log descriptor */
1173 mutex_unlock(&jfs_log_mutex); 1167 mutex_unlock(&jfs_log_mutex);
@@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
1512 bdev = log->bdev; 1506 bdev = log->bdev;
1513 rc = lmLogShutdown(log); 1507 rc = lmLogShutdown(log);
1514 1508
1515 bd_release(bdev); 1509 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1516 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1517 1510
1518 kfree(log); 1511 kfree(log);
1519 1512
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4414e3a42264..81ead850ddb6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1465,9 +1465,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1465 1465
1466 jfs_info("jfs_lookup: name = %s", name); 1466 jfs_info("jfs_lookup: name = %s", name);
1467 1467
1468 if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
1469 d_set_d_op(dentry, &jfs_ci_dentry_operations);
1470
1471 if ((name[0] == '.') && (len == 1)) 1468 if ((name[0] == '.') && (len == 1))
1472 inum = dip->i_ino; 1469 inum = dip->i_ino;
1473 else if (strcmp(name, "..") == 0) 1470 else if (strcmp(name, "..") == 0)
@@ -1492,12 +1489,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1492 return ERR_CAST(ip); 1489 return ERR_CAST(ip);
1493 } 1490 }
1494 1491
1495 dentry = d_splice_alias(ip, dentry); 1492 return d_splice_alias(ip, dentry);
1496
1497 if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
1498 d_set_d_op(dentry, &jfs_ci_dentry_operations);
1499
1500 return dentry;
1501} 1493}
1502 1494
1503static struct inode *jfs_nfs_get_inode(struct super_block *sb, 1495static struct inode *jfs_nfs_get_inode(struct super_block *sb,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3150d766e0d4..eeca48a031ab 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -515,6 +515,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
515 515
516 sb->s_magic = JFS_SUPER_MAGIC; 516 sb->s_magic = JFS_SUPER_MAGIC;
517 517
518 if (sbi->mntflag & JFS_OS2)
519 sb->s_d_op = &jfs_ci_dentry_operations;
520
518 inode = jfs_iget(sb, ROOT_I); 521 inode = jfs_iget(sb, ROOT_I);
519 if (IS_ERR(inode)) { 522 if (IS_ERR(inode)) {
520 ret = PTR_ERR(inode); 523 ret = PTR_ERR(inode);
@@ -524,9 +527,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
524 if (!sb->s_root) 527 if (!sb->s_root)
525 goto out_no_root; 528 goto out_no_root;
526 529
527 if (sbi->mntflag & JFS_OS2)
528 d_set_d_op(sb->s_root, &jfs_ci_dentry_operations);
529
530 /* logical blocks are represented by 40 bits in pxd_t, etc. */ 530 /* logical blocks are represented by 40 bits in pxd_t, etc. */
531 sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; 531 sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
532#if BITS_PER_LONG == 32 532#if BITS_PER_LONG == 32
diff --git a/fs/libfs.c b/fs/libfs.c
index 889311e3d06b..c88eab55aec9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,8 @@ static const struct super_operations simple_super_operations = {
217 * will never be mountable) 217 * will never be mountable)
218 */ 218 */
219struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, 219struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
220 const struct super_operations *ops, unsigned long magic) 220 const struct super_operations *ops,
221 const struct dentry_operations *dops, unsigned long magic)
221{ 222{
222 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); 223 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
223 struct dentry *dentry; 224 struct dentry *dentry;
@@ -254,6 +255,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
254 dentry->d_parent = dentry; 255 dentry->d_parent = dentry;
255 d_instantiate(dentry, root); 256 d_instantiate(dentry, root);
256 s->s_root = dentry; 257 s->s_root = dentry;
258 s->s_d_op = dops;
257 s->s_flags |= MS_ACTIVE; 259 s->s_flags |= MS_ACTIVE;
258 return dget(s->s_root); 260 return dget(s->s_root);
259 261
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 97f6073ab339..ca58d64374ca 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-$(CONFIG_LOCKD) += lockd.o 5obj-$(CONFIG_LOCKD) += lockd.o
6 6
7lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ 7lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
8 svcproc.o svcsubs.o mon.o xdr.o grace.o 8 svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
9lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o 9lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
10lockd-objs := $(lockd-objs-y) 10lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 000000000000..f848b52c67b1
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
1/*
2 * linux/fs/lockd/clnt4xdr.c
3 *
4 * XDR functions to encode/decode NLM version 4 RPC arguments and results.
5 *
6 * NLM client-side only.
7 *
8 * Copyright (C) 2010, Oracle. All rights reserved.
9 */
10
11#include <linux/types.h>
12#include <linux/sunrpc/xdr.h>
13#include <linux/sunrpc/clnt.h>
14#include <linux/sunrpc/stats.h>
15#include <linux/lockd/lockd.h>
16
17#define NLMDBG_FACILITY NLMDBG_XDR
18
19#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
20# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
21#endif
22
23#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
24# error "NLM host name cannot be larger than NLM's maximum string length!"
25#endif
26
27/*
28 * Declare the space requirements for NLM arguments and replies as
29 * number of 32bit-words
30 */
31#define NLM4_void_sz (0)
32#define NLM4_cookie_sz (1+(NLM_MAXCOOKIELEN>>2))
33#define NLM4_caller_sz (1+(NLMCLNT_OHSIZE>>2))
34#define NLM4_owner_sz (1+(NLMCLNT_OHSIZE>>2))
35#define NLM4_fhandle_sz (1+(NFS3_FHSIZE>>2))
36#define NLM4_lock_sz (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
37#define NLM4_holder_sz (6+NLM4_owner_sz)
38
39#define NLM4_testargs_sz (NLM4_cookie_sz+1+NLM4_lock_sz)
40#define NLM4_lockargs_sz (NLM4_cookie_sz+4+NLM4_lock_sz)
41#define NLM4_cancargs_sz (NLM4_cookie_sz+2+NLM4_lock_sz)
42#define NLM4_unlockargs_sz (NLM4_cookie_sz+NLM4_lock_sz)
43
44#define NLM4_testres_sz (NLM4_cookie_sz+1+NLM4_holder_sz)
45#define NLM4_res_sz (NLM4_cookie_sz+1)
46#define NLM4_norep_sz (0)
47
48
49static s64 loff_t_to_s64(loff_t offset)
50{
51 s64 res;
52
53 if (offset >= NLM4_OFFSET_MAX)
54 res = NLM4_OFFSET_MAX;
55 else if (offset <= -NLM4_OFFSET_MAX)
56 res = -NLM4_OFFSET_MAX;
57 else
58 res = offset;
59 return res;
60}
61
62static void nlm4_compute_offsets(const struct nlm_lock *lock,
63 u64 *l_offset, u64 *l_len)
64{
65 const struct file_lock *fl = &lock->fl;
66
67 BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
68 BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
69 fl->fl_end != OFFSET_MAX);
70
71 *l_offset = loff_t_to_s64(fl->fl_start);
72 if (fl->fl_end == OFFSET_MAX)
73 *l_len = 0;
74 else
75 *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
76}
77
78/*
79 * Handle decode buffer overflows out-of-line.
80 */
81static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
82{
83 dprintk("lockd: %s prematurely hit the end of our receive buffer. "
84 "Remaining buffer length is %tu words.\n",
85 func, xdr->end - xdr->p);
86}
87
88
89/*
90 * Encode/decode NLMv4 basic data types
91 *
92 * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
93 * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
94 * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
95 *
96 * Not all basic data types have their own encoding and decoding
97 * functions. For run-time efficiency, some data types are encoded
98 * or decoded inline.
99 */
100
101static void encode_bool(struct xdr_stream *xdr, const int value)
102{
103 __be32 *p;
104
105 p = xdr_reserve_space(xdr, 4);
106 *p = value ? xdr_one : xdr_zero;
107}
108
109static void encode_int32(struct xdr_stream *xdr, const s32 value)
110{
111 __be32 *p;
112
113 p = xdr_reserve_space(xdr, 4);
114 *p = cpu_to_be32(value);
115}
116
117/*
118 * typedef opaque netobj<MAXNETOBJ_SZ>
119 */
120static void encode_netobj(struct xdr_stream *xdr,
121 const u8 *data, const unsigned int length)
122{
123 __be32 *p;
124
125 BUG_ON(length > XDR_MAX_NETOBJ);
126 p = xdr_reserve_space(xdr, 4 + length);
127 xdr_encode_opaque(p, data, length);
128}
129
130static int decode_netobj(struct xdr_stream *xdr,
131 struct xdr_netobj *obj)
132{
133 u32 length;
134 __be32 *p;
135
136 p = xdr_inline_decode(xdr, 4);
137 if (unlikely(p == NULL))
138 goto out_overflow;
139 length = be32_to_cpup(p++);
140 if (unlikely(length > XDR_MAX_NETOBJ))
141 goto out_size;
142 obj->len = length;
143 obj->data = (u8 *)p;
144 return 0;
145out_size:
146 dprintk("NFS: returned netobj was too long: %u\n", length);
147 return -EIO;
148out_overflow:
149 print_overflow_msg(__func__, xdr);
150 return -EIO;
151}
152
153/*
154 * netobj cookie;
155 */
156static void encode_cookie(struct xdr_stream *xdr,
157 const struct nlm_cookie *cookie)
158{
159 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
160 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
161}
162
163static int decode_cookie(struct xdr_stream *xdr,
164 struct nlm_cookie *cookie)
165{
166 u32 length;
167 __be32 *p;
168
169 p = xdr_inline_decode(xdr, 4);
170 if (unlikely(p == NULL))
171 goto out_overflow;
172 length = be32_to_cpup(p++);
173 /* apparently HPUX can return empty cookies */
174 if (length == 0)
175 goto out_hpux;
176 if (length > NLM_MAXCOOKIELEN)
177 goto out_size;
178 p = xdr_inline_decode(xdr, length);
179 if (unlikely(p == NULL))
180 goto out_overflow;
181 cookie->len = length;
182 memcpy(cookie->data, p, length);
183 return 0;
184out_hpux:
185 cookie->len = 4;
186 memset(cookie->data, 0, 4);
187 return 0;
188out_size:
189 dprintk("NFS: returned cookie was too long: %u\n", length);
190 return -EIO;
191out_overflow:
192 print_overflow_msg(__func__, xdr);
193 return -EIO;
194}
195
196/*
197 * netobj fh;
198 */
199static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
200{
201 BUG_ON(fh->size > NFS3_FHSIZE);
202 encode_netobj(xdr, (u8 *)&fh->data, fh->size);
203}
204
205/*
206 * enum nlm4_stats {
207 * NLM4_GRANTED = 0,
208 * NLM4_DENIED = 1,
209 * NLM4_DENIED_NOLOCKS = 2,
210 * NLM4_BLOCKED = 3,
211 * NLM4_DENIED_GRACE_PERIOD = 4,
212 * NLM4_DEADLCK = 5,
213 * NLM4_ROFS = 6,
214 * NLM4_STALE_FH = 7,
215 * NLM4_FBIG = 8,
216 * NLM4_FAILED = 9
217 * };
218 *
219 * struct nlm4_stat {
220 * nlm4_stats stat;
221 * };
222 *
223 * NB: we don't swap bytes for the NLM status values. The upper
224 * layers deal directly with the status value in network byte
225 * order.
226 */
227static void encode_nlm4_stat(struct xdr_stream *xdr,
228 const __be32 stat)
229{
230 __be32 *p;
231
232 BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
233 p = xdr_reserve_space(xdr, 4);
234 *p = stat;
235}
236
237static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
238{
239 __be32 *p;
240
241 p = xdr_inline_decode(xdr, 4);
242 if (unlikely(p == NULL))
243 goto out_overflow;
244 if (unlikely(*p > nlm4_failed))
245 goto out_bad_xdr;
246 *stat = *p;
247 return 0;
248out_bad_xdr:
249 dprintk("%s: server returned invalid nlm4_stats value: %u\n",
250 __func__, be32_to_cpup(p));
251 return -EIO;
252out_overflow:
253 print_overflow_msg(__func__, xdr);
254 return -EIO;
255}
256
257/*
258 * struct nlm4_holder {
259 * bool exclusive;
260 * int32 svid;
261 * netobj oh;
262 * uint64 l_offset;
263 * uint64 l_len;
264 * };
265 */
266static void encode_nlm4_holder(struct xdr_stream *xdr,
267 const struct nlm_res *result)
268{
269 const struct nlm_lock *lock = &result->lock;
270 u64 l_offset, l_len;
271 __be32 *p;
272
273 encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
274 encode_int32(xdr, lock->svid);
275 encode_netobj(xdr, lock->oh.data, lock->oh.len);
276
277 p = xdr_reserve_space(xdr, 4 + 4);
278 nlm4_compute_offsets(lock, &l_offset, &l_len);
279 p = xdr_encode_hyper(p, l_offset);
280 xdr_encode_hyper(p, l_len);
281}
282
283static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
284{
285 struct nlm_lock *lock = &result->lock;
286 struct file_lock *fl = &lock->fl;
287 u64 l_offset, l_len;
288 u32 exclusive;
289 int error;
290 __be32 *p;
291 s32 end;
292
293 memset(lock, 0, sizeof(*lock));
294 locks_init_lock(fl);
295
296 p = xdr_inline_decode(xdr, 4 + 4);
297 if (unlikely(p == NULL))
298 goto out_overflow;
299 exclusive = be32_to_cpup(p++);
300 lock->svid = be32_to_cpup(p);
301 fl->fl_pid = (pid_t)lock->svid;
302
303 error = decode_netobj(xdr, &lock->oh);
304 if (unlikely(error))
305 goto out;
306
307 p = xdr_inline_decode(xdr, 8 + 8);
308 if (unlikely(p == NULL))
309 goto out_overflow;
310
311 fl->fl_flags = FL_POSIX;
312 fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
313 p = xdr_decode_hyper(p, &l_offset);
314 xdr_decode_hyper(p, &l_len);
315 end = l_offset + l_len - 1;
316
317 fl->fl_start = (loff_t)l_offset;
318 if (l_len == 0 || end < 0)
319 fl->fl_end = OFFSET_MAX;
320 else
321 fl->fl_end = (loff_t)end;
322 error = 0;
323out:
324 return error;
325out_overflow:
326 print_overflow_msg(__func__, xdr);
327 return -EIO;
328}
329
330/*
331 * string caller_name<LM_MAXSTRLEN>;
332 */
333static void encode_caller_name(struct xdr_stream *xdr, const char *name)
334{
335 /* NB: client-side does not set lock->len */
336 u32 length = strlen(name);
337 __be32 *p;
338
339 BUG_ON(length > NLM_MAXSTRLEN);
340 p = xdr_reserve_space(xdr, 4 + length);
341 xdr_encode_opaque(p, name, length);
342}
343
344/*
345 * struct nlm4_lock {
346 * string caller_name<LM_MAXSTRLEN>;
347 * netobj fh;
348 * netobj oh;
349 * int32 svid;
350 * uint64 l_offset;
351 * uint64 l_len;
352 * };
353 */
354static void encode_nlm4_lock(struct xdr_stream *xdr,
355 const struct nlm_lock *lock)
356{
357 u64 l_offset, l_len;
358 __be32 *p;
359
360 encode_caller_name(xdr, lock->caller);
361 encode_fh(xdr, &lock->fh);
362 encode_netobj(xdr, lock->oh.data, lock->oh.len);
363
364 p = xdr_reserve_space(xdr, 4 + 8 + 8);
365 *p++ = cpu_to_be32(lock->svid);
366
367 nlm4_compute_offsets(lock, &l_offset, &l_len);
368 p = xdr_encode_hyper(p, l_offset);
369 xdr_encode_hyper(p, l_len);
370}
371
372
373/*
374 * NLMv4 XDR encode functions
375 *
376 * NLMv4 argument types are defined in Appendix II of RFC 1813:
377 * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
378 * "Protocols for Interworking: XNFS, Version 3W".
379 */
380
381/*
382 * struct nlm4_testargs {
383 * netobj cookie;
384 * bool exclusive;
385 * struct nlm4_lock alock;
386 * };
387 */
388static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
389 struct xdr_stream *xdr,
390 const struct nlm_args *args)
391{
392 const struct nlm_lock *lock = &args->lock;
393
394 encode_cookie(xdr, &args->cookie);
395 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
396 encode_nlm4_lock(xdr, lock);
397}
398
399/*
400 * struct nlm4_lockargs {
401 * netobj cookie;
402 * bool block;
403 * bool exclusive;
404 * struct nlm4_lock alock;
405 * bool reclaim;
406 * int state;
407 * };
408 */
409static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
410 struct xdr_stream *xdr,
411 const struct nlm_args *args)
412{
413 const struct nlm_lock *lock = &args->lock;
414
415 encode_cookie(xdr, &args->cookie);
416 encode_bool(xdr, args->block);
417 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
418 encode_nlm4_lock(xdr, lock);
419 encode_bool(xdr, args->reclaim);
420 encode_int32(xdr, args->state);
421}
422
423/*
424 * struct nlm4_cancargs {
425 * netobj cookie;
426 * bool block;
427 * bool exclusive;
428 * struct nlm4_lock alock;
429 * };
430 */
431static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
432 struct xdr_stream *xdr,
433 const struct nlm_args *args)
434{
435 const struct nlm_lock *lock = &args->lock;
436
437 encode_cookie(xdr, &args->cookie);
438 encode_bool(xdr, args->block);
439 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
440 encode_nlm4_lock(xdr, lock);
441}
442
443/*
444 * struct nlm4_unlockargs {
445 * netobj cookie;
446 * struct nlm4_lock alock;
447 * };
448 */
449static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
450 struct xdr_stream *xdr,
451 const struct nlm_args *args)
452{
453 const struct nlm_lock *lock = &args->lock;
454
455 encode_cookie(xdr, &args->cookie);
456 encode_nlm4_lock(xdr, lock);
457}
458
459/*
460 * struct nlm4_res {
461 * netobj cookie;
462 * nlm4_stat stat;
463 * };
464 */
465static void nlm4_xdr_enc_res(struct rpc_rqst *req,
466 struct xdr_stream *xdr,
467 const struct nlm_res *result)
468{
469 encode_cookie(xdr, &result->cookie);
470 encode_nlm4_stat(xdr, result->status);
471}
472
473/*
474 * union nlm4_testrply switch (nlm4_stats stat) {
475 * case NLM4_DENIED:
476 * struct nlm4_holder holder;
477 * default:
478 * void;
479 * };
480 *
481 * struct nlm4_testres {
482 * netobj cookie;
483 * nlm4_testrply test_stat;
484 * };
485 */
486static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
487 struct xdr_stream *xdr,
488 const struct nlm_res *result)
489{
490 encode_cookie(xdr, &result->cookie);
491 encode_nlm4_stat(xdr, result->status);
492 if (result->status == nlm_lck_denied)
493 encode_nlm4_holder(xdr, result);
494}
495
496
497/*
498 * NLMv4 XDR decode functions
499 *
500 * NLMv4 argument types are defined in Appendix II of RFC 1813:
501 * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
502 * "Protocols for Interworking: XNFS, Version 3W".
503 */
504
505/*
506 * union nlm4_testrply switch (nlm4_stats stat) {
507 * case NLM4_DENIED:
508 * struct nlm4_holder holder;
509 * default:
510 * void;
511 * };
512 *
513 * struct nlm4_testres {
514 * netobj cookie;
515 * nlm4_testrply test_stat;
516 * };
517 */
518static int decode_nlm4_testrply(struct xdr_stream *xdr,
519 struct nlm_res *result)
520{
521 int error;
522
523 error = decode_nlm4_stat(xdr, &result->status);
524 if (unlikely(error))
525 goto out;
526 if (result->status == nlm_lck_denied)
527 error = decode_nlm4_holder(xdr, result);
528out:
529 return error;
530}
531
532static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
533 struct xdr_stream *xdr,
534 struct nlm_res *result)
535{
536 int error;
537
538 error = decode_cookie(xdr, &result->cookie);
539 if (unlikely(error))
540 goto out;
541 error = decode_nlm4_testrply(xdr, result);
542out:
543 return error;
544}
545
546/*
547 * struct nlm4_res {
548 * netobj cookie;
549 * nlm4_stat stat;
550 * };
551 */
552static int nlm4_xdr_dec_res(struct rpc_rqst *req,
553 struct xdr_stream *xdr,
554 struct nlm_res *result)
555{
556 int error;
557
558 error = decode_cookie(xdr, &result->cookie);
559 if (unlikely(error))
560 goto out;
561 error = decode_nlm4_stat(xdr, &result->status);
562out:
563 return error;
564}
565
566
567/*
568 * For NLM, a void procedure really returns nothing
569 */
570#define nlm4_xdr_dec_norep NULL
571
572#define PROC(proc, argtype, restype) \
573[NLMPROC_##proc] = { \
574 .p_proc = NLMPROC_##proc, \
575 .p_encode = (kxdreproc_t)nlm4_xdr_enc_##argtype, \
576 .p_decode = (kxdrdproc_t)nlm4_xdr_dec_##restype, \
577 .p_arglen = NLM4_##argtype##_sz, \
578 .p_replen = NLM4_##restype##_sz, \
579 .p_statidx = NLMPROC_##proc, \
580 .p_name = #proc, \
581 }
582
583static struct rpc_procinfo nlm4_procedures[] = {
584 PROC(TEST, testargs, testres),
585 PROC(LOCK, lockargs, res),
586 PROC(CANCEL, cancargs, res),
587 PROC(UNLOCK, unlockargs, res),
588 PROC(GRANTED, testargs, res),
589 PROC(TEST_MSG, testargs, norep),
590 PROC(LOCK_MSG, lockargs, norep),
591 PROC(CANCEL_MSG, cancargs, norep),
592 PROC(UNLOCK_MSG, unlockargs, norep),
593 PROC(GRANTED_MSG, testargs, norep),
594 PROC(TEST_RES, testres, norep),
595 PROC(LOCK_RES, res, norep),
596 PROC(CANCEL_RES, res, norep),
597 PROC(UNLOCK_RES, res, norep),
598 PROC(GRANTED_RES, res, norep),
599};
600
601struct rpc_version nlm_version4 = {
602 .number = 4,
603 .nrprocs = ARRAY_SIZE(nlm4_procedures),
604 .procs = nlm4_procedures,
605};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 25509eb28fd7..8d4ea8351e3d 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
79 */ 79 */
80void nlmclnt_done(struct nlm_host *host) 80void nlmclnt_done(struct nlm_host *host)
81{ 81{
82 nlm_release_host(host); 82 nlmclnt_release_host(host);
83 lockd_down(); 83 lockd_down();
84} 84}
85EXPORT_SYMBOL_GPL(nlmclnt_done); 85EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -273,7 +273,7 @@ restart:
273 spin_unlock(&nlm_blocked_lock); 273 spin_unlock(&nlm_blocked_lock);
274 274
275 /* Release host handle after use */ 275 /* Release host handle after use */
276 nlm_release_host(host); 276 nlmclnt_release_host(host);
277 lockd_down(); 277 lockd_down();
278 return 0; 278 return 0;
279} 279}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 332c54cf75e0..adb45ec9038c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -58,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
58 return; 58 return;
59 list_del(&lockowner->list); 59 list_del(&lockowner->list);
60 spin_unlock(&lockowner->host->h_lock); 60 spin_unlock(&lockowner->host->h_lock);
61 nlm_release_host(lockowner->host); 61 nlmclnt_release_host(lockowner->host);
62 kfree(lockowner); 62 kfree(lockowner);
63} 63}
64 64
@@ -207,22 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
207 printk("nlm_alloc_call: failed, waiting for memory\n"); 207 printk("nlm_alloc_call: failed, waiting for memory\n");
208 schedule_timeout_interruptible(5*HZ); 208 schedule_timeout_interruptible(5*HZ);
209 } 209 }
210 nlm_release_host(host); 210 nlmclnt_release_host(host);
211 return NULL; 211 return NULL;
212} 212}
213 213
214void nlm_release_call(struct nlm_rqst *call) 214void nlmclnt_release_call(struct nlm_rqst *call)
215{ 215{
216 if (!atomic_dec_and_test(&call->a_count)) 216 if (!atomic_dec_and_test(&call->a_count))
217 return; 217 return;
218 nlm_release_host(call->a_host); 218 nlmclnt_release_host(call->a_host);
219 nlmclnt_release_lockargs(call); 219 nlmclnt_release_lockargs(call);
220 kfree(call); 220 kfree(call);
221} 221}
222 222
223static void nlmclnt_rpc_release(void *data) 223static void nlmclnt_rpc_release(void *data)
224{ 224{
225 nlm_release_call(data); 225 nlmclnt_release_call(data);
226} 226}
227 227
228static int nlm_wait_on_grace(wait_queue_head_t *queue) 228static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -436,7 +436,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
436 status = nlm_stat_to_errno(req->a_res.status); 436 status = nlm_stat_to_errno(req->a_res.status);
437 } 437 }
438out: 438out:
439 nlm_release_call(req); 439 nlmclnt_release_call(req);
440 return status; 440 return status;
441} 441}
442 442
@@ -593,7 +593,7 @@ again:
593out_unblock: 593out_unblock:
594 nlmclnt_finish_block(block); 594 nlmclnt_finish_block(block);
595out: 595out:
596 nlm_release_call(req); 596 nlmclnt_release_call(req);
597 return status; 597 return status;
598out_unlock: 598out_unlock:
599 /* Fatal error: ensure that we remove the lock altogether */ 599 /* Fatal error: ensure that we remove the lock altogether */
@@ -694,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
694 /* What to do now? I'm out of my depth... */ 694 /* What to do now? I'm out of my depth... */
695 status = -ENOLCK; 695 status = -ENOLCK;
696out: 696out:
697 nlm_release_call(req); 697 nlmclnt_release_call(req);
698 return status; 698 return status;
699} 699}
700 700
@@ -755,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
755 NLMPROC_CANCEL, &nlmclnt_cancel_ops); 755 NLMPROC_CANCEL, &nlmclnt_cancel_ops);
756 if (status == 0 && req->a_res.status == nlm_lck_denied) 756 if (status == 0 && req->a_res.status == nlm_lck_denied)
757 status = -ENOLCK; 757 status = -ENOLCK;
758 nlm_release_call(req); 758 nlmclnt_release_call(req);
759 return status; 759 return status;
760} 760}
761 761
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 000000000000..180ac34feb9a
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
1/*
2 * linux/fs/lockd/clntxdr.c
3 *
4 * XDR functions to encode/decode NLM version 3 RPC arguments and results.
5 * NLM version 3 is backwards compatible with NLM versions 1 and 2.
6 *
7 * NLM client-side only.
8 *
9 * Copyright (C) 2010, Oracle. All rights reserved.
10 */
11
12#include <linux/types.h>
13#include <linux/sunrpc/xdr.h>
14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/stats.h>
16#include <linux/lockd/lockd.h>
17
18#define NLMDBG_FACILITY NLMDBG_XDR
19
20#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
21# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
22#endif
23
24/*
25 * Declare the space requirements for NLM arguments and replies as
26 * number of 32bit-words
27 */
28#define NLM_cookie_sz (1+(NLM_MAXCOOKIELEN>>2))
29#define NLM_caller_sz (1+(NLMCLNT_OHSIZE>>2))
30#define NLM_owner_sz (1+(NLMCLNT_OHSIZE>>2))
31#define NLM_fhandle_sz (1+(NFS2_FHSIZE>>2))
32#define NLM_lock_sz (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
33#define NLM_holder_sz (4+NLM_owner_sz)
34
35#define NLM_testargs_sz (NLM_cookie_sz+1+NLM_lock_sz)
36#define NLM_lockargs_sz (NLM_cookie_sz+4+NLM_lock_sz)
37#define NLM_cancargs_sz (NLM_cookie_sz+2+NLM_lock_sz)
38#define NLM_unlockargs_sz (NLM_cookie_sz+NLM_lock_sz)
39
40#define NLM_testres_sz (NLM_cookie_sz+1+NLM_holder_sz)
41#define NLM_res_sz (NLM_cookie_sz+1)
42#define NLM_norep_sz (0)
43
44
45static s32 loff_t_to_s32(loff_t offset)
46{
47 s32 res;
48
49 if (offset >= NLM_OFFSET_MAX)
50 res = NLM_OFFSET_MAX;
51 else if (offset <= -NLM_OFFSET_MAX)
52 res = -NLM_OFFSET_MAX;
53 else
54 res = offset;
55 return res;
56}
57
58static void nlm_compute_offsets(const struct nlm_lock *lock,
59 u32 *l_offset, u32 *l_len)
60{
61 const struct file_lock *fl = &lock->fl;
62
63 BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
64 BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
65 fl->fl_end != OFFSET_MAX);
66
67 *l_offset = loff_t_to_s32(fl->fl_start);
68 if (fl->fl_end == OFFSET_MAX)
69 *l_len = 0;
70 else
71 *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
72}
73
74/*
75 * Handle decode buffer overflows out-of-line.
76 */
77static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
78{
79 dprintk("lockd: %s prematurely hit the end of our receive buffer. "
80 "Remaining buffer length is %tu words.\n",
81 func, xdr->end - xdr->p);
82}
83
84
85/*
86 * Encode/decode NLMv3 basic data types
87 *
88 * Basic NLMv3 data types are not defined in an IETF standards
89 * document. X/Open has a description of these data types that
90 * is useful. See Chapter 10 of "Protocols for Interworking:
91 * XNFS, Version 3W".
92 *
93 * Not all basic data types have their own encoding and decoding
94 * functions. For run-time efficiency, some data types are encoded
95 * or decoded inline.
96 */
97
98static void encode_bool(struct xdr_stream *xdr, const int value)
99{
100 __be32 *p;
101
102 p = xdr_reserve_space(xdr, 4);
103 *p = value ? xdr_one : xdr_zero;
104}
105
106static void encode_int32(struct xdr_stream *xdr, const s32 value)
107{
108 __be32 *p;
109
110 p = xdr_reserve_space(xdr, 4);
111 *p = cpu_to_be32(value);
112}
113
114/*
115 * typedef opaque netobj<MAXNETOBJ_SZ>
116 */
117static void encode_netobj(struct xdr_stream *xdr,
118 const u8 *data, const unsigned int length)
119{
120 __be32 *p;
121
122 BUG_ON(length > XDR_MAX_NETOBJ);
123 p = xdr_reserve_space(xdr, 4 + length);
124 xdr_encode_opaque(p, data, length);
125}
126
127static int decode_netobj(struct xdr_stream *xdr,
128 struct xdr_netobj *obj)
129{
130 u32 length;
131 __be32 *p;
132
133 p = xdr_inline_decode(xdr, 4);
134 if (unlikely(p == NULL))
135 goto out_overflow;
136 length = be32_to_cpup(p++);
137 if (unlikely(length > XDR_MAX_NETOBJ))
138 goto out_size;
139 obj->len = length;
140 obj->data = (u8 *)p;
141 return 0;
142out_size:
143 dprintk("NFS: returned netobj was too long: %u\n", length);
144 return -EIO;
145out_overflow:
146 print_overflow_msg(__func__, xdr);
147 return -EIO;
148}
149
150/*
151 * netobj cookie;
152 */
153static void encode_cookie(struct xdr_stream *xdr,
154 const struct nlm_cookie *cookie)
155{
156 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
157 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
158}
159
160static int decode_cookie(struct xdr_stream *xdr,
161 struct nlm_cookie *cookie)
162{
163 u32 length;
164 __be32 *p;
165
166 p = xdr_inline_decode(xdr, 4);
167 if (unlikely(p == NULL))
168 goto out_overflow;
169 length = be32_to_cpup(p++);
170 /* apparently HPUX can return empty cookies */
171 if (length == 0)
172 goto out_hpux;
173 if (length > NLM_MAXCOOKIELEN)
174 goto out_size;
175 p = xdr_inline_decode(xdr, length);
176 if (unlikely(p == NULL))
177 goto out_overflow;
178 cookie->len = length;
179 memcpy(cookie->data, p, length);
180 return 0;
181out_hpux:
182 cookie->len = 4;
183 memset(cookie->data, 0, 4);
184 return 0;
185out_size:
186 dprintk("NFS: returned cookie was too long: %u\n", length);
187 return -EIO;
188out_overflow:
189 print_overflow_msg(__func__, xdr);
190 return -EIO;
191}
192
193/*
194 * netobj fh;
195 */
196static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
197{
198 BUG_ON(fh->size != NFS2_FHSIZE);
199 encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
200}
201
202/*
203 * enum nlm_stats {
204 * LCK_GRANTED = 0,
205 * LCK_DENIED = 1,
206 * LCK_DENIED_NOLOCKS = 2,
207 * LCK_BLOCKED = 3,
208 * LCK_DENIED_GRACE_PERIOD = 4
209 * };
210 *
211 *
212 * struct nlm_stat {
213 * nlm_stats stat;
214 * };
215 *
216 * NB: we don't swap bytes for the NLM status values. The upper
217 * layers deal directly with the status value in network byte
218 * order.
219 */
220
221static void encode_nlm_stat(struct xdr_stream *xdr,
222 const __be32 stat)
223{
224 __be32 *p;
225
226 BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
227 p = xdr_reserve_space(xdr, 4);
228 *p = stat;
229}
230
231static int decode_nlm_stat(struct xdr_stream *xdr,
232 __be32 *stat)
233{
234 __be32 *p;
235
236 p = xdr_inline_decode(xdr, 4);
237 if (unlikely(p == NULL))
238 goto out_overflow;
239 if (unlikely(*p > nlm_lck_denied_grace_period))
240 goto out_enum;
241 *stat = *p;
242 return 0;
243out_enum:
244 dprintk("%s: server returned invalid nlm_stats value: %u\n",
245 __func__, be32_to_cpup(p));
246 return -EIO;
247out_overflow:
248 print_overflow_msg(__func__, xdr);
249 return -EIO;
250}
251
252/*
253 * struct nlm_holder {
254 * bool exclusive;
255 * int uppid;
256 * netobj oh;
257 * unsigned l_offset;
258 * unsigned l_len;
259 * };
260 */
261static void encode_nlm_holder(struct xdr_stream *xdr,
262 const struct nlm_res *result)
263{
264 const struct nlm_lock *lock = &result->lock;
265 u32 l_offset, l_len;
266 __be32 *p;
267
268 encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
269 encode_int32(xdr, lock->svid);
270 encode_netobj(xdr, lock->oh.data, lock->oh.len);
271
272 p = xdr_reserve_space(xdr, 4 + 4);
273 nlm_compute_offsets(lock, &l_offset, &l_len);
274 *p++ = cpu_to_be32(l_offset);
275 *p = cpu_to_be32(l_len);
276}
277
278static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
279{
280 struct nlm_lock *lock = &result->lock;
281 struct file_lock *fl = &lock->fl;
282 u32 exclusive, l_offset, l_len;
283 int error;
284 __be32 *p;
285 s32 end;
286
287 memset(lock, 0, sizeof(*lock));
288 locks_init_lock(fl);
289
290 p = xdr_inline_decode(xdr, 4 + 4);
291 if (unlikely(p == NULL))
292 goto out_overflow;
293 exclusive = be32_to_cpup(p++);
294 lock->svid = be32_to_cpup(p);
295 fl->fl_pid = (pid_t)lock->svid;
296
297 error = decode_netobj(xdr, &lock->oh);
298 if (unlikely(error))
299 goto out;
300
301 p = xdr_inline_decode(xdr, 4 + 4);
302 if (unlikely(p == NULL))
303 goto out_overflow;
304
305 fl->fl_flags = FL_POSIX;
306 fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
307 l_offset = be32_to_cpup(p++);
308 l_len = be32_to_cpup(p);
309 end = l_offset + l_len - 1;
310
311 fl->fl_start = (loff_t)l_offset;
312 if (l_len == 0 || end < 0)
313 fl->fl_end = OFFSET_MAX;
314 else
315 fl->fl_end = (loff_t)end;
316 error = 0;
317out:
318 return error;
319out_overflow:
320 print_overflow_msg(__func__, xdr);
321 return -EIO;
322}
323
324/*
325 * string caller_name<LM_MAXSTRLEN>;
326 */
327static void encode_caller_name(struct xdr_stream *xdr, const char *name)
328{
329 /* NB: client-side does not set lock->len */
330 u32 length = strlen(name);
331 __be32 *p;
332
333 BUG_ON(length > NLM_MAXSTRLEN);
334 p = xdr_reserve_space(xdr, 4 + length);
335 xdr_encode_opaque(p, name, length);
336}
337
338/*
339 * struct nlm_lock {
340 * string caller_name<LM_MAXSTRLEN>;
341 * netobj fh;
342 * netobj oh;
343 * int uppid;
344 * unsigned l_offset;
345 * unsigned l_len;
346 * };
347 */
348static void encode_nlm_lock(struct xdr_stream *xdr,
349 const struct nlm_lock *lock)
350{
351 u32 l_offset, l_len;
352 __be32 *p;
353
354 encode_caller_name(xdr, lock->caller);
355 encode_fh(xdr, &lock->fh);
356 encode_netobj(xdr, lock->oh.data, lock->oh.len);
357
358 p = xdr_reserve_space(xdr, 4 + 4 + 4);
359 *p++ = cpu_to_be32(lock->svid);
360
361 nlm_compute_offsets(lock, &l_offset, &l_len);
362 *p++ = cpu_to_be32(l_offset);
363 *p = cpu_to_be32(l_len);
364}
365
366
367/*
368 * NLMv3 XDR encode functions
369 *
370 * NLMv3 argument types are defined in Chapter 10 of The Open Group's
371 * "Protocols for Interworking: XNFS, Version 3W".
372 */
373
374/*
375 * struct nlm_testargs {
376 * netobj cookie;
377 * bool exclusive;
378 * struct nlm_lock alock;
379 * };
380 */
381static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
382 struct xdr_stream *xdr,
383 const struct nlm_args *args)
384{
385 const struct nlm_lock *lock = &args->lock;
386
387 encode_cookie(xdr, &args->cookie);
388 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
389 encode_nlm_lock(xdr, lock);
390}
391
392/*
393 * struct nlm_lockargs {
394 * netobj cookie;
395 * bool block;
396 * bool exclusive;
397 * struct nlm_lock alock;
398 * bool reclaim;
399 * int state;
400 * };
401 */
402static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
403 struct xdr_stream *xdr,
404 const struct nlm_args *args)
405{
406 const struct nlm_lock *lock = &args->lock;
407
408 encode_cookie(xdr, &args->cookie);
409 encode_bool(xdr, args->block);
410 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
411 encode_nlm_lock(xdr, lock);
412 encode_bool(xdr, args->reclaim);
413 encode_int32(xdr, args->state);
414}
415
416/*
417 * struct nlm_cancargs {
418 * netobj cookie;
419 * bool block;
420 * bool exclusive;
421 * struct nlm_lock alock;
422 * };
423 */
424static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
425 struct xdr_stream *xdr,
426 const struct nlm_args *args)
427{
428 const struct nlm_lock *lock = &args->lock;
429
430 encode_cookie(xdr, &args->cookie);
431 encode_bool(xdr, args->block);
432 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
433 encode_nlm_lock(xdr, lock);
434}
435
436/*
437 * struct nlm_unlockargs {
438 * netobj cookie;
439 * struct nlm_lock alock;
440 * };
441 */
442static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
443 struct xdr_stream *xdr,
444 const struct nlm_args *args)
445{
446 const struct nlm_lock *lock = &args->lock;
447
448 encode_cookie(xdr, &args->cookie);
449 encode_nlm_lock(xdr, lock);
450}
451
452/*
453 * struct nlm_res {
454 * netobj cookie;
455 * nlm_stat stat;
456 * };
457 */
458static void nlm_xdr_enc_res(struct rpc_rqst *req,
459 struct xdr_stream *xdr,
460 const struct nlm_res *result)
461{
462 encode_cookie(xdr, &result->cookie);
463 encode_nlm_stat(xdr, result->status);
464}
465
466/*
467 * union nlm_testrply switch (nlm_stats stat) {
468 * case LCK_DENIED:
469 * struct nlm_holder holder;
470 * default:
471 * void;
472 * };
473 *
474 * struct nlm_testres {
475 * netobj cookie;
476 * nlm_testrply test_stat;
477 * };
478 */
479static void encode_nlm_testrply(struct xdr_stream *xdr,
480 const struct nlm_res *result)
481{
482 if (result->status == nlm_lck_denied)
483 encode_nlm_holder(xdr, result);
484}
485
486static void nlm_xdr_enc_testres(struct rpc_rqst *req,
487 struct xdr_stream *xdr,
488 const struct nlm_res *result)
489{
490 encode_cookie(xdr, &result->cookie);
491 encode_nlm_stat(xdr, result->status);
492 encode_nlm_testrply(xdr, result);
493}
494
495
496/*
497 * NLMv3 XDR decode functions
498 *
499 * NLMv3 result types are defined in Chapter 10 of The Open Group's
500 * "Protocols for Interworking: XNFS, Version 3W".
501 */
502
503/*
504 * union nlm_testrply switch (nlm_stats stat) {
505 * case LCK_DENIED:
506 * struct nlm_holder holder;
507 * default:
508 * void;
509 * };
510 *
511 * struct nlm_testres {
512 * netobj cookie;
513 * nlm_testrply test_stat;
514 * };
515 */
516static int decode_nlm_testrply(struct xdr_stream *xdr,
517 struct nlm_res *result)
518{
519 int error;
520
521 error = decode_nlm_stat(xdr, &result->status);
522 if (unlikely(error))
523 goto out;
524 if (result->status == nlm_lck_denied)
525 error = decode_nlm_holder(xdr, result);
526out:
527 return error;
528}
529
530static int nlm_xdr_dec_testres(struct rpc_rqst *req,
531 struct xdr_stream *xdr,
532 struct nlm_res *result)
533{
534 int error;
535
536 error = decode_cookie(xdr, &result->cookie);
537 if (unlikely(error))
538 goto out;
539 error = decode_nlm_testrply(xdr, result);
540out:
541 return error;
542}
543
544/*
545 * struct nlm_res {
546 * netobj cookie;
547 * nlm_stat stat;
548 * };
549 */
550static int nlm_xdr_dec_res(struct rpc_rqst *req,
551 struct xdr_stream *xdr,
552 struct nlm_res *result)
553{
554 int error;
555
556 error = decode_cookie(xdr, &result->cookie);
557 if (unlikely(error))
558 goto out;
559 error = decode_nlm_stat(xdr, &result->status);
560out:
561 return error;
562}
563
564
565/*
566 * For NLM, a void procedure really returns nothing
567 */
568#define nlm_xdr_dec_norep NULL
569
570#define PROC(proc, argtype, restype) \
571[NLMPROC_##proc] = { \
572 .p_proc = NLMPROC_##proc, \
573 .p_encode = (kxdreproc_t)nlm_xdr_enc_##argtype, \
574 .p_decode = (kxdrdproc_t)nlm_xdr_dec_##restype, \
575 .p_arglen = NLM_##argtype##_sz, \
576 .p_replen = NLM_##restype##_sz, \
577 .p_statidx = NLMPROC_##proc, \
578 .p_name = #proc, \
579 }
580
581static struct rpc_procinfo nlm_procedures[] = {
582 PROC(TEST, testargs, testres),
583 PROC(LOCK, lockargs, res),
584 PROC(CANCEL, cancargs, res),
585 PROC(UNLOCK, unlockargs, res),
586 PROC(GRANTED, testargs, res),
587 PROC(TEST_MSG, testargs, norep),
588 PROC(LOCK_MSG, lockargs, norep),
589 PROC(CANCEL_MSG, cancargs, norep),
590 PROC(UNLOCK_MSG, unlockargs, norep),
591 PROC(GRANTED_MSG, testargs, norep),
592 PROC(TEST_RES, testres, norep),
593 PROC(LOCK_RES, res, norep),
594 PROC(CANCEL_RES, res, norep),
595 PROC(UNLOCK_RES, res, norep),
596 PROC(GRANTED_RES, res, norep),
597};
598
599static struct rpc_version nlm_version1 = {
600 .number = 1,
601 .nrprocs = ARRAY_SIZE(nlm_procedures),
602 .procs = nlm_procedures,
603};
604
605static struct rpc_version nlm_version3 = {
606 .number = 3,
607 .nrprocs = ARRAY_SIZE(nlm_procedures),
608 .procs = nlm_procedures,
609};
610
611static struct rpc_version *nlm_versions[] = {
612 [1] = &nlm_version1,
613 [3] = &nlm_version3,
614#ifdef CONFIG_LOCKD_V4
615 [4] = &nlm_version4,
616#endif
617};
618
619static struct rpc_stat nlm_rpc_stats;
620
621struct rpc_program nlm_program = {
622 .name = "lockd",
623 .number = NLM_PROGRAM,
624 .nrvers = ARRAY_SIZE(nlm_versions),
625 .version = nlm_versions,
626 .stats = &nlm_rpc_stats,
627};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index ed0c59fe23ce..5f1bcb2f06f3 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -25,9 +25,22 @@
25#define NLM_HOST_EXPIRE (300 * HZ) 25#define NLM_HOST_EXPIRE (300 * HZ)
26#define NLM_HOST_COLLECT (120 * HZ) 26#define NLM_HOST_COLLECT (120 * HZ)
27 27
28static struct hlist_head nlm_hosts[NLM_HOST_NRHASH]; 28static struct hlist_head nlm_server_hosts[NLM_HOST_NRHASH];
29static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH];
30
31#define for_each_host(host, pos, chain, table) \
32 for ((chain) = (table); \
33 (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
34 hlist_for_each_entry((host), (pos), (chain), h_hash)
35
36#define for_each_host_safe(host, pos, next, chain, table) \
37 for ((chain) = (table); \
38 (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
39 hlist_for_each_entry_safe((host), (pos), (next), \
40 (chain), h_hash)
41
29static unsigned long next_gc; 42static unsigned long next_gc;
30static int nrhosts; 43static unsigned long nrhosts;
31static DEFINE_MUTEX(nlm_host_mutex); 44static DEFINE_MUTEX(nlm_host_mutex);
32 45
33static void nlm_gc_hosts(void); 46static void nlm_gc_hosts(void);
@@ -40,8 +53,6 @@ struct nlm_lookup_host_info {
40 const u32 version; /* NLM version to search for */ 53 const u32 version; /* NLM version to search for */
41 const char *hostname; /* remote's hostname */ 54 const char *hostname; /* remote's hostname */
42 const size_t hostname_len; /* it's length */ 55 const size_t hostname_len; /* it's length */
43 const struct sockaddr *src_sap; /* our address (optional) */
44 const size_t src_len; /* it's length */
45 const int noresvport; /* use non-priv port */ 56 const int noresvport; /* use non-priv port */
46}; 57};
47 58
@@ -88,127 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
88} 99}
89 100
90/* 101/*
91 * Common host lookup routine for server & client 102 * Allocate and initialize an nlm_host. Common to both client and server.
92 */ 103 */
93static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) 104static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
105 struct nsm_handle *nsm)
94{ 106{
95 struct hlist_head *chain; 107 struct nlm_host *host = NULL;
96 struct hlist_node *pos; 108 unsigned long now = jiffies;
97 struct nlm_host *host;
98 struct nsm_handle *nsm = NULL;
99
100 mutex_lock(&nlm_host_mutex);
101 109
102 if (time_after_eq(jiffies, next_gc)) 110 if (nsm != NULL)
103 nlm_gc_hosts();
104
105 /* We may keep several nlm_host objects for a peer, because each
106 * nlm_host is identified by
107 * (address, protocol, version, server/client)
108 * We could probably simplify this a little by putting all those
109 * different NLM rpc_clients into one single nlm_host object.
110 * This would allow us to have one nlm_host per address.
111 */
112 chain = &nlm_hosts[nlm_hash_address(ni->sap)];
113 hlist_for_each_entry(host, pos, chain, h_hash) {
114 if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
115 continue;
116
117 /* See if we have an NSM handle for this client */
118 if (!nsm)
119 nsm = host->h_nsmhandle;
120
121 if (host->h_proto != ni->protocol)
122 continue;
123 if (host->h_version != ni->version)
124 continue;
125 if (host->h_server != ni->server)
126 continue;
127 if (ni->server && ni->src_len != 0 &&
128 !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
129 continue;
130
131 /* Move to head of hash chain. */
132 hlist_del(&host->h_hash);
133 hlist_add_head(&host->h_hash, chain);
134
135 nlm_get_host(host);
136 dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
137 host->h_name, host->h_addrbuf);
138 goto out;
139 }
140
141 /*
142 * The host wasn't in our hash table. If we don't
143 * have an NSM handle for it yet, create one.
144 */
145 if (nsm)
146 atomic_inc(&nsm->sm_count); 111 atomic_inc(&nsm->sm_count);
147 else { 112 else {
148 host = NULL; 113 host = NULL;
149 nsm = nsm_get_handle(ni->sap, ni->salen, 114 nsm = nsm_get_handle(ni->sap, ni->salen,
150 ni->hostname, ni->hostname_len); 115 ni->hostname, ni->hostname_len);
151 if (!nsm) { 116 if (unlikely(nsm == NULL)) {
152 dprintk("lockd: nlm_lookup_host failed; " 117 dprintk("lockd: %s failed; no nsm handle\n",
153 "no nsm handle\n"); 118 __func__);
154 goto out; 119 goto out;
155 } 120 }
156 } 121 }
157 122
158 host = kzalloc(sizeof(*host), GFP_KERNEL); 123 host = kmalloc(sizeof(*host), GFP_KERNEL);
159 if (!host) { 124 if (unlikely(host == NULL)) {
125 dprintk("lockd: %s failed; no memory\n", __func__);
160 nsm_release(nsm); 126 nsm_release(nsm);
161 dprintk("lockd: nlm_lookup_host failed; no memory\n");
162 goto out; 127 goto out;
163 } 128 }
164 host->h_name = nsm->sm_name; 129
165 host->h_addrbuf = nsm->sm_addrbuf;
166 memcpy(nlm_addr(host), ni->sap, ni->salen); 130 memcpy(nlm_addr(host), ni->sap, ni->salen);
167 host->h_addrlen = ni->salen; 131 host->h_addrlen = ni->salen;
168 rpc_set_port(nlm_addr(host), 0); 132 rpc_set_port(nlm_addr(host), 0);
169 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); 133 host->h_srcaddrlen = 0;
170 host->h_srcaddrlen = ni->src_len; 134
135 host->h_rpcclnt = NULL;
136 host->h_name = nsm->sm_name;
171 host->h_version = ni->version; 137 host->h_version = ni->version;
172 host->h_proto = ni->protocol; 138 host->h_proto = ni->protocol;
173 host->h_rpcclnt = NULL; 139 host->h_reclaiming = 0;
174 mutex_init(&host->h_mutex); 140 host->h_server = ni->server;
175 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 141 host->h_noresvport = ni->noresvport;
176 host->h_expires = jiffies + NLM_HOST_EXPIRE; 142 host->h_inuse = 0;
177 atomic_set(&host->h_count, 1);
178 init_waitqueue_head(&host->h_gracewait); 143 init_waitqueue_head(&host->h_gracewait);
179 init_rwsem(&host->h_rwsem); 144 init_rwsem(&host->h_rwsem);
180 host->h_state = 0; /* pseudo NSM state */ 145 host->h_state = 0;
181 host->h_nsmstate = 0; /* real NSM state */ 146 host->h_nsmstate = 0;
182 host->h_nsmhandle = nsm; 147 host->h_pidcount = 0;
183 host->h_server = ni->server; 148 atomic_set(&host->h_count, 1);
184 host->h_noresvport = ni->noresvport; 149 mutex_init(&host->h_mutex);
185 hlist_add_head(&host->h_hash, chain); 150 host->h_nextrebind = now + NLM_HOST_REBIND;
151 host->h_expires = now + NLM_HOST_EXPIRE;
186 INIT_LIST_HEAD(&host->h_lockowners); 152 INIT_LIST_HEAD(&host->h_lockowners);
187 spin_lock_init(&host->h_lock); 153 spin_lock_init(&host->h_lock);
188 INIT_LIST_HEAD(&host->h_granted); 154 INIT_LIST_HEAD(&host->h_granted);
189 INIT_LIST_HEAD(&host->h_reclaim); 155 INIT_LIST_HEAD(&host->h_reclaim);
190 156 host->h_nsmhandle = nsm;
191 nrhosts++; 157 host->h_addrbuf = nsm->sm_addrbuf;
192
193 dprintk("lockd: nlm_lookup_host created host %s\n",
194 host->h_name);
195 158
196out: 159out:
197 mutex_unlock(&nlm_host_mutex);
198 return host; 160 return host;
199} 161}
200 162
201/* 163/*
202 * Destroy a host 164 * Destroy an nlm_host and free associated resources
165 *
166 * Caller must hold nlm_host_mutex.
203 */ 167 */
204static void 168static void nlm_destroy_host_locked(struct nlm_host *host)
205nlm_destroy_host(struct nlm_host *host)
206{ 169{
207 struct rpc_clnt *clnt; 170 struct rpc_clnt *clnt;
208 171
172 dprintk("lockd: destroy host %s\n", host->h_name);
173
209 BUG_ON(!list_empty(&host->h_lockowners)); 174 BUG_ON(!list_empty(&host->h_lockowners));
210 BUG_ON(atomic_read(&host->h_count)); 175 BUG_ON(atomic_read(&host->h_count));
211 176
177 hlist_del_init(&host->h_hash);
178
212 nsm_unmonitor(host); 179 nsm_unmonitor(host);
213 nsm_release(host->h_nsmhandle); 180 nsm_release(host->h_nsmhandle);
214 181
@@ -216,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host)
216 if (clnt != NULL) 183 if (clnt != NULL)
217 rpc_shutdown_client(clnt); 184 rpc_shutdown_client(clnt);
218 kfree(host); 185 kfree(host);
186
187 nrhosts--;
219} 188}
220 189
221/** 190/**
@@ -249,12 +218,76 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
249 .hostname_len = strlen(hostname), 218 .hostname_len = strlen(hostname),
250 .noresvport = noresvport, 219 .noresvport = noresvport,
251 }; 220 };
221 struct hlist_head *chain;
222 struct hlist_node *pos;
223 struct nlm_host *host;
224 struct nsm_handle *nsm = NULL;
252 225
253 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, 226 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
254 (hostname ? hostname : "<none>"), version, 227 (hostname ? hostname : "<none>"), version,
255 (protocol == IPPROTO_UDP ? "udp" : "tcp")); 228 (protocol == IPPROTO_UDP ? "udp" : "tcp"));
256 229
257 return nlm_lookup_host(&ni); 230 mutex_lock(&nlm_host_mutex);
231
232 chain = &nlm_client_hosts[nlm_hash_address(sap)];
233 hlist_for_each_entry(host, pos, chain, h_hash) {
234 if (!rpc_cmp_addr(nlm_addr(host), sap))
235 continue;
236
237 /* Same address. Share an NSM handle if we already have one */
238 if (nsm == NULL)
239 nsm = host->h_nsmhandle;
240
241 if (host->h_proto != protocol)
242 continue;
243 if (host->h_version != version)
244 continue;
245
246 nlm_get_host(host);
247 dprintk("lockd: %s found host %s (%s)\n", __func__,
248 host->h_name, host->h_addrbuf);
249 goto out;
250 }
251
252 host = nlm_alloc_host(&ni, nsm);
253 if (unlikely(host == NULL))
254 goto out;
255
256 hlist_add_head(&host->h_hash, chain);
257 nrhosts++;
258
259 dprintk("lockd: %s created host %s (%s)\n", __func__,
260 host->h_name, host->h_addrbuf);
261
262out:
263 mutex_unlock(&nlm_host_mutex);
264 return host;
265}
266
267/**
268 * nlmclnt_release_host - release client nlm_host
269 * @host: nlm_host to release
270 *
271 */
272void nlmclnt_release_host(struct nlm_host *host)
273{
274 if (host == NULL)
275 return;
276
277 dprintk("lockd: release client host %s\n", host->h_name);
278
279 BUG_ON(atomic_read(&host->h_count) < 0);
280 BUG_ON(host->h_server);
281
282 if (atomic_dec_and_test(&host->h_count)) {
283 BUG_ON(!list_empty(&host->h_lockowners));
284 BUG_ON(!list_empty(&host->h_granted));
285 BUG_ON(!list_empty(&host->h_reclaim));
286
287 mutex_lock(&nlm_host_mutex);
288 nlm_destroy_host_locked(host);
289 mutex_unlock(&nlm_host_mutex);
290 }
258} 291}
259 292
260/** 293/**
@@ -279,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
279 const char *hostname, 312 const char *hostname,
280 const size_t hostname_len) 313 const size_t hostname_len)
281{ 314{
315 struct hlist_head *chain;
316 struct hlist_node *pos;
317 struct nlm_host *host = NULL;
318 struct nsm_handle *nsm = NULL;
282 struct sockaddr_in sin = { 319 struct sockaddr_in sin = {
283 .sin_family = AF_INET, 320 .sin_family = AF_INET,
284 }; 321 };
285 struct sockaddr_in6 sin6 = { 322 struct sockaddr_in6 sin6 = {
286 .sin6_family = AF_INET6, 323 .sin6_family = AF_INET6,
287 }; 324 };
325 struct sockaddr *src_sap;
326 size_t src_len = rqstp->rq_addrlen;
288 struct nlm_lookup_host_info ni = { 327 struct nlm_lookup_host_info ni = {
289 .server = 1, 328 .server = 1,
290 .sap = svc_addr(rqstp), 329 .sap = svc_addr(rqstp),
@@ -293,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
293 .version = rqstp->rq_vers, 332 .version = rqstp->rq_vers,
294 .hostname = hostname, 333 .hostname = hostname,
295 .hostname_len = hostname_len, 334 .hostname_len = hostname_len,
296 .src_len = rqstp->rq_addrlen,
297 }; 335 };
298 336
299 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, 337 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
300 (int)hostname_len, hostname, rqstp->rq_vers, 338 (int)hostname_len, hostname, rqstp->rq_vers,
301 (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); 339 (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
302 340
341 mutex_lock(&nlm_host_mutex);
342
303 switch (ni.sap->sa_family) { 343 switch (ni.sap->sa_family) {
304 case AF_INET: 344 case AF_INET:
305 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; 345 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
306 ni.src_sap = (struct sockaddr *)&sin; 346 src_sap = (struct sockaddr *)&sin;
307 break; 347 break;
308 case AF_INET6: 348 case AF_INET6:
309 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); 349 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
310 ni.src_sap = (struct sockaddr *)&sin6; 350 src_sap = (struct sockaddr *)&sin6;
311 break; 351 break;
312 default: 352 default:
313 return NULL; 353 dprintk("lockd: %s failed; unrecognized address family\n",
354 __func__);
355 goto out;
356 }
357
358 if (time_after_eq(jiffies, next_gc))
359 nlm_gc_hosts();
360
361 chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
362 hlist_for_each_entry(host, pos, chain, h_hash) {
363 if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
364 continue;
365
366 /* Same address. Share an NSM handle if we already have one */
367 if (nsm == NULL)
368 nsm = host->h_nsmhandle;
369
370 if (host->h_proto != ni.protocol)
371 continue;
372 if (host->h_version != ni.version)
373 continue;
374 if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
375 continue;
376
377 /* Move to head of hash chain. */
378 hlist_del(&host->h_hash);
379 hlist_add_head(&host->h_hash, chain);
380
381 nlm_get_host(host);
382 dprintk("lockd: %s found host %s (%s)\n",
383 __func__, host->h_name, host->h_addrbuf);
384 goto out;
314 } 385 }
315 386
316 return nlm_lookup_host(&ni); 387 host = nlm_alloc_host(&ni, nsm);
388 if (unlikely(host == NULL))
389 goto out;
390
391 memcpy(nlm_srcaddr(host), src_sap, src_len);
392 host->h_srcaddrlen = src_len;
393 hlist_add_head(&host->h_hash, chain);
394 nrhosts++;
395
396 dprintk("lockd: %s created host %s (%s)\n",
397 __func__, host->h_name, host->h_addrbuf);
398
399out:
400 mutex_unlock(&nlm_host_mutex);
401 return host;
402}
403
404/**
405 * nlmsvc_release_host - release server nlm_host
406 * @host: nlm_host to release
407 *
408 * Host is destroyed later in nlm_gc_host().
409 */
410void nlmsvc_release_host(struct nlm_host *host)
411{
412 if (host == NULL)
413 return;
414
415 dprintk("lockd: release server host %s\n", host->h_name);
416
417 BUG_ON(atomic_read(&host->h_count) < 0);
418 BUG_ON(!host->h_server);
419 atomic_dec(&host->h_count);
317} 420}
318 421
319/* 422/*
@@ -413,20 +516,28 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
413 return host; 516 return host;
414} 517}
415 518
416/* 519static struct nlm_host *next_host_state(struct hlist_head *cache,
417 * Release NLM host after use 520 struct nsm_handle *nsm,
418 */ 521 const struct nlm_reboot *info)
419void nlm_release_host(struct nlm_host *host)
420{ 522{
421 if (host != NULL) { 523 struct nlm_host *host = NULL;
422 dprintk("lockd: release host %s\n", host->h_name); 524 struct hlist_head *chain;
423 BUG_ON(atomic_read(&host->h_count) < 0); 525 struct hlist_node *pos;
424 if (atomic_dec_and_test(&host->h_count)) { 526
425 BUG_ON(!list_empty(&host->h_lockowners)); 527 mutex_lock(&nlm_host_mutex);
426 BUG_ON(!list_empty(&host->h_granted)); 528 for_each_host(host, pos, chain, cache) {
427 BUG_ON(!list_empty(&host->h_reclaim)); 529 if (host->h_nsmhandle == nsm
530 && host->h_nsmstate != info->state) {
531 host->h_nsmstate = info->state;
532 host->h_state++;
533
534 nlm_get_host(host);
535 goto out;
428 } 536 }
429 } 537 }
538out:
539 mutex_unlock(&nlm_host_mutex);
540 return host;
430} 541}
431 542
432/** 543/**
@@ -438,8 +549,6 @@ void nlm_release_host(struct nlm_host *host)
438 */ 549 */
439void nlm_host_rebooted(const struct nlm_reboot *info) 550void nlm_host_rebooted(const struct nlm_reboot *info)
440{ 551{
441 struct hlist_head *chain;
442 struct hlist_node *pos;
443 struct nsm_handle *nsm; 552 struct nsm_handle *nsm;
444 struct nlm_host *host; 553 struct nlm_host *host;
445 554
@@ -452,32 +561,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
452 * lock for this. 561 * lock for this.
453 * To avoid processing a host several times, we match the nsmstate. 562 * To avoid processing a host several times, we match the nsmstate.
454 */ 563 */
455again: mutex_lock(&nlm_host_mutex); 564 while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
456 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 565 nlmsvc_free_host_resources(host);
457 hlist_for_each_entry(host, pos, chain, h_hash) { 566 nlmsvc_release_host(host);
458 if (host->h_nsmhandle == nsm
459 && host->h_nsmstate != info->state) {
460 host->h_nsmstate = info->state;
461 host->h_state++;
462
463 nlm_get_host(host);
464 mutex_unlock(&nlm_host_mutex);
465
466 if (host->h_server) {
467 /* We're server for this guy, just ditch
468 * all the locks he held. */
469 nlmsvc_free_host_resources(host);
470 } else {
471 /* He's the server, initiate lock recovery. */
472 nlmclnt_recovery(host);
473 }
474
475 nlm_release_host(host);
476 goto again;
477 }
478 }
479 } 567 }
480 mutex_unlock(&nlm_host_mutex); 568 while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
569 nlmclnt_recovery(host);
570 nlmclnt_release_host(host);
571 }
572
481 nsm_release(nsm); 573 nsm_release(nsm);
482} 574}
483 575
@@ -497,13 +589,11 @@ nlm_shutdown_hosts(void)
497 589
498 /* First, make all hosts eligible for gc */ 590 /* First, make all hosts eligible for gc */
499 dprintk("lockd: nuking all hosts...\n"); 591 dprintk("lockd: nuking all hosts...\n");
500 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 592 for_each_host(host, pos, chain, nlm_server_hosts) {
501 hlist_for_each_entry(host, pos, chain, h_hash) { 593 host->h_expires = jiffies - 1;
502 host->h_expires = jiffies - 1; 594 if (host->h_rpcclnt) {
503 if (host->h_rpcclnt) { 595 rpc_shutdown_client(host->h_rpcclnt);
504 rpc_shutdown_client(host->h_rpcclnt); 596 host->h_rpcclnt = NULL;
505 host->h_rpcclnt = NULL;
506 }
507 } 597 }
508 } 598 }
509 599
@@ -512,15 +602,13 @@ nlm_shutdown_hosts(void)
512 mutex_unlock(&nlm_host_mutex); 602 mutex_unlock(&nlm_host_mutex);
513 603
514 /* complain if any hosts are left */ 604 /* complain if any hosts are left */
515 if (nrhosts) { 605 if (nrhosts != 0) {
516 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); 606 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
517 dprintk("lockd: %d hosts left:\n", nrhosts); 607 dprintk("lockd: %lu hosts left:\n", nrhosts);
518 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 608 for_each_host(host, pos, chain, nlm_server_hosts) {
519 hlist_for_each_entry(host, pos, chain, h_hash) { 609 dprintk(" %s (cnt %d use %d exp %ld)\n",
520 dprintk(" %s (cnt %d use %d exp %ld)\n", 610 host->h_name, atomic_read(&host->h_count),
521 host->h_name, atomic_read(&host->h_count), 611 host->h_inuse, host->h_expires);
522 host->h_inuse, host->h_expires);
523 }
524 } 612 }
525 } 613 }
526} 614}
@@ -538,29 +626,22 @@ nlm_gc_hosts(void)
538 struct nlm_host *host; 626 struct nlm_host *host;
539 627
540 dprintk("lockd: host garbage collection\n"); 628 dprintk("lockd: host garbage collection\n");
541 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 629 for_each_host(host, pos, chain, nlm_server_hosts)
542 hlist_for_each_entry(host, pos, chain, h_hash) 630 host->h_inuse = 0;
543 host->h_inuse = 0;
544 }
545 631
546 /* Mark all hosts that hold locks, blocks or shares */ 632 /* Mark all hosts that hold locks, blocks or shares */
547 nlmsvc_mark_resources(); 633 nlmsvc_mark_resources();
548 634
549 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 635 for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
550 hlist_for_each_entry_safe(host, pos, next, chain, h_hash) { 636 if (atomic_read(&host->h_count) || host->h_inuse
551 if (atomic_read(&host->h_count) || host->h_inuse 637 || time_before(jiffies, host->h_expires)) {
552 || time_before(jiffies, host->h_expires)) { 638 dprintk("nlm_gc_hosts skipping %s "
553 dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n", 639 "(cnt %d use %d exp %ld)\n",
554 host->h_name, atomic_read(&host->h_count), 640 host->h_name, atomic_read(&host->h_count),
555 host->h_inuse, host->h_expires); 641 host->h_inuse, host->h_expires);
556 continue; 642 continue;
557 }
558 dprintk("lockd: delete host %s\n", host->h_name);
559 hlist_del_init(&host->h_hash);
560
561 nlm_destroy_host(host);
562 nrhosts--;
563 } 643 }
644 nlm_destroy_host_locked(host);
564 } 645 }
565 646
566 next_gc = jiffies + NLM_HOST_COLLECT; 647 next_gc = jiffies + NLM_HOST_COLLECT;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e0c918949644..23d7451b2938 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -401,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm)
401 * Status Monitor wire protocol. 401 * Status Monitor wire protocol.
402 */ 402 */
403 403
404static int encode_nsm_string(struct xdr_stream *xdr, const char *string) 404static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
405{ 405{
406 const u32 len = strlen(string); 406 const u32 len = strlen(string);
407 __be32 *p; 407 __be32 *p;
408 408
409 if (unlikely(len > SM_MAXSTRLEN)) 409 BUG_ON(len > SM_MAXSTRLEN);
410 return -EIO; 410 p = xdr_reserve_space(xdr, 4 + len);
411 p = xdr_reserve_space(xdr, sizeof(u32) + len);
412 if (unlikely(p == NULL))
413 return -EIO;
414 xdr_encode_opaque(p, string, len); 411 xdr_encode_opaque(p, string, len);
415 return 0;
416} 412}
417 413
418/* 414/*
419 * "mon_name" specifies the host to be monitored. 415 * "mon_name" specifies the host to be monitored.
420 */ 416 */
421static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) 417static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
422{ 418{
423 return encode_nsm_string(xdr, argp->mon_name); 419 encode_nsm_string(xdr, argp->mon_name);
424} 420}
425 421
426/* 422/*
@@ -429,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
429 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name" 425 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
430 * has changed. 426 * has changed.
431 */ 427 */
432static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) 428static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
433{ 429{
434 int status;
435 __be32 *p; 430 __be32 *p;
436 431
437 status = encode_nsm_string(xdr, utsname()->nodename); 432 encode_nsm_string(xdr, utsname()->nodename);
438 if (unlikely(status != 0)) 433 p = xdr_reserve_space(xdr, 4 + 4 + 4);
439 return status; 434 *p++ = cpu_to_be32(argp->prog);
440 p = xdr_reserve_space(xdr, 3 * sizeof(u32)); 435 *p++ = cpu_to_be32(argp->vers);
441 if (unlikely(p == NULL)) 436 *p = cpu_to_be32(argp->proc);
442 return -EIO;
443 *p++ = htonl(argp->prog);
444 *p++ = htonl(argp->vers);
445 *p++ = htonl(argp->proc);
446 return 0;
447} 437}
448 438
449/* 439/*
450 * The "mon_id" argument specifies the non-private arguments 440 * The "mon_id" argument specifies the non-private arguments
451 * of an NSMPROC_MON or NSMPROC_UNMON call. 441 * of an NSMPROC_MON or NSMPROC_UNMON call.
452 */ 442 */
453static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) 443static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
454{ 444{
455 int status; 445 encode_mon_name(xdr, argp);
456 446 encode_my_id(xdr, argp);
457 status = encode_mon_name(xdr, argp);
458 if (unlikely(status != 0))
459 return status;
460 return encode_my_id(xdr, argp);
461} 447}
462 448
463/* 449/*
@@ -465,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
465 * by the NSMPROC_MON call. This information will be supplied in the 451 * by the NSMPROC_MON call. This information will be supplied in the
466 * NLMPROC_SM_NOTIFY call. 452 * NLMPROC_SM_NOTIFY call.
467 */ 453 */
468static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) 454static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
469{ 455{
470 __be32 *p; 456 __be32 *p;
471 457
472 p = xdr_reserve_space(xdr, SM_PRIV_SIZE); 458 p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
473 if (unlikely(p == NULL))
474 return -EIO;
475 xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE); 459 xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
476 return 0;
477} 460}
478 461
479static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p, 462static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
480 const struct nsm_args *argp) 463 const struct nsm_args *argp)
481{ 464{
482 struct xdr_stream xdr; 465 encode_mon_id(xdr, argp);
483 int status; 466 encode_priv(xdr, argp);
484
485 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
486 status = encode_mon_id(&xdr, argp);
487 if (unlikely(status))
488 return status;
489 return encode_priv(&xdr, argp);
490} 467}
491 468
492static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p, 469static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
493 const struct nsm_args *argp) 470 const struct nsm_args *argp)
494{ 471{
495 struct xdr_stream xdr; 472 encode_mon_id(xdr, argp);
496
497 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
498 return encode_mon_id(&xdr, argp);
499} 473}
500 474
501static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p, 475static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
502 struct nsm_res *resp) 476 struct xdr_stream *xdr,
477 struct nsm_res *resp)
503{ 478{
504 struct xdr_stream xdr; 479 __be32 *p;
505 480
506 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 481 p = xdr_inline_decode(xdr, 4 + 4);
507 p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
508 if (unlikely(p == NULL)) 482 if (unlikely(p == NULL))
509 return -EIO; 483 return -EIO;
510 resp->status = ntohl(*p++); 484 resp->status = be32_to_cpup(p++);
511 resp->state = ntohl(*p); 485 resp->state = be32_to_cpup(p);
512 486
513 dprintk("lockd: xdr_dec_stat_res status %d state %d\n", 487 dprintk("lockd: %s status %d state %d\n",
514 resp->status, resp->state); 488 __func__, resp->status, resp->state);
515 return 0; 489 return 0;
516} 490}
517 491
518static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p, 492static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
519 struct nsm_res *resp) 493 struct xdr_stream *xdr,
494 struct nsm_res *resp)
520{ 495{
521 struct xdr_stream xdr; 496 __be32 *p;
522 497
523 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 498 p = xdr_inline_decode(xdr, 4);
524 p = xdr_inline_decode(&xdr, sizeof(u32));
525 if (unlikely(p == NULL)) 499 if (unlikely(p == NULL))
526 return -EIO; 500 return -EIO;
527 resp->state = ntohl(*p); 501 resp->state = be32_to_cpup(p);
528 502
529 dprintk("lockd: xdr_dec_stat state %d\n", resp->state); 503 dprintk("lockd: %s state %d\n", __func__, resp->state);
530 return 0; 504 return 0;
531} 505}
532 506
@@ -542,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
542static struct rpc_procinfo nsm_procedures[] = { 516static struct rpc_procinfo nsm_procedures[] = {
543[NSMPROC_MON] = { 517[NSMPROC_MON] = {
544 .p_proc = NSMPROC_MON, 518 .p_proc = NSMPROC_MON,
545 .p_encode = (kxdrproc_t)xdr_enc_mon, 519 .p_encode = (kxdreproc_t)nsm_xdr_enc_mon,
546 .p_decode = (kxdrproc_t)xdr_dec_stat_res, 520 .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat_res,
547 .p_arglen = SM_mon_sz, 521 .p_arglen = SM_mon_sz,
548 .p_replen = SM_monres_sz, 522 .p_replen = SM_monres_sz,
549 .p_statidx = NSMPROC_MON, 523 .p_statidx = NSMPROC_MON,
@@ -551,8 +525,8 @@ static struct rpc_procinfo nsm_procedures[] = {
551 }, 525 },
552[NSMPROC_UNMON] = { 526[NSMPROC_UNMON] = {
553 .p_proc = NSMPROC_UNMON, 527 .p_proc = NSMPROC_UNMON,
554 .p_encode = (kxdrproc_t)xdr_enc_unmon, 528 .p_encode = (kxdreproc_t)nsm_xdr_enc_unmon,
555 .p_decode = (kxdrproc_t)xdr_dec_stat, 529 .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat,
556 .p_arglen = SM_mon_id_sz, 530 .p_arglen = SM_mon_id_sz,
557 .p_replen = SM_unmonres_sz, 531 .p_replen = SM_unmonres_sz,
558 .p_statidx = NSMPROC_UNMON, 532 .p_statidx = NSMPROC_UNMON,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 38d261192453..9a41fdc19511 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -51,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
51 return 0; 51 return 0;
52 52
53no_locks: 53no_locks:
54 nlm_release_host(host); 54 nlmsvc_release_host(host);
55 if (error) 55 if (error)
56 return error; 56 return error;
57 return nlm_lck_denied_nolocks; 57 return nlm_lck_denied_nolocks;
@@ -92,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
92 else 92 else
93 dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); 93 dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
94 94
95 nlm_release_host(host); 95 nlmsvc_release_host(host);
96 nlm_release_file(file); 96 nlm_release_file(file);
97 return rc; 97 return rc;
98} 98}
@@ -134,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
134 else 134 else
135 dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 135 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
136 136
137 nlm_release_host(host); 137 nlmsvc_release_host(host);
138 nlm_release_file(file); 138 nlm_release_file(file);
139 return rc; 139 return rc;
140} 140}
@@ -164,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
164 resp->status = nlmsvc_cancel_blocked(file, &argp->lock); 164 resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
165 165
166 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); 166 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status));
167 nlm_release_host(host); 167 nlmsvc_release_host(host);
168 nlm_release_file(file); 168 nlm_release_file(file);
169 return rpc_success; 169 return rpc_success;
170} 170}
@@ -197,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
197 resp->status = nlmsvc_unlock(file, &argp->lock); 197 resp->status = nlmsvc_unlock(file, &argp->lock);
198 198
199 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); 199 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status));
200 nlm_release_host(host); 200 nlmsvc_release_host(host);
201 nlm_release_file(file); 201 nlm_release_file(file);
202 return rpc_success; 202 return rpc_success;
203} 203}
@@ -229,7 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
229 229
230static void nlm4svc_callback_release(void *data) 230static void nlm4svc_callback_release(void *data)
231{ 231{
232 nlm_release_call(data); 232 nlmsvc_release_call(data);
233} 233}
234 234
235static const struct rpc_call_ops nlm4svc_callback_ops = { 235static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -261,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
261 261
262 stat = func(rqstp, argp, &call->a_res); 262 stat = func(rqstp, argp, &call->a_res);
263 if (stat != 0) { 263 if (stat != 0) {
264 nlm_release_call(call); 264 nlmsvc_release_call(call);
265 return stat; 265 return stat;
266 } 266 }
267 267
@@ -334,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
334 resp->status = nlmsvc_share_file(host, file, argp); 334 resp->status = nlmsvc_share_file(host, file, argp);
335 335
336 dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); 336 dprintk("lockd: SHARE status %d\n", ntohl(resp->status));
337 nlm_release_host(host); 337 nlmsvc_release_host(host);
338 nlm_release_file(file); 338 nlm_release_file(file);
339 return rpc_success; 339 return rpc_success;
340} 340}
@@ -367,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
367 resp->status = nlmsvc_unshare_file(host, file, argp); 367 resp->status = nlmsvc_unshare_file(host, file, argp);
368 368
369 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); 369 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status));
370 nlm_release_host(host); 370 nlmsvc_release_host(host);
371 nlm_release_file(file); 371 nlm_release_file(file);
372 return rpc_success; 372 return rpc_success;
373} 373}
@@ -399,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
399 return rpc_success; 399 return rpc_success;
400 400
401 nlmsvc_free_host_resources(host); 401 nlmsvc_free_host_resources(host);
402 nlm_release_host(host); 402 nlmsvc_release_host(host);
403 return rpc_success; 403 return rpc_success;
404} 404}
405 405
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ef5659b211e9..6e31695d046f 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -46,6 +46,7 @@ static void nlmsvc_remove_block(struct nlm_block *block);
46static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); 46static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
47static void nlmsvc_freegrantargs(struct nlm_rqst *call); 47static void nlmsvc_freegrantargs(struct nlm_rqst *call);
48static const struct rpc_call_ops nlmsvc_grant_ops; 48static const struct rpc_call_ops nlmsvc_grant_ops;
49static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
49 50
50/* 51/*
51 * The list of blocked locks to retry 52 * The list of blocked locks to retry
@@ -233,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
233failed_free: 234failed_free:
234 kfree(block); 235 kfree(block);
235failed: 236failed:
236 nlm_release_call(call); 237 nlmsvc_release_call(call);
237 return NULL; 238 return NULL;
238} 239}
239 240
@@ -266,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
266 mutex_unlock(&file->f_mutex); 267 mutex_unlock(&file->f_mutex);
267 268
268 nlmsvc_freegrantargs(block->b_call); 269 nlmsvc_freegrantargs(block->b_call);
269 nlm_release_call(block->b_call); 270 nlmsvc_release_call(block->b_call);
270 nlm_release_file(block->b_file); 271 nlm_release_file(block->b_file);
271 kfree(block->b_fl); 272 kfree(block->b_fl);
272 kfree(block); 273 kfree(block);
@@ -934,3 +935,32 @@ nlmsvc_retry_blocked(void)
934 935
935 return timeout; 936 return timeout;
936} 937}
938
939#ifdef RPC_DEBUG
940static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
941{
942 /*
943 * We can get away with a static buffer because we're only
944 * called with BKL held.
945 */
946 static char buf[2*NLM_MAXCOOKIELEN+1];
947 unsigned int i, len = sizeof(buf);
948 char *p = buf;
949
950 len--; /* allow for trailing \0 */
951 if (len < 3)
952 return "???";
953 for (i = 0 ; i < cookie->len ; i++) {
954 if (len < 2) {
955 strcpy(p-3, "...");
956 break;
957 }
958 sprintf(p, "%02x", cookie->data[i]);
959 p += 2;
960 len -= 2;
961 }
962 *p = '\0';
963
964 return buf;
965}
966#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0caea5310ac3..d27aab11f324 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -80,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
80 return 0; 80 return 0;
81 81
82no_locks: 82no_locks:
83 nlm_release_host(host); 83 nlmsvc_release_host(host);
84 if (error) 84 if (error)
85 return error; 85 return error;
86 return nlm_lck_denied_nolocks; 86 return nlm_lck_denied_nolocks;
@@ -122,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
122 dprintk("lockd: TEST status %d vers %d\n", 122 dprintk("lockd: TEST status %d vers %d\n",
123 ntohl(resp->status), rqstp->rq_vers); 123 ntohl(resp->status), rqstp->rq_vers);
124 124
125 nlm_release_host(host); 125 nlmsvc_release_host(host);
126 nlm_release_file(file); 126 nlm_release_file(file);
127 return rc; 127 return rc;
128} 128}
@@ -164,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
164 else 164 else
165 dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 165 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
166 166
167 nlm_release_host(host); 167 nlmsvc_release_host(host);
168 nlm_release_file(file); 168 nlm_release_file(file);
169 return rc; 169 return rc;
170} 170}
@@ -194,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
194 resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock)); 194 resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
195 195
196 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); 196 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status));
197 nlm_release_host(host); 197 nlmsvc_release_host(host);
198 nlm_release_file(file); 198 nlm_release_file(file);
199 return rpc_success; 199 return rpc_success;
200} 200}
@@ -227,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
227 resp->status = cast_status(nlmsvc_unlock(file, &argp->lock)); 227 resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
228 228
229 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); 229 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status));
230 nlm_release_host(host); 230 nlmsvc_release_host(host);
231 nlm_release_file(file); 231 nlm_release_file(file);
232 return rpc_success; 232 return rpc_success;
233} 233}
@@ -257,9 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
257 -task->tk_status); 257 -task->tk_status);
258} 258}
259 259
260void nlmsvc_release_call(struct nlm_rqst *call)
261{
262 if (!atomic_dec_and_test(&call->a_count))
263 return;
264 nlmsvc_release_host(call->a_host);
265 kfree(call);
266}
267
260static void nlmsvc_callback_release(void *data) 268static void nlmsvc_callback_release(void *data)
261{ 269{
262 nlm_release_call(data); 270 nlmsvc_release_call(data);
263} 271}
264 272
265static const struct rpc_call_ops nlmsvc_callback_ops = { 273static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -291,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
291 299
292 stat = func(rqstp, argp, &call->a_res); 300 stat = func(rqstp, argp, &call->a_res);
293 if (stat != 0) { 301 if (stat != 0) {
294 nlm_release_call(call); 302 nlmsvc_release_call(call);
295 return stat; 303 return stat;
296 } 304 }
297 305
@@ -366,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
366 resp->status = cast_status(nlmsvc_share_file(host, file, argp)); 374 resp->status = cast_status(nlmsvc_share_file(host, file, argp));
367 375
368 dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); 376 dprintk("lockd: SHARE status %d\n", ntohl(resp->status));
369 nlm_release_host(host); 377 nlmsvc_release_host(host);
370 nlm_release_file(file); 378 nlm_release_file(file);
371 return rpc_success; 379 return rpc_success;
372} 380}
@@ -399,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
399 resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); 407 resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
400 408
401 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); 409 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status));
402 nlm_release_host(host); 410 nlmsvc_release_host(host);
403 nlm_release_file(file); 411 nlm_release_file(file);
404 return rpc_success; 412 return rpc_success;
405} 413}
@@ -431,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
431 return rpc_success; 439 return rpc_success;
432 440
433 nlmsvc_free_host_resources(host); 441 nlmsvc_free_host_resources(host);
434 nlm_release_host(host); 442 nlmsvc_release_host(host);
435 return rpc_success; 443 return rpc_success;
436} 444}
437 445
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index b583ab0a4cbb..964666c68a86 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
149} 149}
150 150
151/* 151/*
152 * Encode a lock as part of an NLM call
153 */
154static __be32 *
155nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
156{
157 struct file_lock *fl = &lock->fl;
158 __s32 start, len;
159
160 if (!(p = xdr_encode_string(p, lock->caller))
161 || !(p = nlm_encode_fh(p, &lock->fh))
162 || !(p = nlm_encode_oh(p, &lock->oh)))
163 return NULL;
164
165 if (fl->fl_start > NLM_OFFSET_MAX
166 || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
167 return NULL;
168
169 start = loff_t_to_s32(fl->fl_start);
170 if (fl->fl_end == OFFSET_MAX)
171 len = 0;
172 else
173 len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
174
175 *p++ = htonl(lock->svid);
176 *p++ = htonl(start);
177 *p++ = htonl(len);
178
179 return p;
180}
181
182/*
183 * Encode result of a TEST/TEST_MSG call 152 * Encode result of a TEST/TEST_MSG call
184 */ 153 */
185static __be32 * 154static __be32 *
@@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
372{ 341{
373 return xdr_ressize_check(rqstp, p); 342 return xdr_ressize_check(rqstp, p);
374} 343}
375
376/*
377 * Now, the client side XDR functions
378 */
379#ifdef NLMCLNT_SUPPORT_SHARES
380static int
381nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
382{
383 return 0;
384}
385#endif
386
387static int
388nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
389{
390 struct nlm_lock *lock = &argp->lock;
391
392 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
393 return -EIO;
394 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
395 if (!(p = nlm_encode_lock(p, lock)))
396 return -EIO;
397 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
398 return 0;
399}
400
401static int
402nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
403{
404 if (!(p = nlm_decode_cookie(p, &resp->cookie)))
405 return -EIO;
406 resp->status = *p++;
407 if (resp->status == nlm_lck_denied) {
408 struct file_lock *fl = &resp->lock.fl;
409 u32 excl;
410 s32 start, len, end;
411
412 memset(&resp->lock, 0, sizeof(resp->lock));
413 locks_init_lock(fl);
414 excl = ntohl(*p++);
415 resp->lock.svid = ntohl(*p++);
416 fl->fl_pid = (pid_t)resp->lock.svid;
417 if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
418 return -EIO;
419
420 fl->fl_flags = FL_POSIX;
421 fl->fl_type = excl? F_WRLCK : F_RDLCK;
422 start = ntohl(*p++);
423 len = ntohl(*p++);
424 end = start + len - 1;
425
426 fl->fl_start = s32_to_loff_t(start);
427 if (len == 0 || end < 0)
428 fl->fl_end = OFFSET_MAX;
429 else
430 fl->fl_end = s32_to_loff_t(end);
431 }
432 return 0;
433}
434
435
436static int
437nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
438{
439 struct nlm_lock *lock = &argp->lock;
440
441 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
442 return -EIO;
443 *p++ = argp->block? xdr_one : xdr_zero;
444 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
445 if (!(p = nlm_encode_lock(p, lock)))
446 return -EIO;
447 *p++ = argp->reclaim? xdr_one : xdr_zero;
448 *p++ = htonl(argp->state);
449 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
450 return 0;
451}
452
453static int
454nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
455{
456 struct nlm_lock *lock = &argp->lock;
457
458 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
459 return -EIO;
460 *p++ = argp->block? xdr_one : xdr_zero;
461 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
462 if (!(p = nlm_encode_lock(p, lock)))
463 return -EIO;
464 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
465 return 0;
466}
467
468static int
469nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
470{
471 struct nlm_lock *lock = &argp->lock;
472
473 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
474 return -EIO;
475 if (!(p = nlm_encode_lock(p, lock)))
476 return -EIO;
477 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
478 return 0;
479}
480
481static int
482nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
483{
484 if (!(p = nlm_encode_cookie(p, &resp->cookie)))
485 return -EIO;
486 *p++ = resp->status;
487 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
488 return 0;
489}
490
491static int
492nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
493{
494 if (!(p = nlm_encode_testres(p, resp)))
495 return -EIO;
496 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
497 return 0;
498}
499
500static int
501nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
502{
503 if (!(p = nlm_decode_cookie(p, &resp->cookie)))
504 return -EIO;
505 resp->status = *p++;
506 return 0;
507}
508
509#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
510# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
511#endif
512
513/*
514 * Buffer requirements for NLM
515 */
516#define NLM_void_sz 0
517#define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
518#define NLM_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
519#define NLM_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
520#define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE)
521#define NLM_lock_sz 3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
522#define NLM_holder_sz 4+NLM_owner_sz
523
524#define NLM_testargs_sz NLM_cookie_sz+1+NLM_lock_sz
525#define NLM_lockargs_sz NLM_cookie_sz+4+NLM_lock_sz
526#define NLM_cancargs_sz NLM_cookie_sz+2+NLM_lock_sz
527#define NLM_unlockargs_sz NLM_cookie_sz+NLM_lock_sz
528
529#define NLM_testres_sz NLM_cookie_sz+1+NLM_holder_sz
530#define NLM_res_sz NLM_cookie_sz+1
531#define NLM_norep_sz 0
532
533/*
534 * For NLM, a void procedure really returns nothing
535 */
536#define nlmclt_decode_norep NULL
537
538#define PROC(proc, argtype, restype) \
539[NLMPROC_##proc] = { \
540 .p_proc = NLMPROC_##proc, \
541 .p_encode = (kxdrproc_t) nlmclt_encode_##argtype, \
542 .p_decode = (kxdrproc_t) nlmclt_decode_##restype, \
543 .p_arglen = NLM_##argtype##_sz, \
544 .p_replen = NLM_##restype##_sz, \
545 .p_statidx = NLMPROC_##proc, \
546 .p_name = #proc, \
547 }
548
549static struct rpc_procinfo nlm_procedures[] = {
550 PROC(TEST, testargs, testres),
551 PROC(LOCK, lockargs, res),
552 PROC(CANCEL, cancargs, res),
553 PROC(UNLOCK, unlockargs, res),
554 PROC(GRANTED, testargs, res),
555 PROC(TEST_MSG, testargs, norep),
556 PROC(LOCK_MSG, lockargs, norep),
557 PROC(CANCEL_MSG, cancargs, norep),
558 PROC(UNLOCK_MSG, unlockargs, norep),
559 PROC(GRANTED_MSG, testargs, norep),
560 PROC(TEST_RES, testres, norep),
561 PROC(LOCK_RES, res, norep),
562 PROC(CANCEL_RES, res, norep),
563 PROC(UNLOCK_RES, res, norep),
564 PROC(GRANTED_RES, res, norep),
565#ifdef NLMCLNT_SUPPORT_SHARES
566 PROC(SHARE, shareargs, shareres),
567 PROC(UNSHARE, shareargs, shareres),
568 PROC(NM_LOCK, lockargs, res),
569 PROC(FREE_ALL, notify, void),
570#endif
571};
572
573static struct rpc_version nlm_version1 = {
574 .number = 1,
575 .nrprocs = 16,
576 .procs = nlm_procedures,
577};
578
579static struct rpc_version nlm_version3 = {
580 .number = 3,
581 .nrprocs = 24,
582 .procs = nlm_procedures,
583};
584
585static struct rpc_version * nlm_versions[] = {
586 [1] = &nlm_version1,
587 [3] = &nlm_version3,
588#ifdef CONFIG_LOCKD_V4
589 [4] = &nlm_version4,
590#endif
591};
592
593static struct rpc_stat nlm_stats;
594
595struct rpc_program nlm_program = {
596 .name = "lockd",
597 .number = NLM_PROGRAM,
598 .nrvers = ARRAY_SIZE(nlm_versions),
599 .version = nlm_versions,
600 .stats = &nlm_stats,
601};
602
603#ifdef RPC_DEBUG
604const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
605{
606 /*
607 * We can get away with a static buffer because we're only
608 * called with BKL held.
609 */
610 static char buf[2*NLM_MAXCOOKIELEN+1];
611 unsigned int i, len = sizeof(buf);
612 char *p = buf;
613
614 len--; /* allow for trailing \0 */
615 if (len < 3)
616 return "???";
617 for (i = 0 ; i < cookie->len ; i++) {
618 if (len < 2) {
619 strcpy(p-3, "...");
620 break;
621 }
622 sprintf(p, "%02x", cookie->data[i]);
623 p += 2;
624 len -= 2;
625 }
626 *p = '\0';
627
628 return buf;
629}
630#endif
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ad9dbbc9145d..dfa4789cd460 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
93 return p + XDR_QUADLEN(f->size); 93 return p + XDR_QUADLEN(f->size);
94} 94}
95 95
96static __be32 *
97nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
98{
99 *p++ = htonl(f->size);
100 if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
101 memcpy(p, f->data, f->size);
102 return p + XDR_QUADLEN(f->size);
103}
104
105/* 96/*
106 * Encode and decode owner handle 97 * Encode and decode owner handle
107 */ 98 */
@@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
112} 103}
113 104
114static __be32 * 105static __be32 *
115nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
116{
117 return xdr_encode_netobj(p, oh);
118}
119
120static __be32 *
121nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) 106nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
122{ 107{
123 struct file_lock *fl = &lock->fl; 108 struct file_lock *fl = &lock->fl;
@@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
150} 135}
151 136
152/* 137/*
153 * Encode a lock as part of an NLM call
154 */
155static __be32 *
156nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
157{
158 struct file_lock *fl = &lock->fl;
159 __s64 start, len;
160
161 if (!(p = xdr_encode_string(p, lock->caller))
162 || !(p = nlm4_encode_fh(p, &lock->fh))
163 || !(p = nlm4_encode_oh(p, &lock->oh)))
164 return NULL;
165
166 if (fl->fl_start > NLM4_OFFSET_MAX
167 || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
168 return NULL;
169
170 *p++ = htonl(lock->svid);
171
172 start = loff_t_to_s64(fl->fl_start);
173 if (fl->fl_end == OFFSET_MAX)
174 len = 0;
175 else
176 len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
177
178 p = xdr_encode_hyper(p, start);
179 p = xdr_encode_hyper(p, len);
180
181 return p;
182}
183
184/*
185 * Encode result of a TEST/TEST_MSG call 138 * Encode result of a TEST/TEST_MSG call
186 */ 139 */
187static __be32 * 140static __be32 *
@@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
379{ 332{
380 return xdr_ressize_check(rqstp, p); 333 return xdr_ressize_check(rqstp, p);
381} 334}
382
383/*
384 * Now, the client side XDR functions
385 */
386#ifdef NLMCLNT_SUPPORT_SHARES
387static int
388nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
389{
390 return 0;
391}
392#endif
393
394static int
395nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
396{
397 struct nlm_lock *lock = &argp->lock;
398
399 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
400 return -EIO;
401 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
402 if (!(p = nlm4_encode_lock(p, lock)))
403 return -EIO;
404 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
405 return 0;
406}
407
408static int
409nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
410{
411 if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
412 return -EIO;
413 resp->status = *p++;
414 if (resp->status == nlm_lck_denied) {
415 struct file_lock *fl = &resp->lock.fl;
416 u32 excl;
417 __u64 start, len;
418 __s64 end;
419
420 memset(&resp->lock, 0, sizeof(resp->lock));
421 locks_init_lock(fl);
422 excl = ntohl(*p++);
423 resp->lock.svid = ntohl(*p++);
424 fl->fl_pid = (pid_t)resp->lock.svid;
425 if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
426 return -EIO;
427
428 fl->fl_flags = FL_POSIX;
429 fl->fl_type = excl? F_WRLCK : F_RDLCK;
430 p = xdr_decode_hyper(p, &start);
431 p = xdr_decode_hyper(p, &len);
432 end = start + len - 1;
433
434 fl->fl_start = s64_to_loff_t(start);
435 if (len == 0 || end < 0)
436 fl->fl_end = OFFSET_MAX;
437 else
438 fl->fl_end = s64_to_loff_t(end);
439 }
440 return 0;
441}
442
443
444static int
445nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
446{
447 struct nlm_lock *lock = &argp->lock;
448
449 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
450 return -EIO;
451 *p++ = argp->block? xdr_one : xdr_zero;
452 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
453 if (!(p = nlm4_encode_lock(p, lock)))
454 return -EIO;
455 *p++ = argp->reclaim? xdr_one : xdr_zero;
456 *p++ = htonl(argp->state);
457 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
458 return 0;
459}
460
461static int
462nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
463{
464 struct nlm_lock *lock = &argp->lock;
465
466 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
467 return -EIO;
468 *p++ = argp->block? xdr_one : xdr_zero;
469 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
470 if (!(p = nlm4_encode_lock(p, lock)))
471 return -EIO;
472 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
473 return 0;
474}
475
476static int
477nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
478{
479 struct nlm_lock *lock = &argp->lock;
480
481 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
482 return -EIO;
483 if (!(p = nlm4_encode_lock(p, lock)))
484 return -EIO;
485 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
486 return 0;
487}
488
489static int
490nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
491{
492 if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
493 return -EIO;
494 *p++ = resp->status;
495 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
496 return 0;
497}
498
499static int
500nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
501{
502 if (!(p = nlm4_encode_testres(p, resp)))
503 return -EIO;
504 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
505 return 0;
506}
507
508static int
509nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
510{
511 if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
512 return -EIO;
513 resp->status = *p++;
514 return 0;
515}
516
517#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
518# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
519#endif
520
521#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
522# error "NLM host name cannot be larger than NLM's maximum string length!"
523#endif
524
525/*
526 * Buffer requirements for NLM
527 */
528#define NLM4_void_sz 0
529#define NLM4_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
530#define NLM4_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
531#define NLM4_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
532#define NLM4_fhandle_sz 1+XDR_QUADLEN(NFS3_FHSIZE)
533#define NLM4_lock_sz 5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
534#define NLM4_holder_sz 6+NLM4_owner_sz
535
536#define NLM4_testargs_sz NLM4_cookie_sz+1+NLM4_lock_sz
537#define NLM4_lockargs_sz NLM4_cookie_sz+4+NLM4_lock_sz
538#define NLM4_cancargs_sz NLM4_cookie_sz+2+NLM4_lock_sz
539#define NLM4_unlockargs_sz NLM4_cookie_sz+NLM4_lock_sz
540
541#define NLM4_testres_sz NLM4_cookie_sz+1+NLM4_holder_sz
542#define NLM4_res_sz NLM4_cookie_sz+1
543#define NLM4_norep_sz 0
544
545/*
546 * For NLM, a void procedure really returns nothing
547 */
548#define nlm4clt_decode_norep NULL
549
550#define PROC(proc, argtype, restype) \
551[NLMPROC_##proc] = { \
552 .p_proc = NLMPROC_##proc, \
553 .p_encode = (kxdrproc_t) nlm4clt_encode_##argtype, \
554 .p_decode = (kxdrproc_t) nlm4clt_decode_##restype, \
555 .p_arglen = NLM4_##argtype##_sz, \
556 .p_replen = NLM4_##restype##_sz, \
557 .p_statidx = NLMPROC_##proc, \
558 .p_name = #proc, \
559 }
560
561static struct rpc_procinfo nlm4_procedures[] = {
562 PROC(TEST, testargs, testres),
563 PROC(LOCK, lockargs, res),
564 PROC(CANCEL, cancargs, res),
565 PROC(UNLOCK, unlockargs, res),
566 PROC(GRANTED, testargs, res),
567 PROC(TEST_MSG, testargs, norep),
568 PROC(LOCK_MSG, lockargs, norep),
569 PROC(CANCEL_MSG, cancargs, norep),
570 PROC(UNLOCK_MSG, unlockargs, norep),
571 PROC(GRANTED_MSG, testargs, norep),
572 PROC(TEST_RES, testres, norep),
573 PROC(LOCK_RES, res, norep),
574 PROC(CANCEL_RES, res, norep),
575 PROC(UNLOCK_RES, res, norep),
576 PROC(GRANTED_RES, res, norep),
577#ifdef NLMCLNT_SUPPORT_SHARES
578 PROC(SHARE, shareargs, shareres),
579 PROC(UNSHARE, shareargs, shareres),
580 PROC(NM_LOCK, lockargs, res),
581 PROC(FREE_ALL, notify, void),
582#endif
583};
584
585struct rpc_version nlm_version4 = {
586 .number = 4,
587 .nrprocs = 24,
588 .procs = nlm4_procedures,
589};
diff --git a/fs/locks.c b/fs/locks.c
index 08415b2a6d36..0f3998291f78 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -444,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
444 fl->fl_file->f_owner.signum = 0; 444 fl->fl_file->f_owner.signum = 0;
445} 445}
446 446
447static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
448{
449 return fl->fl_file == try->fl_file;
450}
451
452static const struct lock_manager_operations lease_manager_ops = { 447static const struct lock_manager_operations lease_manager_ops = {
453 .fl_break = lease_break_callback, 448 .fl_break = lease_break_callback,
454 .fl_release_private = lease_release_private_callback, 449 .fl_release_private = lease_release_private_callback,
455 .fl_mylease = lease_mylease_callback,
456 .fl_change = lease_modify, 450 .fl_change = lease_modify,
457}; 451};
458 452
@@ -1405,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1405 for (before = &inode->i_flock; 1399 for (before = &inode->i_flock;
1406 ((fl = *before) != NULL) && IS_LEASE(fl); 1400 ((fl = *before) != NULL) && IS_LEASE(fl);
1407 before = &fl->fl_next) { 1401 before = &fl->fl_next) {
1408 if (lease->fl_lmops->fl_mylease(fl, lease)) 1402 if (fl->fl_file == filp)
1409 my_before = before; 1403 my_before = before;
1410 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) 1404 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
1411 /* 1405 /*
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 92ca6fbe09bd..723bc5bca09a 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
300 300
301static void bdev_put_device(struct logfs_super *s) 301static void bdev_put_device(struct logfs_super *s)
302{ 302{
303 close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE); 303 blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
304} 304}
305 305
306static int bdev_can_write_buf(struct super_block *sb, u64 ofs) 306static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -325,13 +325,14 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
325{ 325{
326 struct block_device *bdev; 326 struct block_device *bdev;
327 327
328 bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); 328 bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
329 type);
329 if (IS_ERR(bdev)) 330 if (IS_ERR(bdev))
330 return PTR_ERR(bdev); 331 return PTR_ERR(bdev);
331 332
332 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { 333 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
333 int mtdnr = MINOR(bdev->bd_dev); 334 int mtdnr = MINOR(bdev->bd_dev);
334 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 335 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
335 return logfs_get_sb_mtd(p, mtdnr); 336 return logfs_get_sb_mtd(p, mtdnr);
336 } 337 }
337 338
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237b..a25444ab2baf 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
76EXPORT_SYMBOL(mb_cache_entry_find_next); 76EXPORT_SYMBOL(mb_cache_entry_find_next);
77#endif 77#endif
78 78
79struct mb_cache {
80 struct list_head c_cache_list;
81 const char *c_name;
82 atomic_t c_entry_count;
83 int c_max_entries;
84 int c_bucket_bits;
85 struct kmem_cache *c_entry_cache;
86 struct list_head *c_block_hash;
87 struct list_head *c_index_hash;
88};
89
90
91/* 79/*
92 * Global data: list of all mbcache's, lru list, and a spinlock for 80 * Global data: list of all mbcache's, lru list, and a spinlock for
93 * accessing cache data structures on SMP machines. The lru list is 81 * accessing cache data structures on SMP machines. The lru list is
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 1b9e07728a9f..ce7337ddfdbf 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,8 +23,6 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
23 struct inode * inode = NULL; 23 struct inode * inode = NULL;
24 ino_t ino; 24 ino_t ino;
25 25
26 d_set_d_op(dentry, dir->i_sb->s_root->d_op);
27
28 if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen) 26 if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
29 return ERR_PTR(-ENAMETOOLONG); 27 return ERR_PTR(-ENAMETOOLONG);
30 28
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
40 * status of that page is hard. See end_buffer_async_read() for the details. 40 * status of that page is hard. See end_buffer_async_read() for the details.
41 * There is no point in duplicating all that complexity. 41 * There is no point in duplicating all that complexity.
42 */ 42 */
43static void mpage_end_io_read(struct bio *bio, int err) 43static void mpage_end_io(struct bio *bio, int err)
44{ 44{
45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
50 50
51 if (--bvec >= bio->bi_io_vec) 51 if (--bvec >= bio->bi_io_vec)
52 prefetchw(&bvec->bv_page->flags); 52 prefetchw(&bvec->bv_page->flags);
53 53 if (bio_data_dir(bio) == READ) {
54 if (uptodate) { 54 if (uptodate) {
55 SetPageUptodate(page); 55 SetPageUptodate(page);
56 } else { 56 } else {
57 ClearPageUptodate(page); 57 ClearPageUptodate(page);
58 SetPageError(page); 58 SetPageError(page);
59 } 59 }
60 unlock_page(page); 60 unlock_page(page);
61 } while (bvec >= bio->bi_io_vec); 61 } else { /* bio_data_dir(bio) == WRITE */
62 bio_put(bio); 62 if (!uptodate) {
63} 63 SetPageError(page);
64 64 if (page->mapping)
65static void mpage_end_io_write(struct bio *bio, int err) 65 set_bit(AS_EIO, &page->mapping->flags);
66{ 66 }
67 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 67 end_page_writeback(page);
68 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
69
70 do {
71 struct page *page = bvec->bv_page;
72
73 if (--bvec >= bio->bi_io_vec)
74 prefetchw(&bvec->bv_page->flags);
75
76 if (!uptodate){
77 SetPageError(page);
78 if (page->mapping)
79 set_bit(AS_EIO, &page->mapping->flags);
80 } 68 }
81 end_page_writeback(page);
82 } while (bvec >= bio->bi_io_vec); 69 } while (bvec >= bio->bi_io_vec);
83 bio_put(bio); 70 bio_put(bio);
84} 71}
85 72
86static struct bio *mpage_bio_submit(int rw, struct bio *bio) 73static struct bio *mpage_bio_submit(int rw, struct bio *bio)
87{ 74{
88 bio->bi_end_io = mpage_end_io_read; 75 bio->bi_end_io = mpage_end_io;
89 if (rw == WRITE)
90 bio->bi_end_io = mpage_end_io_write;
91 submit_bio(rw, bio); 76 submit_bio(rw, bio);
92 return NULL; 77 return NULL;
93} 78}
diff --git a/fs/namei.c b/fs/namei.c
index 19433cdba011..7d77f24d32a9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -202,7 +202,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
202 * @inode: inode to check access rights for 202 * @inode: inode to check access rights for
203 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 203 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
204 * @check_acl: optional callback to check for Posix ACLs 204 * @check_acl: optional callback to check for Posix ACLs
205 * @flags IPERM_FLAG_ flags. 205 * @flags: IPERM_FLAG_ flags.
206 * 206 *
207 * Used to check for read/write/execute permissions on a file. 207 * Used to check for read/write/execute permissions on a file.
208 * We use "fsuid" for this, letting us set arbitrary permissions 208 * We use "fsuid" for this, letting us set arbitrary permissions
@@ -368,18 +368,6 @@ void path_get(struct path *path)
368EXPORT_SYMBOL(path_get); 368EXPORT_SYMBOL(path_get);
369 369
370/** 370/**
371 * path_get_long - get a long reference to a path
372 * @path: path to get the reference to
373 *
374 * Given a path increment the reference count to the dentry and the vfsmount.
375 */
376void path_get_long(struct path *path)
377{
378 mntget_long(path->mnt);
379 dget(path->dentry);
380}
381
382/**
383 * path_put - put a reference to a path 371 * path_put - put a reference to a path
384 * @path: path to put the reference to 372 * @path: path to put the reference to
385 * 373 *
@@ -393,21 +381,9 @@ void path_put(struct path *path)
393EXPORT_SYMBOL(path_put); 381EXPORT_SYMBOL(path_put);
394 382
395/** 383/**
396 * path_put_long - put a long reference to a path
397 * @path: path to put the reference to
398 *
399 * Given a path decrement the reference count to the dentry and the vfsmount.
400 */
401void path_put_long(struct path *path)
402{
403 dput(path->dentry);
404 mntput_long(path->mnt);
405}
406
407/**
408 * nameidata_drop_rcu - drop this nameidata out of rcu-walk 384 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
409 * @nd: nameidata pathwalk data to drop 385 * @nd: nameidata pathwalk data to drop
410 * @Returns: 0 on success, -ECHLID on failure 386 * Returns: 0 on success, -ECHILD on failure
411 * 387 *
412 * Path walking has 2 modes, rcu-walk and ref-walk (see 388 * Path walking has 2 modes, rcu-walk and ref-walk (see
413 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt 389 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
@@ -468,7 +444,7 @@ static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
468 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk 444 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
469 * @nd: nameidata pathwalk data to drop 445 * @nd: nameidata pathwalk data to drop
470 * @dentry: dentry to drop 446 * @dentry: dentry to drop
471 * @Returns: 0 on success, -ECHLID on failure 447 * Returns: 0 on success, -ECHILD on failure
472 * 448 *
473 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root, 449 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
474 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on 450 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
@@ -479,6 +455,14 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
479 struct fs_struct *fs = current->fs; 455 struct fs_struct *fs = current->fs;
480 struct dentry *parent = nd->path.dentry; 456 struct dentry *parent = nd->path.dentry;
481 457
458 /*
459 * It can be possible to revalidate the dentry that we started
460 * the path walk with. force_reval_path may also revalidate the
461 * dentry already committed to the nameidata.
462 */
463 if (unlikely(parent == dentry))
464 return nameidata_drop_rcu(nd);
465
482 BUG_ON(!(nd->flags & LOOKUP_RCU)); 466 BUG_ON(!(nd->flags & LOOKUP_RCU));
483 if (nd->root.mnt) { 467 if (nd->root.mnt) {
484 spin_lock(&fs->lock); 468 spin_lock(&fs->lock);
@@ -530,7 +514,7 @@ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct d
530/** 514/**
531 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk 515 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
532 * @nd: nameidata pathwalk data to drop 516 * @nd: nameidata pathwalk data to drop
533 * @Returns: 0 on success, -ECHLID on failure 517 * Returns: 0 on success, -ECHILD on failure
534 * 518 *
535 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk. 519 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
536 * nd->path should be the final element of the lookup, so nd->root is discarded. 520 * nd->path should be the final element of the lookup, so nd->root is discarded.
@@ -583,6 +567,13 @@ void release_open_intent(struct nameidata *nd)
583 fput(nd->intent.open.file); 567 fput(nd->intent.open.file);
584} 568}
585 569
570/*
571 * Call d_revalidate and handle filesystems that request rcu-walk
572 * to be dropped. This may be called and return in rcu-walk mode,
573 * regardless of success or error. If -ECHILD is returned, the caller
574 * must return -ECHILD back up the path walk stack so path walk may
575 * be restarted in ref-walk mode.
576 */
586static int d_revalidate(struct dentry *dentry, struct nameidata *nd) 577static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
587{ 578{
588 int status; 579 int status;
@@ -673,6 +664,9 @@ force_reval_path(struct path *path, struct nameidata *nd)
673 return 0; 664 return 0;
674 665
675 if (!status) { 666 if (!status) {
667 /* Don't d_invalidate in rcu-walk mode */
668 if (nameidata_drop_rcu(nd))
669 return -ECHILD;
676 d_invalidate(dentry); 670 d_invalidate(dentry);
677 status = -ESTALE; 671 status = -ESTALE;
678 } 672 }
@@ -761,7 +755,8 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
761 mntput(path->mnt); 755 mntput(path->mnt);
762} 756}
763 757
764static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 758static inline void path_to_nameidata(const struct path *path,
759 struct nameidata *nd)
765{ 760{
766 if (!(nd->flags & LOOKUP_RCU)) { 761 if (!(nd->flags & LOOKUP_RCU)) {
767 dput(nd->path.dentry); 762 dput(nd->path.dentry);
@@ -773,20 +768,16 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
773} 768}
774 769
775static __always_inline int 770static __always_inline int
776__do_follow_link(struct path *path, struct nameidata *nd, void **p) 771__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
777{ 772{
778 int error; 773 int error;
779 struct dentry *dentry = path->dentry; 774 struct dentry *dentry = link->dentry;
780 775
781 touch_atime(path->mnt, dentry); 776 touch_atime(link->mnt, dentry);
782 nd_set_link(nd, NULL); 777 nd_set_link(nd, NULL);
783 778
784 if (path->mnt != nd->path.mnt) { 779 if (link->mnt == nd->path.mnt)
785 path_to_nameidata(path, nd); 780 mntget(link->mnt);
786 nd->inode = nd->path.dentry->d_inode;
787 dget(dentry);
788 }
789 mntget(path->mnt);
790 781
791 nd->last_type = LAST_BIND; 782 nd->last_type = LAST_BIND;
792 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 783 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -877,54 +868,148 @@ int follow_up(struct path *path)
877} 868}
878 869
879/* 870/*
880 * serialization is taken care of in namespace.c 871 * Perform an automount
872 * - return -EISDIR to tell follow_managed() to stop and return the path we
873 * were called with.
881 */ 874 */
882static void __follow_mount_rcu(struct nameidata *nd, struct path *path, 875static int follow_automount(struct path *path, unsigned flags,
883 struct inode **inode) 876 bool *need_mntput)
884{ 877{
885 while (d_mountpoint(path->dentry)) { 878 struct vfsmount *mnt;
886 struct vfsmount *mounted; 879 int err;
887 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 880
888 if (!mounted) 881 if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
889 return; 882 return -EREMOTE;
890 path->mnt = mounted; 883
891 path->dentry = mounted->mnt_root; 884 /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
892 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 885 * and this is the terminal part of the path.
893 *inode = path->dentry->d_inode; 886 */
887 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
888 return -EISDIR; /* we actually want to stop here */
889
890 /* We want to mount if someone is trying to open/create a file of any
891 * type under the mountpoint, wants to traverse through the mountpoint
892 * or wants to open the mounted directory.
893 *
894 * We don't want to mount if someone's just doing a stat and they've
895 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
896 * appended a '/' to the name.
897 */
898 if (!(flags & LOOKUP_FOLLOW) &&
899 !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
900 LOOKUP_OPEN | LOOKUP_CREATE)))
901 return -EISDIR;
902
903 current->total_link_count++;
904 if (current->total_link_count >= 40)
905 return -ELOOP;
906
907 mnt = path->dentry->d_op->d_automount(path);
908 if (IS_ERR(mnt)) {
909 /*
910 * The filesystem is allowed to return -EISDIR here to indicate
911 * it doesn't want to automount. For instance, autofs would do
912 * this so that its userspace daemon can mount on this dentry.
913 *
914 * However, we can only permit this if it's a terminal point in
915 * the path being looked up; if it wasn't then the remainder of
916 * the path is inaccessible and we should say so.
917 */
918 if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
919 return -EREMOTE;
920 return PTR_ERR(mnt);
894 } 921 }
895}
896 922
897static int __follow_mount(struct path *path) 923 if (!mnt) /* mount collision */
898{ 924 return 0;
899 int res = 0; 925
900 while (d_mountpoint(path->dentry)) { 926 err = finish_automount(mnt, path);
901 struct vfsmount *mounted = lookup_mnt(path); 927
902 if (!mounted) 928 switch (err) {
903 break; 929 case -EBUSY:
930 /* Someone else made a mount here whilst we were busy */
931 return 0;
932 case 0:
904 dput(path->dentry); 933 dput(path->dentry);
905 if (res) 934 if (*need_mntput)
906 mntput(path->mnt); 935 mntput(path->mnt);
907 path->mnt = mounted; 936 path->mnt = mnt;
908 path->dentry = dget(mounted->mnt_root); 937 path->dentry = dget(mnt->mnt_root);
909 res = 1; 938 *need_mntput = true;
939 return 0;
940 default:
941 return err;
910 } 942 }
911 return res; 943
912} 944}
913 945
914static void follow_mount(struct path *path) 946/*
947 * Handle a dentry that is managed in some way.
948 * - Flagged for transit management (autofs)
949 * - Flagged as mountpoint
950 * - Flagged as automount point
951 *
952 * This may only be called in refwalk mode.
953 *
954 * Serialization is taken care of in namespace.c
955 */
956static int follow_managed(struct path *path, unsigned flags)
915{ 957{
916 while (d_mountpoint(path->dentry)) { 958 unsigned managed;
917 struct vfsmount *mounted = lookup_mnt(path); 959 bool need_mntput = false;
918 if (!mounted) 960 int ret;
919 break; 961
920 dput(path->dentry); 962 /* Given that we're not holding a lock here, we retain the value in a
921 mntput(path->mnt); 963 * local variable for each dentry as we look at it so that we don't see
922 path->mnt = mounted; 964 * the components of that value change under us */
923 path->dentry = dget(mounted->mnt_root); 965 while (managed = ACCESS_ONCE(path->dentry->d_flags),
966 managed &= DCACHE_MANAGED_DENTRY,
967 unlikely(managed != 0)) {
968 /* Allow the filesystem to manage the transit without i_mutex
969 * being held. */
970 if (managed & DCACHE_MANAGE_TRANSIT) {
971 BUG_ON(!path->dentry->d_op);
972 BUG_ON(!path->dentry->d_op->d_manage);
973 ret = path->dentry->d_op->d_manage(path->dentry,
974 false, false);
975 if (ret < 0)
976 return ret == -EISDIR ? 0 : ret;
977 }
978
979 /* Transit to a mounted filesystem. */
980 if (managed & DCACHE_MOUNTED) {
981 struct vfsmount *mounted = lookup_mnt(path);
982 if (mounted) {
983 dput(path->dentry);
984 if (need_mntput)
985 mntput(path->mnt);
986 path->mnt = mounted;
987 path->dentry = dget(mounted->mnt_root);
988 need_mntput = true;
989 continue;
990 }
991
992 /* Something is mounted on this dentry in another
993 * namespace and/or whatever was mounted there in this
994 * namespace got unmounted before we managed to get the
995 * vfsmount_lock */
996 }
997
998 /* Handle an automount point */
999 if (managed & DCACHE_NEED_AUTOMOUNT) {
1000 ret = follow_automount(path, flags, &need_mntput);
1001 if (ret < 0)
1002 return ret == -EISDIR ? 0 : ret;
1003 continue;
1004 }
1005
1006 /* We didn't change the current path point */
1007 break;
924 } 1008 }
1009 return 0;
925} 1010}
926 1011
927int follow_down(struct path *path) 1012int follow_down_one(struct path *path)
928{ 1013{
929 struct vfsmount *mounted; 1014 struct vfsmount *mounted;
930 1015
@@ -939,13 +1024,41 @@ int follow_down(struct path *path)
939 return 0; 1024 return 0;
940} 1025}
941 1026
1027/*
1028 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
1029 * meet a managed dentry and we're not walking to "..". True is returned to
1030 * continue, false to abort.
1031 */
1032static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1033 struct inode **inode, bool reverse_transit)
1034{
1035 while (d_mountpoint(path->dentry)) {
1036 struct vfsmount *mounted;
1037 if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
1038 !reverse_transit &&
1039 path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
1040 return false;
1041 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
1042 if (!mounted)
1043 break;
1044 path->mnt = mounted;
1045 path->dentry = mounted->mnt_root;
1046 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1047 *inode = path->dentry->d_inode;
1048 }
1049
1050 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1051 return reverse_transit;
1052 return true;
1053}
1054
942static int follow_dotdot_rcu(struct nameidata *nd) 1055static int follow_dotdot_rcu(struct nameidata *nd)
943{ 1056{
944 struct inode *inode = nd->inode; 1057 struct inode *inode = nd->inode;
945 1058
946 set_root_rcu(nd); 1059 set_root_rcu(nd);
947 1060
948 while(1) { 1061 while (1) {
949 if (nd->path.dentry == nd->root.dentry && 1062 if (nd->path.dentry == nd->root.dentry &&
950 nd->path.mnt == nd->root.mnt) { 1063 nd->path.mnt == nd->root.mnt) {
951 break; 1064 break;
@@ -968,12 +1081,80 @@ static int follow_dotdot_rcu(struct nameidata *nd)
968 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 1081 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
969 inode = nd->path.dentry->d_inode; 1082 inode = nd->path.dentry->d_inode;
970 } 1083 }
971 __follow_mount_rcu(nd, &nd->path, &inode); 1084 __follow_mount_rcu(nd, &nd->path, &inode, true);
972 nd->inode = inode; 1085 nd->inode = inode;
973 1086
974 return 0; 1087 return 0;
975} 1088}
976 1089
1090/*
1091 * Follow down to the covering mount currently visible to userspace. At each
1092 * point, the filesystem owning that dentry may be queried as to whether the
1093 * caller is permitted to proceed or not.
1094 *
1095 * Care must be taken as namespace_sem may be held (indicated by mounting_here
1096 * being true).
1097 */
1098int follow_down(struct path *path, bool mounting_here)
1099{
1100 unsigned managed;
1101 int ret;
1102
1103 while (managed = ACCESS_ONCE(path->dentry->d_flags),
1104 unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1105 /* Allow the filesystem to manage the transit without i_mutex
1106 * being held.
1107 *
1108 * We indicate to the filesystem if someone is trying to mount
1109 * something here. This gives autofs the chance to deny anyone
1110 * other than its daemon the right to mount on its
1111 * superstructure.
1112 *
1113 * The filesystem may sleep at this point.
1114 */
1115 if (managed & DCACHE_MANAGE_TRANSIT) {
1116 BUG_ON(!path->dentry->d_op);
1117 BUG_ON(!path->dentry->d_op->d_manage);
1118 ret = path->dentry->d_op->d_manage(
1119 path->dentry, mounting_here, false);
1120 if (ret < 0)
1121 return ret == -EISDIR ? 0 : ret;
1122 }
1123
1124 /* Transit to a mounted filesystem. */
1125 if (managed & DCACHE_MOUNTED) {
1126 struct vfsmount *mounted = lookup_mnt(path);
1127 if (!mounted)
1128 break;
1129 dput(path->dentry);
1130 mntput(path->mnt);
1131 path->mnt = mounted;
1132 path->dentry = dget(mounted->mnt_root);
1133 continue;
1134 }
1135
1136 /* Don't handle automount points here */
1137 break;
1138 }
1139 return 0;
1140}
1141
1142/*
1143 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1144 */
1145static void follow_mount(struct path *path)
1146{
1147 while (d_mountpoint(path->dentry)) {
1148 struct vfsmount *mounted = lookup_mnt(path);
1149 if (!mounted)
1150 break;
1151 dput(path->dentry);
1152 mntput(path->mnt);
1153 path->mnt = mounted;
1154 path->dentry = dget(mounted->mnt_root);
1155 }
1156}
1157
977static void follow_dotdot(struct nameidata *nd) 1158static void follow_dotdot(struct nameidata *nd)
978{ 1159{
979 set_root(nd); 1160 set_root(nd);
@@ -1038,12 +1219,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1038 struct vfsmount *mnt = nd->path.mnt; 1219 struct vfsmount *mnt = nd->path.mnt;
1039 struct dentry *dentry, *parent = nd->path.dentry; 1220 struct dentry *dentry, *parent = nd->path.dentry;
1040 struct inode *dir; 1221 struct inode *dir;
1222 int err;
1223
1041 /* 1224 /*
1042 * See if the low-level filesystem might want 1225 * See if the low-level filesystem might want
1043 * to use its own hash.. 1226 * to use its own hash..
1044 */ 1227 */
1045 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1228 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1046 int err = parent->d_op->d_hash(parent, nd->inode, name); 1229 err = parent->d_op->d_hash(parent, nd->inode, name);
1047 if (err < 0) 1230 if (err < 0)
1048 return err; 1231 return err;
1049 } 1232 }
@@ -1070,22 +1253,30 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1070 nd->seq = seq; 1253 nd->seq = seq;
1071 if (dentry->d_flags & DCACHE_OP_REVALIDATE) 1254 if (dentry->d_flags & DCACHE_OP_REVALIDATE)
1072 goto need_revalidate; 1255 goto need_revalidate;
1256done2:
1073 path->mnt = mnt; 1257 path->mnt = mnt;
1074 path->dentry = dentry; 1258 path->dentry = dentry;
1075 __follow_mount_rcu(nd, path, inode); 1259 if (likely(__follow_mount_rcu(nd, path, inode, false)))
1076 } else { 1260 return 0;
1077 dentry = __d_lookup(parent, name); 1261 if (nameidata_drop_rcu(nd))
1078 if (!dentry) 1262 return -ECHILD;
1079 goto need_lookup; 1263 /* fallthru */
1264 }
1265 dentry = __d_lookup(parent, name);
1266 if (!dentry)
1267 goto need_lookup;
1080found: 1268found:
1081 if (dentry->d_flags & DCACHE_OP_REVALIDATE) 1269 if (dentry->d_flags & DCACHE_OP_REVALIDATE)
1082 goto need_revalidate; 1270 goto need_revalidate;
1083done: 1271done:
1084 path->mnt = mnt; 1272 path->mnt = mnt;
1085 path->dentry = dentry; 1273 path->dentry = dentry;
1086 __follow_mount(path); 1274 err = follow_managed(path, nd->flags);
1087 *inode = path->dentry->d_inode; 1275 if (unlikely(err < 0)) {
1088 } 1276 path_put_conditional(path, nd);
1277 return err;
1278 }
1279 *inode = path->dentry->d_inode;
1089 return 0; 1280 return 0;
1090 1281
1091need_lookup: 1282need_lookup:
@@ -1124,6 +1315,8 @@ need_revalidate:
1124 goto need_lookup; 1315 goto need_lookup;
1125 if (IS_ERR(dentry)) 1316 if (IS_ERR(dentry))
1126 goto fail; 1317 goto fail;
1318 if (nd->flags & LOOKUP_RCU)
1319 goto done2;
1127 goto done; 1320 goto done;
1128 1321
1129fail: 1322fail:
@@ -1131,17 +1324,6 @@ fail:
1131} 1324}
1132 1325
1133/* 1326/*
1134 * This is a temporary kludge to deal with "automount" symlinks; proper
1135 * solution is to trigger them on follow_mount(), so that do_lookup()
1136 * would DTRT. To be killed before 2.6.34-final.
1137 */
1138static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
1139{
1140 return inode && unlikely(inode->i_op->follow_link) &&
1141 ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
1142}
1143
1144/*
1145 * Name resolution. 1327 * Name resolution.
1146 * This is the basic name resolution function, turning a pathname into 1328 * This is the basic name resolution function, turning a pathname into
1147 * the final dentry. We expect 'base' to be positive and a directory. 1329 * the final dentry. We expect 'base' to be positive and a directory.
@@ -1279,7 +1461,8 @@ last_component:
1279 err = do_lookup(nd, &this, &next, &inode); 1461 err = do_lookup(nd, &this, &next, &inode);
1280 if (err) 1462 if (err)
1281 break; 1463 break;
1282 if (follow_on_final(inode, lookup_flags)) { 1464 if (inode && unlikely(inode->i_op->follow_link) &&
1465 (lookup_flags & LOOKUP_FOLLOW)) {
1283 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) 1466 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1284 return -ECHILD; 1467 return -ECHILD;
1285 BUG_ON(inode != next.dentry->d_inode); 1468 BUG_ON(inode != next.dentry->d_inode);
@@ -1950,8 +2133,9 @@ int may_open(struct path *path, int acc_mode, int flag)
1950 return break_lease(inode, flag); 2133 return break_lease(inode, flag);
1951} 2134}
1952 2135
1953static int handle_truncate(struct path *path) 2136static int handle_truncate(struct file *filp)
1954{ 2137{
2138 struct path *path = &filp->f_path;
1955 struct inode *inode = path->dentry->d_inode; 2139 struct inode *inode = path->dentry->d_inode;
1956 int error = get_write_access(inode); 2140 int error = get_write_access(inode);
1957 if (error) 2141 if (error)
@@ -1965,7 +2149,7 @@ static int handle_truncate(struct path *path)
1965 if (!error) { 2149 if (!error) {
1966 error = do_truncate(path->dentry, 0, 2150 error = do_truncate(path->dentry, 0,
1967 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 2151 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1968 NULL); 2152 filp);
1969 } 2153 }
1970 put_write_access(inode); 2154 put_write_access(inode);
1971 return error; 2155 return error;
@@ -2063,7 +2247,7 @@ static struct file *finish_open(struct nameidata *nd,
2063 } 2247 }
2064 if (!IS_ERR(filp)) { 2248 if (!IS_ERR(filp)) {
2065 if (will_truncate) { 2249 if (will_truncate) {
2066 error = handle_truncate(&nd->path); 2250 error = handle_truncate(filp);
2067 if (error) { 2251 if (error) {
2068 fput(filp); 2252 fput(filp);
2069 filp = ERR_PTR(error); 2253 filp = ERR_PTR(error);
@@ -2104,11 +2288,13 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2104 dir = nd->path.dentry; 2288 dir = nd->path.dentry;
2105 case LAST_DOT: 2289 case LAST_DOT:
2106 if (need_reval_dot(dir)) { 2290 if (need_reval_dot(dir)) {
2107 error = d_revalidate(nd->path.dentry, nd); 2291 int status = d_revalidate(nd->path.dentry, nd);
2108 if (!error) 2292 if (!status)
2109 error = -ESTALE; 2293 status = -ESTALE;
2110 if (error < 0) 2294 if (status < 0) {
2295 error = status;
2111 goto exit; 2296 goto exit;
2297 }
2112 } 2298 }
2113 /* fallthrough */ 2299 /* fallthrough */
2114 case LAST_ROOT: 2300 case LAST_ROOT:
@@ -2178,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2178 if (open_flag & O_EXCL) 2364 if (open_flag & O_EXCL)
2179 goto exit_dput; 2365 goto exit_dput;
2180 2366
2181 if (__follow_mount(path)) { 2367 error = follow_managed(path, nd->flags);
2182 error = -ELOOP; 2368 if (error < 0)
2183 if (open_flag & O_NOFOLLOW) 2369 goto exit_dput;
2184 goto exit_dput;
2185 }
2186 2370
2187 error = -ENOENT; 2371 error = -ENOENT;
2188 if (!path->dentry->d_inode) 2372 if (!path->dentry->d_inode)
@@ -2327,11 +2511,11 @@ reval:
2327 nd.flags = flags; 2511 nd.flags = flags;
2328 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2512 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
2329 while (unlikely(!filp)) { /* trailing symlink */ 2513 while (unlikely(!filp)) { /* trailing symlink */
2330 struct path holder; 2514 struct path link = path;
2515 struct inode *linki = link.dentry->d_inode;
2331 void *cookie; 2516 void *cookie;
2332 error = -ELOOP; 2517 error = -ELOOP;
2333 /* S_ISDIR part is a temporary automount kludge */ 2518 if (!(nd.flags & LOOKUP_FOLLOW))
2334 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
2335 goto exit_dput; 2519 goto exit_dput;
2336 if (count++ == 32) 2520 if (count++ == 32)
2337 goto exit_dput; 2521 goto exit_dput;
@@ -2347,23 +2531,22 @@ reval:
2347 * just set LAST_BIND. 2531 * just set LAST_BIND.
2348 */ 2532 */
2349 nd.flags |= LOOKUP_PARENT; 2533 nd.flags |= LOOKUP_PARENT;
2350 error = security_inode_follow_link(path.dentry, &nd); 2534 error = security_inode_follow_link(link.dentry, &nd);
2351 if (error) 2535 if (error)
2352 goto exit_dput; 2536 goto exit_dput;
2353 error = __do_follow_link(&path, &nd, &cookie); 2537 error = __do_follow_link(&link, &nd, &cookie);
2354 if (unlikely(error)) { 2538 if (unlikely(error)) {
2355 if (!IS_ERR(cookie) && nd.inode->i_op->put_link) 2539 if (!IS_ERR(cookie) && linki->i_op->put_link)
2356 nd.inode->i_op->put_link(path.dentry, &nd, cookie); 2540 linki->i_op->put_link(link.dentry, &nd, cookie);
2357 /* nd.path had been dropped */ 2541 /* nd.path had been dropped */
2358 nd.path = path; 2542 nd.path = link;
2359 goto out_path; 2543 goto out_path;
2360 } 2544 }
2361 holder = path;
2362 nd.flags &= ~LOOKUP_PARENT; 2545 nd.flags &= ~LOOKUP_PARENT;
2363 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2546 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
2364 if (nd.inode->i_op->put_link) 2547 if (linki->i_op->put_link)
2365 nd.inode->i_op->put_link(holder.dentry, &nd, cookie); 2548 linki->i_op->put_link(link.dentry, &nd, cookie);
2366 path_put(&holder); 2549 path_put(&link);
2367 } 2550 }
2368out: 2551out:
2369 if (nd.root.mnt) 2552 if (nd.root.mnt)
@@ -3391,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = {
3391}; 3574};
3392 3575
3393EXPORT_SYMBOL(user_path_at); 3576EXPORT_SYMBOL(user_path_at);
3577EXPORT_SYMBOL(follow_down_one);
3394EXPORT_SYMBOL(follow_down); 3578EXPORT_SYMBOL(follow_down);
3395EXPORT_SYMBOL(follow_up); 3579EXPORT_SYMBOL(follow_up);
3396EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 3580EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 3ddfd9046c44..7b0b95371696 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -183,7 +183,7 @@ static inline void mnt_dec_count(struct vfsmount *mnt)
183unsigned int mnt_get_count(struct vfsmount *mnt) 183unsigned int mnt_get_count(struct vfsmount *mnt)
184{ 184{
185#ifdef CONFIG_SMP 185#ifdef CONFIG_SMP
186 unsigned int count = atomic_read(&mnt->mnt_longrefs); 186 unsigned int count = 0;
187 int cpu; 187 int cpu;
188 188
189 for_each_possible_cpu(cpu) { 189 for_each_possible_cpu(cpu) {
@@ -217,7 +217,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
217 if (!mnt->mnt_pcp) 217 if (!mnt->mnt_pcp)
218 goto out_free_devname; 218 goto out_free_devname;
219 219
220 atomic_set(&mnt->mnt_longrefs, 1); 220 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
221#else 221#else
222 mnt->mnt_count = 1; 222 mnt->mnt_count = 1;
223 mnt->mnt_writers = 0; 223 mnt->mnt_writers = 0;
@@ -611,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
611 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); 611 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
612} 612}
613 613
614static inline void __mnt_make_longterm(struct vfsmount *mnt)
615{
616#ifdef CONFIG_SMP
617 atomic_inc(&mnt->mnt_longterm);
618#endif
619}
620
621/* needs vfsmount lock for write */
622static inline void __mnt_make_shortterm(struct vfsmount *mnt)
623{
624#ifdef CONFIG_SMP
625 atomic_dec(&mnt->mnt_longterm);
626#endif
627}
628
614/* 629/*
615 * vfsmount lock must be held for write 630 * vfsmount lock must be held for write
616 */ 631 */
@@ -624,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
624 BUG_ON(parent == mnt); 639 BUG_ON(parent == mnt);
625 640
626 list_add_tail(&head, &mnt->mnt_list); 641 list_add_tail(&head, &mnt->mnt_list);
627 list_for_each_entry(m, &head, mnt_list) 642 list_for_each_entry(m, &head, mnt_list) {
628 m->mnt_ns = n; 643 m->mnt_ns = n;
644 __mnt_make_longterm(m);
645 }
646
629 list_splice(&head, n->list.prev); 647 list_splice(&head, n->list.prev);
630 648
631 list_add_tail(&mnt->mnt_hash, mount_hashtable + 649 list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -734,51 +752,30 @@ static inline void mntfree(struct vfsmount *mnt)
734 deactivate_super(sb); 752 deactivate_super(sb);
735} 753}
736 754
737#ifdef CONFIG_SMP 755static void mntput_no_expire(struct vfsmount *mnt)
738static inline void __mntput(struct vfsmount *mnt, int longrefs)
739{ 756{
740 if (!longrefs) {
741put_again: 757put_again:
742 br_read_lock(vfsmount_lock); 758#ifdef CONFIG_SMP
743 if (likely(atomic_read(&mnt->mnt_longrefs))) { 759 br_read_lock(vfsmount_lock);
744 mnt_dec_count(mnt); 760 if (likely(atomic_read(&mnt->mnt_longterm))) {
745 br_read_unlock(vfsmount_lock); 761 mnt_dec_count(mnt);
746 return;
747 }
748 br_read_unlock(vfsmount_lock); 762 br_read_unlock(vfsmount_lock);
749 } else { 763 return;
750 BUG_ON(!atomic_read(&mnt->mnt_longrefs));
751 if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
752 return;
753 } 764 }
765 br_read_unlock(vfsmount_lock);
754 766
755 br_write_lock(vfsmount_lock); 767 br_write_lock(vfsmount_lock);
756 if (!longrefs) 768 mnt_dec_count(mnt);
757 mnt_dec_count(mnt);
758 else
759 atomic_dec(&mnt->mnt_longrefs);
760 if (mnt_get_count(mnt)) { 769 if (mnt_get_count(mnt)) {
761 br_write_unlock(vfsmount_lock); 770 br_write_unlock(vfsmount_lock);
762 return; 771 return;
763 } 772 }
764 if (unlikely(mnt->mnt_pinned)) {
765 mnt_add_count(mnt, mnt->mnt_pinned + 1);
766 mnt->mnt_pinned = 0;
767 br_write_unlock(vfsmount_lock);
768 acct_auto_close_mnt(mnt);
769 goto put_again;
770 }
771 br_write_unlock(vfsmount_lock);
772 mntfree(mnt);
773}
774#else 773#else
775static inline void __mntput(struct vfsmount *mnt, int longrefs)
776{
777put_again:
778 mnt_dec_count(mnt); 774 mnt_dec_count(mnt);
779 if (likely(mnt_get_count(mnt))) 775 if (likely(mnt_get_count(mnt)))
780 return; 776 return;
781 br_write_lock(vfsmount_lock); 777 br_write_lock(vfsmount_lock);
778#endif
782 if (unlikely(mnt->mnt_pinned)) { 779 if (unlikely(mnt->mnt_pinned)) {
783 mnt_add_count(mnt, mnt->mnt_pinned + 1); 780 mnt_add_count(mnt, mnt->mnt_pinned + 1);
784 mnt->mnt_pinned = 0; 781 mnt->mnt_pinned = 0;
@@ -789,12 +786,6 @@ put_again:
789 br_write_unlock(vfsmount_lock); 786 br_write_unlock(vfsmount_lock);
790 mntfree(mnt); 787 mntfree(mnt);
791} 788}
792#endif
793
794static void mntput_no_expire(struct vfsmount *mnt)
795{
796 __mntput(mnt, 0);
797}
798 789
799void mntput(struct vfsmount *mnt) 790void mntput(struct vfsmount *mnt)
800{ 791{
@@ -802,7 +793,7 @@ void mntput(struct vfsmount *mnt)
802 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 793 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
803 if (unlikely(mnt->mnt_expiry_mark)) 794 if (unlikely(mnt->mnt_expiry_mark))
804 mnt->mnt_expiry_mark = 0; 795 mnt->mnt_expiry_mark = 0;
805 __mntput(mnt, 0); 796 mntput_no_expire(mnt);
806 } 797 }
807} 798}
808EXPORT_SYMBOL(mntput); 799EXPORT_SYMBOL(mntput);
@@ -815,33 +806,6 @@ struct vfsmount *mntget(struct vfsmount *mnt)
815} 806}
816EXPORT_SYMBOL(mntget); 807EXPORT_SYMBOL(mntget);
817 808
818void mntput_long(struct vfsmount *mnt)
819{
820#ifdef CONFIG_SMP
821 if (mnt) {
822 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
823 if (unlikely(mnt->mnt_expiry_mark))
824 mnt->mnt_expiry_mark = 0;
825 __mntput(mnt, 1);
826 }
827#else
828 mntput(mnt);
829#endif
830}
831EXPORT_SYMBOL(mntput_long);
832
833struct vfsmount *mntget_long(struct vfsmount *mnt)
834{
835#ifdef CONFIG_SMP
836 if (mnt)
837 atomic_inc(&mnt->mnt_longrefs);
838 return mnt;
839#else
840 return mntget(mnt);
841#endif
842}
843EXPORT_SYMBOL(mntget_long);
844
845void mnt_pin(struct vfsmount *mnt) 809void mnt_pin(struct vfsmount *mnt)
846{ 810{
847 br_write_lock(vfsmount_lock); 811 br_write_lock(vfsmount_lock);
@@ -1216,7 +1180,7 @@ void release_mounts(struct list_head *head)
1216 dput(dentry); 1180 dput(dentry);
1217 mntput(m); 1181 mntput(m);
1218 } 1182 }
1219 mntput_long(mnt); 1183 mntput(mnt);
1220 } 1184 }
1221} 1185}
1222 1186
@@ -1226,19 +1190,21 @@ void release_mounts(struct list_head *head)
1226 */ 1190 */
1227void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1191void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1228{ 1192{
1193 LIST_HEAD(tmp_list);
1229 struct vfsmount *p; 1194 struct vfsmount *p;
1230 1195
1231 for (p = mnt; p; p = next_mnt(p, mnt)) 1196 for (p = mnt; p; p = next_mnt(p, mnt))
1232 list_move(&p->mnt_hash, kill); 1197 list_move(&p->mnt_hash, &tmp_list);
1233 1198
1234 if (propagate) 1199 if (propagate)
1235 propagate_umount(kill); 1200 propagate_umount(&tmp_list);
1236 1201
1237 list_for_each_entry(p, kill, mnt_hash) { 1202 list_for_each_entry(p, &tmp_list, mnt_hash) {
1238 list_del_init(&p->mnt_expire); 1203 list_del_init(&p->mnt_expire);
1239 list_del_init(&p->mnt_list); 1204 list_del_init(&p->mnt_list);
1240 __touch_mnt_namespace(p->mnt_ns); 1205 __touch_mnt_namespace(p->mnt_ns);
1241 p->mnt_ns = NULL; 1206 p->mnt_ns = NULL;
1207 __mnt_make_shortterm(p);
1242 list_del_init(&p->mnt_child); 1208 list_del_init(&p->mnt_child);
1243 if (p->mnt_parent != p) { 1209 if (p->mnt_parent != p) {
1244 p->mnt_parent->mnt_ghosts++; 1210 p->mnt_parent->mnt_ghosts++;
@@ -1246,6 +1212,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1246 } 1212 }
1247 change_mnt_propagation(p, MS_PRIVATE); 1213 change_mnt_propagation(p, MS_PRIVATE);
1248 } 1214 }
1215 list_splice(&tmp_list, kill);
1249} 1216}
1250 1217
1251static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); 1218static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1844,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
1844 return err; 1811 return err;
1845 1812
1846 down_write(&namespace_sem); 1813 down_write(&namespace_sem);
1847 while (d_mountpoint(path->dentry) && 1814 err = follow_down(path, true);
1848 follow_down(path)) 1815 if (err < 0)
1849 ; 1816 goto out;
1817
1850 err = -EINVAL; 1818 err = -EINVAL;
1851 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1819 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
1852 goto out; 1820 goto out;
@@ -1904,6 +1872,8 @@ out:
1904 return err; 1872 return err;
1905} 1873}
1906 1874
1875static int do_add_mount(struct vfsmount *, struct path *, int);
1876
1907/* 1877/*
1908 * create a new mount for userspace and request it to be added into the 1878 * create a new mount for userspace and request it to be added into the
1909 * namespace's tree 1879 * namespace's tree
@@ -1912,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
1912 int mnt_flags, char *name, void *data) 1882 int mnt_flags, char *name, void *data)
1913{ 1883{
1914 struct vfsmount *mnt; 1884 struct vfsmount *mnt;
1885 int err;
1915 1886
1916 if (!type) 1887 if (!type)
1917 return -EINVAL; 1888 return -EINVAL;
@@ -1924,15 +1895,47 @@ static int do_new_mount(struct path *path, char *type, int flags,
1924 if (IS_ERR(mnt)) 1895 if (IS_ERR(mnt))
1925 return PTR_ERR(mnt); 1896 return PTR_ERR(mnt);
1926 1897
1927 return do_add_mount(mnt, path, mnt_flags, NULL); 1898 err = do_add_mount(mnt, path, mnt_flags);
1899 if (err)
1900 mntput(mnt);
1901 return err;
1902}
1903
1904int finish_automount(struct vfsmount *m, struct path *path)
1905{
1906 int err;
1907 /* The new mount record should have at least 2 refs to prevent it being
1908 * expired before we get a chance to add it
1909 */
1910 BUG_ON(mnt_get_count(m) < 2);
1911
1912 if (m->mnt_sb == path->mnt->mnt_sb &&
1913 m->mnt_root == path->dentry) {
1914 err = -ELOOP;
1915 goto fail;
1916 }
1917
1918 err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
1919 if (!err)
1920 return 0;
1921fail:
1922 /* remove m from any expiration list it may be on */
1923 if (!list_empty(&m->mnt_expire)) {
1924 down_write(&namespace_sem);
1925 br_write_lock(vfsmount_lock);
1926 list_del_init(&m->mnt_expire);
1927 br_write_unlock(vfsmount_lock);
1928 up_write(&namespace_sem);
1929 }
1930 mntput(m);
1931 mntput(m);
1932 return err;
1928} 1933}
1929 1934
1930/* 1935/*
1931 * add a mount into a namespace's mount tree 1936 * add a mount into a namespace's mount tree
1932 * - provide the option of adding the new mount to an expiration list
1933 */ 1937 */
1934int do_add_mount(struct vfsmount *newmnt, struct path *path, 1938static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
1935 int mnt_flags, struct list_head *fslist)
1936{ 1939{
1937 int err; 1940 int err;
1938 1941
@@ -1940,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1940 1943
1941 down_write(&namespace_sem); 1944 down_write(&namespace_sem);
1942 /* Something was mounted here while we slept */ 1945 /* Something was mounted here while we slept */
1943 while (d_mountpoint(path->dentry) && 1946 err = follow_down(path, true);
1944 follow_down(path)) 1947 if (err < 0)
1945 ; 1948 goto unlock;
1949
1946 err = -EINVAL; 1950 err = -EINVAL;
1947 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) 1951 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1948 goto unlock; 1952 goto unlock;
@@ -1958,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1958 goto unlock; 1962 goto unlock;
1959 1963
1960 newmnt->mnt_flags = mnt_flags; 1964 newmnt->mnt_flags = mnt_flags;
1961 if ((err = graft_tree(newmnt, path))) 1965 err = graft_tree(newmnt, path);
1962 goto unlock;
1963
1964 if (fslist) /* add to the specified expiration list */
1965 list_add_tail(&newmnt->mnt_expire, fslist);
1966
1967 up_write(&namespace_sem);
1968 return 0;
1969 1966
1970unlock: 1967unlock:
1971 up_write(&namespace_sem); 1968 up_write(&namespace_sem);
1972 mntput_long(newmnt);
1973 return err; 1969 return err;
1974} 1970}
1975 1971
1976EXPORT_SYMBOL_GPL(do_add_mount); 1972/**
1973 * mnt_set_expiry - Put a mount on an expiration list
1974 * @mnt: The mount to list.
1975 * @expiry_list: The list to add the mount to.
1976 */
1977void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
1978{
1979 down_write(&namespace_sem);
1980 br_write_lock(vfsmount_lock);
1981
1982 list_add_tail(&mnt->mnt_expire, expiry_list);
1983
1984 br_write_unlock(vfsmount_lock);
1985 up_write(&namespace_sem);
1986}
1987EXPORT_SYMBOL(mnt_set_expiry);
1977 1988
1978/* 1989/*
1979 * process a list of expirable mountpoints with the intent of discarding any 1990 * process a list of expirable mountpoints with the intent of discarding any
@@ -2262,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
2262 return new_ns; 2273 return new_ns;
2263} 2274}
2264 2275
2276void mnt_make_longterm(struct vfsmount *mnt)
2277{
2278 __mnt_make_longterm(mnt);
2279}
2280
2281void mnt_make_shortterm(struct vfsmount *mnt)
2282{
2283#ifdef CONFIG_SMP
2284 if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
2285 return;
2286 br_write_lock(vfsmount_lock);
2287 atomic_dec(&mnt->mnt_longterm);
2288 br_write_unlock(vfsmount_lock);
2289#endif
2290}
2291
2265/* 2292/*
2266 * Allocate a new namespace structure and populate it with contents 2293 * Allocate a new namespace structure and populate it with contents
2267 * copied from the namespace of the passed in task structure. 2294 * copied from the namespace of the passed in task structure.
@@ -2299,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2299 q = new_ns->root; 2326 q = new_ns->root;
2300 while (p) { 2327 while (p) {
2301 q->mnt_ns = new_ns; 2328 q->mnt_ns = new_ns;
2329 __mnt_make_longterm(q);
2302 if (fs) { 2330 if (fs) {
2303 if (p == fs->root.mnt) { 2331 if (p == fs->root.mnt) {
2332 fs->root.mnt = mntget(q);
2333 __mnt_make_longterm(q);
2334 mnt_make_shortterm(p);
2304 rootmnt = p; 2335 rootmnt = p;
2305 fs->root.mnt = mntget_long(q);
2306 } 2336 }
2307 if (p == fs->pwd.mnt) { 2337 if (p == fs->pwd.mnt) {
2338 fs->pwd.mnt = mntget(q);
2339 __mnt_make_longterm(q);
2340 mnt_make_shortterm(p);
2308 pwdmnt = p; 2341 pwdmnt = p;
2309 fs->pwd.mnt = mntget_long(q);
2310 } 2342 }
2311 } 2343 }
2312 p = next_mnt(p, mnt_ns->root); 2344 p = next_mnt(p, mnt_ns->root);
@@ -2315,9 +2347,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2315 up_write(&namespace_sem); 2347 up_write(&namespace_sem);
2316 2348
2317 if (rootmnt) 2349 if (rootmnt)
2318 mntput_long(rootmnt); 2350 mntput(rootmnt);
2319 if (pwdmnt) 2351 if (pwdmnt)
2320 mntput_long(pwdmnt); 2352 mntput(pwdmnt);
2321 2353
2322 return new_ns; 2354 return new_ns;
2323} 2355}
@@ -2350,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
2350 new_ns = alloc_mnt_ns(); 2382 new_ns = alloc_mnt_ns();
2351 if (!IS_ERR(new_ns)) { 2383 if (!IS_ERR(new_ns)) {
2352 mnt->mnt_ns = new_ns; 2384 mnt->mnt_ns = new_ns;
2385 __mnt_make_longterm(mnt);
2353 new_ns->root = mnt; 2386 new_ns->root = mnt;
2354 list_add(&new_ns->list, &new_ns->root->mnt_list); 2387 list_add(&new_ns->list, &new_ns->root->mnt_list);
2355 } 2388 }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 28f136d4aaec..f6946bb5cb55 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -21,9 +21,7 @@
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <asm/byteorder.h> 22#include <asm/byteorder.h>
23 23
24#include <linux/ncp_fs.h> 24#include "ncp_fs.h"
25
26#include "ncplib_kernel.h"
27 25
28static void ncp_read_volume_list(struct file *, void *, filldir_t, 26static void ncp_read_volume_list(struct file *, void *, filldir_t,
29 struct ncp_cache_control *); 27 struct ncp_cache_control *);
@@ -82,7 +80,7 @@ static int ncp_compare_dentry(const struct dentry *, const struct inode *,
82 unsigned int, const char *, const struct qstr *); 80 unsigned int, const char *, const struct qstr *);
83static int ncp_delete_dentry(const struct dentry *); 81static int ncp_delete_dentry(const struct dentry *);
84 82
85static const struct dentry_operations ncp_dentry_operations = 83const struct dentry_operations ncp_dentry_operations =
86{ 84{
87 .d_revalidate = ncp_lookup_validate, 85 .d_revalidate = ncp_lookup_validate,
88 .d_hash = ncp_hash_dentry, 86 .d_hash = ncp_hash_dentry,
@@ -90,14 +88,6 @@ static const struct dentry_operations ncp_dentry_operations =
90 .d_delete = ncp_delete_dentry, 88 .d_delete = ncp_delete_dentry,
91}; 89};
92 90
93const struct dentry_operations ncp_root_dentry_operations =
94{
95 .d_hash = ncp_hash_dentry,
96 .d_compare = ncp_compare_dentry,
97 .d_delete = ncp_delete_dentry,
98};
99
100
101#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) 91#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
102 92
103static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator) 93static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
@@ -309,6 +299,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
309 int res, val = 0, len; 299 int res, val = 0, len;
310 __u8 __name[NCP_MAXPATHLEN + 1]; 300 __u8 __name[NCP_MAXPATHLEN + 1];
311 301
302 if (dentry == dentry->d_sb->s_root)
303 return 1;
304
312 if (nd->flags & LOOKUP_RCU) 305 if (nd->flags & LOOKUP_RCU)
313 return -ECHILD; 306 return -ECHILD;
314 307
@@ -637,7 +630,6 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
637 entry->ino = iunique(dir->i_sb, 2); 630 entry->ino = iunique(dir->i_sb, 2);
638 inode = ncp_iget(dir->i_sb, entry); 631 inode = ncp_iget(dir->i_sb, entry);
639 if (inode) { 632 if (inode) {
640 d_set_d_op(newdent, &ncp_dentry_operations);
641 d_instantiate(newdent, inode); 633 d_instantiate(newdent, inode);
642 if (!hashed) 634 if (!hashed)
643 d_rehash(newdent); 635 d_rehash(newdent);
@@ -893,7 +885,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
893 if (inode) { 885 if (inode) {
894 ncp_new_dentry(dentry); 886 ncp_new_dentry(dentry);
895add_entry: 887add_entry:
896 d_set_d_op(dentry, &ncp_dentry_operations);
897 d_add(dentry, inode); 888 d_add(dentry, inode);
898 error = 0; 889 error = 0;
899 } 890 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index cb50aaf981df..0ed65e0c3dfe 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,8 +18,7 @@
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20 20
21#include <linux/ncp_fs.h> 21#include "ncp_fs.h"
22#include "ncplib_kernel.h"
23 22
24static int ncp_fsync(struct file *file, int datasync) 23static int ncp_fsync(struct file *file, int datasync)
25{ 24{
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9b39a5dd4131..00a1d1c3d3a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -31,11 +31,9 @@
31#include <linux/seq_file.h> 31#include <linux/seq_file.h>
32#include <linux/namei.h> 32#include <linux/namei.h>
33 33
34#include <linux/ncp_fs.h>
35
36#include <net/sock.h> 34#include <net/sock.h>
37 35
38#include "ncplib_kernel.h" 36#include "ncp_fs.h"
39#include "getopt.h" 37#include "getopt.h"
40 38
41#define NCP_DEFAULT_FILE_MODE 0600 39#define NCP_DEFAULT_FILE_MODE 0600
@@ -544,6 +542,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
544 sb->s_blocksize_bits = 10; 542 sb->s_blocksize_bits = 10;
545 sb->s_magic = NCP_SUPER_MAGIC; 543 sb->s_magic = NCP_SUPER_MAGIC;
546 sb->s_op = &ncp_sops; 544 sb->s_op = &ncp_sops;
545 sb->s_d_op = &ncp_dentry_operations;
547 sb->s_bdi = &server->bdi; 546 sb->s_bdi = &server->bdi;
548 547
549 server = NCP_SBP(sb); 548 server = NCP_SBP(sb);
@@ -723,7 +722,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
723 sb->s_root = d_alloc_root(root_inode); 722 sb->s_root = d_alloc_root(root_inode);
724 if (!sb->s_root) 723 if (!sb->s_root)
725 goto out_no_root; 724 goto out_no_root;
726 d_set_d_op(sb->s_root, &ncp_root_dentry_operations);
727 return 0; 725 return 0;
728 726
729out_no_root: 727out_no_root:
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index d40a547e3377..790e92a9ec63 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,11 +20,9 @@
20#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22 22
23#include <linux/ncp_fs.h>
24
25#include <asm/uaccess.h> 23#include <asm/uaccess.h>
26 24
27#include "ncplib_kernel.h" 25#include "ncp_fs.h"
28 26
29/* maximum limit for ncp_objectname_ioctl */ 27/* maximum limit for ncp_objectname_ioctl */
30#define NCP_OBJECT_NAME_MAX_LEN 4096 28#define NCP_OBJECT_NAME_MAX_LEN 4096
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 56f5b3a0e1ee..a7c07b44b100 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,12 +16,12 @@
16#include <linux/mman.h> 16#include <linux/mman.h>
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/fcntl.h> 18#include <linux/fcntl.h>
19#include <linux/ncp_fs.h>
20 19
21#include "ncplib_kernel.h"
22#include <asm/uaccess.h> 20#include <asm/uaccess.h>
23#include <asm/system.h> 21#include <asm/system.h>
24 22
23#include "ncp_fs.h"
24
25/* 25/*
26 * Fill in the supplied page for mmap 26 * Fill in the supplied page for mmap
27 * XXX: how are we excluding truncate/invalidate here? Maybe need to lock 27 * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 000000000000..31831afe1c3b
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
1#include <linux/ncp_fs.h>
2#include "ncp_fs_i.h"
3#include "ncp_fs_sb.h"
4
5/* define because it is easy to change PRINTK to {*}PRINTK */
6#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
7
8#undef NCPFS_PARANOIA
9#ifdef NCPFS_PARANOIA
10#define PPRINTK(format, args...) PRINTK(format , ## args)
11#else
12#define PPRINTK(format, args...)
13#endif
14
15#ifndef DEBUG_NCP
16#define DEBUG_NCP 0
17#endif
18#if DEBUG_NCP > 0
19#define DPRINTK(format, args...) PRINTK(format , ## args)
20#else
21#define DPRINTK(format, args...)
22#endif
23#if DEBUG_NCP > 1
24#define DDPRINTK(format, args...) PRINTK(format , ## args)
25#else
26#define DDPRINTK(format, args...)
27#endif
28
29#define NCP_MAX_RPC_TIMEOUT (6*HZ)
30
31
32struct ncp_entry_info {
33 struct nw_info_struct i;
34 ino_t ino;
35 int opened;
36 int access;
37 unsigned int volume;
38 __u8 file_handle[6];
39};
40
41static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
42{
43 return sb->s_fs_info;
44}
45
46#define NCP_SERVER(inode) NCP_SBP((inode)->i_sb)
47static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
48{
49 return container_of(inode, struct ncp_inode_info, vfs_inode);
50}
51
52/* linux/fs/ncpfs/inode.c */
53int ncp_notify_change(struct dentry *, struct iattr *);
54struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
55void ncp_update_inode(struct inode *, struct ncp_entry_info *);
56void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
57
58/* linux/fs/ncpfs/dir.c */
59extern const struct inode_operations ncp_dir_inode_operations;
60extern const struct file_operations ncp_dir_operations;
61extern const struct dentry_operations ncp_dentry_operations;
62int ncp_conn_logged_in(struct super_block *);
63int ncp_date_dos2unix(__le16 time, __le16 date);
64void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
65
66/* linux/fs/ncpfs/ioctl.c */
67long ncp_ioctl(struct file *, unsigned int, unsigned long);
68long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
69
70/* linux/fs/ncpfs/sock.c */
71int ncp_request2(struct ncp_server *server, int function,
72 void* reply, int max_reply_size);
73static inline int ncp_request(struct ncp_server *server, int function) {
74 return ncp_request2(server, function, server->packet, server->packet_size);
75}
76int ncp_connect(struct ncp_server *server);
77int ncp_disconnect(struct ncp_server *server);
78void ncp_lock_server(struct ncp_server *server);
79void ncp_unlock_server(struct ncp_server *server);
80
81/* linux/fs/ncpfs/symlink.c */
82#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
83extern const struct address_space_operations ncp_symlink_aops;
84int ncp_symlink(struct inode*, struct dentry*, const char*);
85#endif
86
87/* linux/fs/ncpfs/file.c */
88extern const struct inode_operations ncp_file_inode_operations;
89extern const struct file_operations ncp_file_operations;
90int ncp_make_open(struct inode *, int);
91
92/* linux/fs/ncpfs/mmap.c */
93int ncp_mmap(struct file *, struct vm_area_struct *);
94
95/* linux/fs/ncpfs/ncplib_kernel.c */
96int ncp_make_closed(struct inode *);
97
98#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 000000000000..4b0bec477846
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
1/*
2 * ncp_fs_i.h
3 *
4 * Copyright (C) 1995 Volker Lendecke
5 *
6 */
7
8#ifndef _LINUX_NCP_FS_I
9#define _LINUX_NCP_FS_I
10
11/*
12 * This is the ncpfs part of the inode structure. This must contain
13 * all the information we need to work with an inode after creation.
14 */
15struct ncp_inode_info {
16 __le32 dirEntNum;
17 __le32 DosDirNum;
18 __u8 volNumber;
19 __le32 nwattr;
20 struct mutex open_mutex;
21 atomic_t opened;
22 int access;
23 int flags;
24#define NCPI_KLUDGE_SYMLINK 0x0001
25 __u8 file_handle[6];
26 struct inode vfs_inode;
27};
28
29#endif /* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 000000000000..4af803f13516
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
1/*
2 * ncp_fs_sb.h
3 *
4 * Copyright (C) 1995, 1996 by Volker Lendecke
5 *
6 */
7
8#ifndef _NCP_FS_SB
9#define _NCP_FS_SB
10
11#include <linux/types.h>
12#include <linux/ncp_mount.h>
13#include <linux/net.h>
14#include <linux/mutex.h>
15#include <linux/backing-dev.h>
16#include <linux/workqueue.h>
17
18#define NCP_DEFAULT_OPTIONS 0 /* 2 for packet signatures */
19
20struct sock;
21
22struct ncp_mount_data_kernel {
23 unsigned long flags; /* NCP_MOUNT_* flags */
24 unsigned int int_flags; /* internal flags */
25#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001
26 __kernel_uid32_t mounted_uid; /* Who may umount() this filesystem? */
27 struct pid *wdog_pid; /* Who cares for our watchdog packets? */
28 unsigned int ncp_fd; /* The socket to the ncp port */
29 unsigned int time_out; /* How long should I wait after
30 sending a NCP request? */
31 unsigned int retry_count; /* And how often should I retry? */
32 unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
33 __kernel_uid32_t uid;
34 __kernel_gid32_t gid;
35 __kernel_mode_t file_mode;
36 __kernel_mode_t dir_mode;
37 int info_fd;
38};
39
40struct ncp_server {
41
42 struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of
43 interest for us later, so we store
44 it completely. */
45
46 __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
47
48 struct file *ncp_filp; /* File pointer to ncp socket */
49 struct socket *ncp_sock;/* ncp socket */
50 struct file *info_filp;
51 struct socket *info_sock;
52
53 u8 sequence;
54 u8 task;
55 u16 connection; /* Remote connection number */
56
57 u8 completion; /* Status message from server */
58 u8 conn_status; /* Bit 4 = 1 ==> Server going down, no
59 requests allowed anymore.
60 Bit 0 = 1 ==> Server is down. */
61
62 int buffer_size; /* Negotiated bufsize */
63
64 int reply_size; /* Size of last reply */
65
66 int packet_size;
67 unsigned char *packet; /* Here we prepare requests and
68 receive replies */
69 unsigned char *txbuf; /* Storage for current request */
70 unsigned char *rxbuf; /* Storage for reply to current request */
71
72 int lock; /* To prevent mismatch in protocols. */
73 struct mutex mutex;
74
75 int current_size; /* for packet preparation */
76 int has_subfunction;
77 int ncp_reply_size;
78
79 int root_setuped;
80 struct mutex root_setup_lock;
81
82 /* info for packet signing */
83 int sign_wanted; /* 1=Server needs signed packets */
84 int sign_active; /* 0=don't do signing, 1=do */
85 char sign_root[8]; /* generated from password and encr. key */
86 char sign_last[16];
87
88 /* Authentication info: NDS or BINDERY, username */
89 struct {
90 int auth_type;
91 size_t object_name_len;
92 void* object_name;
93 int object_type;
94 } auth;
95 /* Password info */
96 struct {
97 size_t len;
98 void* data;
99 } priv;
100 struct rw_semaphore auth_rwsem;
101
102 /* nls info: codepage for volume and charset for I/O */
103 struct nls_table *nls_vol;
104 struct nls_table *nls_io;
105
106 /* maximum age in jiffies */
107 atomic_t dentry_ttl;
108
109 /* miscellaneous */
110 unsigned int flags;
111
112 spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
113
114 void (*data_ready)(struct sock* sk, int len);
115 void (*error_report)(struct sock* sk);
116 void (*write_space)(struct sock* sk); /* STREAM mode only */
117 struct {
118 struct work_struct tq; /* STREAM/DGRAM: data/error ready */
119 struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */
120 struct mutex creq_mutex; /* DGRAM only: lock accesses to rcv.creq */
121
122 unsigned int state; /* STREAM only: receiver state */
123 struct {
124 __u32 magic __packed;
125 __u32 len __packed;
126 __u16 type __packed;
127 __u16 p1 __packed;
128 __u16 p2 __packed;
129 __u16 p3 __packed;
130 __u16 type2 __packed;
131 } buf; /* STREAM only: temporary buffer */
132 unsigned char* ptr; /* STREAM only: pointer to data */
133 size_t len; /* STREAM only: length of data to receive */
134 } rcv;
135 struct {
136 struct list_head requests; /* STREAM only: queued requests */
137 struct work_struct tq; /* STREAM only: transmitter ready */
138 struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */
139 } tx;
140 struct timer_list timeout_tm; /* DGRAM only: timeout timer */
141 struct work_struct timeout_tq; /* DGRAM only: associated queue, we run timers from process context */
142 int timeout_last; /* DGRAM only: current timeout length */
143 int timeout_retries; /* DGRAM only: retries left */
144 struct {
145 size_t len;
146 __u8 data[128];
147 } unexpected_packet;
148 struct backing_dev_info bdi;
149};
150
151extern void ncp_tcp_rcv_proc(struct work_struct *work);
152extern void ncp_tcp_tx_proc(struct work_struct *work);
153extern void ncpdgram_rcv_proc(struct work_struct *work);
154extern void ncpdgram_timeout_proc(struct work_struct *work);
155extern void ncpdgram_timeout_call(unsigned long server);
156extern void ncp_tcp_data_ready(struct sock* sk, int len);
157extern void ncp_tcp_write_space(struct sock* sk);
158extern void ncp_tcp_error_report(struct sock* sk);
159
160#define NCP_FLAG_UTF8 1
161
162#define NCP_CLR_FLAG(server, flag) ((server)->flags &= ~(flag))
163#define NCP_SET_FLAG(server, flag) ((server)->flags |= (flag))
164#define NCP_IS_FLAG(server, flag) ((server)->flags & (flag))
165
166static inline int ncp_conn_valid(struct ncp_server *server)
167{
168 return ((server->conn_status & 0x11) == 0);
169}
170
171static inline void ncp_invalidate_conn(struct ncp_server *server)
172{
173 server->conn_status |= 0x01;
174}
175
176#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index a95615a0b6ac..981a95617fc9 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -11,7 +11,7 @@
11 11
12 12
13 13
14#include "ncplib_kernel.h" 14#include "ncp_fs.h"
15 15
16static inline void assert_server_locked(struct ncp_server *server) 16static inline void assert_server_locked(struct ncp_server *server)
17{ 17{
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 1220df75ff22..09881e6aa5ad 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -32,8 +32,6 @@
32#include <linux/ctype.h> 32#include <linux/ctype.h>
33#endif /* CONFIG_NCPFS_NLS */ 33#endif /* CONFIG_NCPFS_NLS */
34 34
35#include <linux/ncp_fs.h>
36
37#define NCP_MIN_SYMLINK_SIZE 8 35#define NCP_MIN_SYMLINK_SIZE 8
38#define NCP_MAX_SYMLINK_SIZE 512 36#define NCP_MAX_SYMLINK_SIZE 512
39 37
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index d8b2d7e6910b..08907599dcd2 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -11,6 +11,7 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/ncp.h> 12#include <linux/ncp.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include "ncp_fs.h"
14#include "ncpsign_kernel.h" 15#include "ncpsign_kernel.h"
15 16
16/* i386: 32-bit, little endian, handles mis-alignment */ 17/* i386: 32-bit, little endian, handles mis-alignment */
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index 6451a68381cc..d9a1438bb1f6 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -8,8 +8,6 @@
8#ifndef _NCPSIGN_KERNEL_H 8#ifndef _NCPSIGN_KERNEL_H
9#define _NCPSIGN_KERNEL_H 9#define _NCPSIGN_KERNEL_H
10 10
11#include <linux/ncp_fs.h>
12
13#ifdef CONFIG_NCPFS_PACKET_SIGNING 11#ifdef CONFIG_NCPFS_PACKET_SIGNING
14void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff); 12void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
15int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff); 13int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 668bd267346e..3a1587222c8a 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -28,7 +28,7 @@
28#include <linux/poll.h> 28#include <linux/poll.h>
29#include <linux/file.h> 29#include <linux/file.h>
30 30
31#include <linux/ncp_fs.h> 31#include "ncp_fs.h"
32 32
33#include "ncpsign_kernel.h" 33#include "ncpsign_kernel.h"
34 34
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index c634fd17b337..661f861d80c6 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -25,13 +25,11 @@
25 25
26#include <linux/errno.h> 26#include <linux/errno.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/ncp_fs.h>
29#include <linux/time.h> 28#include <linux/time.h>
30#include <linux/slab.h> 29#include <linux/slab.h>
31#include <linux/mm.h> 30#include <linux/mm.h>
32#include <linux/stat.h> 31#include <linux/stat.h>
33#include "ncplib_kernel.h" 32#include "ncp_fs.h"
34
35 33
36/* these magic numbers must appear in the symlink file -- this makes it a bit 34/* these magic numbers must appear in the symlink file -- this makes it a bit
37 more resilient against the magic attributes being set on random files. */ 35 more resilient against the magic attributes being set on random files. */
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 93a8b3bd69e3..199016528fcb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,9 +16,7 @@
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/kthread.h> 17#include <linux/kthread.h>
18#include <linux/sunrpc/svcauth_gss.h> 18#include <linux/sunrpc/svcauth_gss.h>
19#if defined(CONFIG_NFS_V4_1)
20#include <linux/sunrpc/bc_xprt.h> 19#include <linux/sunrpc/bc_xprt.h>
21#endif
22 20
23#include <net/inet_sock.h> 21#include <net/inet_sock.h>
24 22
@@ -137,6 +135,33 @@ out_err:
137 135
138#if defined(CONFIG_NFS_V4_1) 136#if defined(CONFIG_NFS_V4_1)
139/* 137/*
138 * * CB_SEQUENCE operations will fail until the callback sessionid is set.
139 * */
140int nfs4_set_callback_sessionid(struct nfs_client *clp)
141{
142 struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
143 struct nfs4_sessionid *bc_sid;
144
145 if (!serv->sv_bc_xprt)
146 return -EINVAL;
147
148 /* on success freed in xprt_free */
149 bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
150 if (!bc_sid)
151 return -ENOMEM;
152 memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
153 NFS4_MAX_SESSIONID_LEN);
154 spin_lock_bh(&serv->sv_cb_lock);
155 serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
156 spin_unlock_bh(&serv->sv_cb_lock);
157 dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
158 ((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
159 ((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
160 serv->sv_bc_xprt);
161 return 0;
162}
163
164/*
140 * The callback service for NFSv4.1 callbacks 165 * The callback service for NFSv4.1 callbacks
141 */ 166 */
142static int 167static int
@@ -177,30 +202,38 @@ nfs41_callback_svc(void *vrqstp)
177struct svc_rqst * 202struct svc_rqst *
178nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) 203nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
179{ 204{
180 struct svc_xprt *bc_xprt; 205 struct svc_rqst *rqstp;
181 struct svc_rqst *rqstp = ERR_PTR(-ENOMEM); 206 int ret;
182 207
183 dprintk("--> %s\n", __func__); 208 /*
184 /* Create a svc_sock for the service */ 209 * Create an svc_sock for the back channel service that shares the
185 bc_xprt = svc_sock_create(serv, xprt->prot); 210 * fore channel connection.
186 if (!bc_xprt) 211 * Returns the input port (0) and sets the svc_serv bc_xprt on success
212 */
213 ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
214 SVC_SOCK_ANONYMOUS);
215 if (ret < 0) {
216 rqstp = ERR_PTR(ret);
187 goto out; 217 goto out;
218 }
188 219
189 /* 220 /*
190 * Save the svc_serv in the transport so that it can 221 * Save the svc_serv in the transport so that it can
191 * be referenced when the session backchannel is initialized 222 * be referenced when the session backchannel is initialized
192 */ 223 */
193 serv->bc_xprt = bc_xprt;
194 xprt->bc_serv = serv; 224 xprt->bc_serv = serv;
195 225
196 INIT_LIST_HEAD(&serv->sv_cb_list); 226 INIT_LIST_HEAD(&serv->sv_cb_list);
197 spin_lock_init(&serv->sv_cb_lock); 227 spin_lock_init(&serv->sv_cb_lock);
198 init_waitqueue_head(&serv->sv_cb_waitq); 228 init_waitqueue_head(&serv->sv_cb_waitq);
199 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); 229 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
200 if (IS_ERR(rqstp)) 230 if (IS_ERR(rqstp)) {
201 svc_sock_destroy(bc_xprt); 231 svc_xprt_put(serv->sv_bc_xprt);
232 serv->sv_bc_xprt = NULL;
233 }
202out: 234out:
203 dprintk("--> %s return %p\n", __func__, rqstp); 235 dprintk("--> %s return %ld\n", __func__,
236 IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
204 return rqstp; 237 return rqstp;
205} 238}
206 239
@@ -233,6 +266,10 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
233 struct nfs_callback_data *cb_info) 266 struct nfs_callback_data *cb_info)
234{ 267{
235} 268}
269int nfs4_set_callback_sessionid(struct nfs_client *clp)
270{
271 return 0;
272}
236#endif /* CONFIG_NFS_V4_1 */ 273#endif /* CONFIG_NFS_V4_1 */
237 274
238/* 275/*
@@ -328,6 +365,9 @@ static int check_gss_callback_principal(struct nfs_client *clp,
328 struct rpc_clnt *r = clp->cl_rpcclient; 365 struct rpc_clnt *r = clp->cl_rpcclient;
329 char *p = svc_gss_principal(rqstp); 366 char *p = svc_gss_principal(rqstp);
330 367
368 /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
369 if (clp->cl_minorversion != 0)
370 return SVC_DROP;
331 /* 371 /*
332 * It might just be a normal user principal, in which case 372 * It might just be a normal user principal, in which case
333 * userspace won't bother to tell us the name at all. 373 * userspace won't bother to tell us the name at all.
@@ -345,6 +385,23 @@ static int check_gss_callback_principal(struct nfs_client *clp,
345 return SVC_OK; 385 return SVC_OK;
346} 386}
347 387
388/* pg_authenticate method helper */
389static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp)
390{
391 struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp);
392 int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0;
393
394 dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc);
395 if (svc_is_backchannel(rqstp))
396 /* Sessionid (usually) set after CB_NULL ping */
397 return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
398 is_cb_compound);
399 else
400 /* No callback identifier in pg_authenticate */
401 return nfs4_find_client_no_ident(svc_addr(rqstp));
402}
403
404/* pg_authenticate method for nfsv4 callback threads. */
348static int nfs_callback_authenticate(struct svc_rqst *rqstp) 405static int nfs_callback_authenticate(struct svc_rqst *rqstp)
349{ 406{
350 struct nfs_client *clp; 407 struct nfs_client *clp;
@@ -352,7 +409,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
352 int ret = SVC_OK; 409 int ret = SVC_OK;
353 410
354 /* Don't talk to strangers */ 411 /* Don't talk to strangers */
355 clp = nfs_find_client(svc_addr(rqstp), 4); 412 clp = nfs_cb_find_client(rqstp);
356 if (clp == NULL) 413 if (clp == NULL)
357 return SVC_DROP; 414 return SVC_DROP;
358 415
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8dd..d3b44f9bd747 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -34,10 +34,17 @@ enum nfs4_callback_opnum {
34 OP_CB_ILLEGAL = 10044, 34 OP_CB_ILLEGAL = 10044,
35}; 35};
36 36
37struct cb_process_state {
38 __be32 drc_status;
39 struct nfs_client *clp;
40 struct nfs4_sessionid *svc_sid; /* v4.1 callback service sessionid */
41};
42
37struct cb_compound_hdr_arg { 43struct cb_compound_hdr_arg {
38 unsigned int taglen; 44 unsigned int taglen;
39 const char *tag; 45 const char *tag;
40 unsigned int minorversion; 46 unsigned int minorversion;
47 unsigned int cb_ident; /* v4.0 callback identifier */
41 unsigned nops; 48 unsigned nops;
42}; 49};
43 50
@@ -103,14 +110,23 @@ struct cb_sequenceres {
103 uint32_t csr_target_highestslotid; 110 uint32_t csr_target_highestslotid;
104}; 111};
105 112
106extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, 113extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
107 struct cb_sequenceres *res); 114 struct cb_sequenceres *res,
115 struct cb_process_state *cps);
108 116
109extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, 117extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
110 const nfs4_stateid *stateid); 118 const nfs4_stateid *stateid);
111 119
112#define RCA4_TYPE_MASK_RDATA_DLG 0 120#define RCA4_TYPE_MASK_RDATA_DLG 0
113#define RCA4_TYPE_MASK_WDATA_DLG 1 121#define RCA4_TYPE_MASK_WDATA_DLG 1
122#define RCA4_TYPE_MASK_DIR_DLG 2
123#define RCA4_TYPE_MASK_FILE_LAYOUT 3
124#define RCA4_TYPE_MASK_BLK_LAYOUT 4
125#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8
126#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
127#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
128#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
129#define RCA4_TYPE_MASK_ALL 0xf31f
114 130
115struct cb_recallanyargs { 131struct cb_recallanyargs {
116 struct sockaddr *craa_addr; 132 struct sockaddr *craa_addr;
@@ -118,25 +134,52 @@ struct cb_recallanyargs {
118 uint32_t craa_type_mask; 134 uint32_t craa_type_mask;
119}; 135};
120 136
121extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); 137extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
138 void *dummy,
139 struct cb_process_state *cps);
122 140
123struct cb_recallslotargs { 141struct cb_recallslotargs {
124 struct sockaddr *crsa_addr; 142 struct sockaddr *crsa_addr;
125 uint32_t crsa_target_max_slots; 143 uint32_t crsa_target_max_slots;
126}; 144};
127extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, 145extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
128 void *dummy); 146 void *dummy,
147 struct cb_process_state *cps);
148
149struct cb_layoutrecallargs {
150 struct sockaddr *cbl_addr;
151 uint32_t cbl_recall_type;
152 uint32_t cbl_layout_type;
153 uint32_t cbl_layoutchanged;
154 union {
155 struct {
156 struct nfs_fh cbl_fh;
157 struct pnfs_layout_range cbl_range;
158 nfs4_stateid cbl_stateid;
159 };
160 struct nfs_fsid cbl_fsid;
161 };
162};
129 163
130#endif /* CONFIG_NFS_V4_1 */ 164extern unsigned nfs4_callback_layoutrecall(
165 struct cb_layoutrecallargs *args,
166 void *dummy, struct cb_process_state *cps);
131 167
132extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); 168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
133extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy); 169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170#endif /* CONFIG_NFS_V4_1 */
134 171
172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
173 struct cb_getattrres *res,
174 struct cb_process_state *cps);
175extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
176 struct cb_process_state *cps);
135#ifdef CONFIG_NFS_V4 177#ifdef CONFIG_NFS_V4
136extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); 178extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
137extern void nfs_callback_down(int minorversion); 179extern void nfs_callback_down(int minorversion);
138extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, 180extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
139 const nfs4_stateid *stateid); 181 const nfs4_stateid *stateid);
182extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
140#endif /* CONFIG_NFS_V4 */ 183#endif /* CONFIG_NFS_V4 */
141/* 184/*
142 * nfs41: Callbacks are expected to not cause substantial latency, 185 * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2950fca0c61b..4bb91cb2620d 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,30 +12,33 @@
12#include "callback.h" 12#include "callback.h"
13#include "delegation.h" 13#include "delegation.h"
14#include "internal.h" 14#include "internal.h"
15#include "pnfs.h"
15 16
16#ifdef NFS_DEBUG 17#ifdef NFS_DEBUG
17#define NFSDBG_FACILITY NFSDBG_CALLBACK 18#define NFSDBG_FACILITY NFSDBG_CALLBACK
18#endif 19#endif
19 20
20__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) 21__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
22 struct cb_getattrres *res,
23 struct cb_process_state *cps)
21{ 24{
22 struct nfs_client *clp;
23 struct nfs_delegation *delegation; 25 struct nfs_delegation *delegation;
24 struct nfs_inode *nfsi; 26 struct nfs_inode *nfsi;
25 struct inode *inode; 27 struct inode *inode;
26 28
29 res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
30 if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
31 goto out;
32
27 res->bitmap[0] = res->bitmap[1] = 0; 33 res->bitmap[0] = res->bitmap[1] = 0;
28 res->status = htonl(NFS4ERR_BADHANDLE); 34 res->status = htonl(NFS4ERR_BADHANDLE);
29 clp = nfs_find_client(args->addr, 4);
30 if (clp == NULL)
31 goto out;
32 35
33 dprintk("NFS: GETATTR callback request from %s\n", 36 dprintk("NFS: GETATTR callback request from %s\n",
34 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 37 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
35 38
36 inode = nfs_delegation_find_inode(clp, &args->fh); 39 inode = nfs_delegation_find_inode(cps->clp, &args->fh);
37 if (inode == NULL) 40 if (inode == NULL)
38 goto out_putclient; 41 goto out;
39 nfsi = NFS_I(inode); 42 nfsi = NFS_I(inode);
40 rcu_read_lock(); 43 rcu_read_lock();
41 delegation = rcu_dereference(nfsi->delegation); 44 delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
55out_iput: 58out_iput:
56 rcu_read_unlock(); 59 rcu_read_unlock();
57 iput(inode); 60 iput(inode);
58out_putclient:
59 nfs_put_client(clp);
60out: 61out:
61 dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); 62 dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
62 return res->status; 63 return res->status;
63} 64}
64 65
65__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) 66__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
67 struct cb_process_state *cps)
66{ 68{
67 struct nfs_client *clp;
68 struct inode *inode; 69 struct inode *inode;
69 __be32 res; 70 __be32 res;
70 71
71 res = htonl(NFS4ERR_BADHANDLE); 72 res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
72 clp = nfs_find_client(args->addr, 4); 73 if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
73 if (clp == NULL)
74 goto out; 74 goto out;
75 75
76 dprintk("NFS: RECALL callback request from %s\n", 76 dprintk("NFS: RECALL callback request from %s\n",
77 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 77 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
78 78
79 do { 79 res = htonl(NFS4ERR_BADHANDLE);
80 struct nfs_client *prev = clp; 80 inode = nfs_delegation_find_inode(cps->clp, &args->fh);
81 81 if (inode == NULL)
82 inode = nfs_delegation_find_inode(clp, &args->fh); 82 goto out;
83 if (inode != NULL) { 83 /* Set up a helper thread to actually return the delegation */
84 /* Set up a helper thread to actually return the delegation */ 84 switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
85 switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { 85 case 0:
86 case 0: 86 res = 0;
87 res = 0; 87 break;
88 break; 88 case -ENOENT:
89 case -ENOENT: 89 if (res != 0)
90 if (res != 0) 90 res = htonl(NFS4ERR_BAD_STATEID);
91 res = htonl(NFS4ERR_BAD_STATEID); 91 break;
92 break; 92 default:
93 default: 93 res = htonl(NFS4ERR_RESOURCE);
94 res = htonl(NFS4ERR_RESOURCE); 94 }
95 } 95 iput(inode);
96 iput(inode);
97 }
98 clp = nfs_find_client_next(prev);
99 nfs_put_client(prev);
100 } while (clp != NULL);
101out: 96out:
102 dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); 97 dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
103 return res; 98 return res;
@@ -113,6 +108,139 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
113 108
114#if defined(CONFIG_NFS_V4_1) 109#if defined(CONFIG_NFS_V4_1)
115 110
111static u32 initiate_file_draining(struct nfs_client *clp,
112 struct cb_layoutrecallargs *args)
113{
114 struct pnfs_layout_hdr *lo;
115 struct inode *ino;
116 bool found = false;
117 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
118 LIST_HEAD(free_me_list);
119
120 spin_lock(&clp->cl_lock);
121 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
122 if (nfs_compare_fh(&args->cbl_fh,
123 &NFS_I(lo->plh_inode)->fh))
124 continue;
125 ino = igrab(lo->plh_inode);
126 if (!ino)
127 continue;
128 found = true;
129 /* Without this, layout can be freed as soon
130 * as we release cl_lock.
131 */
132 get_layout_hdr(lo);
133 break;
134 }
135 spin_unlock(&clp->cl_lock);
136 if (!found)
137 return NFS4ERR_NOMATCHING_LAYOUT;
138
139 spin_lock(&ino->i_lock);
140 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
141 mark_matching_lsegs_invalid(lo, &free_me_list,
142 args->cbl_range.iomode))
143 rv = NFS4ERR_DELAY;
144 else
145 rv = NFS4ERR_NOMATCHING_LAYOUT;
146 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
147 spin_unlock(&ino->i_lock);
148 pnfs_free_lseg_list(&free_me_list);
149 put_layout_hdr(lo);
150 iput(ino);
151 return rv;
152}
153
154static u32 initiate_bulk_draining(struct nfs_client *clp,
155 struct cb_layoutrecallargs *args)
156{
157 struct pnfs_layout_hdr *lo;
158 struct inode *ino;
159 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
160 struct pnfs_layout_hdr *tmp;
161 LIST_HEAD(recall_list);
162 LIST_HEAD(free_me_list);
163 struct pnfs_layout_range range = {
164 .iomode = IOMODE_ANY,
165 .offset = 0,
166 .length = NFS4_MAX_UINT64,
167 };
168
169 spin_lock(&clp->cl_lock);
170 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
171 if ((args->cbl_recall_type == RETURN_FSID) &&
172 memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
173 &args->cbl_fsid, sizeof(struct nfs_fsid)))
174 continue;
175 if (!igrab(lo->plh_inode))
176 continue;
177 get_layout_hdr(lo);
178 BUG_ON(!list_empty(&lo->plh_bulk_recall));
179 list_add(&lo->plh_bulk_recall, &recall_list);
180 }
181 spin_unlock(&clp->cl_lock);
182 list_for_each_entry_safe(lo, tmp,
183 &recall_list, plh_bulk_recall) {
184 ino = lo->plh_inode;
185 spin_lock(&ino->i_lock);
186 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
187 if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
188 rv = NFS4ERR_DELAY;
189 list_del_init(&lo->plh_bulk_recall);
190 spin_unlock(&ino->i_lock);
191 put_layout_hdr(lo);
192 iput(ino);
193 }
194 pnfs_free_lseg_list(&free_me_list);
195 return rv;
196}
197
198static u32 do_callback_layoutrecall(struct nfs_client *clp,
199 struct cb_layoutrecallargs *args)
200{
201 u32 res = NFS4ERR_DELAY;
202
203 dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
204 if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
205 goto out;
206 if (args->cbl_recall_type == RETURN_FILE)
207 res = initiate_file_draining(clp, args);
208 else
209 res = initiate_bulk_draining(clp, args);
210 clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
211out:
212 dprintk("%s returning %i\n", __func__, res);
213 return res;
214
215}
216
217__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
218 void *dummy, struct cb_process_state *cps)
219{
220 u32 res;
221
222 dprintk("%s: -->\n", __func__);
223
224 if (cps->clp)
225 res = do_callback_layoutrecall(cps->clp, args);
226 else
227 res = NFS4ERR_OP_NOT_IN_SESSION;
228
229 dprintk("%s: exit with status = %d\n", __func__, res);
230 return cpu_to_be32(res);
231}
232
233static void pnfs_recall_all_layouts(struct nfs_client *clp)
234{
235 struct cb_layoutrecallargs args;
236
237 /* Pretend we got a CB_LAYOUTRECALL(ALL) */
238 memset(&args, 0, sizeof(args));
239 args.cbl_recall_type = RETURN_ALL;
240 /* FIXME we ignore errors, what should we do? */
241 do_callback_layoutrecall(clp, &args);
242}
243
116int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) 244int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
117{ 245{
118 if (delegation == NULL) 246 if (delegation == NULL)
@@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
185} 313}
186 314
187/* 315/*
188 * Returns a pointer to a held 'struct nfs_client' that matches the server's
189 * address, major version number, and session ID. It is the caller's
190 * responsibility to release the returned reference.
191 *
192 * Returns NULL if there are no connections with sessions, or if no session
193 * matches the one of interest.
194 */
195 static struct nfs_client *find_client_with_session(
196 const struct sockaddr *addr, u32 nfsversion,
197 struct nfs4_sessionid *sessionid)
198{
199 struct nfs_client *clp;
200
201 clp = nfs_find_client(addr, 4);
202 if (clp == NULL)
203 return NULL;
204
205 do {
206 struct nfs_client *prev = clp;
207
208 if (clp->cl_session != NULL) {
209 if (memcmp(clp->cl_session->sess_id.data,
210 sessionid->data,
211 NFS4_MAX_SESSIONID_LEN) == 0) {
212 /* Returns a held reference to clp */
213 return clp;
214 }
215 }
216 clp = nfs_find_client_next(prev);
217 nfs_put_client(prev);
218 } while (clp != NULL);
219
220 return NULL;
221}
222
223/*
224 * For each referring call triple, check the session's slot table for 316 * For each referring call triple, check the session's slot table for
225 * a match. If the slot is in use and the sequence numbers match, the 317 * a match. If the slot is in use and the sequence numbers match, the
226 * client is still waiting for a response to the original request. 318 * client is still waiting for a response to the original request.
@@ -276,20 +368,34 @@ out:
276} 368}
277 369
278__be32 nfs4_callback_sequence(struct cb_sequenceargs *args, 370__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
279 struct cb_sequenceres *res) 371 struct cb_sequenceres *res,
372 struct cb_process_state *cps)
280{ 373{
281 struct nfs_client *clp; 374 struct nfs_client *clp;
282 int i; 375 int i;
283 __be32 status; 376 __be32 status;
284 377
378 cps->clp = NULL;
379
285 status = htonl(NFS4ERR_BADSESSION); 380 status = htonl(NFS4ERR_BADSESSION);
286 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); 381 /* Incoming session must match the callback session */
382 if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
383 goto out;
384
385 clp = nfs4_find_client_sessionid(args->csa_addr,
386 &args->csa_sessionid, 1);
287 if (clp == NULL) 387 if (clp == NULL)
288 goto out; 388 goto out;
289 389
390 /* state manager is resetting the session */
391 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
392 status = NFS4ERR_DELAY;
393 goto out;
394 }
395
290 status = validate_seqid(&clp->cl_session->bc_slot_table, args); 396 status = validate_seqid(&clp->cl_session->bc_slot_table, args);
291 if (status) 397 if (status)
292 goto out_putclient; 398 goto out;
293 399
294 /* 400 /*
295 * Check for pending referring calls. If a match is found, a 401 * Check for pending referring calls. If a match is found, a
@@ -298,7 +404,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
298 */ 404 */
299 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { 405 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
300 status = htonl(NFS4ERR_DELAY); 406 status = htonl(NFS4ERR_DELAY);
301 goto out_putclient; 407 goto out;
302 } 408 }
303 409
304 memcpy(&res->csr_sessionid, &args->csa_sessionid, 410 memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,83 +413,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
307 res->csr_slotid = args->csa_slotid; 413 res->csr_slotid = args->csa_slotid;
308 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 414 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
309 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 415 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
416 nfs4_cb_take_slot(clp);
417 cps->clp = clp; /* put in nfs4_callback_compound */
310 418
311out_putclient:
312 nfs_put_client(clp);
313out: 419out:
314 for (i = 0; i < args->csa_nrclists; i++) 420 for (i = 0; i < args->csa_nrclists; i++)
315 kfree(args->csa_rclists[i].rcl_refcalls); 421 kfree(args->csa_rclists[i].rcl_refcalls);
316 kfree(args->csa_rclists); 422 kfree(args->csa_rclists);
317 423
318 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) 424 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
319 res->csr_status = 0; 425 cps->drc_status = status;
320 else 426 status = 0;
427 } else
321 res->csr_status = status; 428 res->csr_status = status;
429
322 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, 430 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
323 ntohl(status), ntohl(res->csr_status)); 431 ntohl(status), ntohl(res->csr_status));
324 return status; 432 return status;
325} 433}
326 434
327__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) 435static bool
436validate_bitmap_values(unsigned long mask)
437{
438 return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
439}
440
441__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
442 struct cb_process_state *cps)
328{ 443{
329 struct nfs_client *clp;
330 __be32 status; 444 __be32 status;
331 fmode_t flags = 0; 445 fmode_t flags = 0;
332 446
333 status = htonl(NFS4ERR_OP_NOT_IN_SESSION); 447 status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
334 clp = nfs_find_client(args->craa_addr, 4); 448 if (!cps->clp) /* set in cb_sequence */
335 if (clp == NULL)
336 goto out; 449 goto out;
337 450
338 dprintk("NFS: RECALL_ANY callback request from %s\n", 451 dprintk("NFS: RECALL_ANY callback request from %s\n",
339 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 452 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
453
454 status = cpu_to_be32(NFS4ERR_INVAL);
455 if (!validate_bitmap_values(args->craa_type_mask))
456 goto out;
340 457
458 status = cpu_to_be32(NFS4_OK);
341 if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) 459 if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
342 &args->craa_type_mask)) 460 &args->craa_type_mask))
343 flags = FMODE_READ; 461 flags = FMODE_READ;
344 if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) 462 if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
345 &args->craa_type_mask)) 463 &args->craa_type_mask))
346 flags |= FMODE_WRITE; 464 flags |= FMODE_WRITE;
347 465 if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
466 &args->craa_type_mask))
467 pnfs_recall_all_layouts(cps->clp);
348 if (flags) 468 if (flags)
349 nfs_expire_all_delegation_types(clp, flags); 469 nfs_expire_all_delegation_types(cps->clp, flags);
350 status = htonl(NFS4_OK);
351out: 470out:
352 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 471 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
353 return status; 472 return status;
354} 473}
355 474
356/* Reduce the fore channel's max_slots to the target value */ 475/* Reduce the fore channel's max_slots to the target value */
357__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) 476__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
477 struct cb_process_state *cps)
358{ 478{
359 struct nfs_client *clp;
360 struct nfs4_slot_table *fc_tbl; 479 struct nfs4_slot_table *fc_tbl;
361 __be32 status; 480 __be32 status;
362 481
363 status = htonl(NFS4ERR_OP_NOT_IN_SESSION); 482 status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
364 clp = nfs_find_client(args->crsa_addr, 4); 483 if (!cps->clp) /* set in cb_sequence */
365 if (clp == NULL)
366 goto out; 484 goto out;
367 485
368 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", 486 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
369 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), 487 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
370 args->crsa_target_max_slots); 488 args->crsa_target_max_slots);
371 489
372 fc_tbl = &clp->cl_session->fc_slot_table; 490 fc_tbl = &cps->clp->cl_session->fc_slot_table;
373 491
374 status = htonl(NFS4ERR_BAD_HIGH_SLOT); 492 status = htonl(NFS4ERR_BAD_HIGH_SLOT);
375 if (args->crsa_target_max_slots > fc_tbl->max_slots || 493 if (args->crsa_target_max_slots > fc_tbl->max_slots ||
376 args->crsa_target_max_slots < 1) 494 args->crsa_target_max_slots < 1)
377 goto out_putclient; 495 goto out;
378 496
379 status = htonl(NFS4_OK); 497 status = htonl(NFS4_OK);
380 if (args->crsa_target_max_slots == fc_tbl->max_slots) 498 if (args->crsa_target_max_slots == fc_tbl->max_slots)
381 goto out_putclient; 499 goto out;
382 500
383 fc_tbl->target_max_slots = args->crsa_target_max_slots; 501 fc_tbl->target_max_slots = args->crsa_target_max_slots;
384 nfs41_handle_recall_slot(clp); 502 nfs41_handle_recall_slot(cps->clp);
385out_putclient:
386 nfs_put_client(clp); /* balance nfs_find_client */
387out: 503out:
388 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 504 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
389 return status; 505 return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0edf..23112c263f81 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sunrpc/bc_xprt.h>
13#include "nfs4_fs.h" 14#include "nfs4_fs.h"
14#include "callback.h" 15#include "callback.h"
16#include "internal.h"
15 17
16#define CB_OP_TAGLEN_MAXSZ (512) 18#define CB_OP_TAGLEN_MAXSZ (512)
17#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) 19#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
@@ -22,6 +24,7 @@
22#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 24#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
23 25
24#if defined(CONFIG_NFS_V4_1) 26#if defined(CONFIG_NFS_V4_1)
27#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
25#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 28#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
26 4 + 1 + 3) 29 4 + 1 + 3)
27#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 30#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -33,7 +36,8 @@
33/* Internal error code */ 36/* Internal error code */
34#define NFS4ERR_RESOURCE_HDR 11050 37#define NFS4ERR_RESOURCE_HDR 11050
35 38
36typedef __be32 (*callback_process_op_t)(void *, void *); 39typedef __be32 (*callback_process_op_t)(void *, void *,
40 struct cb_process_state *);
37typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); 41typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
38typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); 42typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
39 43
@@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
160 hdr->minorversion = ntohl(*p++); 164 hdr->minorversion = ntohl(*p++);
161 /* Check minor version is zero or one. */ 165 /* Check minor version is zero or one. */
162 if (hdr->minorversion <= 1) { 166 if (hdr->minorversion <= 1) {
163 p++; /* skip callback_ident */ 167 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
164 } else { 168 } else {
165 printk(KERN_WARNING "%s: NFSv4 server callback with " 169 printk(KERN_WARNING "%s: NFSv4 server callback with "
166 "illegal minor version %u!\n", 170 "illegal minor version %u!\n",
@@ -220,6 +224,66 @@ out:
220 224
221#if defined(CONFIG_NFS_V4_1) 225#if defined(CONFIG_NFS_V4_1)
222 226
227static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
228 struct xdr_stream *xdr,
229 struct cb_layoutrecallargs *args)
230{
231 __be32 *p;
232 __be32 status = 0;
233 uint32_t iomode;
234
235 args->cbl_addr = svc_addr(rqstp);
236 p = read_buf(xdr, 4 * sizeof(uint32_t));
237 if (unlikely(p == NULL)) {
238 status = htonl(NFS4ERR_BADXDR);
239 goto out;
240 }
241
242 args->cbl_layout_type = ntohl(*p++);
243 /* Depite the spec's xdr, iomode really belongs in the FILE switch,
244 * as it is unuseable and ignored with the other types.
245 */
246 iomode = ntohl(*p++);
247 args->cbl_layoutchanged = ntohl(*p++);
248 args->cbl_recall_type = ntohl(*p++);
249
250 if (args->cbl_recall_type == RETURN_FILE) {
251 args->cbl_range.iomode = iomode;
252 status = decode_fh(xdr, &args->cbl_fh);
253 if (unlikely(status != 0))
254 goto out;
255
256 p = read_buf(xdr, 2 * sizeof(uint64_t));
257 if (unlikely(p == NULL)) {
258 status = htonl(NFS4ERR_BADXDR);
259 goto out;
260 }
261 p = xdr_decode_hyper(p, &args->cbl_range.offset);
262 p = xdr_decode_hyper(p, &args->cbl_range.length);
263 status = decode_stateid(xdr, &args->cbl_stateid);
264 if (unlikely(status != 0))
265 goto out;
266 } else if (args->cbl_recall_type == RETURN_FSID) {
267 p = read_buf(xdr, 2 * sizeof(uint64_t));
268 if (unlikely(p == NULL)) {
269 status = htonl(NFS4ERR_BADXDR);
270 goto out;
271 }
272 p = xdr_decode_hyper(p, &args->cbl_fsid.major);
273 p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
274 } else if (args->cbl_recall_type != RETURN_ALL) {
275 status = htonl(NFS4ERR_BADXDR);
276 goto out;
277 }
278 dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
279 __func__,
280 args->cbl_layout_type, iomode,
281 args->cbl_layoutchanged, args->cbl_recall_type);
282out:
283 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
284 return status;
285}
286
223static __be32 decode_sessionid(struct xdr_stream *xdr, 287static __be32 decode_sessionid(struct xdr_stream *xdr,
224 struct nfs4_sessionid *sid) 288 struct nfs4_sessionid *sid)
225{ 289{
@@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
574 case OP_CB_SEQUENCE: 638 case OP_CB_SEQUENCE:
575 case OP_CB_RECALL_ANY: 639 case OP_CB_RECALL_ANY:
576 case OP_CB_RECALL_SLOT: 640 case OP_CB_RECALL_SLOT:
641 case OP_CB_LAYOUTRECALL:
577 *op = &callback_ops[op_nr]; 642 *op = &callback_ops[op_nr];
578 break; 643 break;
579 644
580 case OP_CB_LAYOUTRECALL:
581 case OP_CB_NOTIFY_DEVICEID: 645 case OP_CB_NOTIFY_DEVICEID:
582 case OP_CB_NOTIFY: 646 case OP_CB_NOTIFY:
583 case OP_CB_PUSH_DELEG: 647 case OP_CB_PUSH_DELEG:
@@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
593 return htonl(NFS_OK); 657 return htonl(NFS_OK);
594} 658}
595 659
660static void nfs4_callback_free_slot(struct nfs4_session *session)
661{
662 struct nfs4_slot_table *tbl = &session->bc_slot_table;
663
664 spin_lock(&tbl->slot_tbl_lock);
665 /*
666 * Let the state manager know callback processing done.
667 * A single slot, so highest used slotid is either 0 or -1
668 */
669 tbl->highest_used_slotid--;
670 nfs4_check_drain_bc_complete(session);
671 spin_unlock(&tbl->slot_tbl_lock);
672}
673
674static void nfs4_cb_free_slot(struct nfs_client *clp)
675{
676 if (clp && clp->cl_session)
677 nfs4_callback_free_slot(clp->cl_session);
678}
679
680/* A single slot, so highest used slotid is either 0 or -1 */
681void nfs4_cb_take_slot(struct nfs_client *clp)
682{
683 struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
684
685 spin_lock(&tbl->slot_tbl_lock);
686 tbl->highest_used_slotid++;
687 BUG_ON(tbl->highest_used_slotid != 0);
688 spin_unlock(&tbl->slot_tbl_lock);
689}
690
596#else /* CONFIG_NFS_V4_1 */ 691#else /* CONFIG_NFS_V4_1 */
597 692
598static __be32 693static __be32
@@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
601 return htonl(NFS4ERR_MINOR_VERS_MISMATCH); 696 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
602} 697}
603 698
699static void nfs4_cb_free_slot(struct nfs_client *clp)
700{
701}
604#endif /* CONFIG_NFS_V4_1 */ 702#endif /* CONFIG_NFS_V4_1 */
605 703
606static __be32 704static __be32
@@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
621static __be32 process_op(uint32_t minorversion, int nop, 719static __be32 process_op(uint32_t minorversion, int nop,
622 struct svc_rqst *rqstp, 720 struct svc_rqst *rqstp,
623 struct xdr_stream *xdr_in, void *argp, 721 struct xdr_stream *xdr_in, void *argp,
624 struct xdr_stream *xdr_out, void *resp, int* drc_status) 722 struct xdr_stream *xdr_out, void *resp,
723 struct cb_process_state *cps)
625{ 724{
626 struct callback_op *op = &callback_ops[0]; 725 struct callback_op *op = &callback_ops[0];
627 unsigned int op_nr; 726 unsigned int op_nr;
@@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
644 if (status) 743 if (status)
645 goto encode_hdr; 744 goto encode_hdr;
646 745
647 if (*drc_status) { 746 if (cps->drc_status) {
648 status = *drc_status; 747 status = cps->drc_status;
649 goto encode_hdr; 748 goto encode_hdr;
650 } 749 }
651 750
@@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
653 if (maxlen > 0 && maxlen < PAGE_SIZE) { 752 if (maxlen > 0 && maxlen < PAGE_SIZE) {
654 status = op->decode_args(rqstp, xdr_in, argp); 753 status = op->decode_args(rqstp, xdr_in, argp);
655 if (likely(status == 0)) 754 if (likely(status == 0))
656 status = op->process_op(argp, resp); 755 status = op->process_op(argp, resp, cps);
657 } else 756 } else
658 status = htonl(NFS4ERR_RESOURCE); 757 status = htonl(NFS4ERR_RESOURCE);
659 758
660 /* Only set by OP_CB_SEQUENCE processing */
661 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
662 *drc_status = status;
663 status = 0;
664 }
665
666encode_hdr: 759encode_hdr:
667 res = encode_op_hdr(xdr_out, op_nr, status); 760 res = encode_op_hdr(xdr_out, op_nr, status);
668 if (unlikely(res)) 761 if (unlikely(res))
@@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
681 struct cb_compound_hdr_arg hdr_arg = { 0 }; 774 struct cb_compound_hdr_arg hdr_arg = { 0 };
682 struct cb_compound_hdr_res hdr_res = { NULL }; 775 struct cb_compound_hdr_res hdr_res = { NULL };
683 struct xdr_stream xdr_in, xdr_out; 776 struct xdr_stream xdr_in, xdr_out;
684 __be32 *p; 777 __be32 *p, status;
685 __be32 status, drc_status = 0; 778 struct cb_process_state cps = {
779 .drc_status = 0,
780 .clp = NULL,
781 };
686 unsigned int nops = 0; 782 unsigned int nops = 0;
687 783
688 dprintk("%s: start\n", __func__); 784 dprintk("%s: start\n", __func__);
@@ -696,6 +792,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
696 if (status == __constant_htonl(NFS4ERR_RESOURCE)) 792 if (status == __constant_htonl(NFS4ERR_RESOURCE))
697 return rpc_garbage_args; 793 return rpc_garbage_args;
698 794
795 if (hdr_arg.minorversion == 0) {
796 cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
797 if (!cps.clp)
798 return rpc_drop_reply;
799 } else
800 cps.svc_sid = bc_xprt_sid(rqstp);
801
699 hdr_res.taglen = hdr_arg.taglen; 802 hdr_res.taglen = hdr_arg.taglen;
700 hdr_res.tag = hdr_arg.tag; 803 hdr_res.tag = hdr_arg.tag;
701 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) 804 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +806,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
703 806
704 while (status == 0 && nops != hdr_arg.nops) { 807 while (status == 0 && nops != hdr_arg.nops) {
705 status = process_op(hdr_arg.minorversion, nops, rqstp, 808 status = process_op(hdr_arg.minorversion, nops, rqstp,
706 &xdr_in, argp, &xdr_out, resp, &drc_status); 809 &xdr_in, argp, &xdr_out, resp, &cps);
707 nops++; 810 nops++;
708 } 811 }
709 812
@@ -716,6 +819,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
716 819
717 *hdr_res.status = status; 820 *hdr_res.status = status;
718 *hdr_res.nops = htonl(nops); 821 *hdr_res.nops = htonl(nops);
822 nfs4_cb_free_slot(cps.clp);
823 nfs_put_client(cps.clp);
719 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 824 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
720 return rpc_success; 825 return rpc_success;
721} 826}
@@ -739,6 +844,12 @@ static struct callback_op callback_ops[] = {
739 .res_maxsize = CB_OP_RECALL_RES_MAXSZ, 844 .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
740 }, 845 },
741#if defined(CONFIG_NFS_V4_1) 846#if defined(CONFIG_NFS_V4_1)
847 [OP_CB_LAYOUTRECALL] = {
848 .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
849 .decode_args =
850 (callback_decode_arg_t)decode_layoutrecall_args,
851 .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
852 },
742 [OP_CB_SEQUENCE] = { 853 [OP_CB_SEQUENCE] = {
743 .process_op = (callback_process_op_t)nfs4_callback_sequence, 854 .process_op = (callback_process_op_t)nfs4_callback_sequence,
744 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, 855 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0870d0d4efc0..192f2f860265 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -56,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
56static LIST_HEAD(nfs_client_list); 56static LIST_HEAD(nfs_client_list);
57static LIST_HEAD(nfs_volume_list); 57static LIST_HEAD(nfs_volume_list);
58static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); 58static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
59#ifdef CONFIG_NFS_V4
60static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
61
62/*
63 * Get a unique NFSv4.0 callback identifier which will be used
64 * by the V4.0 callback service to lookup the nfs_client struct
65 */
66static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
67{
68 int ret = 0;
69
70 if (clp->rpc_ops->version != 4 || minorversion != 0)
71 return ret;
72retry:
73 if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
74 return -ENOMEM;
75 spin_lock(&nfs_client_lock);
76 ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
77 spin_unlock(&nfs_client_lock);
78 if (ret == -EAGAIN)
79 goto retry;
80 return ret;
81}
82#endif /* CONFIG_NFS_V4 */
59 83
60/* 84/*
61 * RPC cruft for NFS 85 * RPC cruft for NFS
@@ -144,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
144 clp->cl_proto = cl_init->proto; 168 clp->cl_proto = cl_init->proto;
145 169
146#ifdef CONFIG_NFS_V4 170#ifdef CONFIG_NFS_V4
147 INIT_LIST_HEAD(&clp->cl_delegations); 171 err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
172 if (err)
173 goto error_cleanup;
174
148 spin_lock_init(&clp->cl_lock); 175 spin_lock_init(&clp->cl_lock);
149 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); 176 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
150 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); 177 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -170,21 +197,17 @@ error_0:
170} 197}
171 198
172#ifdef CONFIG_NFS_V4 199#ifdef CONFIG_NFS_V4
173/*
174 * Clears/puts all minor version specific parts from an nfs_client struct
175 * reverting it to minorversion 0.
176 */
177static void nfs4_clear_client_minor_version(struct nfs_client *clp)
178{
179#ifdef CONFIG_NFS_V4_1 200#ifdef CONFIG_NFS_V4_1
180 if (nfs4_has_session(clp)) { 201static void nfs4_shutdown_session(struct nfs_client *clp)
202{
203 if (nfs4_has_session(clp))
181 nfs4_destroy_session(clp->cl_session); 204 nfs4_destroy_session(clp->cl_session);
182 clp->cl_session = NULL;
183 }
184
185 clp->cl_mvops = nfs_v4_minor_ops[0];
186#endif /* CONFIG_NFS_V4_1 */
187} 205}
206#else /* CONFIG_NFS_V4_1 */
207static void nfs4_shutdown_session(struct nfs_client *clp)
208{
209}
210#endif /* CONFIG_NFS_V4_1 */
188 211
189/* 212/*
190 * Destroy the NFS4 callback service 213 * Destroy the NFS4 callback service
@@ -199,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
199{ 222{
200 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) 223 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
201 nfs4_kill_renewd(clp); 224 nfs4_kill_renewd(clp);
202 nfs4_clear_client_minor_version(clp); 225 nfs4_shutdown_session(clp);
203 nfs4_destroy_callback(clp); 226 nfs4_destroy_callback(clp);
204 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) 227 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
205 nfs_idmap_delete(clp); 228 nfs_idmap_delete(clp);
206 229
207 rpc_destroy_wait_queue(&clp->cl_rpcwaitq); 230 rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
208} 231}
232
233/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
234void nfs_cleanup_cb_ident_idr(void)
235{
236 idr_destroy(&cb_ident_idr);
237}
238
239/* nfs_client_lock held */
240static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
241{
242 if (clp->cl_cb_ident)
243 idr_remove(&cb_ident_idr, clp->cl_cb_ident);
244}
245
246static void pnfs_init_server(struct nfs_server *server)
247{
248 rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
249}
250
209#else 251#else
210static void nfs4_shutdown_client(struct nfs_client *clp) 252static void nfs4_shutdown_client(struct nfs_client *clp)
211{ 253{
212} 254}
255
256void nfs_cleanup_cb_ident_idr(void)
257{
258}
259
260static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
261{
262}
263
264static void pnfs_init_server(struct nfs_server *server)
265{
266}
267
213#endif /* CONFIG_NFS_V4 */ 268#endif /* CONFIG_NFS_V4 */
214 269
215/* 270/*
@@ -248,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp)
248 303
249 if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { 304 if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
250 list_del(&clp->cl_share_link); 305 list_del(&clp->cl_share_link);
306 nfs_cb_idr_remove_locked(clp);
251 spin_unlock(&nfs_client_lock); 307 spin_unlock(&nfs_client_lock);
252 308
253 BUG_ON(!list_empty(&clp->cl_superblocks)); 309 BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -363,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
363 return 0; 419 return 0;
364} 420}
365 421
366/* 422/* Common match routine for v4.0 and v4.1 callback services */
367 * Find a client by IP address and protocol version 423bool
368 * - returns NULL if no such client 424nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
369 */ 425 u32 minorversion)
370struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
371{
372 struct nfs_client *clp;
373
374 spin_lock(&nfs_client_lock);
375 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
376 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
377
378 /* Don't match clients that failed to initialise properly */
379 if (!(clp->cl_cons_state == NFS_CS_READY ||
380 clp->cl_cons_state == NFS_CS_SESSION_INITING))
381 continue;
382
383 /* Different NFS versions cannot share the same nfs_client */
384 if (clp->rpc_ops->version != nfsversion)
385 continue;
386
387 /* Match only the IP address, not the port number */
388 if (!nfs_sockaddr_match_ipaddr(addr, clap))
389 continue;
390
391 atomic_inc(&clp->cl_count);
392 spin_unlock(&nfs_client_lock);
393 return clp;
394 }
395 spin_unlock(&nfs_client_lock);
396 return NULL;
397}
398
399/*
400 * Find a client by IP address and protocol version
401 * - returns NULL if no such client
402 */
403struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
404{ 426{
405 struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr; 427 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
406 u32 nfsvers = clp->rpc_ops->version;
407 428
408 spin_lock(&nfs_client_lock); 429 /* Don't match clients that failed to initialise */
409 list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) { 430 if (!(clp->cl_cons_state == NFS_CS_READY ||
410 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; 431 clp->cl_cons_state == NFS_CS_SESSION_INITING))
432 return false;
411 433
412 /* Don't match clients that failed to initialise properly */ 434 /* Match the version and minorversion */
413 if (clp->cl_cons_state != NFS_CS_READY) 435 if (clp->rpc_ops->version != 4 ||
414 continue; 436 clp->cl_minorversion != minorversion)
437 return false;
415 438
416 /* Different NFS versions cannot share the same nfs_client */ 439 /* Match only the IP address, not the port number */
417 if (clp->rpc_ops->version != nfsvers) 440 if (!nfs_sockaddr_match_ipaddr(addr, clap))
418 continue; 441 return false;
419 442
420 /* Match only the IP address, not the port number */ 443 return true;
421 if (!nfs_sockaddr_match_ipaddr(sap, clap))
422 continue;
423
424 atomic_inc(&clp->cl_count);
425 spin_unlock(&nfs_client_lock);
426 return clp;
427 }
428 spin_unlock(&nfs_client_lock);
429 return NULL;
430} 444}
431 445
432/* 446/*
@@ -988,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
988 target->options = source->options; 1002 target->options = source->options;
989} 1003}
990 1004
1005static void nfs_server_insert_lists(struct nfs_server *server)
1006{
1007 struct nfs_client *clp = server->nfs_client;
1008
1009 spin_lock(&nfs_client_lock);
1010 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
1011 list_add_tail(&server->master_link, &nfs_volume_list);
1012 spin_unlock(&nfs_client_lock);
1013
1014}
1015
1016static void nfs_server_remove_lists(struct nfs_server *server)
1017{
1018 spin_lock(&nfs_client_lock);
1019 list_del_rcu(&server->client_link);
1020 list_del(&server->master_link);
1021 spin_unlock(&nfs_client_lock);
1022
1023 synchronize_rcu();
1024}
1025
991/* 1026/*
992 * Allocate and initialise a server record 1027 * Allocate and initialise a server record
993 */ 1028 */
@@ -1004,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
1004 /* Zero out the NFS state stuff */ 1039 /* Zero out the NFS state stuff */
1005 INIT_LIST_HEAD(&server->client_link); 1040 INIT_LIST_HEAD(&server->client_link);
1006 INIT_LIST_HEAD(&server->master_link); 1041 INIT_LIST_HEAD(&server->master_link);
1042 INIT_LIST_HEAD(&server->delegations);
1007 1043
1008 atomic_set(&server->active, 0); 1044 atomic_set(&server->active, 0);
1009 1045
@@ -1019,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void)
1019 return NULL; 1055 return NULL;
1020 } 1056 }
1021 1057
1058 pnfs_init_server(server);
1059
1022 return server; 1060 return server;
1023} 1061}
1024 1062
@@ -1029,11 +1067,8 @@ void nfs_free_server(struct nfs_server *server)
1029{ 1067{
1030 dprintk("--> nfs_free_server()\n"); 1068 dprintk("--> nfs_free_server()\n");
1031 1069
1070 nfs_server_remove_lists(server);
1032 unset_pnfs_layoutdriver(server); 1071 unset_pnfs_layoutdriver(server);
1033 spin_lock(&nfs_client_lock);
1034 list_del(&server->client_link);
1035 list_del(&server->master_link);
1036 spin_unlock(&nfs_client_lock);
1037 1072
1038 if (server->destroy != NULL) 1073 if (server->destroy != NULL)
1039 server->destroy(server); 1074 server->destroy(server);
@@ -1108,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1108 (unsigned long long) server->fsid.major, 1143 (unsigned long long) server->fsid.major,
1109 (unsigned long long) server->fsid.minor); 1144 (unsigned long long) server->fsid.minor);
1110 1145
1111 spin_lock(&nfs_client_lock); 1146 nfs_server_insert_lists(server);
1112 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1113 list_add_tail(&server->master_link, &nfs_volume_list);
1114 spin_unlock(&nfs_client_lock);
1115
1116 server->mount_time = jiffies; 1147 server->mount_time = jiffies;
1117 nfs_free_fattr(fattr); 1148 nfs_free_fattr(fattr);
1118 return server; 1149 return server;
@@ -1125,6 +1156,101 @@ error:
1125 1156
1126#ifdef CONFIG_NFS_V4 1157#ifdef CONFIG_NFS_V4
1127/* 1158/*
1159 * NFSv4.0 callback thread helper
1160 *
1161 * Find a client by IP address, protocol version, and minorversion
1162 *
1163 * Called from the pg_authenticate method. The callback identifier
1164 * is not used as it has not been decoded.
1165 *
1166 * Returns NULL if no such client
1167 */
1168struct nfs_client *
1169nfs4_find_client_no_ident(const struct sockaddr *addr)
1170{
1171 struct nfs_client *clp;
1172
1173 spin_lock(&nfs_client_lock);
1174 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
1175 if (nfs4_cb_match_client(addr, clp, 0) == false)
1176 continue;
1177 atomic_inc(&clp->cl_count);
1178 spin_unlock(&nfs_client_lock);
1179 return clp;
1180 }
1181 spin_unlock(&nfs_client_lock);
1182 return NULL;
1183}
1184
1185/*
1186 * NFSv4.0 callback thread helper
1187 *
1188 * Find a client by callback identifier
1189 */
1190struct nfs_client *
1191nfs4_find_client_ident(int cb_ident)
1192{
1193 struct nfs_client *clp;
1194
1195 spin_lock(&nfs_client_lock);
1196 clp = idr_find(&cb_ident_idr, cb_ident);
1197 if (clp)
1198 atomic_inc(&clp->cl_count);
1199 spin_unlock(&nfs_client_lock);
1200 return clp;
1201}
1202
1203#if defined(CONFIG_NFS_V4_1)
1204/*
1205 * NFSv4.1 callback thread helper
1206 * For CB_COMPOUND calls, find a client by IP address, protocol version,
1207 * minorversion, and sessionID
1208 *
1209 * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
1210 * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
1211 * can arrive before the callback sessionid is set. For CB_NULL calls,
1212 * find a client by IP address protocol version, and minorversion.
1213 *
1214 * Returns NULL if no such client
1215 */
1216struct nfs_client *
1217nfs4_find_client_sessionid(const struct sockaddr *addr,
1218 struct nfs4_sessionid *sid, int is_cb_compound)
1219{
1220 struct nfs_client *clp;
1221
1222 spin_lock(&nfs_client_lock);
1223 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
1224 if (nfs4_cb_match_client(addr, clp, 1) == false)
1225 continue;
1226
1227 if (!nfs4_has_session(clp))
1228 continue;
1229
1230 /* Match sessionid unless cb_null call*/
1231 if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data,
1232 sid->data, NFS4_MAX_SESSIONID_LEN) != 0))
1233 continue;
1234
1235 atomic_inc(&clp->cl_count);
1236 spin_unlock(&nfs_client_lock);
1237 return clp;
1238 }
1239 spin_unlock(&nfs_client_lock);
1240 return NULL;
1241}
1242
1243#else /* CONFIG_NFS_V4_1 */
1244
1245struct nfs_client *
1246nfs4_find_client_sessionid(const struct sockaddr *addr,
1247 struct nfs4_sessionid *sid, int is_cb_compound)
1248{
1249 return NULL;
1250}
1251#endif /* CONFIG_NFS_V4_1 */
1252
1253/*
1128 * Initialize the NFS4 callback service 1254 * Initialize the NFS4 callback service
1129 */ 1255 */
1130static int nfs4_init_callback(struct nfs_client *clp) 1256static int nfs4_init_callback(struct nfs_client *clp)
@@ -1342,11 +1468,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
1342 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) 1468 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
1343 server->namelen = NFS4_MAXNAMLEN; 1469 server->namelen = NFS4_MAXNAMLEN;
1344 1470
1345 spin_lock(&nfs_client_lock); 1471 nfs_server_insert_lists(server);
1346 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1347 list_add_tail(&server->master_link, &nfs_volume_list);
1348 spin_unlock(&nfs_client_lock);
1349
1350 server->mount_time = jiffies; 1472 server->mount_time = jiffies;
1351out: 1473out:
1352 nfs_free_fattr(fattr); 1474 nfs_free_fattr(fattr);
@@ -1551,11 +1673,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1551 if (error < 0) 1673 if (error < 0)
1552 goto out_free_server; 1674 goto out_free_server;
1553 1675
1554 spin_lock(&nfs_client_lock); 1676 nfs_server_insert_lists(server);
1555 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1556 list_add_tail(&server->master_link, &nfs_volume_list);
1557 spin_unlock(&nfs_client_lock);
1558
1559 server->mount_time = jiffies; 1677 server->mount_time = jiffies;
1560 1678
1561 nfs_free_fattr(fattr_fsinfo); 1679 nfs_free_fattr(fattr_fsinfo);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 1fd62fc49be3..364e4328f392 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -40,11 +40,23 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
40 call_rcu(&delegation->rcu, nfs_free_delegation_callback); 40 call_rcu(&delegation->rcu, nfs_free_delegation_callback);
41} 41}
42 42
43/**
44 * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
45 * @delegation: delegation to process
46 *
47 */
43void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) 48void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
44{ 49{
45 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); 50 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
46} 51}
47 52
53/**
54 * nfs_have_delegation - check if inode has a delegation
55 * @inode: inode to check
56 * @flags: delegation types to check for
57 *
58 * Returns one if inode has the indicated delegation, otherwise zero.
59 */
48int nfs_have_delegation(struct inode *inode, fmode_t flags) 60int nfs_have_delegation(struct inode *inode, fmode_t flags)
49{ 61{
50 struct nfs_delegation *delegation; 62 struct nfs_delegation *delegation;
@@ -119,10 +131,15 @@ again:
119 return 0; 131 return 0;
120} 132}
121 133
122/* 134/**
123 * Set up a delegation on an inode 135 * nfs_inode_reclaim_delegation - process a delegation reclaim request
136 * @inode: inode to process
137 * @cred: credential to use for request
138 * @res: new delegation state from server
139 *
124 */ 140 */
125void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 141void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
142 struct nfs_openres *res)
126{ 143{
127 struct nfs_delegation *delegation; 144 struct nfs_delegation *delegation;
128 struct rpc_cred *oldcred = NULL; 145 struct rpc_cred *oldcred = NULL;
@@ -175,38 +192,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
175 return inode; 192 return inode;
176} 193}
177 194
178static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, 195static struct nfs_delegation *
179 const nfs4_stateid *stateid, 196nfs_detach_delegation_locked(struct nfs_inode *nfsi,
180 struct nfs_client *clp) 197 struct nfs_server *server)
181{ 198{
182 struct nfs_delegation *delegation = 199 struct nfs_delegation *delegation =
183 rcu_dereference_protected(nfsi->delegation, 200 rcu_dereference_protected(nfsi->delegation,
184 lockdep_is_held(&clp->cl_lock)); 201 lockdep_is_held(&server->nfs_client->cl_lock));
185 202
186 if (delegation == NULL) 203 if (delegation == NULL)
187 goto nomatch; 204 goto nomatch;
205
188 spin_lock(&delegation->lock); 206 spin_lock(&delegation->lock);
189 if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
190 sizeof(delegation->stateid.data)) != 0)
191 goto nomatch_unlock;
192 list_del_rcu(&delegation->super_list); 207 list_del_rcu(&delegation->super_list);
193 delegation->inode = NULL; 208 delegation->inode = NULL;
194 nfsi->delegation_state = 0; 209 nfsi->delegation_state = 0;
195 rcu_assign_pointer(nfsi->delegation, NULL); 210 rcu_assign_pointer(nfsi->delegation, NULL);
196 spin_unlock(&delegation->lock); 211 spin_unlock(&delegation->lock);
197 return delegation; 212 return delegation;
198nomatch_unlock:
199 spin_unlock(&delegation->lock);
200nomatch: 213nomatch:
201 return NULL; 214 return NULL;
202} 215}
203 216
204/* 217static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
205 * Set up a delegation on an inode 218 struct nfs_server *server)
219{
220 struct nfs_client *clp = server->nfs_client;
221 struct nfs_delegation *delegation;
222
223 spin_lock(&clp->cl_lock);
224 delegation = nfs_detach_delegation_locked(nfsi, server);
225 spin_unlock(&clp->cl_lock);
226 return delegation;
227}
228
229/**
230 * nfs_inode_set_delegation - set up a delegation on an inode
231 * @inode: inode to which delegation applies
232 * @cred: cred to use for subsequent delegation processing
233 * @res: new delegation state from server
234 *
235 * Returns zero on success, or a negative errno value.
206 */ 236 */
207int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 237int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
208{ 238{
209 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 239 struct nfs_server *server = NFS_SERVER(inode);
240 struct nfs_client *clp = server->nfs_client;
210 struct nfs_inode *nfsi = NFS_I(inode); 241 struct nfs_inode *nfsi = NFS_I(inode);
211 struct nfs_delegation *delegation, *old_delegation; 242 struct nfs_delegation *delegation, *old_delegation;
212 struct nfs_delegation *freeme = NULL; 243 struct nfs_delegation *freeme = NULL;
@@ -227,7 +258,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
227 258
228 spin_lock(&clp->cl_lock); 259 spin_lock(&clp->cl_lock);
229 old_delegation = rcu_dereference_protected(nfsi->delegation, 260 old_delegation = rcu_dereference_protected(nfsi->delegation,
230 lockdep_is_held(&clp->cl_lock)); 261 lockdep_is_held(&clp->cl_lock));
231 if (old_delegation != NULL) { 262 if (old_delegation != NULL) {
232 if (memcmp(&delegation->stateid, &old_delegation->stateid, 263 if (memcmp(&delegation->stateid, &old_delegation->stateid,
233 sizeof(old_delegation->stateid)) == 0 && 264 sizeof(old_delegation->stateid)) == 0 &&
@@ -246,9 +277,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
246 delegation = NULL; 277 delegation = NULL;
247 goto out; 278 goto out;
248 } 279 }
249 freeme = nfs_detach_delegation_locked(nfsi, NULL, clp); 280 freeme = nfs_detach_delegation_locked(nfsi, server);
250 } 281 }
251 list_add_rcu(&delegation->super_list, &clp->cl_delegations); 282 list_add_rcu(&delegation->super_list, &server->delegations);
252 nfsi->delegation_state = delegation->type; 283 nfsi->delegation_state = delegation->type;
253 rcu_assign_pointer(nfsi->delegation, delegation); 284 rcu_assign_pointer(nfsi->delegation, delegation);
254 delegation = NULL; 285 delegation = NULL;
@@ -290,73 +321,85 @@ out:
290 return err; 321 return err;
291} 322}
292 323
293/* 324/**
294 * Return all delegations that have been marked for return 325 * nfs_client_return_marked_delegations - return previously marked delegations
326 * @clp: nfs_client to process
327 *
328 * Returns zero on success, or a negative errno value.
295 */ 329 */
296int nfs_client_return_marked_delegations(struct nfs_client *clp) 330int nfs_client_return_marked_delegations(struct nfs_client *clp)
297{ 331{
298 struct nfs_delegation *delegation; 332 struct nfs_delegation *delegation;
333 struct nfs_server *server;
299 struct inode *inode; 334 struct inode *inode;
300 int err = 0; 335 int err = 0;
301 336
302restart: 337restart:
303 rcu_read_lock(); 338 rcu_read_lock();
304 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 339 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
305 if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) 340 list_for_each_entry_rcu(delegation, &server->delegations,
306 continue; 341 super_list) {
307 inode = nfs_delegation_grab_inode(delegation); 342 if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
308 if (inode == NULL) 343 &delegation->flags))
309 continue; 344 continue;
310 spin_lock(&clp->cl_lock); 345 inode = nfs_delegation_grab_inode(delegation);
311 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); 346 if (inode == NULL)
312 spin_unlock(&clp->cl_lock); 347 continue;
313 rcu_read_unlock(); 348 delegation = nfs_detach_delegation(NFS_I(inode),
314 if (delegation != NULL) { 349 server);
315 filemap_flush(inode->i_mapping); 350 rcu_read_unlock();
316 err = __nfs_inode_return_delegation(inode, delegation, 0); 351
352 if (delegation != NULL) {
353 filemap_flush(inode->i_mapping);
354 err = __nfs_inode_return_delegation(inode,
355 delegation, 0);
356 }
357 iput(inode);
358 if (!err)
359 goto restart;
360 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
361 return err;
317 } 362 }
318 iput(inode);
319 if (!err)
320 goto restart;
321 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
322 return err;
323 } 363 }
324 rcu_read_unlock(); 364 rcu_read_unlock();
325 return 0; 365 return 0;
326} 366}
327 367
328/* 368/**
329 * This function returns the delegation without reclaiming opens 369 * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
330 * or protecting against delegation reclaims. 370 * @inode: inode to process
331 * It is therefore really only safe to be called from 371 *
332 * nfs4_clear_inode() 372 * Does not protect against delegation reclaims, therefore really only safe
373 * to be called from nfs4_clear_inode().
333 */ 374 */
334void nfs_inode_return_delegation_noreclaim(struct inode *inode) 375void nfs_inode_return_delegation_noreclaim(struct inode *inode)
335{ 376{
336 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 377 struct nfs_server *server = NFS_SERVER(inode);
337 struct nfs_inode *nfsi = NFS_I(inode); 378 struct nfs_inode *nfsi = NFS_I(inode);
338 struct nfs_delegation *delegation; 379 struct nfs_delegation *delegation;
339 380
340 if (rcu_access_pointer(nfsi->delegation) != NULL) { 381 if (rcu_access_pointer(nfsi->delegation) != NULL) {
341 spin_lock(&clp->cl_lock); 382 delegation = nfs_detach_delegation(nfsi, server);
342 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
343 spin_unlock(&clp->cl_lock);
344 if (delegation != NULL) 383 if (delegation != NULL)
345 nfs_do_return_delegation(inode, delegation, 0); 384 nfs_do_return_delegation(inode, delegation, 0);
346 } 385 }
347} 386}
348 387
388/**
389 * nfs_inode_return_delegation - synchronously return a delegation
390 * @inode: inode to process
391 *
392 * Returns zero on success, or a negative errno value.
393 */
349int nfs_inode_return_delegation(struct inode *inode) 394int nfs_inode_return_delegation(struct inode *inode)
350{ 395{
351 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 396 struct nfs_server *server = NFS_SERVER(inode);
352 struct nfs_inode *nfsi = NFS_I(inode); 397 struct nfs_inode *nfsi = NFS_I(inode);
353 struct nfs_delegation *delegation; 398 struct nfs_delegation *delegation;
354 int err = 0; 399 int err = 0;
355 400
356 if (rcu_access_pointer(nfsi->delegation) != NULL) { 401 if (rcu_access_pointer(nfsi->delegation) != NULL) {
357 spin_lock(&clp->cl_lock); 402 delegation = nfs_detach_delegation(nfsi, server);
358 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
359 spin_unlock(&clp->cl_lock);
360 if (delegation != NULL) { 403 if (delegation != NULL) {
361 nfs_wb_all(inode); 404 nfs_wb_all(inode);
362 err = __nfs_inode_return_delegation(inode, delegation, 1); 405 err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -365,46 +408,61 @@ int nfs_inode_return_delegation(struct inode *inode)
365 return err; 408 return err;
366} 409}
367 410
368static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation) 411static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
369{ 412{
413 struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
414
370 set_bit(NFS_DELEGATION_RETURN, &delegation->flags); 415 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
371 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); 416 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
372} 417}
373 418
374/* 419/**
375 * Return all delegations associated to a super block 420 * nfs_super_return_all_delegations - return delegations for one superblock
421 * @sb: sb to process
422 *
376 */ 423 */
377void nfs_super_return_all_delegations(struct super_block *sb) 424void nfs_super_return_all_delegations(struct super_block *sb)
378{ 425{
379 struct nfs_client *clp = NFS_SB(sb)->nfs_client; 426 struct nfs_server *server = NFS_SB(sb);
427 struct nfs_client *clp = server->nfs_client;
380 struct nfs_delegation *delegation; 428 struct nfs_delegation *delegation;
381 429
382 if (clp == NULL) 430 if (clp == NULL)
383 return; 431 return;
432
384 rcu_read_lock(); 433 rcu_read_lock();
385 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 434 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
386 spin_lock(&delegation->lock); 435 spin_lock(&delegation->lock);
387 if (delegation->inode != NULL && delegation->inode->i_sb == sb) 436 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
388 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
389 spin_unlock(&delegation->lock); 437 spin_unlock(&delegation->lock);
390 } 438 }
391 rcu_read_unlock(); 439 rcu_read_unlock();
440
392 if (nfs_client_return_marked_delegations(clp) != 0) 441 if (nfs_client_return_marked_delegations(clp) != 0)
393 nfs4_schedule_state_manager(clp); 442 nfs4_schedule_state_manager(clp);
394} 443}
395 444
396static 445static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
397void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags) 446 fmode_t flags)
398{ 447{
399 struct nfs_delegation *delegation; 448 struct nfs_delegation *delegation;
400 449
401 rcu_read_lock(); 450 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
402 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
403 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) 451 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
404 continue; 452 continue;
405 if (delegation->type & flags) 453 if (delegation->type & flags)
406 nfs_mark_return_delegation(clp, delegation); 454 nfs_mark_return_delegation(delegation);
407 } 455 }
456}
457
458static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
459 fmode_t flags)
460{
461 struct nfs_server *server;
462
463 rcu_read_lock();
464 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
465 nfs_mark_return_all_delegation_types(server, flags);
408 rcu_read_unlock(); 466 rcu_read_unlock();
409} 467}
410 468
@@ -419,19 +477,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
419 nfs4_schedule_state_manager(clp); 477 nfs4_schedule_state_manager(clp);
420} 478}
421 479
480/**
481 * nfs_expire_all_delegation_types
482 * @clp: client to process
483 * @flags: delegation types to expire
484 *
485 */
422void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) 486void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
423{ 487{
424 nfs_client_mark_return_all_delegation_types(clp, flags); 488 nfs_client_mark_return_all_delegation_types(clp, flags);
425 nfs_delegation_run_state_manager(clp); 489 nfs_delegation_run_state_manager(clp);
426} 490}
427 491
492/**
493 * nfs_expire_all_delegations
494 * @clp: client to process
495 *
496 */
428void nfs_expire_all_delegations(struct nfs_client *clp) 497void nfs_expire_all_delegations(struct nfs_client *clp)
429{ 498{
430 nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); 499 nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
431} 500}
432 501
433/* 502/**
434 * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. 503 * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
504 * @clp: client to process
505 *
435 */ 506 */
436void nfs_handle_cb_pathdown(struct nfs_client *clp) 507void nfs_handle_cb_pathdown(struct nfs_client *clp)
437{ 508{
@@ -440,29 +511,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
440 nfs_client_mark_return_all_delegations(clp); 511 nfs_client_mark_return_all_delegations(clp);
441} 512}
442 513
443static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp) 514static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
444{ 515{
445 struct nfs_delegation *delegation; 516 struct nfs_delegation *delegation;
446 517
447 rcu_read_lock(); 518 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
448 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
449 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) 519 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
450 continue; 520 continue;
451 nfs_mark_return_delegation(clp, delegation); 521 nfs_mark_return_delegation(delegation);
452 } 522 }
453 rcu_read_unlock();
454} 523}
455 524
525/**
526 * nfs_expire_unreferenced_delegations - Eliminate unused delegations
527 * @clp: nfs_client to process
528 *
529 */
456void nfs_expire_unreferenced_delegations(struct nfs_client *clp) 530void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
457{ 531{
458 nfs_client_mark_return_unreferenced_delegations(clp); 532 struct nfs_server *server;
533
534 rcu_read_lock();
535 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
536 nfs_mark_return_unreferenced_delegations(server);
537 rcu_read_unlock();
538
459 nfs_delegation_run_state_manager(clp); 539 nfs_delegation_run_state_manager(clp);
460} 540}
461 541
462/* 542/**
463 * Asynchronous delegation recall! 543 * nfs_async_inode_return_delegation - asynchronously return a delegation
544 * @inode: inode to process
545 * @stateid: state ID information from CB_RECALL arguments
546 *
547 * Returns zero on success, or a negative errno value.
464 */ 548 */
465int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) 549int nfs_async_inode_return_delegation(struct inode *inode,
550 const nfs4_stateid *stateid)
466{ 551{
467 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 552 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
468 struct nfs_delegation *delegation; 553 struct nfs_delegation *delegation;
@@ -474,22 +559,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
474 rcu_read_unlock(); 559 rcu_read_unlock();
475 return -ENOENT; 560 return -ENOENT;
476 } 561 }
477 562 nfs_mark_return_delegation(delegation);
478 nfs_mark_return_delegation(clp, delegation);
479 rcu_read_unlock(); 563 rcu_read_unlock();
564
480 nfs_delegation_run_state_manager(clp); 565 nfs_delegation_run_state_manager(clp);
481 return 0; 566 return 0;
482} 567}
483 568
484/* 569static struct inode *
485 * Retrieve the inode associated with a delegation 570nfs_delegation_find_inode_server(struct nfs_server *server,
486 */ 571 const struct nfs_fh *fhandle)
487struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
488{ 572{
489 struct nfs_delegation *delegation; 573 struct nfs_delegation *delegation;
490 struct inode *res = NULL; 574 struct inode *res = NULL;
491 rcu_read_lock(); 575
492 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 576 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
493 spin_lock(&delegation->lock); 577 spin_lock(&delegation->lock);
494 if (delegation->inode != NULL && 578 if (delegation->inode != NULL &&
495 nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { 579 nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -499,49 +583,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
499 if (res != NULL) 583 if (res != NULL)
500 break; 584 break;
501 } 585 }
586 return res;
587}
588
589/**
590 * nfs_delegation_find_inode - retrieve the inode associated with a delegation
591 * @clp: client state handle
592 * @fhandle: filehandle from a delegation recall
593 *
594 * Returns pointer to inode matching "fhandle," or NULL if a matching inode
595 * cannot be found.
596 */
597struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
598 const struct nfs_fh *fhandle)
599{
600 struct nfs_server *server;
601 struct inode *res = NULL;
602
603 rcu_read_lock();
604 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
605 res = nfs_delegation_find_inode_server(server, fhandle);
606 if (res != NULL)
607 break;
608 }
502 rcu_read_unlock(); 609 rcu_read_unlock();
503 return res; 610 return res;
504} 611}
505 612
506/* 613static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
507 * Mark all delegations as needing to be reclaimed 614{
615 struct nfs_delegation *delegation;
616
617 list_for_each_entry_rcu(delegation, &server->delegations, super_list)
618 set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
619}
620
621/**
622 * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
623 * @clp: nfs_client to process
624 *
508 */ 625 */
509void nfs_delegation_mark_reclaim(struct nfs_client *clp) 626void nfs_delegation_mark_reclaim(struct nfs_client *clp)
510{ 627{
511 struct nfs_delegation *delegation; 628 struct nfs_server *server;
629
512 rcu_read_lock(); 630 rcu_read_lock();
513 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) 631 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
514 set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); 632 nfs_delegation_mark_reclaim_server(server);
515 rcu_read_unlock(); 633 rcu_read_unlock();
516} 634}
517 635
518/* 636/**
519 * Reap all unclaimed delegations after reboot recovery is done 637 * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
638 * @clp: nfs_client to process
639 *
520 */ 640 */
521void nfs_delegation_reap_unclaimed(struct nfs_client *clp) 641void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
522{ 642{
523 struct nfs_delegation *delegation; 643 struct nfs_delegation *delegation;
644 struct nfs_server *server;
524 struct inode *inode; 645 struct inode *inode;
646
525restart: 647restart:
526 rcu_read_lock(); 648 rcu_read_lock();
527 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 649 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
528 if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) 650 list_for_each_entry_rcu(delegation, &server->delegations,
529 continue; 651 super_list) {
530 inode = nfs_delegation_grab_inode(delegation); 652 if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
531 if (inode == NULL) 653 &delegation->flags) == 0)
532 continue; 654 continue;
533 spin_lock(&clp->cl_lock); 655 inode = nfs_delegation_grab_inode(delegation);
534 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); 656 if (inode == NULL)
535 spin_unlock(&clp->cl_lock); 657 continue;
536 rcu_read_unlock(); 658 delegation = nfs_detach_delegation(NFS_I(inode),
537 if (delegation != NULL) 659 server);
538 nfs_free_delegation(delegation); 660 rcu_read_unlock();
539 iput(inode); 661
540 goto restart; 662 if (delegation != NULL)
663 nfs_free_delegation(delegation);
664 iput(inode);
665 goto restart;
666 }
541 } 667 }
542 rcu_read_unlock(); 668 rcu_read_unlock();
543} 669}
544 670
671/**
672 * nfs_delegations_present - check for existence of delegations
673 * @clp: client state handle
674 *
675 * Returns one if there are any nfs_delegation structures attached
676 * to this nfs_client.
677 */
678int nfs_delegations_present(struct nfs_client *clp)
679{
680 struct nfs_server *server;
681 int ret = 0;
682
683 rcu_read_lock();
684 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
685 if (!list_empty(&server->delegations)) {
686 ret = 1;
687 break;
688 }
689 rcu_read_unlock();
690 return ret;
691}
692
693/**
694 * nfs4_copy_delegation_stateid - Copy inode's state ID information
695 * @dst: stateid data structure to fill in
696 * @inode: inode to check
697 *
698 * Returns one and fills in "dst->data" * if inode had a delegation,
699 * otherwise zero is returned.
700 */
545int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) 701int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
546{ 702{
547 struct nfs_inode *nfsi = NFS_I(inode); 703 struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda19..d9322e490c56 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
44void nfs_expire_unreferenced_delegations(struct nfs_client *clp); 44void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
45void nfs_handle_cb_pathdown(struct nfs_client *clp); 45void nfs_handle_cb_pathdown(struct nfs_client *clp);
46int nfs_client_return_marked_delegations(struct nfs_client *clp); 46int nfs_client_return_marked_delegations(struct nfs_client *clp);
47int nfs_delegations_present(struct nfs_client *clp);
47 48
48void nfs_delegation_mark_reclaim(struct nfs_client *clp); 49void nfs_delegation_mark_reclaim(struct nfs_client *clp);
49void nfs_delegation_reap_unclaimed(struct nfs_client *clp); 50void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d33da530097a..2c3eb33b904d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,8 +33,8 @@
33#include <linux/namei.h> 33#include <linux/namei.h>
34#include <linux/mount.h> 34#include <linux/mount.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/vmalloc.h>
37#include <linux/kmemleak.h> 36#include <linux/kmemleak.h>
37#include <linux/xattr.h>
38 38
39#include "delegation.h" 39#include "delegation.h"
40#include "iostat.h" 40#include "iostat.h"
@@ -125,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = {
125 .permission = nfs_permission, 125 .permission = nfs_permission,
126 .getattr = nfs_getattr, 126 .getattr = nfs_getattr,
127 .setattr = nfs_setattr, 127 .setattr = nfs_setattr,
128 .getxattr = nfs4_getxattr, 128 .getxattr = generic_getxattr,
129 .setxattr = nfs4_setxattr, 129 .setxattr = generic_setxattr,
130 .listxattr = nfs4_listxattr, 130 .listxattr = generic_listxattr,
131 .removexattr = generic_removexattr,
131}; 132};
132 133
133#endif /* CONFIG_NFS_V4 */ 134#endif /* CONFIG_NFS_V4 */
@@ -172,7 +173,7 @@ struct nfs_cache_array {
172 struct nfs_cache_array_entry array[0]; 173 struct nfs_cache_array_entry array[0];
173}; 174};
174 175
175typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); 176typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
176typedef struct { 177typedef struct {
177 struct file *file; 178 struct file *file;
178 struct page *page; 179 struct page *page;
@@ -378,14 +379,14 @@ error:
378 return error; 379 return error;
379} 380}
380 381
381/* Fill in an entry based on the xdr code stored in desc->page */ 382static int xdr_decode(nfs_readdir_descriptor_t *desc,
382static 383 struct nfs_entry *entry, struct xdr_stream *xdr)
383int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
384{ 384{
385 __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus); 385 int error;
386 if (IS_ERR(p))
387 return PTR_ERR(p);
388 386
387 error = desc->decode(xdr, entry, desc->plus);
388 if (error)
389 return error;
389 entry->fattr->time_start = desc->timestamp; 390 entry->fattr->time_start = desc->timestamp;
390 entry->fattr->gencount = desc->gencount; 391 entry->fattr->gencount = desc->gencount;
391 return 0; 392 return 0;
@@ -438,7 +439,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
438 if (dentry == NULL) 439 if (dentry == NULL)
439 return; 440 return;
440 441
441 d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
442 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); 442 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
443 if (IS_ERR(inode)) 443 if (IS_ERR(inode))
444 goto out; 444 goto out;
@@ -459,25 +459,26 @@ out:
459/* Perform conversion from xdr to cache array */ 459/* Perform conversion from xdr to cache array */
460static 460static
461int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, 461int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
462 void *xdr_page, struct page *page, unsigned int buflen) 462 struct page **xdr_pages, struct page *page, unsigned int buflen)
463{ 463{
464 struct xdr_stream stream; 464 struct xdr_stream stream;
465 struct xdr_buf buf; 465 struct xdr_buf buf = {
466 __be32 *ptr = xdr_page; 466 .pages = xdr_pages,
467 .page_len = buflen,
468 .buflen = buflen,
469 .len = buflen,
470 };
471 struct page *scratch;
467 struct nfs_cache_array *array; 472 struct nfs_cache_array *array;
468 unsigned int count = 0; 473 unsigned int count = 0;
469 int status; 474 int status;
470 475
471 buf.head->iov_base = xdr_page; 476 scratch = alloc_page(GFP_KERNEL);
472 buf.head->iov_len = buflen; 477 if (scratch == NULL)
473 buf.tail->iov_len = 0; 478 return -ENOMEM;
474 buf.page_base = 0;
475 buf.page_len = 0;
476 buf.buflen = buf.head->iov_len;
477 buf.len = buf.head->iov_len;
478
479 xdr_init_decode(&stream, &buf, ptr);
480 479
480 xdr_init_decode(&stream, &buf, NULL);
481 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
481 482
482 do { 483 do {
483 status = xdr_decode(desc, entry, &stream); 484 status = xdr_decode(desc, entry, &stream);
@@ -506,6 +507,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
506 } else 507 } else
507 status = PTR_ERR(array); 508 status = PTR_ERR(array);
508 } 509 }
510
511 put_page(scratch);
509 return status; 512 return status;
510} 513}
511 514
@@ -521,7 +524,6 @@ static
521void nfs_readdir_free_large_page(void *ptr, struct page **pages, 524void nfs_readdir_free_large_page(void *ptr, struct page **pages,
522 unsigned int npages) 525 unsigned int npages)
523{ 526{
524 vm_unmap_ram(ptr, npages);
525 nfs_readdir_free_pagearray(pages, npages); 527 nfs_readdir_free_pagearray(pages, npages);
526} 528}
527 529
@@ -530,9 +532,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
530 * to nfs_readdir_free_large_page 532 * to nfs_readdir_free_large_page
531 */ 533 */
532static 534static
533void *nfs_readdir_large_page(struct page **pages, unsigned int npages) 535int nfs_readdir_large_page(struct page **pages, unsigned int npages)
534{ 536{
535 void *ptr;
536 unsigned int i; 537 unsigned int i;
537 538
538 for (i = 0; i < npages; i++) { 539 for (i = 0; i < npages; i++) {
@@ -541,13 +542,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
541 goto out_freepages; 542 goto out_freepages;
542 pages[i] = page; 543 pages[i] = page;
543 } 544 }
545 return 0;
544 546
545 ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
546 if (!IS_ERR_OR_NULL(ptr))
547 return ptr;
548out_freepages: 547out_freepages:
549 nfs_readdir_free_pagearray(pages, i); 548 nfs_readdir_free_pagearray(pages, i);
550 return NULL; 549 return -ENOMEM;
551} 550}
552 551
553static 552static
@@ -566,6 +565,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
566 entry.eof = 0; 565 entry.eof = 0;
567 entry.fh = nfs_alloc_fhandle(); 566 entry.fh = nfs_alloc_fhandle();
568 entry.fattr = nfs_alloc_fattr(); 567 entry.fattr = nfs_alloc_fattr();
568 entry.server = NFS_SERVER(inode);
569 if (entry.fh == NULL || entry.fattr == NULL) 569 if (entry.fh == NULL || entry.fattr == NULL)
570 goto out; 570 goto out;
571 571
@@ -577,8 +577,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
577 memset(array, 0, sizeof(struct nfs_cache_array)); 577 memset(array, 0, sizeof(struct nfs_cache_array));
578 array->eof_index = -1; 578 array->eof_index = -1;
579 579
580 pages_ptr = nfs_readdir_large_page(pages, array_size); 580 status = nfs_readdir_large_page(pages, array_size);
581 if (!pages_ptr) 581 if (status < 0)
582 goto out_release_array; 582 goto out_release_array;
583 do { 583 do {
584 unsigned int pglen; 584 unsigned int pglen;
@@ -587,7 +587,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
587 if (status < 0) 587 if (status < 0)
588 break; 588 break;
589 pglen = status; 589 pglen = status;
590 status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen); 590 status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
591 if (status < 0) { 591 if (status < 0) {
592 if (status == -ENOSPC) 592 if (status == -ENOSPC)
593 status = 0; 593 status = 0;
@@ -970,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
970{ 970{
971 struct nfs_server *server = NFS_SERVER(inode); 971 struct nfs_server *server = NFS_SERVER(inode);
972 972
973 if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags)) 973 if (IS_AUTOMOUNT(inode))
974 return 0; 974 return 0;
975 if (nd != NULL) { 975 if (nd != NULL) {
976 /* VFS wants an on-the-wire revalidation */ 976 /* VFS wants an on-the-wire revalidation */
@@ -1173,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
1173 .d_revalidate = nfs_lookup_revalidate, 1173 .d_revalidate = nfs_lookup_revalidate,
1174 .d_delete = nfs_dentry_delete, 1174 .d_delete = nfs_dentry_delete,
1175 .d_iput = nfs_dentry_iput, 1175 .d_iput = nfs_dentry_iput,
1176 .d_automount = nfs_d_automount,
1176}; 1177};
1177 1178
1178static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 1179static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1192,8 +1193,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
1192 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) 1193 if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
1193 goto out; 1194 goto out;
1194 1195
1195 d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
1196
1197 /* 1196 /*
1198 * If we're doing an exclusive create, optimize away the lookup 1197 * If we're doing an exclusive create, optimize away the lookup
1199 * but don't hash the dentry. 1198 * but don't hash the dentry.
@@ -1221,7 +1220,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
1221 goto out_unblock_sillyrename; 1220 goto out_unblock_sillyrename;
1222 } 1221 }
1223 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1222 inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
1224 res = (struct dentry *)inode; 1223 res = ERR_CAST(inode);
1225 if (IS_ERR(res)) 1224 if (IS_ERR(res))
1226 goto out_unblock_sillyrename; 1225 goto out_unblock_sillyrename;
1227 1226
@@ -1248,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
1248 .d_revalidate = nfs_open_revalidate, 1247 .d_revalidate = nfs_open_revalidate,
1249 .d_delete = nfs_dentry_delete, 1248 .d_delete = nfs_dentry_delete,
1250 .d_iput = nfs_dentry_iput, 1249 .d_iput = nfs_dentry_iput,
1250 .d_automount = nfs_d_automount,
1251}; 1251};
1252 1252
1253/* 1253/*
@@ -1337,7 +1337,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1337 res = ERR_PTR(-ENAMETOOLONG); 1337 res = ERR_PTR(-ENAMETOOLONG);
1338 goto out; 1338 goto out;
1339 } 1339 }
1340 d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
1341 1340
1342 /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash 1341 /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
1343 * the dentry. */ 1342 * the dentry. */
@@ -1355,8 +1354,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1355 if (nd->flags & LOOKUP_CREATE) { 1354 if (nd->flags & LOOKUP_CREATE) {
1356 attr.ia_mode = nd->intent.open.create_mode; 1355 attr.ia_mode = nd->intent.open.create_mode;
1357 attr.ia_valid = ATTR_MODE; 1356 attr.ia_valid = ATTR_MODE;
1358 if (!IS_POSIXACL(dir)) 1357 attr.ia_mode &= ~current_umask();
1359 attr.ia_mode &= ~current_umask();
1360 } else { 1358 } else {
1361 open_flags &= ~(O_EXCL | O_CREAT); 1359 open_flags &= ~(O_EXCL | O_CREAT);
1362 attr.ia_valid = 0; 1360 attr.ia_valid = 0;
@@ -1410,11 +1408,15 @@ no_open:
1410static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) 1408static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1411{ 1409{
1412 struct dentry *parent = NULL; 1410 struct dentry *parent = NULL;
1413 struct inode *inode = dentry->d_inode; 1411 struct inode *inode;
1414 struct inode *dir; 1412 struct inode *dir;
1415 struct nfs_open_context *ctx; 1413 struct nfs_open_context *ctx;
1416 int openflags, ret = 0; 1414 int openflags, ret = 0;
1417 1415
1416 if (nd->flags & LOOKUP_RCU)
1417 return -ECHILD;
1418
1419 inode = dentry->d_inode;
1418 if (!is_atomic_open(nd) || d_mountpoint(dentry)) 1420 if (!is_atomic_open(nd) || d_mountpoint(dentry))
1419 goto no_open; 1421 goto no_open;
1420 1422
@@ -1583,6 +1585,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1583{ 1585{
1584 struct iattr attr; 1586 struct iattr attr;
1585 int error; 1587 int error;
1588 int open_flags = 0;
1586 1589
1587 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1590 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1588 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1591 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1590,7 +1593,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1590 attr.ia_mode = mode; 1593 attr.ia_mode = mode;
1591 attr.ia_valid = ATTR_MODE; 1594 attr.ia_valid = ATTR_MODE;
1592 1595
1593 error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); 1596 if ((nd->flags & LOOKUP_CREATE) != 0)
1597 open_flags = nd->intent.open.flags;
1598
1599 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
1594 if (error != 0) 1600 if (error != 0)
1595 goto out_err; 1601 goto out_err;
1596 return 0; 1602 return 0;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 5596c6a2881e..b5ffe8fa291f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -119,9 +119,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
119 } 119 }
120 120
121 security_d_instantiate(ret, inode); 121 security_d_instantiate(ret, inode);
122
123 if (ret->d_op == NULL)
124 d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
125out: 122out:
126 nfs_free_fattr(fsinfo.fattr); 123 nfs_free_fattr(fsinfo.fattr);
127 return ret; 124 return ret;
@@ -227,9 +224,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
227 224
228 security_d_instantiate(ret, inode); 225 security_d_instantiate(ret, inode);
229 226
230 if (ret->d_op == NULL)
231 d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
232
233out: 227out:
234 nfs_free_fattr(fattr); 228 nfs_free_fattr(fattr);
235 dprintk("<-- nfs4_get_root()\n"); 229 dprintk("<-- nfs4_get_root()\n");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 4e2d9b6b1380..18696882f1c6 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -238,7 +238,7 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
238 return nfs_idmap_lookup_name(gid, "group", buf, buflen); 238 return nfs_idmap_lookup_name(gid, "group", buf, buflen);
239} 239}
240 240
241#else /* CONFIG_NFS_USE_IDMAPPER not defined */ 241#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
242 242
243#include <linux/module.h> 243#include <linux/module.h>
244#include <linux/mutex.h> 244#include <linux/mutex.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 017daa3bed38..d8512423ba72 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -300,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
300 else 300 else
301 inode->i_op = &nfs_mountpoint_inode_operations; 301 inode->i_op = &nfs_mountpoint_inode_operations;
302 inode->i_fop = NULL; 302 inode->i_fop = NULL;
303 set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags); 303 inode->i_flags |= S_AUTOMOUNT;
304 } 304 }
305 } else if (S_ISLNK(inode->i_mode)) 305 } else if (S_ISLNK(inode->i_mode))
306 inode->i_op = &nfs_symlink_inode_operations; 306 inode->i_op = &nfs_symlink_inode_operations;
@@ -1208,7 +1208,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1208 /* Update the fsid? */ 1208 /* Update the fsid? */
1209 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && 1209 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
1210 !nfs_fsid_equal(&server->fsid, &fattr->fsid) && 1210 !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
1211 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) 1211 !IS_AUTOMOUNT(inode))
1212 server->fsid = fattr->fsid; 1212 server->fsid = fattr->fsid;
1213 1213
1214 /* 1214 /*
@@ -1410,9 +1410,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1410 */ 1410 */
1411void nfs4_evict_inode(struct inode *inode) 1411void nfs4_evict_inode(struct inode *inode)
1412{ 1412{
1413 pnfs_destroy_layout(NFS_I(inode));
1413 truncate_inode_pages(&inode->i_data, 0); 1414 truncate_inode_pages(&inode->i_data, 0);
1414 end_writeback(inode); 1415 end_writeback(inode);
1415 pnfs_destroy_layout(NFS_I(inode));
1416 /* If we are holding a delegation, return it! */ 1416 /* If we are holding a delegation, return it! */
1417 nfs_inode_return_delegation_noreclaim(inode); 1417 nfs_inode_return_delegation_noreclaim(inode);
1418 /* First call standard NFS clear_inode() code */ 1418 /* First call standard NFS clear_inode() code */
@@ -1619,6 +1619,7 @@ static void __exit exit_nfs_fs(void)
1619#ifdef CONFIG_PROC_FS 1619#ifdef CONFIG_PROC_FS
1620 rpc_proc_unregister("nfs"); 1620 rpc_proc_unregister("nfs");
1621#endif 1621#endif
1622 nfs_cleanup_cb_ident_idr();
1622 unregister_nfs_fs(); 1623 unregister_nfs_fs();
1623 nfs_fs_proc_exit(); 1624 nfs_fs_proc_exit();
1624 nfsiod_stop(); 1625 nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e6356b750b77..4644f04b4b46 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -128,9 +128,13 @@ extern void nfs_umount(const struct nfs_mount_request *info);
128/* client.c */ 128/* client.c */
129extern struct rpc_program nfs_program; 129extern struct rpc_program nfs_program;
130 130
131extern void nfs_cleanup_cb_ident_idr(void);
131extern void nfs_put_client(struct nfs_client *); 132extern void nfs_put_client(struct nfs_client *);
132extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32); 133extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
133extern struct nfs_client *nfs_find_client_next(struct nfs_client *); 134extern struct nfs_client *nfs4_find_client_ident(int);
135extern struct nfs_client *
136nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *,
137 int);
134extern struct nfs_server *nfs_create_server( 138extern struct nfs_server *nfs_create_server(
135 const struct nfs_parsed_mount_data *, 139 const struct nfs_parsed_mount_data *,
136 struct nfs_fh *); 140 struct nfs_fh *);
@@ -185,17 +189,20 @@ extern int __init nfs_init_directcache(void);
185extern void nfs_destroy_directcache(void); 189extern void nfs_destroy_directcache(void);
186 190
187/* nfs2xdr.c */ 191/* nfs2xdr.c */
188extern int nfs_stat_to_errno(int); 192extern int nfs_stat_to_errno(enum nfs_stat);
189extern struct rpc_procinfo nfs_procedures[]; 193extern struct rpc_procinfo nfs_procedures[];
190extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); 194extern int nfs2_decode_dirent(struct xdr_stream *,
195 struct nfs_entry *, int);
191 196
192/* nfs3xdr.c */ 197/* nfs3xdr.c */
193extern struct rpc_procinfo nfs3_procedures[]; 198extern struct rpc_procinfo nfs3_procedures[];
194extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); 199extern int nfs3_decode_dirent(struct xdr_stream *,
200 struct nfs_entry *, int);
195 201
196/* nfs4xdr.c */ 202/* nfs4xdr.c */
197#ifdef CONFIG_NFS_V4 203#ifdef CONFIG_NFS_V4
198extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); 204extern int nfs4_decode_dirent(struct xdr_stream *,
205 struct nfs_entry *, int);
199#endif 206#endif
200#ifdef CONFIG_NFS_V4_1 207#ifdef CONFIG_NFS_V4_1
201extern const u32 nfs41_maxread_overhead; 208extern const u32 nfs41_maxread_overhead;
@@ -245,6 +252,7 @@ extern char *nfs_path(const char *base,
245 const struct dentry *droot, 252 const struct dentry *droot,
246 const struct dentry *dentry, 253 const struct dentry *dentry,
247 char *buffer, ssize_t buflen); 254 char *buffer, ssize_t buflen);
255extern struct vfsmount *nfs_d_automount(struct path *path);
248 256
249/* getroot.c */ 257/* getroot.c */
250extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); 258extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 4f981f1f6689..d4c2d6b7507e 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -236,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info)
236 .authflavor = RPC_AUTH_UNIX, 236 .authflavor = RPC_AUTH_UNIX,
237 .flags = RPC_CLNT_CREATE_NOPING, 237 .flags = RPC_CLNT_CREATE_NOPING,
238 }; 238 };
239 struct mountres result;
240 struct rpc_message msg = { 239 struct rpc_message msg = {
241 .rpc_argp = info->dirpath, 240 .rpc_argp = info->dirpath,
242 .rpc_resp = &result,
243 }; 241 };
244 struct rpc_clnt *clnt; 242 struct rpc_clnt *clnt;
245 int status; 243 int status;
@@ -248,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info)
248 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 246 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
249 247
250 clnt = rpc_create(&args); 248 clnt = rpc_create(&args);
251 if (unlikely(IS_ERR(clnt))) 249 if (IS_ERR(clnt))
252 goto out_clnt_err; 250 goto out_clnt_err;
253 251
254 dprintk("NFS: sending UMNT request for %s:%s\n", 252 dprintk("NFS: sending UMNT request for %s:%s\n",
@@ -280,29 +278,20 @@ out_call_err:
280 * XDR encode/decode functions for MOUNT 278 * XDR encode/decode functions for MOUNT
281 */ 279 */
282 280
283static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) 281static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
284{ 282{
285 const u32 pathname_len = strlen(pathname); 283 const u32 pathname_len = strlen(pathname);
286 __be32 *p; 284 __be32 *p;
287 285
288 if (unlikely(pathname_len > MNTPATHLEN)) 286 BUG_ON(pathname_len > MNTPATHLEN);
289 return -EIO; 287 p = xdr_reserve_space(xdr, 4 + pathname_len);
290
291 p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
292 if (unlikely(p == NULL))
293 return -EIO;
294 xdr_encode_opaque(p, pathname, pathname_len); 288 xdr_encode_opaque(p, pathname, pathname_len);
295
296 return 0;
297} 289}
298 290
299static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p, 291static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
300 const char *dirpath) 292 const char *dirpath)
301{ 293{
302 struct xdr_stream xdr; 294 encode_mntdirpath(xdr, dirpath);
303
304 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
305 return encode_mntdirpath(&xdr, dirpath);
306} 295}
307 296
308/* 297/*
@@ -320,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
320 u32 status; 309 u32 status;
321 __be32 *p; 310 __be32 *p;
322 311
323 p = xdr_inline_decode(xdr, sizeof(status)); 312 p = xdr_inline_decode(xdr, 4);
324 if (unlikely(p == NULL)) 313 if (unlikely(p == NULL))
325 return -EIO; 314 return -EIO;
326 status = ntohl(*p); 315 status = be32_to_cpup(p);
327 316
328 for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { 317 for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
329 if (mnt_errtbl[i].status == status) { 318 if (mnt_errtbl[i].status == status) {
@@ -351,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
351 return 0; 340 return 0;
352} 341}
353 342
354static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p, 343static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
355 struct mountres *res) 344 struct xdr_stream *xdr,
345 struct mountres *res)
356{ 346{
357 struct xdr_stream xdr;
358 int status; 347 int status;
359 348
360 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 349 status = decode_status(xdr, res);
361
362 status = decode_status(&xdr, res);
363 if (unlikely(status != 0 || res->errno != 0)) 350 if (unlikely(status != 0 || res->errno != 0))
364 return status; 351 return status;
365 return decode_fhandle(&xdr, res); 352 return decode_fhandle(xdr, res);
366} 353}
367 354
368static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) 355static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -371,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
371 u32 status; 358 u32 status;
372 __be32 *p; 359 __be32 *p;
373 360
374 p = xdr_inline_decode(xdr, sizeof(status)); 361 p = xdr_inline_decode(xdr, 4);
375 if (unlikely(p == NULL)) 362 if (unlikely(p == NULL))
376 return -EIO; 363 return -EIO;
377 status = ntohl(*p); 364 status = be32_to_cpup(p);
378 365
379 for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { 366 for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
380 if (mnt3_errtbl[i].status == status) { 367 if (mnt3_errtbl[i].status == status) {
@@ -394,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
394 u32 size; 381 u32 size;
395 __be32 *p; 382 __be32 *p;
396 383
397 p = xdr_inline_decode(xdr, sizeof(size)); 384 p = xdr_inline_decode(xdr, 4);
398 if (unlikely(p == NULL)) 385 if (unlikely(p == NULL))
399 return -EIO; 386 return -EIO;
400 387
401 size = ntohl(*p++); 388 size = be32_to_cpup(p);
402 if (size > NFS3_FHSIZE || size == 0) 389 if (size > NFS3_FHSIZE || size == 0)
403 return -EIO; 390 return -EIO;
404 391
@@ -421,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
421 if (*count == 0) 408 if (*count == 0)
422 return 0; 409 return 0;
423 410
424 p = xdr_inline_decode(xdr, sizeof(entries)); 411 p = xdr_inline_decode(xdr, 4);
425 if (unlikely(p == NULL)) 412 if (unlikely(p == NULL))
426 return -EIO; 413 return -EIO;
427 entries = ntohl(*p); 414 entries = be32_to_cpup(p);
428 dprintk("NFS: received %u auth flavors\n", entries); 415 dprintk("NFS: received %u auth flavors\n", entries);
429 if (entries > NFS_MAX_SECFLAVORS) 416 if (entries > NFS_MAX_SECFLAVORS)
430 entries = NFS_MAX_SECFLAVORS; 417 entries = NFS_MAX_SECFLAVORS;
431 418
432 p = xdr_inline_decode(xdr, sizeof(u32) * entries); 419 p = xdr_inline_decode(xdr, 4 * entries);
433 if (unlikely(p == NULL)) 420 if (unlikely(p == NULL))
434 return -EIO; 421 return -EIO;
435 422
@@ -437,7 +424,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
437 entries = *count; 424 entries = *count;
438 425
439 for (i = 0; i < entries; i++) { 426 for (i = 0; i < entries; i++) {
440 flavors[i] = ntohl(*p++); 427 flavors[i] = be32_to_cpup(p++);
441 dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]); 428 dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]);
442 } 429 }
443 *count = i; 430 *count = i;
@@ -445,30 +432,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
445 return 0; 432 return 0;
446} 433}
447 434
448static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p, 435static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
449 struct mountres *res) 436 struct xdr_stream *xdr,
437 struct mountres *res)
450{ 438{
451 struct xdr_stream xdr;
452 int status; 439 int status;
453 440
454 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 441 status = decode_fhs_status(xdr, res);
455
456 status = decode_fhs_status(&xdr, res);
457 if (unlikely(status != 0 || res->errno != 0)) 442 if (unlikely(status != 0 || res->errno != 0))
458 return status; 443 return status;
459 status = decode_fhandle3(&xdr, res); 444 status = decode_fhandle3(xdr, res);
460 if (unlikely(status != 0)) { 445 if (unlikely(status != 0)) {
461 res->errno = -EBADHANDLE; 446 res->errno = -EBADHANDLE;
462 return 0; 447 return 0;
463 } 448 }
464 return decode_auth_flavors(&xdr, res); 449 return decode_auth_flavors(xdr, res);
465} 450}
466 451
467static struct rpc_procinfo mnt_procedures[] = { 452static struct rpc_procinfo mnt_procedures[] = {
468 [MOUNTPROC_MNT] = { 453 [MOUNTPROC_MNT] = {
469 .p_proc = MOUNTPROC_MNT, 454 .p_proc = MOUNTPROC_MNT,
470 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 455 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
471 .p_decode = (kxdrproc_t)mnt_dec_mountres, 456 .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres,
472 .p_arglen = MNT_enc_dirpath_sz, 457 .p_arglen = MNT_enc_dirpath_sz,
473 .p_replen = MNT_dec_mountres_sz, 458 .p_replen = MNT_dec_mountres_sz,
474 .p_statidx = MOUNTPROC_MNT, 459 .p_statidx = MOUNTPROC_MNT,
@@ -476,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = {
476 }, 461 },
477 [MOUNTPROC_UMNT] = { 462 [MOUNTPROC_UMNT] = {
478 .p_proc = MOUNTPROC_UMNT, 463 .p_proc = MOUNTPROC_UMNT,
479 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 464 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
480 .p_arglen = MNT_enc_dirpath_sz, 465 .p_arglen = MNT_enc_dirpath_sz,
481 .p_statidx = MOUNTPROC_UMNT, 466 .p_statidx = MOUNTPROC_UMNT,
482 .p_name = "UMOUNT", 467 .p_name = "UMOUNT",
@@ -486,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = {
486static struct rpc_procinfo mnt3_procedures[] = { 471static struct rpc_procinfo mnt3_procedures[] = {
487 [MOUNTPROC3_MNT] = { 472 [MOUNTPROC3_MNT] = {
488 .p_proc = MOUNTPROC3_MNT, 473 .p_proc = MOUNTPROC3_MNT,
489 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 474 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
490 .p_decode = (kxdrproc_t)mnt_dec_mountres3, 475 .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres3,
491 .p_arglen = MNT_enc_dirpath_sz, 476 .p_arglen = MNT_enc_dirpath_sz,
492 .p_replen = MNT_dec_mountres3_sz, 477 .p_replen = MNT_dec_mountres3_sz,
493 .p_statidx = MOUNTPROC3_MNT, 478 .p_statidx = MOUNTPROC3_MNT,
@@ -495,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
495 }, 480 },
496 [MOUNTPROC3_UMNT] = { 481 [MOUNTPROC3_UMNT] = {
497 .p_proc = MOUNTPROC3_UMNT, 482 .p_proc = MOUNTPROC3_UMNT,
498 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 483 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
499 .p_arglen = MNT_enc_dirpath_sz, 484 .p_arglen = MNT_enc_dirpath_sz,
500 .p_statidx = MOUNTPROC3_UMNT, 485 .p_statidx = MOUNTPROC3_UMNT,
501 .p_name = "UMOUNT", 486 .p_name = "UMOUNT",
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 74aaf3963c10..f32b8603dca8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -97,9 +97,8 @@ Elong:
97} 97}
98 98
99/* 99/*
100 * nfs_follow_mountpoint - handle crossing a mountpoint on the server 100 * nfs_d_automount - Handle crossing a mountpoint on the server
101 * @dentry - dentry of mountpoint 101 * @path - The mountpoint
102 * @nd - nameidata info
103 * 102 *
104 * When we encounter a mountpoint on the server, we want to set up 103 * When we encounter a mountpoint on the server, we want to set up
105 * a mountpoint on the client too, to prevent inode numbers from 104 * a mountpoint on the client too, to prevent inode numbers from
@@ -109,87 +108,65 @@ Elong:
109 * situation, and that different filesystems may want to use 108 * situation, and that different filesystems may want to use
110 * different security flavours. 109 * different security flavours.
111 */ 110 */
112static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) 111struct vfsmount *nfs_d_automount(struct path *path)
113{ 112{
114 struct vfsmount *mnt; 113 struct vfsmount *mnt;
115 struct nfs_server *server = NFS_SERVER(dentry->d_inode); 114 struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
116 struct dentry *parent; 115 struct dentry *parent;
117 struct nfs_fh *fh = NULL; 116 struct nfs_fh *fh = NULL;
118 struct nfs_fattr *fattr = NULL; 117 struct nfs_fattr *fattr = NULL;
119 int err; 118 int err;
120 119
121 dprintk("--> nfs_follow_mountpoint()\n"); 120 dprintk("--> nfs_d_automount()\n");
122 121
123 err = -ESTALE; 122 mnt = ERR_PTR(-ESTALE);
124 if (IS_ROOT(dentry)) 123 if (IS_ROOT(path->dentry))
125 goto out_err; 124 goto out_nofree;
126 125
127 err = -ENOMEM; 126 mnt = ERR_PTR(-ENOMEM);
128 fh = nfs_alloc_fhandle(); 127 fh = nfs_alloc_fhandle();
129 fattr = nfs_alloc_fattr(); 128 fattr = nfs_alloc_fattr();
130 if (fh == NULL || fattr == NULL) 129 if (fh == NULL || fattr == NULL)
131 goto out_err; 130 goto out;
132 131
133 dprintk("%s: enter\n", __func__); 132 dprintk("%s: enter\n", __func__);
134 dput(nd->path.dentry);
135 nd->path.dentry = dget(dentry);
136 133
137 /* Look it up again */ 134 /* Look it up again to get its attributes */
138 parent = dget_parent(nd->path.dentry); 135 parent = dget_parent(path->dentry);
139 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, 136 err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
140 &nd->path.dentry->d_name, 137 &path->dentry->d_name,
141 fh, fattr); 138 fh, fattr);
142 dput(parent); 139 dput(parent);
143 if (err != 0) 140 if (err != 0) {
144 goto out_err; 141 mnt = ERR_PTR(err);
142 goto out;
143 }
145 144
146 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 145 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
147 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); 146 mnt = nfs_do_refmount(path->mnt, path->dentry);
148 else 147 else
149 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, 148 mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
150 fattr);
151 err = PTR_ERR(mnt);
152 if (IS_ERR(mnt)) 149 if (IS_ERR(mnt))
153 goto out_err; 150 goto out;
154 151
155 mntget(mnt); 152 dprintk("%s: done, success\n", __func__);
156 err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, 153 mntget(mnt); /* prevent immediate expiration */
157 &nfs_automount_list); 154 mnt_set_expiry(mnt, &nfs_automount_list);
158 if (err < 0) {
159 mntput(mnt);
160 if (err == -EBUSY)
161 goto out_follow;
162 goto out_err;
163 }
164 path_put(&nd->path);
165 nd->path.mnt = mnt;
166 nd->path.dentry = dget(mnt->mnt_root);
167 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); 155 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
156
168out: 157out:
169 nfs_free_fattr(fattr); 158 nfs_free_fattr(fattr);
170 nfs_free_fhandle(fh); 159 nfs_free_fhandle(fh);
171 dprintk("%s: done, returned %d\n", __func__, err); 160out_nofree:
172 161 dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
173 dprintk("<-- nfs_follow_mountpoint() = %d\n", err); 162 return mnt;
174 return ERR_PTR(err);
175out_err:
176 path_put(&nd->path);
177 goto out;
178out_follow:
179 while (d_mountpoint(nd->path.dentry) &&
180 follow_down(&nd->path))
181 ;
182 err = 0;
183 goto out;
184} 163}
185 164
186const struct inode_operations nfs_mountpoint_inode_operations = { 165const struct inode_operations nfs_mountpoint_inode_operations = {
187 .follow_link = nfs_follow_mountpoint,
188 .getattr = nfs_getattr, 166 .getattr = nfs_getattr,
189}; 167};
190 168
191const struct inode_operations nfs_referral_inode_operations = { 169const struct inode_operations nfs_referral_inode_operations = {
192 .follow_link = nfs_follow_mountpoint,
193}; 170};
194 171
195static void nfs_expire_automounts(struct work_struct *work) 172static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5914a1911c95..792cb13a4304 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,584 +61,1008 @@
61#define NFS_readdirres_sz (1) 61#define NFS_readdirres_sz (1)
62#define NFS_statfsres_sz (1+NFS_info_sz) 62#define NFS_statfsres_sz (1+NFS_info_sz)
63 63
64
64/* 65/*
65 * Common NFS XDR functions as inlines 66 * While encoding arguments, set up the reply buffer in advance to
67 * receive reply data directly into the page cache.
66 */ 68 */
67static inline __be32 * 69static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
68xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle) 70 unsigned int base, unsigned int len,
71 unsigned int bufsize)
69{ 72{
70 memcpy(p, fhandle->data, NFS2_FHSIZE); 73 struct rpc_auth *auth = req->rq_cred->cr_auth;
71 return p + XDR_QUADLEN(NFS2_FHSIZE); 74 unsigned int replen;
75
76 replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
77 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
72} 78}
73 79
74static inline __be32 * 80/*
75xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle) 81 * Handle decode buffer overflows out-of-line.
82 */
83static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
76{ 84{
77 /* NFSv2 handles have a fixed length */ 85 dprintk("NFS: %s prematurely hit the end of our receive buffer. "
78 fhandle->size = NFS2_FHSIZE; 86 "Remaining buffer length is %tu words.\n",
79 memcpy(fhandle->data, p, NFS2_FHSIZE); 87 func, xdr->end - xdr->p);
80 return p + XDR_QUADLEN(NFS2_FHSIZE); 88}
89
90
91/*
92 * Encode/decode NFSv2 basic data types
93 *
94 * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
95 * "NFS: Network File System Protocol Specification".
96 *
97 * Not all basic data types have their own encoding and decoding
98 * functions. For run-time efficiency, some data types are encoded
99 * or decoded inline.
100 */
101
102/*
103 * typedef opaque nfsdata<>;
104 */
105static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
106{
107 u32 recvd, count;
108 size_t hdrlen;
109 __be32 *p;
110
111 p = xdr_inline_decode(xdr, 4);
112 if (unlikely(p == NULL))
113 goto out_overflow;
114 count = be32_to_cpup(p);
115 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
116 recvd = xdr->buf->len - hdrlen;
117 if (unlikely(count > recvd))
118 goto out_cheating;
119out:
120 xdr_read_pages(xdr, count);
121 result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */
122 result->count = count;
123 return count;
124out_cheating:
125 dprintk("NFS: server cheating in read result: "
126 "count %u > recvd %u\n", count, recvd);
127 count = recvd;
128 goto out;
129out_overflow:
130 print_overflow_msg(__func__, xdr);
131 return -EIO;
132}
133
134/*
135 * enum stat {
136 * NFS_OK = 0,
137 * NFSERR_PERM = 1,
138 * NFSERR_NOENT = 2,
139 * NFSERR_IO = 5,
140 * NFSERR_NXIO = 6,
141 * NFSERR_ACCES = 13,
142 * NFSERR_EXIST = 17,
143 * NFSERR_NODEV = 19,
144 * NFSERR_NOTDIR = 20,
145 * NFSERR_ISDIR = 21,
146 * NFSERR_FBIG = 27,
147 * NFSERR_NOSPC = 28,
148 * NFSERR_ROFS = 30,
149 * NFSERR_NAMETOOLONG = 63,
150 * NFSERR_NOTEMPTY = 66,
151 * NFSERR_DQUOT = 69,
152 * NFSERR_STALE = 70,
153 * NFSERR_WFLUSH = 99
154 * };
155 */
156static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
157{
158 __be32 *p;
159
160 p = xdr_inline_decode(xdr, 4);
161 if (unlikely(p == NULL))
162 goto out_overflow;
163 *status = be32_to_cpup(p);
164 return 0;
165out_overflow:
166 print_overflow_msg(__func__, xdr);
167 return -EIO;
81} 168}
82 169
83static inline __be32* 170/*
84xdr_encode_time(__be32 *p, struct timespec *timep) 171 * 2.3.2. ftype
172 *
173 * enum ftype {
174 * NFNON = 0,
175 * NFREG = 1,
176 * NFDIR = 2,
177 * NFBLK = 3,
178 * NFCHR = 4,
179 * NFLNK = 5
180 * };
181 *
182 */
183static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
85{ 184{
86 *p++ = htonl(timep->tv_sec); 185 *type = be32_to_cpup(p++);
87 /* Convert nanoseconds into microseconds */ 186 if (unlikely(*type > NF2FIFO))
88 *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0); 187 *type = NFBAD;
89 return p; 188 return p;
90} 189}
91 190
92static inline __be32* 191/*
93xdr_encode_current_server_time(__be32 *p, struct timespec *timep) 192 * 2.3.3. fhandle
193 *
194 * typedef opaque fhandle[FHSIZE];
195 */
196static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
94{ 197{
95 /* 198 __be32 *p;
96 * Passing the invalid value useconds=1000000 is a 199
97 * Sun convention for "set to current server time". 200 BUG_ON(fh->size != NFS2_FHSIZE);
98 * It's needed to make permissions checks for the 201 p = xdr_reserve_space(xdr, NFS2_FHSIZE);
99 * "touch" program across v2 mounts to Solaris and 202 memcpy(p, fh->data, NFS2_FHSIZE);
100 * Irix boxes work correctly. See description of 203}
101 * sattr in section 6.1 of "NFS Illustrated" by 204
102 * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 205static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
103 */ 206{
104 *p++ = htonl(timep->tv_sec); 207 __be32 *p;
105 *p++ = htonl(1000000); 208
209 p = xdr_inline_decode(xdr, NFS2_FHSIZE);
210 if (unlikely(p == NULL))
211 goto out_overflow;
212 fh->size = NFS2_FHSIZE;
213 memcpy(fh->data, p, NFS2_FHSIZE);
214 return 0;
215out_overflow:
216 print_overflow_msg(__func__, xdr);
217 return -EIO;
218}
219
220/*
221 * 2.3.4. timeval
222 *
223 * struct timeval {
224 * unsigned int seconds;
225 * unsigned int useconds;
226 * };
227 */
228static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
229{
230 *p++ = cpu_to_be32(timep->tv_sec);
231 if (timep->tv_nsec != 0)
232 *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
233 else
234 *p++ = cpu_to_be32(0);
106 return p; 235 return p;
107} 236}
108 237
109static inline __be32* 238/*
110xdr_decode_time(__be32 *p, struct timespec *timep) 239 * Passing the invalid value useconds=1000000 is a Sun convention for
240 * "set to current server time". It's needed to make permissions checks
241 * for the "touch" program across v2 mounts to Solaris and Irix servers
242 * work correctly. See description of sattr in section 6.1 of "NFS
243 * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
244 */
245static __be32 *xdr_encode_current_server_time(__be32 *p,
246 const struct timespec *timep)
111{ 247{
112 timep->tv_sec = ntohl(*p++); 248 *p++ = cpu_to_be32(timep->tv_sec);
113 /* Convert microseconds into nanoseconds */ 249 *p++ = cpu_to_be32(1000000);
114 timep->tv_nsec = ntohl(*p++) * 1000;
115 return p; 250 return p;
116} 251}
117 252
118static __be32 * 253static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
119xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 254{
255 timep->tv_sec = be32_to_cpup(p++);
256 timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
257 return p;
258}
259
260/*
261 * 2.3.5. fattr
262 *
263 * struct fattr {
264 * ftype type;
265 * unsigned int mode;
266 * unsigned int nlink;
267 * unsigned int uid;
268 * unsigned int gid;
269 * unsigned int size;
270 * unsigned int blocksize;
271 * unsigned int rdev;
272 * unsigned int blocks;
273 * unsigned int fsid;
274 * unsigned int fileid;
275 * timeval atime;
276 * timeval mtime;
277 * timeval ctime;
278 * };
279 *
280 */
281static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
120{ 282{
121 u32 rdev, type; 283 u32 rdev, type;
122 type = ntohl(*p++); 284 __be32 *p;
123 fattr->mode = ntohl(*p++); 285
124 fattr->nlink = ntohl(*p++); 286 p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
125 fattr->uid = ntohl(*p++); 287 if (unlikely(p == NULL))
126 fattr->gid = ntohl(*p++); 288 goto out_overflow;
127 fattr->size = ntohl(*p++); 289
128 fattr->du.nfs2.blocksize = ntohl(*p++);
129 rdev = ntohl(*p++);
130 fattr->du.nfs2.blocks = ntohl(*p++);
131 fattr->fsid.major = ntohl(*p++);
132 fattr->fsid.minor = 0;
133 fattr->fileid = ntohl(*p++);
134 p = xdr_decode_time(p, &fattr->atime);
135 p = xdr_decode_time(p, &fattr->mtime);
136 p = xdr_decode_time(p, &fattr->ctime);
137 fattr->valid |= NFS_ATTR_FATTR_V2; 290 fattr->valid |= NFS_ATTR_FATTR_V2;
291
292 p = xdr_decode_ftype(p, &type);
293
294 fattr->mode = be32_to_cpup(p++);
295 fattr->nlink = be32_to_cpup(p++);
296 fattr->uid = be32_to_cpup(p++);
297 fattr->gid = be32_to_cpup(p++);
298 fattr->size = be32_to_cpup(p++);
299 fattr->du.nfs2.blocksize = be32_to_cpup(p++);
300
301 rdev = be32_to_cpup(p++);
138 fattr->rdev = new_decode_dev(rdev); 302 fattr->rdev = new_decode_dev(rdev);
139 if (type == NFCHR && rdev == NFS2_FIFO_DEV) { 303 if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
140 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; 304 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
141 fattr->rdev = 0; 305 fattr->rdev = 0;
142 } 306 }
307
308 fattr->du.nfs2.blocks = be32_to_cpup(p++);
309 fattr->fsid.major = be32_to_cpup(p++);
310 fattr->fsid.minor = 0;
311 fattr->fileid = be32_to_cpup(p++);
312
313 p = xdr_decode_time(p, &fattr->atime);
314 p = xdr_decode_time(p, &fattr->mtime);
315 xdr_decode_time(p, &fattr->ctime);
316 return 0;
317out_overflow:
318 print_overflow_msg(__func__, xdr);
319 return -EIO;
320}
321
322/*
323 * 2.3.6. sattr
324 *
325 * struct sattr {
326 * unsigned int mode;
327 * unsigned int uid;
328 * unsigned int gid;
329 * unsigned int size;
330 * timeval atime;
331 * timeval mtime;
332 * };
333 */
334
335#define NFS2_SATTR_NOT_SET (0xffffffff)
336
337static __be32 *xdr_time_not_set(__be32 *p)
338{
339 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
340 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
143 return p; 341 return p;
144} 342}
145 343
146static inline __be32 * 344static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
147xdr_encode_sattr(__be32 *p, struct iattr *attr)
148{ 345{
149 const __be32 not_set = __constant_htonl(0xFFFFFFFF); 346 __be32 *p;
150 347
151 *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set; 348 p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
152 *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
153 *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
154 *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
155 349
156 if (attr->ia_valid & ATTR_ATIME_SET) { 350 if (attr->ia_valid & ATTR_MODE)
351 *p++ = cpu_to_be32(attr->ia_mode);
352 else
353 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
354 if (attr->ia_valid & ATTR_UID)
355 *p++ = cpu_to_be32(attr->ia_uid);
356 else
357 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
358 if (attr->ia_valid & ATTR_GID)
359 *p++ = cpu_to_be32(attr->ia_gid);
360 else
361 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
362 if (attr->ia_valid & ATTR_SIZE)
363 *p++ = cpu_to_be32((u32)attr->ia_size);
364 else
365 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
366
367 if (attr->ia_valid & ATTR_ATIME_SET)
157 p = xdr_encode_time(p, &attr->ia_atime); 368 p = xdr_encode_time(p, &attr->ia_atime);
158 } else if (attr->ia_valid & ATTR_ATIME) { 369 else if (attr->ia_valid & ATTR_ATIME)
159 p = xdr_encode_current_server_time(p, &attr->ia_atime); 370 p = xdr_encode_current_server_time(p, &attr->ia_atime);
160 } else { 371 else
161 *p++ = not_set; 372 p = xdr_time_not_set(p);
162 *p++ = not_set; 373 if (attr->ia_valid & ATTR_MTIME_SET)
163 } 374 xdr_encode_time(p, &attr->ia_mtime);
164 375 else if (attr->ia_valid & ATTR_MTIME)
165 if (attr->ia_valid & ATTR_MTIME_SET) { 376 xdr_encode_current_server_time(p, &attr->ia_mtime);
166 p = xdr_encode_time(p, &attr->ia_mtime); 377 else
167 } else if (attr->ia_valid & ATTR_MTIME) { 378 xdr_time_not_set(p);
168 p = xdr_encode_current_server_time(p, &attr->ia_mtime);
169 } else {
170 *p++ = not_set;
171 *p++ = not_set;
172 }
173 return p;
174} 379}
175 380
176/* 381/*
177 * NFS encode functions 382 * 2.3.7. filename
383 *
384 * typedef string filename<MAXNAMLEN>;
178 */ 385 */
386static void encode_filename(struct xdr_stream *xdr,
387 const char *name, u32 length)
388{
389 __be32 *p;
390
391 BUG_ON(length > NFS2_MAXNAMLEN);
392 p = xdr_reserve_space(xdr, 4 + length);
393 xdr_encode_opaque(p, name, length);
394}
395
396static int decode_filename_inline(struct xdr_stream *xdr,
397 const char **name, u32 *length)
398{
399 __be32 *p;
400 u32 count;
401
402 p = xdr_inline_decode(xdr, 4);
403 if (unlikely(p == NULL))
404 goto out_overflow;
405 count = be32_to_cpup(p);
406 if (count > NFS3_MAXNAMLEN)
407 goto out_nametoolong;
408 p = xdr_inline_decode(xdr, count);
409 if (unlikely(p == NULL))
410 goto out_overflow;
411 *name = (const char *)p;
412 *length = count;
413 return 0;
414out_nametoolong:
415 dprintk("NFS: returned filename too long: %u\n", count);
416 return -ENAMETOOLONG;
417out_overflow:
418 print_overflow_msg(__func__, xdr);
419 return -EIO;
420}
421
179/* 422/*
180 * Encode file handle argument 423 * 2.3.8. path
181 * GETATTR, READLINK, STATFS 424 *
425 * typedef string path<MAXPATHLEN>;
182 */ 426 */
183static int 427static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
184nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
185{ 428{
186 p = xdr_encode_fhandle(p, fh); 429 __be32 *p;
187 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 430
431 BUG_ON(length > NFS2_MAXPATHLEN);
432 p = xdr_reserve_space(xdr, 4);
433 *p = cpu_to_be32(length);
434 xdr_write_pages(xdr, pages, 0, length);
435}
436
437static int decode_path(struct xdr_stream *xdr)
438{
439 u32 length, recvd;
440 size_t hdrlen;
441 __be32 *p;
442
443 p = xdr_inline_decode(xdr, 4);
444 if (unlikely(p == NULL))
445 goto out_overflow;
446 length = be32_to_cpup(p);
447 if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
448 goto out_size;
449 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
450 recvd = xdr->buf->len - hdrlen;
451 if (unlikely(length > recvd))
452 goto out_cheating;
453
454 xdr_read_pages(xdr, length);
455 xdr_terminate_string(xdr->buf, length);
188 return 0; 456 return 0;
457out_size:
458 dprintk("NFS: returned pathname too long: %u\n", length);
459 return -ENAMETOOLONG;
460out_cheating:
461 dprintk("NFS: server cheating in pathname result: "
462 "length %u > received %u\n", length, recvd);
463 return -EIO;
464out_overflow:
465 print_overflow_msg(__func__, xdr);
466 return -EIO;
189} 467}
190 468
191/* 469/*
192 * Encode SETATTR arguments 470 * 2.3.9. attrstat
471 *
472 * union attrstat switch (stat status) {
473 * case NFS_OK:
474 * fattr attributes;
475 * default:
476 * void;
477 * };
193 */ 478 */
194static int 479static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
195nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
196{ 480{
197 p = xdr_encode_fhandle(p, args->fh); 481 enum nfs_stat status;
198 p = xdr_encode_sattr(p, args->sattr); 482 int error;
199 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 483
200 return 0; 484 error = decode_stat(xdr, &status);
485 if (unlikely(error))
486 goto out;
487 if (status != NFS_OK)
488 goto out_default;
489 error = decode_fattr(xdr, result);
490out:
491 return error;
492out_default:
493 return nfs_stat_to_errno(status);
201} 494}
202 495
203/* 496/*
204 * Encode directory ops argument 497 * 2.3.10. diropargs
205 * LOOKUP, RMDIR 498 *
499 * struct diropargs {
500 * fhandle dir;
501 * filename name;
502 * };
206 */ 503 */
207static int 504static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
208nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args) 505 const char *name, u32 length)
209{ 506{
210 p = xdr_encode_fhandle(p, args->fh); 507 encode_fhandle(xdr, fh);
211 p = xdr_encode_array(p, args->name, args->len); 508 encode_filename(xdr, name, length);
212 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
213 return 0;
214} 509}
215 510
216/* 511/*
217 * Encode REMOVE argument 512 * 2.3.11. diropres
513 *
514 * union diropres switch (stat status) {
515 * case NFS_OK:
516 * struct {
517 * fhandle file;
518 * fattr attributes;
519 * } diropok;
520 * default:
521 * void;
522 * };
218 */ 523 */
219static int 524static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
220nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
221{ 525{
222 p = xdr_encode_fhandle(p, args->fh); 526 int error;
223 p = xdr_encode_array(p, args->name.name, args->name.len); 527
224 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 528 error = decode_fhandle(xdr, result->fh);
225 return 0; 529 if (unlikely(error))
530 goto out;
531 error = decode_fattr(xdr, result->fattr);
532out:
533 return error;
534}
535
536static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
537{
538 enum nfs_stat status;
539 int error;
540
541 error = decode_stat(xdr, &status);
542 if (unlikely(error))
543 goto out;
544 if (status != NFS_OK)
545 goto out_default;
546 error = decode_diropok(xdr, result);
547out:
548 return error;
549out_default:
550 return nfs_stat_to_errno(status);
226} 551}
227 552
553
228/* 554/*
229 * Arguments to a READ call. Since we read data directly into the page 555 * NFSv2 XDR encode functions
230 * cache, we also set up the reply iovec here so that iov[1] points 556 *
231 * exactly to the page we want to fetch. 557 * NFSv2 argument types are defined in section 2.2 of RFC 1094:
558 * "NFS: Network File System Protocol Specification".
232 */ 559 */
233static int 560
234nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 561static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
562 struct xdr_stream *xdr,
563 const struct nfs_fh *fh)
235{ 564{
236 struct rpc_auth *auth = req->rq_cred->cr_auth; 565 encode_fhandle(xdr, fh);
237 unsigned int replen; 566}
238 u32 offset = (u32)args->offset; 567
568/*
569 * 2.2.3. sattrargs
570 *
571 * struct sattrargs {
572 * fhandle file;
573 * sattr attributes;
574 * };
575 */
576static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
577 struct xdr_stream *xdr,
578 const struct nfs_sattrargs *args)
579{
580 encode_fhandle(xdr, args->fh);
581 encode_sattr(xdr, args->sattr);
582}
583
584static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
585 struct xdr_stream *xdr,
586 const struct nfs_diropargs *args)
587{
588 encode_diropargs(xdr, args->fh, args->name, args->len);
589}
590
591static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
592 struct xdr_stream *xdr,
593 const struct nfs_readlinkargs *args)
594{
595 encode_fhandle(xdr, args->fh);
596 prepare_reply_buffer(req, args->pages, args->pgbase,
597 args->pglen, NFS_readlinkres_sz);
598}
599
600/*
601 * 2.2.7. readargs
602 *
603 * struct readargs {
604 * fhandle file;
605 * unsigned offset;
606 * unsigned count;
607 * unsigned totalcount;
608 * };
609 */
610static void encode_readargs(struct xdr_stream *xdr,
611 const struct nfs_readargs *args)
612{
613 u32 offset = args->offset;
239 u32 count = args->count; 614 u32 count = args->count;
615 __be32 *p;
240 616
241 p = xdr_encode_fhandle(p, args->fh); 617 encode_fhandle(xdr, args->fh);
242 *p++ = htonl(offset);
243 *p++ = htonl(count);
244 *p++ = htonl(count);
245 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
246 618
247 /* Inline the page array */ 619 p = xdr_reserve_space(xdr, 4 + 4 + 4);
248 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2; 620 *p++ = cpu_to_be32(offset);
249 xdr_inline_pages(&req->rq_rcv_buf, replen, 621 *p++ = cpu_to_be32(count);
250 args->pages, args->pgbase, count); 622 *p = cpu_to_be32(count);
623}
624
625static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
626 struct xdr_stream *xdr,
627 const struct nfs_readargs *args)
628{
629 encode_readargs(xdr, args);
630 prepare_reply_buffer(req, args->pages, args->pgbase,
631 args->count, NFS_readres_sz);
251 req->rq_rcv_buf.flags |= XDRBUF_READ; 632 req->rq_rcv_buf.flags |= XDRBUF_READ;
252 return 0;
253} 633}
254 634
255/* 635/*
256 * Decode READ reply 636 * 2.2.9. writeargs
637 *
638 * struct writeargs {
639 * fhandle file;
640 * unsigned beginoffset;
641 * unsigned offset;
642 * unsigned totalcount;
643 * nfsdata data;
644 * };
257 */ 645 */
258static int 646static void encode_writeargs(struct xdr_stream *xdr,
259nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) 647 const struct nfs_writeargs *args)
260{ 648{
261 struct kvec *iov = req->rq_rcv_buf.head; 649 u32 offset = args->offset;
262 size_t hdrlen; 650 u32 count = args->count;
263 u32 count, recvd; 651 __be32 *p;
264 int status;
265
266 if ((status = ntohl(*p++)))
267 return nfs_stat_to_errno(status);
268 p = xdr_decode_fattr(p, res->fattr);
269
270 count = ntohl(*p++);
271 res->eof = 0;
272 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
273 if (iov->iov_len < hdrlen) {
274 dprintk("NFS: READ reply header overflowed:"
275 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
276 return -errno_NFSERR_IO;
277 } else if (iov->iov_len != hdrlen) {
278 dprintk("NFS: READ header is short. iovec will be shifted.\n");
279 xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
280 }
281 652
282 recvd = req->rq_rcv_buf.len - hdrlen; 653 encode_fhandle(xdr, args->fh);
283 if (count > recvd) {
284 dprintk("NFS: server cheating in read reply: "
285 "count %u > recvd %u\n", count, recvd);
286 count = recvd;
287 }
288 654
289 dprintk("RPC: readres OK count %u\n", count); 655 p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
290 if (count < res->count) 656 *p++ = cpu_to_be32(offset);
291 res->count = count; 657 *p++ = cpu_to_be32(offset);
658 *p++ = cpu_to_be32(count);
292 659
293 return count; 660 /* nfsdata */
661 *p = cpu_to_be32(count);
662 xdr_write_pages(xdr, args->pages, args->pgbase, count);
294} 663}
295 664
665static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
666 struct xdr_stream *xdr,
667 const struct nfs_writeargs *args)
668{
669 encode_writeargs(xdr, args);
670 xdr->buf->flags |= XDRBUF_WRITE;
671}
296 672
297/* 673/*
298 * Write arguments. Splice the buffer to be written into the iovec. 674 * 2.2.10. createargs
675 *
676 * struct createargs {
677 * diropargs where;
678 * sattr attributes;
679 * };
299 */ 680 */
300static int 681static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
301nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 682 struct xdr_stream *xdr,
683 const struct nfs_createargs *args)
302{ 684{
303 struct xdr_buf *sndbuf = &req->rq_snd_buf; 685 encode_diropargs(xdr, args->fh, args->name, args->len);
304 u32 offset = (u32)args->offset; 686 encode_sattr(xdr, args->sattr);
305 u32 count = args->count; 687}
306
307 p = xdr_encode_fhandle(p, args->fh);
308 *p++ = htonl(offset);
309 *p++ = htonl(offset);
310 *p++ = htonl(count);
311 *p++ = htonl(count);
312 sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
313 688
314 /* Copy the page array */ 689static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
315 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); 690 struct xdr_stream *xdr,
316 sndbuf->flags |= XDRBUF_WRITE; 691 const struct nfs_removeargs *args)
317 return 0; 692{
693 encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
318} 694}
319 695
320/* 696/*
321 * Encode create arguments 697 * 2.2.12. renameargs
322 * CREATE, MKDIR 698 *
699 * struct renameargs {
700 * diropargs from;
701 * diropargs to;
702 * };
323 */ 703 */
324static int 704static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
325nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args) 705 struct xdr_stream *xdr,
706 const struct nfs_renameargs *args)
326{ 707{
327 p = xdr_encode_fhandle(p, args->fh); 708 const struct qstr *old = args->old_name;
328 p = xdr_encode_array(p, args->name, args->len); 709 const struct qstr *new = args->new_name;
329 p = xdr_encode_sattr(p, args->sattr); 710
330 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 711 encode_diropargs(xdr, args->old_dir, old->name, old->len);
331 return 0; 712 encode_diropargs(xdr, args->new_dir, new->name, new->len);
332} 713}
333 714
334/* 715/*
335 * Encode RENAME arguments 716 * 2.2.13. linkargs
717 *
718 * struct linkargs {
719 * fhandle from;
720 * diropargs to;
721 * };
336 */ 722 */
337static int 723static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
338nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) 724 struct xdr_stream *xdr,
725 const struct nfs_linkargs *args)
339{ 726{
340 p = xdr_encode_fhandle(p, args->old_dir); 727 encode_fhandle(xdr, args->fromfh);
341 p = xdr_encode_array(p, args->old_name->name, args->old_name->len); 728 encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
342 p = xdr_encode_fhandle(p, args->new_dir);
343 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
344 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
345 return 0;
346} 729}
347 730
348/* 731/*
349 * Encode LINK arguments 732 * 2.2.14. symlinkargs
733 *
734 * struct symlinkargs {
735 * diropargs from;
736 * path to;
737 * sattr attributes;
738 * };
350 */ 739 */
351static int 740static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
352nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args) 741 struct xdr_stream *xdr,
742 const struct nfs_symlinkargs *args)
353{ 743{
354 p = xdr_encode_fhandle(p, args->fromfh); 744 encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
355 p = xdr_encode_fhandle(p, args->tofh); 745 encode_path(xdr, args->pages, args->pathlen);
356 p = xdr_encode_array(p, args->toname, args->tolen); 746 encode_sattr(xdr, args->sattr);
357 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
358 return 0;
359} 747}
360 748
361/* 749/*
362 * Encode SYMLINK arguments 750 * 2.2.17. readdirargs
751 *
752 * struct readdirargs {
753 * fhandle dir;
754 * nfscookie cookie;
755 * unsigned count;
756 * };
363 */ 757 */
364static int 758static void encode_readdirargs(struct xdr_stream *xdr,
365nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args) 759 const struct nfs_readdirargs *args)
366{ 760{
367 struct xdr_buf *sndbuf = &req->rq_snd_buf; 761 __be32 *p;
368 size_t pad;
369 762
370 p = xdr_encode_fhandle(p, args->fromfh); 763 encode_fhandle(xdr, args->fh);
371 p = xdr_encode_array(p, args->fromname, args->fromlen);
372 *p++ = htonl(args->pathlen);
373 sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
374 764
375 xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen); 765 p = xdr_reserve_space(xdr, 4 + 4);
766 *p++ = cpu_to_be32(args->cookie);
767 *p = cpu_to_be32(args->count);
768}
376 769
377 /* 770static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
378 * xdr_encode_pages may have added a few bytes to ensure the 771 struct xdr_stream *xdr,
379 * pathname ends on a 4-byte boundary. Start encoding the 772 const struct nfs_readdirargs *args)
380 * attributes after the pad bytes. 773{
381 */ 774 encode_readdirargs(xdr, args);
382 pad = sndbuf->tail->iov_len; 775 prepare_reply_buffer(req, args->pages, 0,
383 if (pad > 0) 776 args->count, NFS_readdirres_sz);
384 p++;
385 p = xdr_encode_sattr(p, args->sattr);
386 sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
387 return 0;
388} 777}
389 778
390/* 779/*
391 * Encode arguments to readdir call 780 * NFSv2 XDR decode functions
781 *
782 * NFSv2 result types are defined in section 2.2 of RFC 1094:
783 * "NFS: Network File System Protocol Specification".
392 */ 784 */
393static int 785
394nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) 786static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
787 void *__unused)
395{ 788{
396 struct rpc_auth *auth = req->rq_cred->cr_auth; 789 enum nfs_stat status;
397 unsigned int replen; 790 int error;
398 u32 count = args->count; 791
792 error = decode_stat(xdr, &status);
793 if (unlikely(error))
794 goto out;
795 if (status != NFS_OK)
796 goto out_default;
797out:
798 return error;
799out_default:
800 return nfs_stat_to_errno(status);
801}
399 802
400 p = xdr_encode_fhandle(p, args->fh); 803static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
401 *p++ = htonl(args->cookie); 804 struct nfs_fattr *result)
402 *p++ = htonl(count); /* see above */ 805{
403 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 806 return decode_attrstat(xdr, result);
807}
404 808
405 /* Inline the page array */ 809static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
406 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2; 810 struct nfs_diropok *result)
407 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count); 811{
408 return 0; 812 return decode_diropres(xdr, result);
409} 813}
410 814
411/* 815/*
412 * Decode the result of a readdir call. 816 * 2.2.6. readlinkres
413 * We're not really decoding anymore, we just leave the buffer untouched 817 *
414 * and only check that it is syntactically correct. 818 * union readlinkres switch (stat status) {
415 * The real decoding happens in nfs_decode_entry below, called directly 819 * case NFS_OK:
416 * from nfs_readdir for each entry. 820 * path data;
821 * default:
822 * void;
823 * };
417 */ 824 */
418static int 825static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
419nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) 826 struct xdr_stream *xdr, void *__unused)
420{ 827{
421 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 828 enum nfs_stat status;
422 struct kvec *iov = rcvbuf->head; 829 int error;
423 struct page **page; 830
424 size_t hdrlen; 831 error = decode_stat(xdr, &status);
425 unsigned int pglen, recvd; 832 if (unlikely(error))
426 int status; 833 goto out;
427 834 if (status != NFS_OK)
428 if ((status = ntohl(*p++))) 835 goto out_default;
429 return nfs_stat_to_errno(status); 836 error = decode_path(xdr);
430 837out:
431 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 838 return error;
432 if (iov->iov_len < hdrlen) { 839out_default:
433 dprintk("NFS: READDIR reply header overflowed:" 840 return nfs_stat_to_errno(status);
434 "length %Zu > %Zu\n", hdrlen, iov->iov_len); 841}
435 return -errno_NFSERR_IO;
436 } else if (iov->iov_len != hdrlen) {
437 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
438 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
439 }
440 842
441 pglen = rcvbuf->page_len; 843/*
442 recvd = rcvbuf->len - hdrlen; 844 * 2.2.7. readres
443 if (pglen > recvd) 845 *
444 pglen = recvd; 846 * union readres switch (stat status) {
445 page = rcvbuf->pages; 847 * case NFS_OK:
446 return pglen; 848 * fattr attributes;
849 * nfsdata data;
850 * default:
851 * void;
852 * };
853 */
854static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
855 struct nfs_readres *result)
856{
857 enum nfs_stat status;
858 int error;
859
860 error = decode_stat(xdr, &status);
861 if (unlikely(error))
862 goto out;
863 if (status != NFS_OK)
864 goto out_default;
865 error = decode_fattr(xdr, result->fattr);
866 if (unlikely(error))
867 goto out;
868 error = decode_nfsdata(xdr, result);
869out:
870 return error;
871out_default:
872 return nfs_stat_to_errno(status);
447} 873}
448 874
449static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 875static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
876 struct nfs_writeres *result)
450{ 877{
451 dprintk("nfs: %s: prematurely hit end of receive buffer. " 878 /* All NFSv2 writes are "file sync" writes */
452 "Remaining buffer length is %tu words.\n", 879 result->verf->committed = NFS_FILE_SYNC;
453 func, xdr->end - xdr->p); 880 return decode_attrstat(xdr, result->fattr);
454} 881}
455 882
456__be32 * 883/**
457nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) 884 * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
885 * the local page cache.
886 * @xdr: XDR stream where entry resides
887 * @entry: buffer to fill in with entry data
888 * @plus: boolean indicating whether this should be a readdirplus entry
889 *
890 * Returns zero if successful, otherwise a negative errno value is
891 * returned.
892 *
893 * This function is not invoked during READDIR reply decoding, but
894 * rather whenever an application invokes the getdents(2) system call
895 * on a directory already in our cache.
896 *
897 * 2.2.17. entry
898 *
899 * struct entry {
900 * unsigned fileid;
901 * filename name;
902 * nfscookie cookie;
903 * entry *nextentry;
904 * };
905 */
906int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
907 int plus)
458{ 908{
459 __be32 *p; 909 __be32 *p;
910 int error;
911
460 p = xdr_inline_decode(xdr, 4); 912 p = xdr_inline_decode(xdr, 4);
461 if (unlikely(!p)) 913 if (unlikely(p == NULL))
462 goto out_overflow; 914 goto out_overflow;
463 if (!ntohl(*p++)) { 915 if (*p++ == xdr_zero) {
464 p = xdr_inline_decode(xdr, 4); 916 p = xdr_inline_decode(xdr, 4);
465 if (unlikely(!p)) 917 if (unlikely(p == NULL))
466 goto out_overflow; 918 goto out_overflow;
467 if (!ntohl(*p++)) 919 if (*p++ == xdr_zero)
468 return ERR_PTR(-EAGAIN); 920 return -EAGAIN;
469 entry->eof = 1; 921 entry->eof = 1;
470 return ERR_PTR(-EBADCOOKIE); 922 return -EBADCOOKIE;
471 } 923 }
472 924
473 p = xdr_inline_decode(xdr, 8); 925 p = xdr_inline_decode(xdr, 4);
474 if (unlikely(!p)) 926 if (unlikely(p == NULL))
475 goto out_overflow; 927 goto out_overflow;
928 entry->ino = be32_to_cpup(p);
476 929
477 entry->ino = ntohl(*p++); 930 error = decode_filename_inline(xdr, &entry->name, &entry->len);
478 entry->len = ntohl(*p++); 931 if (unlikely(error))
932 return error;
479 933
480 p = xdr_inline_decode(xdr, entry->len + 4); 934 /*
481 if (unlikely(!p)) 935 * The type (size and byte order) of nfscookie isn't defined in
936 * RFC 1094. This implementation assumes that it's an XDR uint32.
937 */
938 entry->prev_cookie = entry->cookie;
939 p = xdr_inline_decode(xdr, 4);
940 if (unlikely(p == NULL))
482 goto out_overflow; 941 goto out_overflow;
483 entry->name = (const char *) p; 942 entry->cookie = be32_to_cpup(p);
484 p += XDR_QUADLEN(entry->len);
485 entry->prev_cookie = entry->cookie;
486 entry->cookie = ntohl(*p++);
487 943
488 entry->d_type = DT_UNKNOWN; 944 entry->d_type = DT_UNKNOWN;
489 945
490 p = xdr_inline_peek(xdr, 8); 946 return 0;
491 if (p != NULL)
492 entry->eof = !p[0] && p[1];
493 else
494 entry->eof = 0;
495
496 return p;
497 947
498out_overflow: 948out_overflow:
499 print_overflow_msg(__func__, xdr); 949 print_overflow_msg(__func__, xdr);
500 return ERR_PTR(-EAGAIN); 950 return -EAGAIN;
501}
502
503/*
504 * NFS XDR decode functions
505 */
506/*
507 * Decode simple status reply
508 */
509static int
510nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
511{
512 int status;
513
514 if ((status = ntohl(*p++)) != 0)
515 status = nfs_stat_to_errno(status);
516 return status;
517} 951}
518 952
519/* 953/*
520 * Decode attrstat reply 954 * 2.2.17. readdirres
521 * GETATTR, SETATTR, WRITE 955 *
522 */ 956 * union readdirres switch (stat status) {
523static int 957 * case NFS_OK:
524nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 958 * struct {
525{ 959 * entry *entries;
526 int status; 960 * bool eof;
527 961 * } readdirok;
528 if ((status = ntohl(*p++))) 962 * default:
529 return nfs_stat_to_errno(status); 963 * void;
530 xdr_decode_fattr(p, fattr); 964 * };
531 return 0; 965 *
532} 966 * Read the directory contents into the page cache, but don't
533 967 * touch them. The actual decoding is done by nfs2_decode_dirent()
534/* 968 * during subsequent nfs_readdir() calls.
535 * Decode diropres reply
536 * LOOKUP, CREATE, MKDIR
537 */ 969 */
538static int 970static int decode_readdirok(struct xdr_stream *xdr)
539nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
540{ 971{
541 int status; 972 u32 recvd, pglen;
973 size_t hdrlen;
542 974
543 if ((status = ntohl(*p++))) 975 pglen = xdr->buf->page_len;
544 return nfs_stat_to_errno(status); 976 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
545 p = xdr_decode_fhandle(p, res->fh); 977 recvd = xdr->buf->len - hdrlen;
546 xdr_decode_fattr(p, res->fattr); 978 if (unlikely(pglen > recvd))
547 return 0; 979 goto out_cheating;
980out:
981 xdr_read_pages(xdr, pglen);
982 return pglen;
983out_cheating:
984 dprintk("NFS: server cheating in readdir result: "
985 "pglen %u > recvd %u\n", pglen, recvd);
986 pglen = recvd;
987 goto out;
548} 988}
549 989
550/* 990static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
551 * Encode READLINK args 991 struct xdr_stream *xdr, void *__unused)
552 */
553static int
554nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
555{ 992{
556 struct rpc_auth *auth = req->rq_cred->cr_auth; 993 enum nfs_stat status;
557 unsigned int replen; 994 int error;
558 995
559 p = xdr_encode_fhandle(p, args->fh); 996 error = decode_stat(xdr, &status);
560 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 997 if (unlikely(error))
561 998 goto out;
562 /* Inline the page array */ 999 if (status != NFS_OK)
563 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2; 1000 goto out_default;
564 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); 1001 error = decode_readdirok(xdr);
565 return 0; 1002out:
1003 return error;
1004out_default:
1005 return nfs_stat_to_errno(status);
566} 1006}
567 1007
568/* 1008/*
569 * Decode READLINK reply 1009 * 2.2.18. statfsres
1010 *
1011 * union statfsres (stat status) {
1012 * case NFS_OK:
1013 * struct {
1014 * unsigned tsize;
1015 * unsigned bsize;
1016 * unsigned blocks;
1017 * unsigned bfree;
1018 * unsigned bavail;
1019 * } info;
1020 * default:
1021 * void;
1022 * };
570 */ 1023 */
571static int 1024static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
572nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
573{ 1025{
574 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 1026 __be32 *p;
575 struct kvec *iov = rcvbuf->head;
576 size_t hdrlen;
577 u32 len, recvd;
578 int status;
579
580 if ((status = ntohl(*p++)))
581 return nfs_stat_to_errno(status);
582 /* Convert length of symlink */
583 len = ntohl(*p++);
584 if (len >= rcvbuf->page_len) {
585 dprintk("nfs: server returned giant symlink!\n");
586 return -ENAMETOOLONG;
587 }
588 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
589 if (iov->iov_len < hdrlen) {
590 dprintk("NFS: READLINK reply header overflowed:"
591 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
592 return -errno_NFSERR_IO;
593 } else if (iov->iov_len != hdrlen) {
594 dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
595 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
596 }
597 recvd = req->rq_rcv_buf.len - hdrlen;
598 if (recvd < len) {
599 dprintk("NFS: server cheating in readlink reply: "
600 "count %u > recvd %u\n", len, recvd);
601 return -EIO;
602 }
603 1027
604 xdr_terminate_string(rcvbuf, len); 1028 p = xdr_inline_decode(xdr, NFS_info_sz << 2);
1029 if (unlikely(p == NULL))
1030 goto out_overflow;
1031 result->tsize = be32_to_cpup(p++);
1032 result->bsize = be32_to_cpup(p++);
1033 result->blocks = be32_to_cpup(p++);
1034 result->bfree = be32_to_cpup(p++);
1035 result->bavail = be32_to_cpup(p);
605 return 0; 1036 return 0;
1037out_overflow:
1038 print_overflow_msg(__func__, xdr);
1039 return -EIO;
606} 1040}
607 1041
608/* 1042static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
609 * Decode WRITE reply 1043 struct nfs2_fsstat *result)
610 */
611static int
612nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
613{ 1044{
614 res->verf->committed = NFS_FILE_SYNC; 1045 enum nfs_stat status;
615 return nfs_xdr_attrstat(req, p, res->fattr); 1046 int error;
1047
1048 error = decode_stat(xdr, &status);
1049 if (unlikely(error))
1050 goto out;
1051 if (status != NFS_OK)
1052 goto out_default;
1053 error = decode_info(xdr, result);
1054out:
1055 return error;
1056out_default:
1057 return nfs_stat_to_errno(status);
616} 1058}
617 1059
618/*
619 * Decode STATFS reply
620 */
621static int
622nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
623{
624 int status;
625
626 if ((status = ntohl(*p++)))
627 return nfs_stat_to_errno(status);
628
629 res->tsize = ntohl(*p++);
630 res->bsize = ntohl(*p++);
631 res->blocks = ntohl(*p++);
632 res->bfree = ntohl(*p++);
633 res->bavail = ntohl(*p++);
634 return 0;
635}
636 1060
637/* 1061/*
638 * We need to translate between nfs status return values and 1062 * We need to translate between nfs status return values and
639 * the local errno values which may not be the same. 1063 * the local errno values which may not be the same.
640 */ 1064 */
641static struct { 1065static const struct {
642 int stat; 1066 int stat;
643 int errno; 1067 int errno;
644} nfs_errtbl[] = { 1068} nfs_errtbl[] = {
@@ -678,28 +1102,30 @@ static struct {
678 { -1, -EIO } 1102 { -1, -EIO }
679}; 1103};
680 1104
681/* 1105/**
682 * Convert an NFS error code to a local one. 1106 * nfs_stat_to_errno - convert an NFS status code to a local errno
683 * This one is used jointly by NFSv2 and NFSv3. 1107 * @status: NFS status code to convert
1108 *
1109 * Returns a local errno value, or -EIO if the NFS status code is
1110 * not recognized. This function is used jointly by NFSv2 and NFSv3.
684 */ 1111 */
685int 1112int nfs_stat_to_errno(enum nfs_stat status)
686nfs_stat_to_errno(int stat)
687{ 1113{
688 int i; 1114 int i;
689 1115
690 for (i = 0; nfs_errtbl[i].stat != -1; i++) { 1116 for (i = 0; nfs_errtbl[i].stat != -1; i++) {
691 if (nfs_errtbl[i].stat == stat) 1117 if (nfs_errtbl[i].stat == (int)status)
692 return nfs_errtbl[i].errno; 1118 return nfs_errtbl[i].errno;
693 } 1119 }
694 dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat); 1120 dprintk("NFS: Unrecognized nfs status value: %u\n", status);
695 return nfs_errtbl[i].errno; 1121 return nfs_errtbl[i].errno;
696} 1122}
697 1123
698#define PROC(proc, argtype, restype, timer) \ 1124#define PROC(proc, argtype, restype, timer) \
699[NFSPROC_##proc] = { \ 1125[NFSPROC_##proc] = { \
700 .p_proc = NFSPROC_##proc, \ 1126 .p_proc = NFSPROC_##proc, \
701 .p_encode = (kxdrproc_t) nfs_xdr_##argtype, \ 1127 .p_encode = (kxdreproc_t)nfs2_xdr_enc_##argtype, \
702 .p_decode = (kxdrproc_t) nfs_xdr_##restype, \ 1128 .p_decode = (kxdrdproc_t)nfs2_xdr_dec_##restype, \
703 .p_arglen = NFS_##argtype##_sz, \ 1129 .p_arglen = NFS_##argtype##_sz, \
704 .p_replen = NFS_##restype##_sz, \ 1130 .p_replen = NFS_##restype##_sz, \
705 .p_timer = timer, \ 1131 .p_timer = timer, \
@@ -707,21 +1133,21 @@ nfs_stat_to_errno(int stat)
707 .p_name = #proc, \ 1133 .p_name = #proc, \
708 } 1134 }
709struct rpc_procinfo nfs_procedures[] = { 1135struct rpc_procinfo nfs_procedures[] = {
710 PROC(GETATTR, fhandle, attrstat, 1), 1136 PROC(GETATTR, fhandle, attrstat, 1),
711 PROC(SETATTR, sattrargs, attrstat, 0), 1137 PROC(SETATTR, sattrargs, attrstat, 0),
712 PROC(LOOKUP, diropargs, diropres, 2), 1138 PROC(LOOKUP, diropargs, diropres, 2),
713 PROC(READLINK, readlinkargs, readlinkres, 3), 1139 PROC(READLINK, readlinkargs, readlinkres, 3),
714 PROC(READ, readargs, readres, 3), 1140 PROC(READ, readargs, readres, 3),
715 PROC(WRITE, writeargs, writeres, 4), 1141 PROC(WRITE, writeargs, writeres, 4),
716 PROC(CREATE, createargs, diropres, 0), 1142 PROC(CREATE, createargs, diropres, 0),
717 PROC(REMOVE, removeargs, stat, 0), 1143 PROC(REMOVE, removeargs, stat, 0),
718 PROC(RENAME, renameargs, stat, 0), 1144 PROC(RENAME, renameargs, stat, 0),
719 PROC(LINK, linkargs, stat, 0), 1145 PROC(LINK, linkargs, stat, 0),
720 PROC(SYMLINK, symlinkargs, stat, 0), 1146 PROC(SYMLINK, symlinkargs, stat, 0),
721 PROC(MKDIR, createargs, diropres, 0), 1147 PROC(MKDIR, createargs, diropres, 0),
722 PROC(RMDIR, diropargs, stat, 0), 1148 PROC(RMDIR, diropargs, stat, 0),
723 PROC(READDIR, readdirargs, readdirres, 3), 1149 PROC(READDIR, readdirargs, readdirres, 3),
724 PROC(STATFS, fhandle, statfsres, 0), 1150 PROC(STATFS, fhandle, statfsres, 0),
725}; 1151};
726 1152
727struct rpc_version nfs_version2 = { 1153struct rpc_version nfs_version2 = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index f6cc60f06dac..01c5e8b1941d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,18 +37,16 @@
37#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) 37#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2))
38#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) 38#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2))
39#define NFS3_fattr_sz (21) 39#define NFS3_fattr_sz (21)
40#define NFS3_wcc_attr_sz (6) 40#define NFS3_cookieverf_sz (NFS3_COOKIEVERFSIZE>>2)
41#define NFS3_wcc_attr_sz (6)
41#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz) 42#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz)
42#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz) 43#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz)
43#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) 44#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
44#define NFS3_fsstat_sz
45#define NFS3_fsinfo_sz
46#define NFS3_pathconf_sz
47#define NFS3_entry_sz (NFS3_filename_sz+3)
48
49#define NFS3_sattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3)
50#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz) 45#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz)
51#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz) 46
47#define NFS3_getattrargs_sz (NFS3_fh_sz)
48#define NFS3_setattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3)
49#define NFS3_lookupargs_sz (NFS3_fh_sz+NFS3_filename_sz)
52#define NFS3_accessargs_sz (NFS3_fh_sz+1) 50#define NFS3_accessargs_sz (NFS3_fh_sz+1)
53#define NFS3_readlinkargs_sz (NFS3_fh_sz) 51#define NFS3_readlinkargs_sz (NFS3_fh_sz)
54#define NFS3_readargs_sz (NFS3_fh_sz+3) 52#define NFS3_readargs_sz (NFS3_fh_sz+3)
@@ -57,14 +55,16 @@
57#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) 55#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz)
58#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz) 56#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz)
59#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz) 57#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz)
58#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz)
60#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz) 59#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz)
61#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz) 60#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz)
62#define NFS3_readdirargs_sz (NFS3_fh_sz+2) 61#define NFS3_readdirargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+3)
62#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
63#define NFS3_commitargs_sz (NFS3_fh_sz+3) 63#define NFS3_commitargs_sz (NFS3_fh_sz+3)
64 64
65#define NFS3_attrstat_sz (1+NFS3_fattr_sz) 65#define NFS3_getattrres_sz (1+NFS3_fattr_sz)
66#define NFS3_wccstat_sz (1+NFS3_wcc_data_sz) 66#define NFS3_setattrres_sz (1+NFS3_wcc_data_sz)
67#define NFS3_removeres_sz (NFS3_wccstat_sz) 67#define NFS3_removeres_sz (NFS3_setattrres_sz)
68#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) 68#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
69#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) 69#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1)
70#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) 70#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1)
@@ -100,1079 +100,2362 @@ static const umode_t nfs_type2fmt[] = {
100 [NF3FIFO] = S_IFIFO, 100 [NF3FIFO] = S_IFIFO,
101}; 101};
102 102
103/*
104 * While encoding arguments, set up the reply buffer in advance to
105 * receive reply data directly into the page cache.
106 */
107static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
108 unsigned int base, unsigned int len,
109 unsigned int bufsize)
110{
111 struct rpc_auth *auth = req->rq_cred->cr_auth;
112 unsigned int replen;
113
114 replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
115 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
116}
117
118/*
119 * Handle decode buffer overflows out-of-line.
120 */
103static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 121static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
104{ 122{
105 dprintk("nfs: %s: prematurely hit end of receive buffer. " 123 dprintk("NFS: %s prematurely hit the end of our receive buffer. "
106 "Remaining buffer length is %tu words.\n", 124 "Remaining buffer length is %tu words.\n",
107 func, xdr->end - xdr->p); 125 func, xdr->end - xdr->p);
108} 126}
109 127
128
110/* 129/*
111 * Common NFS XDR functions as inlines 130 * Encode/decode NFSv3 basic data types
131 *
132 * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
133 * "NFS Version 3 Protocol Specification".
134 *
135 * Not all basic data types have their own encoding and decoding
136 * functions. For run-time efficiency, some data types are encoded
137 * or decoded inline.
112 */ 138 */
113static inline __be32 * 139
114xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh) 140static void encode_uint32(struct xdr_stream *xdr, u32 value)
115{ 141{
116 return xdr_encode_array(p, fh->data, fh->size); 142 __be32 *p = xdr_reserve_space(xdr, 4);
143 *p = cpu_to_be32(value);
117} 144}
118 145
119static inline __be32 * 146static int decode_uint32(struct xdr_stream *xdr, u32 *value)
120xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
121{ 147{
122 if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) { 148 __be32 *p;
123 memcpy(fh->data, p, fh->size); 149
124 return p + XDR_QUADLEN(fh->size); 150 p = xdr_inline_decode(xdr, 4);
125 } 151 if (unlikely(p == NULL))
126 return NULL; 152 goto out_overflow;
153 *value = be32_to_cpup(p);
154 return 0;
155out_overflow:
156 print_overflow_msg(__func__, xdr);
157 return -EIO;
158}
159
160static int decode_uint64(struct xdr_stream *xdr, u64 *value)
161{
162 __be32 *p;
163
164 p = xdr_inline_decode(xdr, 8);
165 if (unlikely(p == NULL))
166 goto out_overflow;
167 xdr_decode_hyper(p, value);
168 return 0;
169out_overflow:
170 print_overflow_msg(__func__, xdr);
171 return -EIO;
172}
173
174/*
175 * fileid3
176 *
177 * typedef uint64 fileid3;
178 */
179static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
180{
181 return xdr_decode_hyper(p, fileid);
182}
183
184static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
185{
186 return decode_uint64(xdr, fileid);
187}
188
189/*
190 * filename3
191 *
192 * typedef string filename3<>;
193 */
194static void encode_filename3(struct xdr_stream *xdr,
195 const char *name, u32 length)
196{
197 __be32 *p;
198
199 BUG_ON(length > NFS3_MAXNAMLEN);
200 p = xdr_reserve_space(xdr, 4 + length);
201 xdr_encode_opaque(p, name, length);
127} 202}
128 203
129static inline __be32 * 204static int decode_inline_filename3(struct xdr_stream *xdr,
130xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh) 205 const char **name, u32 *length)
131{ 206{
132 __be32 *p; 207 __be32 *p;
208 u32 count;
209
133 p = xdr_inline_decode(xdr, 4); 210 p = xdr_inline_decode(xdr, 4);
134 if (unlikely(!p)) 211 if (unlikely(p == NULL))
212 goto out_overflow;
213 count = be32_to_cpup(p);
214 if (count > NFS3_MAXNAMLEN)
215 goto out_nametoolong;
216 p = xdr_inline_decode(xdr, count);
217 if (unlikely(p == NULL))
135 goto out_overflow; 218 goto out_overflow;
136 fh->size = ntohl(*p++); 219 *name = (const char *)p;
220 *length = count;
221 return 0;
137 222
138 if (fh->size <= NFS3_FHSIZE) { 223out_nametoolong:
139 p = xdr_inline_decode(xdr, fh->size); 224 dprintk("NFS: returned filename too long: %u\n", count);
140 if (unlikely(!p)) 225 return -ENAMETOOLONG;
141 goto out_overflow; 226out_overflow:
142 memcpy(fh->data, p, fh->size); 227 print_overflow_msg(__func__, xdr);
143 return p + XDR_QUADLEN(fh->size); 228 return -EIO;
144 } 229}
145 return NULL; 230
231/*
232 * nfspath3
233 *
234 * typedef string nfspath3<>;
235 */
236static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
237 const u32 length)
238{
239 BUG_ON(length > NFS3_MAXPATHLEN);
240 encode_uint32(xdr, length);
241 xdr_write_pages(xdr, pages, 0, length);
242}
146 243
244static int decode_nfspath3(struct xdr_stream *xdr)
245{
246 u32 recvd, count;
247 size_t hdrlen;
248 __be32 *p;
249
250 p = xdr_inline_decode(xdr, 4);
251 if (unlikely(p == NULL))
252 goto out_overflow;
253 count = be32_to_cpup(p);
254 if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
255 goto out_nametoolong;
256 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
257 recvd = xdr->buf->len - hdrlen;
258 if (unlikely(count > recvd))
259 goto out_cheating;
260
261 xdr_read_pages(xdr, count);
262 xdr_terminate_string(xdr->buf, count);
263 return 0;
264
265out_nametoolong:
266 dprintk("NFS: returned pathname too long: %u\n", count);
267 return -ENAMETOOLONG;
268out_cheating:
269 dprintk("NFS: server cheating in pathname result: "
270 "count %u > recvd %u\n", count, recvd);
271 return -EIO;
147out_overflow: 272out_overflow:
148 print_overflow_msg(__func__, xdr); 273 print_overflow_msg(__func__, xdr);
149 return ERR_PTR(-EIO); 274 return -EIO;
150} 275}
151 276
152/* 277/*
153 * Encode/decode time. 278 * cookie3
279 *
280 * typedef uint64 cookie3
154 */ 281 */
155static inline __be32 * 282static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
156xdr_encode_time3(__be32 *p, struct timespec *timep)
157{ 283{
158 *p++ = htonl(timep->tv_sec); 284 return xdr_encode_hyper(p, cookie);
159 *p++ = htonl(timep->tv_nsec);
160 return p;
161} 285}
162 286
163static inline __be32 * 287static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
164xdr_decode_time3(__be32 *p, struct timespec *timep)
165{ 288{
166 timep->tv_sec = ntohl(*p++); 289 return decode_uint64(xdr, cookie);
167 timep->tv_nsec = ntohl(*p++); 290}
168 return p; 291
292/*
293 * cookieverf3
294 *
295 * typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
296 */
297static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
298{
299 memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
300 return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
301}
302
303static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
304{
305 __be32 *p;
306
307 p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
308 if (unlikely(p == NULL))
309 goto out_overflow;
310 memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
311 return 0;
312out_overflow:
313 print_overflow_msg(__func__, xdr);
314 return -EIO;
315}
316
317/*
318 * createverf3
319 *
320 * typedef opaque createverf3[NFS3_CREATEVERFSIZE];
321 */
322static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
323{
324 __be32 *p;
325
326 p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
327 memcpy(p, verifier, NFS3_CREATEVERFSIZE);
328}
329
330static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
331{
332 __be32 *p;
333
334 p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
335 if (unlikely(p == NULL))
336 goto out_overflow;
337 memcpy(verifier, p, NFS3_WRITEVERFSIZE);
338 return 0;
339out_overflow:
340 print_overflow_msg(__func__, xdr);
341 return -EIO;
342}
343
344/*
345 * size3
346 *
347 * typedef uint64 size3;
348 */
349static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
350{
351 return xdr_decode_hyper(p, size);
352}
353
354/*
355 * nfsstat3
356 *
357 * enum nfsstat3 {
358 * NFS3_OK = 0,
359 * ...
360 * }
361 */
362#define NFS3_OK NFS_OK
363
364static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
365{
366 __be32 *p;
367
368 p = xdr_inline_decode(xdr, 4);
369 if (unlikely(p == NULL))
370 goto out_overflow;
371 *status = be32_to_cpup(p);
372 return 0;
373out_overflow:
374 print_overflow_msg(__func__, xdr);
375 return -EIO;
376}
377
378/*
379 * ftype3
380 *
381 * enum ftype3 {
382 * NF3REG = 1,
383 * NF3DIR = 2,
384 * NF3BLK = 3,
385 * NF3CHR = 4,
386 * NF3LNK = 5,
387 * NF3SOCK = 6,
388 * NF3FIFO = 7
389 * };
390 */
391static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
392{
393 BUG_ON(type > NF3FIFO);
394 encode_uint32(xdr, type);
169} 395}
170 396
171static __be32 * 397static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
172xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
173{ 398{
174 unsigned int type, major, minor; 399 u32 type;
175 umode_t fmode;
176 400
177 type = ntohl(*p++); 401 type = be32_to_cpup(p++);
178 if (type > NF3FIFO) 402 if (type > NF3FIFO)
179 type = NF3NON; 403 type = NF3NON;
180 fmode = nfs_type2fmt[type]; 404 *mode = nfs_type2fmt[type];
181 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; 405 return p;
182 fattr->nlink = ntohl(*p++); 406}
183 fattr->uid = ntohl(*p++);
184 fattr->gid = ntohl(*p++);
185 p = xdr_decode_hyper(p, &fattr->size);
186 p = xdr_decode_hyper(p, &fattr->du.nfs3.used);
187
188 /* Turn remote device info into Linux-specific dev_t */
189 major = ntohl(*p++);
190 minor = ntohl(*p++);
191 fattr->rdev = MKDEV(major, minor);
192 if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
193 fattr->rdev = 0;
194 407
195 p = xdr_decode_hyper(p, &fattr->fsid.major); 408/*
196 fattr->fsid.minor = 0; 409 * specdata3
197 p = xdr_decode_hyper(p, &fattr->fileid); 410 *
198 p = xdr_decode_time3(p, &fattr->atime); 411 * struct specdata3 {
199 p = xdr_decode_time3(p, &fattr->mtime); 412 * uint32 specdata1;
200 p = xdr_decode_time3(p, &fattr->ctime); 413 * uint32 specdata2;
414 * };
415 */
416static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
417{
418 __be32 *p;
201 419
202 /* Update the mode bits */ 420 p = xdr_reserve_space(xdr, 8);
203 fattr->valid |= NFS_ATTR_FATTR_V3; 421 *p++ = cpu_to_be32(MAJOR(rdev));
422 *p = cpu_to_be32(MINOR(rdev));
423}
424
425static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
426{
427 unsigned int major, minor;
428
429 major = be32_to_cpup(p++);
430 minor = be32_to_cpup(p++);
431 *rdev = MKDEV(major, minor);
432 if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
433 *rdev = 0;
434 return p;
435}
436
437/*
438 * nfs_fh3
439 *
440 * struct nfs_fh3 {
441 * opaque data<NFS3_FHSIZE>;
442 * };
443 */
444static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
445{
446 __be32 *p;
447
448 BUG_ON(fh->size > NFS3_FHSIZE);
449 p = xdr_reserve_space(xdr, 4 + fh->size);
450 xdr_encode_opaque(p, fh->data, fh->size);
451}
452
453static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
454{
455 u32 length;
456 __be32 *p;
457
458 p = xdr_inline_decode(xdr, 4);
459 if (unlikely(p == NULL))
460 goto out_overflow;
461 length = be32_to_cpup(p++);
462 if (unlikely(length > NFS3_FHSIZE))
463 goto out_toobig;
464 p = xdr_inline_decode(xdr, length);
465 if (unlikely(p == NULL))
466 goto out_overflow;
467 fh->size = length;
468 memcpy(fh->data, p, length);
469 return 0;
470out_toobig:
471 dprintk("NFS: file handle size (%u) too big\n", length);
472 return -E2BIG;
473out_overflow:
474 print_overflow_msg(__func__, xdr);
475 return -EIO;
476}
477
478static void zero_nfs_fh3(struct nfs_fh *fh)
479{
480 memset(fh, 0, sizeof(*fh));
481}
482
483/*
484 * nfstime3
485 *
486 * struct nfstime3 {
487 * uint32 seconds;
488 * uint32 nseconds;
489 * };
490 */
491static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
492{
493 *p++ = cpu_to_be32(timep->tv_sec);
494 *p++ = cpu_to_be32(timep->tv_nsec);
204 return p; 495 return p;
205} 496}
206 497
207static inline __be32 * 498static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
208xdr_encode_sattr(__be32 *p, struct iattr *attr)
209{ 499{
500 timep->tv_sec = be32_to_cpup(p++);
501 timep->tv_nsec = be32_to_cpup(p++);
502 return p;
503}
504
505/*
506 * sattr3
507 *
508 * enum time_how {
509 * DONT_CHANGE = 0,
510 * SET_TO_SERVER_TIME = 1,
511 * SET_TO_CLIENT_TIME = 2
512 * };
513 *
514 * union set_mode3 switch (bool set_it) {
515 * case TRUE:
516 * mode3 mode;
517 * default:
518 * void;
519 * };
520 *
521 * union set_uid3 switch (bool set_it) {
522 * case TRUE:
523 * uid3 uid;
524 * default:
525 * void;
526 * };
527 *
528 * union set_gid3 switch (bool set_it) {
529 * case TRUE:
530 * gid3 gid;
531 * default:
532 * void;
533 * };
534 *
535 * union set_size3 switch (bool set_it) {
536 * case TRUE:
537 * size3 size;
538 * default:
539 * void;
540 * };
541 *
542 * union set_atime switch (time_how set_it) {
543 * case SET_TO_CLIENT_TIME:
544 * nfstime3 atime;
545 * default:
546 * void;
547 * };
548 *
549 * union set_mtime switch (time_how set_it) {
550 * case SET_TO_CLIENT_TIME:
551 * nfstime3 mtime;
552 * default:
553 * void;
554 * };
555 *
556 * struct sattr3 {
557 * set_mode3 mode;
558 * set_uid3 uid;
559 * set_gid3 gid;
560 * set_size3 size;
561 * set_atime atime;
562 * set_mtime mtime;
563 * };
564 */
565static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
566{
567 u32 nbytes;
568 __be32 *p;
569
570 /*
571 * In order to make only a single xdr_reserve_space() call,
572 * pre-compute the total number of bytes to be reserved.
573 * Six boolean values, one for each set_foo field, are always
574 * present in the encoded result, so start there.
575 */
576 nbytes = 6 * 4;
577 if (attr->ia_valid & ATTR_MODE)
578 nbytes += 4;
579 if (attr->ia_valid & ATTR_UID)
580 nbytes += 4;
581 if (attr->ia_valid & ATTR_GID)
582 nbytes += 4;
583 if (attr->ia_valid & ATTR_SIZE)
584 nbytes += 8;
585 if (attr->ia_valid & ATTR_ATIME_SET)
586 nbytes += 8;
587 if (attr->ia_valid & ATTR_MTIME_SET)
588 nbytes += 8;
589 p = xdr_reserve_space(xdr, nbytes);
590
210 if (attr->ia_valid & ATTR_MODE) { 591 if (attr->ia_valid & ATTR_MODE) {
211 *p++ = xdr_one; 592 *p++ = xdr_one;
212 *p++ = htonl(attr->ia_mode & S_IALLUGO); 593 *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
213 } else { 594 } else
214 *p++ = xdr_zero; 595 *p++ = xdr_zero;
215 } 596
216 if (attr->ia_valid & ATTR_UID) { 597 if (attr->ia_valid & ATTR_UID) {
217 *p++ = xdr_one; 598 *p++ = xdr_one;
218 *p++ = htonl(attr->ia_uid); 599 *p++ = cpu_to_be32(attr->ia_uid);
219 } else { 600 } else
220 *p++ = xdr_zero; 601 *p++ = xdr_zero;
221 } 602
222 if (attr->ia_valid & ATTR_GID) { 603 if (attr->ia_valid & ATTR_GID) {
223 *p++ = xdr_one; 604 *p++ = xdr_one;
224 *p++ = htonl(attr->ia_gid); 605 *p++ = cpu_to_be32(attr->ia_gid);
225 } else { 606 } else
226 *p++ = xdr_zero; 607 *p++ = xdr_zero;
227 } 608
228 if (attr->ia_valid & ATTR_SIZE) { 609 if (attr->ia_valid & ATTR_SIZE) {
229 *p++ = xdr_one; 610 *p++ = xdr_one;
230 p = xdr_encode_hyper(p, (__u64) attr->ia_size); 611 p = xdr_encode_hyper(p, (u64)attr->ia_size);
231 } else { 612 } else
232 *p++ = xdr_zero; 613 *p++ = xdr_zero;
233 } 614
234 if (attr->ia_valid & ATTR_ATIME_SET) { 615 if (attr->ia_valid & ATTR_ATIME_SET) {
235 *p++ = xdr_two; 616 *p++ = xdr_two;
236 p = xdr_encode_time3(p, &attr->ia_atime); 617 p = xdr_encode_nfstime3(p, &attr->ia_atime);
237 } else if (attr->ia_valid & ATTR_ATIME) { 618 } else if (attr->ia_valid & ATTR_ATIME) {
238 *p++ = xdr_one; 619 *p++ = xdr_one;
239 } else { 620 } else
240 *p++ = xdr_zero; 621 *p++ = xdr_zero;
241 } 622
242 if (attr->ia_valid & ATTR_MTIME_SET) { 623 if (attr->ia_valid & ATTR_MTIME_SET) {
243 *p++ = xdr_two; 624 *p++ = xdr_two;
244 p = xdr_encode_time3(p, &attr->ia_mtime); 625 xdr_encode_nfstime3(p, &attr->ia_mtime);
245 } else if (attr->ia_valid & ATTR_MTIME) { 626 } else if (attr->ia_valid & ATTR_MTIME) {
246 *p++ = xdr_one; 627 *p = xdr_one;
247 } else { 628 } else
248 *p++ = xdr_zero; 629 *p = xdr_zero;
249 } 630}
250 return p; 631
632/*
633 * fattr3
634 *
635 * struct fattr3 {
636 * ftype3 type;
637 * mode3 mode;
638 * uint32 nlink;
639 * uid3 uid;
640 * gid3 gid;
641 * size3 size;
642 * size3 used;
643 * specdata3 rdev;
644 * uint64 fsid;
645 * fileid3 fileid;
646 * nfstime3 atime;
647 * nfstime3 mtime;
648 * nfstime3 ctime;
649 * };
650 */
651static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
652{
653 umode_t fmode;
654 __be32 *p;
655
656 p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
657 if (unlikely(p == NULL))
658 goto out_overflow;
659
660 p = xdr_decode_ftype3(p, &fmode);
661
662 fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
663 fattr->nlink = be32_to_cpup(p++);
664 fattr->uid = be32_to_cpup(p++);
665 fattr->gid = be32_to_cpup(p++);
666
667 p = xdr_decode_size3(p, &fattr->size);
668 p = xdr_decode_size3(p, &fattr->du.nfs3.used);
669 p = xdr_decode_specdata3(p, &fattr->rdev);
670
671 p = xdr_decode_hyper(p, &fattr->fsid.major);
672 fattr->fsid.minor = 0;
673
674 p = xdr_decode_fileid3(p, &fattr->fileid);
675 p = xdr_decode_nfstime3(p, &fattr->atime);
676 p = xdr_decode_nfstime3(p, &fattr->mtime);
677 xdr_decode_nfstime3(p, &fattr->ctime);
678
679 fattr->valid |= NFS_ATTR_FATTR_V3;
680 return 0;
681out_overflow:
682 print_overflow_msg(__func__, xdr);
683 return -EIO;
251} 684}
252 685
253static inline __be32 * 686/*
254xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr) 687 * post_op_attr
688 *
689 * union post_op_attr switch (bool attributes_follow) {
690 * case TRUE:
691 * fattr3 attributes;
692 * case FALSE:
693 * void;
694 * };
695 */
696static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
255{ 697{
256 p = xdr_decode_hyper(p, &fattr->pre_size); 698 __be32 *p;
257 p = xdr_decode_time3(p, &fattr->pre_mtime); 699
258 p = xdr_decode_time3(p, &fattr->pre_ctime); 700 p = xdr_inline_decode(xdr, 4);
701 if (unlikely(p == NULL))
702 goto out_overflow;
703 if (*p != xdr_zero)
704 return decode_fattr3(xdr, fattr);
705 return 0;
706out_overflow:
707 print_overflow_msg(__func__, xdr);
708 return -EIO;
709}
710
711/*
712 * wcc_attr
713 * struct wcc_attr {
714 * size3 size;
715 * nfstime3 mtime;
716 * nfstime3 ctime;
717 * };
718 */
719static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
720{
721 __be32 *p;
722
723 p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
724 if (unlikely(p == NULL))
725 goto out_overflow;
726
259 fattr->valid |= NFS_ATTR_FATTR_PRESIZE 727 fattr->valid |= NFS_ATTR_FATTR_PRESIZE
260 | NFS_ATTR_FATTR_PREMTIME 728 | NFS_ATTR_FATTR_PREMTIME
261 | NFS_ATTR_FATTR_PRECTIME; 729 | NFS_ATTR_FATTR_PRECTIME;
262 return p;
263}
264 730
265static inline __be32 * 731 p = xdr_decode_size3(p, &fattr->pre_size);
266xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr) 732 p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
267{ 733 xdr_decode_nfstime3(p, &fattr->pre_ctime);
268 if (*p++) 734
269 p = xdr_decode_fattr(p, fattr); 735 return 0;
270 return p; 736out_overflow:
737 print_overflow_msg(__func__, xdr);
738 return -EIO;
271} 739}
272 740
273static inline __be32 * 741/*
274xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr) 742 * pre_op_attr
743 * union pre_op_attr switch (bool attributes_follow) {
744 * case TRUE:
745 * wcc_attr attributes;
746 * case FALSE:
747 * void;
748 * };
749 *
750 * wcc_data
751 *
752 * struct wcc_data {
753 * pre_op_attr before;
754 * post_op_attr after;
755 * };
756 */
757static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
275{ 758{
276 __be32 *p; 759 __be32 *p;
277 760
278 p = xdr_inline_decode(xdr, 4); 761 p = xdr_inline_decode(xdr, 4);
279 if (unlikely(!p)) 762 if (unlikely(p == NULL))
280 goto out_overflow; 763 goto out_overflow;
281 if (ntohl(*p++)) { 764 if (*p != xdr_zero)
282 p = xdr_inline_decode(xdr, 84); 765 return decode_wcc_attr(xdr, fattr);
283 if (unlikely(!p)) 766 return 0;
284 goto out_overflow;
285 p = xdr_decode_fattr(p, fattr);
286 }
287 return p;
288out_overflow: 767out_overflow:
289 print_overflow_msg(__func__, xdr); 768 print_overflow_msg(__func__, xdr);
290 return ERR_PTR(-EIO); 769 return -EIO;
291} 770}
292 771
293static inline __be32 * 772static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
294xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
295{ 773{
296 if (*p++) 774 int error;
297 return xdr_decode_wcc_attr(p, fattr); 775
298 return p; 776 error = decode_pre_op_attr(xdr, fattr);
777 if (unlikely(error))
778 goto out;
779 error = decode_post_op_attr(xdr, fattr);
780out:
781 return error;
299} 782}
300 783
784/*
785 * post_op_fh3
786 *
787 * union post_op_fh3 switch (bool handle_follows) {
788 * case TRUE:
789 * nfs_fh3 handle;
790 * case FALSE:
791 * void;
792 * };
793 */
794static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
795{
796 __be32 *p = xdr_inline_decode(xdr, 4);
797 if (unlikely(p == NULL))
798 goto out_overflow;
799 if (*p != xdr_zero)
800 return decode_nfs_fh3(xdr, fh);
801 zero_nfs_fh3(fh);
802 return 0;
803out_overflow:
804 print_overflow_msg(__func__, xdr);
805 return -EIO;
806}
301 807
302static inline __be32 * 808/*
303xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr) 809 * diropargs3
810 *
811 * struct diropargs3 {
812 * nfs_fh3 dir;
813 * filename3 name;
814 * };
815 */
816static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
817 const char *name, u32 length)
304{ 818{
305 p = xdr_decode_pre_op_attr(p, fattr); 819 encode_nfs_fh3(xdr, fh);
306 return xdr_decode_post_op_attr(p, fattr); 820 encode_filename3(xdr, name, length);
307} 821}
308 822
823
309/* 824/*
310 * NFS encode functions 825 * NFSv3 XDR encode functions
826 *
827 * NFSv3 argument types are defined in section 3.3 of RFC 1813:
828 * "NFS Version 3 Protocol Specification".
311 */ 829 */
312 830
313/* 831/*
314 * Encode file handle argument 832 * 3.3.1 GETATTR3args
833 *
834 * struct GETATTR3args {
835 * nfs_fh3 object;
836 * };
315 */ 837 */
316static int 838static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
317nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) 839 struct xdr_stream *xdr,
840 const struct nfs_fh *fh)
318{ 841{
319 p = xdr_encode_fhandle(p, fh); 842 encode_nfs_fh3(xdr, fh);
320 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
321 return 0;
322} 843}
323 844
324/* 845/*
325 * Encode SETATTR arguments 846 * 3.3.2 SETATTR3args
847 *
848 * union sattrguard3 switch (bool check) {
849 * case TRUE:
850 * nfstime3 obj_ctime;
851 * case FALSE:
852 * void;
853 * };
854 *
855 * struct SETATTR3args {
856 * nfs_fh3 object;
857 * sattr3 new_attributes;
858 * sattrguard3 guard;
859 * };
326 */ 860 */
327static int 861static void encode_sattrguard3(struct xdr_stream *xdr,
328nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args) 862 const struct nfs3_sattrargs *args)
329{ 863{
330 p = xdr_encode_fhandle(p, args->fh); 864 __be32 *p;
331 p = xdr_encode_sattr(p, args->sattr); 865
332 *p++ = htonl(args->guard); 866 if (args->guard) {
333 if (args->guard) 867 p = xdr_reserve_space(xdr, 4 + 8);
334 p = xdr_encode_time3(p, &args->guardtime); 868 *p++ = xdr_one;
335 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 869 xdr_encode_nfstime3(p, &args->guardtime);
336 return 0; 870 } else {
871 p = xdr_reserve_space(xdr, 4);
872 *p = xdr_zero;
873 }
874}
875
876static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
877 struct xdr_stream *xdr,
878 const struct nfs3_sattrargs *args)
879{
880 encode_nfs_fh3(xdr, args->fh);
881 encode_sattr3(xdr, args->sattr);
882 encode_sattrguard3(xdr, args);
337} 883}
338 884
339/* 885/*
340 * Encode directory ops argument 886 * 3.3.3 LOOKUP3args
887 *
888 * struct LOOKUP3args {
889 * diropargs3 what;
890 * };
341 */ 891 */
342static int 892static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
343nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args) 893 struct xdr_stream *xdr,
894 const struct nfs3_diropargs *args)
344{ 895{
345 p = xdr_encode_fhandle(p, args->fh); 896 encode_diropargs3(xdr, args->fh, args->name, args->len);
346 p = xdr_encode_array(p, args->name, args->len);
347 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
348 return 0;
349} 897}
350 898
351/* 899/*
352 * Encode REMOVE argument 900 * 3.3.4 ACCESS3args
901 *
902 * struct ACCESS3args {
903 * nfs_fh3 object;
904 * uint32 access;
905 * };
353 */ 906 */
354static int 907static void encode_access3args(struct xdr_stream *xdr,
355nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) 908 const struct nfs3_accessargs *args)
356{ 909{
357 p = xdr_encode_fhandle(p, args->fh); 910 encode_nfs_fh3(xdr, args->fh);
358 p = xdr_encode_array(p, args->name.name, args->name.len); 911 encode_uint32(xdr, args->access);
359 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 912}
360 return 0; 913
914static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
915 struct xdr_stream *xdr,
916 const struct nfs3_accessargs *args)
917{
918 encode_access3args(xdr, args);
361} 919}
362 920
363/* 921/*
364 * Encode access() argument 922 * 3.3.5 READLINK3args
923 *
924 * struct READLINK3args {
925 * nfs_fh3 symlink;
926 * };
365 */ 927 */
366static int 928static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
367nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args) 929 struct xdr_stream *xdr,
930 const struct nfs3_readlinkargs *args)
368{ 931{
369 p = xdr_encode_fhandle(p, args->fh); 932 encode_nfs_fh3(xdr, args->fh);
370 *p++ = htonl(args->access); 933 prepare_reply_buffer(req, args->pages, args->pgbase,
371 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 934 args->pglen, NFS3_readlinkres_sz);
372 return 0;
373} 935}
374 936
375/* 937/*
376 * Arguments to a READ call. Since we read data directly into the page 938 * 3.3.6 READ3args
377 * cache, we also set up the reply iovec here so that iov[1] points 939 *
378 * exactly to the page we want to fetch. 940 * struct READ3args {
941 * nfs_fh3 file;
942 * offset3 offset;
943 * count3 count;
944 * };
379 */ 945 */
380static int 946static void encode_read3args(struct xdr_stream *xdr,
381nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 947 const struct nfs_readargs *args)
382{ 948{
383 struct rpc_auth *auth = req->rq_cred->cr_auth; 949 __be32 *p;
384 unsigned int replen; 950
385 u32 count = args->count; 951 encode_nfs_fh3(xdr, args->fh);
386 952
387 p = xdr_encode_fhandle(p, args->fh); 953 p = xdr_reserve_space(xdr, 8 + 4);
388 p = xdr_encode_hyper(p, args->offset); 954 p = xdr_encode_hyper(p, args->offset);
389 *p++ = htonl(count); 955 *p = cpu_to_be32(args->count);
390 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 956}
391 957
392 /* Inline the page array */ 958static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
393 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2; 959 struct xdr_stream *xdr,
394 xdr_inline_pages(&req->rq_rcv_buf, replen, 960 const struct nfs_readargs *args)
395 args->pages, args->pgbase, count); 961{
962 encode_read3args(xdr, args);
963 prepare_reply_buffer(req, args->pages, args->pgbase,
964 args->count, NFS3_readres_sz);
396 req->rq_rcv_buf.flags |= XDRBUF_READ; 965 req->rq_rcv_buf.flags |= XDRBUF_READ;
397 return 0;
398} 966}
399 967
400/* 968/*
401 * Write arguments. Splice the buffer to be written into the iovec. 969 * 3.3.7 WRITE3args
970 *
971 * enum stable_how {
972 * UNSTABLE = 0,
973 * DATA_SYNC = 1,
974 * FILE_SYNC = 2
975 * };
976 *
977 * struct WRITE3args {
978 * nfs_fh3 file;
979 * offset3 offset;
980 * count3 count;
981 * stable_how stable;
982 * opaque data<>;
983 * };
402 */ 984 */
403static int 985static void encode_write3args(struct xdr_stream *xdr,
404nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 986 const struct nfs_writeargs *args)
405{ 987{
406 struct xdr_buf *sndbuf = &req->rq_snd_buf; 988 __be32 *p;
407 u32 count = args->count; 989
990 encode_nfs_fh3(xdr, args->fh);
408 991
409 p = xdr_encode_fhandle(p, args->fh); 992 p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
410 p = xdr_encode_hyper(p, args->offset); 993 p = xdr_encode_hyper(p, args->offset);
411 *p++ = htonl(count); 994 *p++ = cpu_to_be32(args->count);
412 *p++ = htonl(args->stable); 995 *p++ = cpu_to_be32(args->stable);
413 *p++ = htonl(count); 996 *p = cpu_to_be32(args->count);
414 sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); 997 xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
415 998}
416 /* Copy the page array */ 999
417 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); 1000static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
418 sndbuf->flags |= XDRBUF_WRITE; 1001 struct xdr_stream *xdr,
419 return 0; 1002 const struct nfs_writeargs *args)
1003{
1004 encode_write3args(xdr, args);
1005 xdr->buf->flags |= XDRBUF_WRITE;
420} 1006}
421 1007
422/* 1008/*
423 * Encode CREATE arguments 1009 * 3.3.8 CREATE3args
1010 *
1011 * enum createmode3 {
1012 * UNCHECKED = 0,
1013 * GUARDED = 1,
1014 * EXCLUSIVE = 2
1015 * };
1016 *
1017 * union createhow3 switch (createmode3 mode) {
1018 * case UNCHECKED:
1019 * case GUARDED:
1020 * sattr3 obj_attributes;
1021 * case EXCLUSIVE:
1022 * createverf3 verf;
1023 * };
1024 *
1025 * struct CREATE3args {
1026 * diropargs3 where;
1027 * createhow3 how;
1028 * };
424 */ 1029 */
425static int 1030static void encode_createhow3(struct xdr_stream *xdr,
426nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args) 1031 const struct nfs3_createargs *args)
427{ 1032{
428 p = xdr_encode_fhandle(p, args->fh); 1033 encode_uint32(xdr, args->createmode);
429 p = xdr_encode_array(p, args->name, args->len); 1034 switch (args->createmode) {
430 1035 case NFS3_CREATE_UNCHECKED:
431 *p++ = htonl(args->createmode); 1036 case NFS3_CREATE_GUARDED:
432 if (args->createmode == NFS3_CREATE_EXCLUSIVE) { 1037 encode_sattr3(xdr, args->sattr);
433 *p++ = args->verifier[0]; 1038 break;
434 *p++ = args->verifier[1]; 1039 case NFS3_CREATE_EXCLUSIVE:
435 } else 1040 encode_createverf3(xdr, args->verifier);
436 p = xdr_encode_sattr(p, args->sattr); 1041 break;
1042 default:
1043 BUG();
1044 }
1045}
437 1046
438 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 1047static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
439 return 0; 1048 struct xdr_stream *xdr,
1049 const struct nfs3_createargs *args)
1050{
1051 encode_diropargs3(xdr, args->fh, args->name, args->len);
1052 encode_createhow3(xdr, args);
440} 1053}
441 1054
442/* 1055/*
443 * Encode MKDIR arguments 1056 * 3.3.9 MKDIR3args
1057 *
1058 * struct MKDIR3args {
1059 * diropargs3 where;
1060 * sattr3 attributes;
1061 * };
444 */ 1062 */
445static int 1063static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
446nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args) 1064 struct xdr_stream *xdr,
1065 const struct nfs3_mkdirargs *args)
447{ 1066{
448 p = xdr_encode_fhandle(p, args->fh); 1067 encode_diropargs3(xdr, args->fh, args->name, args->len);
449 p = xdr_encode_array(p, args->name, args->len); 1068 encode_sattr3(xdr, args->sattr);
450 p = xdr_encode_sattr(p, args->sattr);
451 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
452 return 0;
453} 1069}
454 1070
455/* 1071/*
456 * Encode SYMLINK arguments 1072 * 3.3.10 SYMLINK3args
1073 *
1074 * struct symlinkdata3 {
1075 * sattr3 symlink_attributes;
1076 * nfspath3 symlink_data;
1077 * };
1078 *
1079 * struct SYMLINK3args {
1080 * diropargs3 where;
1081 * symlinkdata3 symlink;
1082 * };
457 */ 1083 */
458static int 1084static void encode_symlinkdata3(struct xdr_stream *xdr,
459nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args) 1085 const struct nfs3_symlinkargs *args)
460{ 1086{
461 p = xdr_encode_fhandle(p, args->fromfh); 1087 encode_sattr3(xdr, args->sattr);
462 p = xdr_encode_array(p, args->fromname, args->fromlen); 1088 encode_nfspath3(xdr, args->pages, args->pathlen);
463 p = xdr_encode_sattr(p, args->sattr); 1089}
464 *p++ = htonl(args->pathlen);
465 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
466 1090
467 /* Copy the page */ 1091static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
468 xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen); 1092 struct xdr_stream *xdr,
469 return 0; 1093 const struct nfs3_symlinkargs *args)
1094{
1095 encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
1096 encode_symlinkdata3(xdr, args);
470} 1097}
471 1098
472/* 1099/*
473 * Encode MKNOD arguments 1100 * 3.3.11 MKNOD3args
1101 *
1102 * struct devicedata3 {
1103 * sattr3 dev_attributes;
1104 * specdata3 spec;
1105 * };
1106 *
1107 * union mknoddata3 switch (ftype3 type) {
1108 * case NF3CHR:
1109 * case NF3BLK:
1110 * devicedata3 device;
1111 * case NF3SOCK:
1112 * case NF3FIFO:
1113 * sattr3 pipe_attributes;
1114 * default:
1115 * void;
1116 * };
1117 *
1118 * struct MKNOD3args {
1119 * diropargs3 where;
1120 * mknoddata3 what;
1121 * };
474 */ 1122 */
475static int 1123static void encode_devicedata3(struct xdr_stream *xdr,
476nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args) 1124 const struct nfs3_mknodargs *args)
477{ 1125{
478 p = xdr_encode_fhandle(p, args->fh); 1126 encode_sattr3(xdr, args->sattr);
479 p = xdr_encode_array(p, args->name, args->len); 1127 encode_specdata3(xdr, args->rdev);
480 *p++ = htonl(args->type); 1128}
481 p = xdr_encode_sattr(p, args->sattr); 1129
482 if (args->type == NF3CHR || args->type == NF3BLK) { 1130static void encode_mknoddata3(struct xdr_stream *xdr,
483 *p++ = htonl(MAJOR(args->rdev)); 1131 const struct nfs3_mknodargs *args)
484 *p++ = htonl(MINOR(args->rdev)); 1132{
1133 encode_ftype3(xdr, args->type);
1134 switch (args->type) {
1135 case NF3CHR:
1136 case NF3BLK:
1137 encode_devicedata3(xdr, args);
1138 break;
1139 case NF3SOCK:
1140 case NF3FIFO:
1141 encode_sattr3(xdr, args->sattr);
1142 break;
1143 case NF3REG:
1144 case NF3DIR:
1145 break;
1146 default:
1147 BUG();
485 } 1148 }
1149}
486 1150
487 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 1151static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
488 return 0; 1152 struct xdr_stream *xdr,
1153 const struct nfs3_mknodargs *args)
1154{
1155 encode_diropargs3(xdr, args->fh, args->name, args->len);
1156 encode_mknoddata3(xdr, args);
489} 1157}
490 1158
491/* 1159/*
492 * Encode RENAME arguments 1160 * 3.3.12 REMOVE3args
1161 *
1162 * struct REMOVE3args {
1163 * diropargs3 object;
1164 * };
493 */ 1165 */
494static int 1166static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
495nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) 1167 struct xdr_stream *xdr,
496{ 1168 const struct nfs_removeargs *args)
497 p = xdr_encode_fhandle(p, args->old_dir); 1169{
498 p = xdr_encode_array(p, args->old_name->name, args->old_name->len); 1170 encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
499 p = xdr_encode_fhandle(p, args->new_dir);
500 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
501 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
502 return 0;
503} 1171}
504 1172
505/* 1173/*
506 * Encode LINK arguments 1174 * 3.3.14 RENAME3args
1175 *
1176 * struct RENAME3args {
1177 * diropargs3 from;
1178 * diropargs3 to;
1179 * };
507 */ 1180 */
508static int 1181static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
509nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args) 1182 struct xdr_stream *xdr,
1183 const struct nfs_renameargs *args)
510{ 1184{
511 p = xdr_encode_fhandle(p, args->fromfh); 1185 const struct qstr *old = args->old_name;
512 p = xdr_encode_fhandle(p, args->tofh); 1186 const struct qstr *new = args->new_name;
513 p = xdr_encode_array(p, args->toname, args->tolen); 1187
514 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 1188 encode_diropargs3(xdr, args->old_dir, old->name, old->len);
515 return 0; 1189 encode_diropargs3(xdr, args->new_dir, new->name, new->len);
516} 1190}
517 1191
518/* 1192/*
519 * Encode arguments to readdir call 1193 * 3.3.15 LINK3args
1194 *
1195 * struct LINK3args {
1196 * nfs_fh3 file;
1197 * diropargs3 link;
1198 * };
520 */ 1199 */
521static int 1200static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
522nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) 1201 struct xdr_stream *xdr,
1202 const struct nfs3_linkargs *args)
523{ 1203{
524 struct rpc_auth *auth = req->rq_cred->cr_auth; 1204 encode_nfs_fh3(xdr, args->fromfh);
525 unsigned int replen; 1205 encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
526 u32 count = args->count;
527
528 p = xdr_encode_fhandle(p, args->fh);
529 p = xdr_encode_hyper(p, args->cookie);
530 *p++ = args->verf[0];
531 *p++ = args->verf[1];
532 if (args->plus) {
533 /* readdirplus: need dircount + buffer size.
534 * We just make sure we make dircount big enough */
535 *p++ = htonl(count >> 3);
536 }
537 *p++ = htonl(count);
538 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
539
540 /* Inline the page array */
541 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2;
542 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
543 return 0;
544} 1206}
545 1207
546/* 1208/*
547 * Decode the result of a readdir call. 1209 * 3.3.16 READDIR3args
548 * We just check for syntactical correctness. 1210 *
1211 * struct READDIR3args {
1212 * nfs_fh3 dir;
1213 * cookie3 cookie;
1214 * cookieverf3 cookieverf;
1215 * count3 count;
1216 * };
549 */ 1217 */
550static int 1218static void encode_readdir3args(struct xdr_stream *xdr,
551nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res) 1219 const struct nfs3_readdirargs *args)
552{ 1220{
553 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 1221 __be32 *p;
554 struct kvec *iov = rcvbuf->head;
555 struct page **page;
556 size_t hdrlen;
557 u32 recvd, pglen;
558 int status;
559
560 status = ntohl(*p++);
561 /* Decode post_op_attrs */
562 p = xdr_decode_post_op_attr(p, res->dir_attr);
563 if (status)
564 return nfs_stat_to_errno(status);
565 /* Decode verifier cookie */
566 if (res->verf) {
567 res->verf[0] = *p++;
568 res->verf[1] = *p++;
569 } else {
570 p += 2;
571 }
572 1222
573 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 1223 encode_nfs_fh3(xdr, args->fh);
574 if (iov->iov_len < hdrlen) {
575 dprintk("NFS: READDIR reply header overflowed:"
576 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
577 return -errno_NFSERR_IO;
578 } else if (iov->iov_len != hdrlen) {
579 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
580 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
581 }
582 1224
583 pglen = rcvbuf->page_len; 1225 p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
584 recvd = rcvbuf->len - hdrlen; 1226 p = xdr_encode_cookie3(p, args->cookie);
585 if (pglen > recvd) 1227 p = xdr_encode_cookieverf3(p, args->verf);
586 pglen = recvd; 1228 *p = cpu_to_be32(args->count);
587 page = rcvbuf->pages; 1229}
588 1230
589 return pglen; 1231static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
1232 struct xdr_stream *xdr,
1233 const struct nfs3_readdirargs *args)
1234{
1235 encode_readdir3args(xdr, args);
1236 prepare_reply_buffer(req, args->pages, 0,
1237 args->count, NFS3_readdirres_sz);
590} 1238}
591 1239
592__be32 * 1240/*
593nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) 1241 * 3.3.17 READDIRPLUS3args
1242 *
1243 * struct READDIRPLUS3args {
1244 * nfs_fh3 dir;
1245 * cookie3 cookie;
1246 * cookieverf3 cookieverf;
1247 * count3 dircount;
1248 * count3 maxcount;
1249 * };
1250 */
1251static void encode_readdirplus3args(struct xdr_stream *xdr,
1252 const struct nfs3_readdirargs *args)
594{ 1253{
595 __be32 *p; 1254 __be32 *p;
596 struct nfs_entry old = *entry;
597
598 p = xdr_inline_decode(xdr, 4);
599 if (unlikely(!p))
600 goto out_overflow;
601 if (!ntohl(*p++)) {
602 p = xdr_inline_decode(xdr, 4);
603 if (unlikely(!p))
604 goto out_overflow;
605 if (!ntohl(*p++))
606 return ERR_PTR(-EAGAIN);
607 entry->eof = 1;
608 return ERR_PTR(-EBADCOOKIE);
609 }
610 1255
611 p = xdr_inline_decode(xdr, 12); 1256 encode_nfs_fh3(xdr, args->fh);
612 if (unlikely(!p))
613 goto out_overflow;
614 p = xdr_decode_hyper(p, &entry->ino);
615 entry->len = ntohl(*p++);
616 1257
617 p = xdr_inline_decode(xdr, entry->len + 8); 1258 p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
618 if (unlikely(!p)) 1259 p = xdr_encode_cookie3(p, args->cookie);
619 goto out_overflow; 1260 p = xdr_encode_cookieverf3(p, args->verf);
620 entry->name = (const char *) p;
621 p += XDR_QUADLEN(entry->len);
622 entry->prev_cookie = entry->cookie;
623 p = xdr_decode_hyper(p, &entry->cookie);
624
625 entry->d_type = DT_UNKNOWN;
626 if (plus) {
627 entry->fattr->valid = 0;
628 p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
629 if (IS_ERR(p))
630 goto out_overflow_exit;
631 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
632 /* In fact, a post_op_fh3: */
633 p = xdr_inline_decode(xdr, 4);
634 if (unlikely(!p))
635 goto out_overflow;
636 if (*p++) {
637 p = xdr_decode_fhandle_stream(xdr, entry->fh);
638 if (IS_ERR(p))
639 goto out_overflow_exit;
640 /* Ugh -- server reply was truncated */
641 if (p == NULL) {
642 dprintk("NFS: FH truncated\n");
643 *entry = old;
644 return ERR_PTR(-EAGAIN);
645 }
646 } else
647 memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
648 }
649 1261
650 p = xdr_inline_peek(xdr, 8); 1262 /*
651 if (p != NULL) 1263 * readdirplus: need dircount + buffer size.
652 entry->eof = !p[0] && p[1]; 1264 * We just make sure we make dircount big enough
653 else 1265 */
654 entry->eof = 0; 1266 *p++ = cpu_to_be32(args->count >> 3);
655 1267
656 return p; 1268 *p = cpu_to_be32(args->count);
1269}
657 1270
658out_overflow: 1271static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
659 print_overflow_msg(__func__, xdr); 1272 struct xdr_stream *xdr,
660out_overflow_exit: 1273 const struct nfs3_readdirargs *args)
661 return ERR_PTR(-EAGAIN); 1274{
1275 encode_readdirplus3args(xdr, args);
1276 prepare_reply_buffer(req, args->pages, 0,
1277 args->count, NFS3_readdirres_sz);
662} 1278}
663 1279
664/* 1280/*
665 * Encode COMMIT arguments 1281 * 3.3.21 COMMIT3args
1282 *
1283 * struct COMMIT3args {
1284 * nfs_fh3 file;
1285 * offset3 offset;
1286 * count3 count;
1287 * };
666 */ 1288 */
667static int 1289static void encode_commit3args(struct xdr_stream *xdr,
668nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 1290 const struct nfs_writeargs *args)
669{ 1291{
670 p = xdr_encode_fhandle(p, args->fh); 1292 __be32 *p;
1293
1294 encode_nfs_fh3(xdr, args->fh);
1295
1296 p = xdr_reserve_space(xdr, 8 + 4);
671 p = xdr_encode_hyper(p, args->offset); 1297 p = xdr_encode_hyper(p, args->offset);
672 *p++ = htonl(args->count); 1298 *p = cpu_to_be32(args->count);
673 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
674 return 0;
675} 1299}
676 1300
677#ifdef CONFIG_NFS_V3_ACL 1301static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
678/* 1302 struct xdr_stream *xdr,
679 * Encode GETACL arguments 1303 const struct nfs_writeargs *args)
680 */
681static int
682nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
683 struct nfs3_getaclargs *args)
684{ 1304{
685 struct rpc_auth *auth = req->rq_cred->cr_auth; 1305 encode_commit3args(xdr, args);
686 unsigned int replen; 1306}
687 1307
688 p = xdr_encode_fhandle(p, args->fh); 1308#ifdef CONFIG_NFS_V3_ACL
689 *p++ = htonl(args->mask);
690 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
691 1309
692 if (args->mask & (NFS_ACL | NFS_DFACL)) { 1310static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
693 /* Inline the page array */ 1311 struct xdr_stream *xdr,
694 replen = (RPC_REPHDRSIZE + auth->au_rslack + 1312 const struct nfs3_getaclargs *args)
695 ACL3_getaclres_sz) << 2; 1313{
696 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, 1314 encode_nfs_fh3(xdr, args->fh);
697 NFSACL_MAXPAGES << PAGE_SHIFT); 1315 encode_uint32(xdr, args->mask);
698 } 1316 if (args->mask & (NFS_ACL | NFS_DFACL))
699 return 0; 1317 prepare_reply_buffer(req, args->pages, 0,
1318 NFSACL_MAXPAGES << PAGE_SHIFT,
1319 ACL3_getaclres_sz);
700} 1320}
701 1321
702/* 1322static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
703 * Encode SETACL arguments 1323 struct xdr_stream *xdr,
704 */ 1324 const struct nfs3_setaclargs *args)
705static int
706nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
707 struct nfs3_setaclargs *args)
708{ 1325{
709 struct xdr_buf *buf = &req->rq_snd_buf;
710 unsigned int base; 1326 unsigned int base;
711 int err; 1327 int error;
712
713 p = xdr_encode_fhandle(p, NFS_FH(args->inode));
714 *p++ = htonl(args->mask);
715 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
716 base = req->rq_slen;
717 1328
1329 encode_nfs_fh3(xdr, NFS_FH(args->inode));
1330 encode_uint32(xdr, args->mask);
718 if (args->npages != 0) 1331 if (args->npages != 0)
719 xdr_encode_pages(buf, args->pages, 0, args->len); 1332 xdr_write_pages(xdr, args->pages, 0, args->len);
720 else
721 req->rq_slen = xdr_adjust_iovec(req->rq_svec,
722 p + XDR_QUADLEN(args->len));
723 1333
724 err = nfsacl_encode(buf, base, args->inode, 1334 base = req->rq_slen;
1335 error = nfsacl_encode(xdr->buf, base, args->inode,
725 (args->mask & NFS_ACL) ? 1336 (args->mask & NFS_ACL) ?
726 args->acl_access : NULL, 1, 0); 1337 args->acl_access : NULL, 1, 0);
727 if (err > 0) 1338 BUG_ON(error < 0);
728 err = nfsacl_encode(buf, base + err, args->inode, 1339 error = nfsacl_encode(xdr->buf, base + error, args->inode,
729 (args->mask & NFS_DFACL) ? 1340 (args->mask & NFS_DFACL) ?
730 args->acl_default : NULL, 1, 1341 args->acl_default : NULL, 1,
731 NFS_ACL_DEFAULT); 1342 NFS_ACL_DEFAULT);
732 return (err > 0) ? 0 : err; 1343 BUG_ON(error < 0);
733} 1344}
1345
734#endif /* CONFIG_NFS_V3_ACL */ 1346#endif /* CONFIG_NFS_V3_ACL */
735 1347
736/* 1348/*
737 * NFS XDR decode functions 1349 * NFSv3 XDR decode functions
1350 *
1351 * NFSv3 result types are defined in section 3.3 of RFC 1813:
1352 * "NFS Version 3 Protocol Specification".
738 */ 1353 */
739 1354
740/* 1355/*
741 * Decode attrstat reply. 1356 * 3.3.1 GETATTR3res
1357 *
1358 * struct GETATTR3resok {
1359 * fattr3 obj_attributes;
1360 * };
1361 *
1362 * union GETATTR3res switch (nfsstat3 status) {
1363 * case NFS3_OK:
1364 * GETATTR3resok resok;
1365 * default:
1366 * void;
1367 * };
742 */ 1368 */
743static int 1369static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
744nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 1370 struct xdr_stream *xdr,
1371 struct nfs_fattr *result)
745{ 1372{
746 int status; 1373 enum nfs_stat status;
747 1374 int error;
748 if ((status = ntohl(*p++))) 1375
749 return nfs_stat_to_errno(status); 1376 error = decode_nfsstat3(xdr, &status);
750 xdr_decode_fattr(p, fattr); 1377 if (unlikely(error))
751 return 0; 1378 goto out;
1379 if (status != NFS3_OK)
1380 goto out_default;
1381 error = decode_fattr3(xdr, result);
1382out:
1383 return error;
1384out_default:
1385 return nfs_stat_to_errno(status);
752} 1386}
753 1387
754/* 1388/*
755 * Decode status+wcc_data reply 1389 * 3.3.2 SETATTR3res
756 * SATTR, REMOVE, RMDIR 1390 *
1391 * struct SETATTR3resok {
1392 * wcc_data obj_wcc;
1393 * };
1394 *
1395 * struct SETATTR3resfail {
1396 * wcc_data obj_wcc;
1397 * };
1398 *
1399 * union SETATTR3res switch (nfsstat3 status) {
1400 * case NFS3_OK:
1401 * SETATTR3resok resok;
1402 * default:
1403 * SETATTR3resfail resfail;
1404 * };
757 */ 1405 */
758static int 1406static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
759nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 1407 struct xdr_stream *xdr,
1408 struct nfs_fattr *result)
760{ 1409{
761 int status; 1410 enum nfs_stat status;
762 1411 int error;
763 if ((status = ntohl(*p++))) 1412
764 status = nfs_stat_to_errno(status); 1413 error = decode_nfsstat3(xdr, &status);
765 xdr_decode_wcc_data(p, fattr); 1414 if (unlikely(error))
766 return status; 1415 goto out;
1416 error = decode_wcc_data(xdr, result);
1417 if (unlikely(error))
1418 goto out;
1419 if (status != NFS3_OK)
1420 goto out_status;
1421out:
1422 return error;
1423out_status:
1424 return nfs_stat_to_errno(status);
767} 1425}
768 1426
769static int 1427/*
770nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) 1428 * 3.3.3 LOOKUP3res
1429 *
1430 * struct LOOKUP3resok {
1431 * nfs_fh3 object;
1432 * post_op_attr obj_attributes;
1433 * post_op_attr dir_attributes;
1434 * };
1435 *
1436 * struct LOOKUP3resfail {
1437 * post_op_attr dir_attributes;
1438 * };
1439 *
1440 * union LOOKUP3res switch (nfsstat3 status) {
1441 * case NFS3_OK:
1442 * LOOKUP3resok resok;
1443 * default:
1444 * LOOKUP3resfail resfail;
1445 * };
1446 */
1447static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
1448 struct xdr_stream *xdr,
1449 struct nfs3_diropres *result)
771{ 1450{
772 return nfs3_xdr_wccstat(req, p, res->dir_attr); 1451 enum nfs_stat status;
1452 int error;
1453
1454 error = decode_nfsstat3(xdr, &status);
1455 if (unlikely(error))
1456 goto out;
1457 if (status != NFS3_OK)
1458 goto out_default;
1459 error = decode_nfs_fh3(xdr, result->fh);
1460 if (unlikely(error))
1461 goto out;
1462 error = decode_post_op_attr(xdr, result->fattr);
1463 if (unlikely(error))
1464 goto out;
1465 error = decode_post_op_attr(xdr, result->dir_attr);
1466out:
1467 return error;
1468out_default:
1469 error = decode_post_op_attr(xdr, result->dir_attr);
1470 if (unlikely(error))
1471 goto out;
1472 return nfs_stat_to_errno(status);
773} 1473}
774 1474
775/* 1475/*
776 * Decode LOOKUP reply 1476 * 3.3.4 ACCESS3res
1477 *
1478 * struct ACCESS3resok {
1479 * post_op_attr obj_attributes;
1480 * uint32 access;
1481 * };
1482 *
1483 * struct ACCESS3resfail {
1484 * post_op_attr obj_attributes;
1485 * };
1486 *
1487 * union ACCESS3res switch (nfsstat3 status) {
1488 * case NFS3_OK:
1489 * ACCESS3resok resok;
1490 * default:
1491 * ACCESS3resfail resfail;
1492 * };
777 */ 1493 */
778static int 1494static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
779nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) 1495 struct xdr_stream *xdr,
1496 struct nfs3_accessres *result)
780{ 1497{
781 int status; 1498 enum nfs_stat status;
782 1499 int error;
783 if ((status = ntohl(*p++))) { 1500
784 status = nfs_stat_to_errno(status); 1501 error = decode_nfsstat3(xdr, &status);
785 } else { 1502 if (unlikely(error))
786 if (!(p = xdr_decode_fhandle(p, res->fh))) 1503 goto out;
787 return -errno_NFSERR_IO; 1504 error = decode_post_op_attr(xdr, result->fattr);
788 p = xdr_decode_post_op_attr(p, res->fattr); 1505 if (unlikely(error))
789 } 1506 goto out;
790 xdr_decode_post_op_attr(p, res->dir_attr); 1507 if (status != NFS3_OK)
791 return status; 1508 goto out_default;
1509 error = decode_uint32(xdr, &result->access);
1510out:
1511 return error;
1512out_default:
1513 return nfs_stat_to_errno(status);
792} 1514}
793 1515
794/* 1516/*
795 * Decode ACCESS reply 1517 * 3.3.5 READLINK3res
1518 *
1519 * struct READLINK3resok {
1520 * post_op_attr symlink_attributes;
1521 * nfspath3 data;
1522 * };
1523 *
1524 * struct READLINK3resfail {
1525 * post_op_attr symlink_attributes;
1526 * };
1527 *
1528 * union READLINK3res switch (nfsstat3 status) {
1529 * case NFS3_OK:
1530 * READLINK3resok resok;
1531 * default:
1532 * READLINK3resfail resfail;
1533 * };
796 */ 1534 */
797static int 1535static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
798nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) 1536 struct xdr_stream *xdr,
1537 struct nfs_fattr *result)
799{ 1538{
800 int status = ntohl(*p++); 1539 enum nfs_stat status;
801 1540 int error;
802 p = xdr_decode_post_op_attr(p, res->fattr); 1541
803 if (status) 1542 error = decode_nfsstat3(xdr, &status);
804 return nfs_stat_to_errno(status); 1543 if (unlikely(error))
805 res->access = ntohl(*p++); 1544 goto out;
806 return 0; 1545 error = decode_post_op_attr(xdr, result);
1546 if (unlikely(error))
1547 goto out;
1548 if (status != NFS3_OK)
1549 goto out_default;
1550 error = decode_nfspath3(xdr);
1551out:
1552 return error;
1553out_default:
1554 return nfs_stat_to_errno(status);
807} 1555}
808 1556
809static int 1557/*
810nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) 1558 * 3.3.6 READ3res
1559 *
1560 * struct READ3resok {
1561 * post_op_attr file_attributes;
1562 * count3 count;
1563 * bool eof;
1564 * opaque data<>;
1565 * };
1566 *
1567 * struct READ3resfail {
1568 * post_op_attr file_attributes;
1569 * };
1570 *
1571 * union READ3res switch (nfsstat3 status) {
1572 * case NFS3_OK:
1573 * READ3resok resok;
1574 * default:
1575 * READ3resfail resfail;
1576 * };
1577 */
1578static int decode_read3resok(struct xdr_stream *xdr,
1579 struct nfs_readres *result)
811{ 1580{
812 struct rpc_auth *auth = req->rq_cred->cr_auth; 1581 u32 eof, count, ocount, recvd;
813 unsigned int replen; 1582 size_t hdrlen;
1583 __be32 *p;
814 1584
815 p = xdr_encode_fhandle(p, args->fh); 1585 p = xdr_inline_decode(xdr, 4 + 4 + 4);
816 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 1586 if (unlikely(p == NULL))
1587 goto out_overflow;
1588 count = be32_to_cpup(p++);
1589 eof = be32_to_cpup(p++);
1590 ocount = be32_to_cpup(p++);
1591 if (unlikely(ocount != count))
1592 goto out_mismatch;
1593 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
1594 recvd = xdr->buf->len - hdrlen;
1595 if (unlikely(count > recvd))
1596 goto out_cheating;
1597
1598out:
1599 xdr_read_pages(xdr, count);
1600 result->eof = eof;
1601 result->count = count;
1602 return count;
1603out_mismatch:
1604 dprintk("NFS: READ count doesn't match length of opaque: "
1605 "count %u != ocount %u\n", count, ocount);
1606 return -EIO;
1607out_cheating:
1608 dprintk("NFS: server cheating in read result: "
1609 "count %u > recvd %u\n", count, recvd);
1610 count = recvd;
1611 eof = 0;
1612 goto out;
1613out_overflow:
1614 print_overflow_msg(__func__, xdr);
1615 return -EIO;
1616}
817 1617
818 /* Inline the page array */ 1618static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
819 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2; 1619 struct nfs_readres *result)
820 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); 1620{
821 return 0; 1621 enum nfs_stat status;
1622 int error;
1623
1624 error = decode_nfsstat3(xdr, &status);
1625 if (unlikely(error))
1626 goto out;
1627 error = decode_post_op_attr(xdr, result->fattr);
1628 if (unlikely(error))
1629 goto out;
1630 if (status != NFS3_OK)
1631 goto out_status;
1632 error = decode_read3resok(xdr, result);
1633out:
1634 return error;
1635out_status:
1636 return nfs_stat_to_errno(status);
822} 1637}
823 1638
824/* 1639/*
825 * Decode READLINK reply 1640 * 3.3.7 WRITE3res
1641 *
1642 * enum stable_how {
1643 * UNSTABLE = 0,
1644 * DATA_SYNC = 1,
1645 * FILE_SYNC = 2
1646 * };
1647 *
1648 * struct WRITE3resok {
1649 * wcc_data file_wcc;
1650 * count3 count;
1651 * stable_how committed;
1652 * writeverf3 verf;
1653 * };
1654 *
1655 * struct WRITE3resfail {
1656 * wcc_data file_wcc;
1657 * };
1658 *
1659 * union WRITE3res switch (nfsstat3 status) {
1660 * case NFS3_OK:
1661 * WRITE3resok resok;
1662 * default:
1663 * WRITE3resfail resfail;
1664 * };
826 */ 1665 */
827static int 1666static int decode_write3resok(struct xdr_stream *xdr,
828nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 1667 struct nfs_writeres *result)
829{ 1668{
830 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 1669 __be32 *p;
831 struct kvec *iov = rcvbuf->head;
832 size_t hdrlen;
833 u32 len, recvd;
834 int status;
835
836 status = ntohl(*p++);
837 p = xdr_decode_post_op_attr(p, fattr);
838
839 if (status != 0)
840 return nfs_stat_to_errno(status);
841
842 /* Convert length of symlink */
843 len = ntohl(*p++);
844 if (len >= rcvbuf->page_len) {
845 dprintk("nfs: server returned giant symlink!\n");
846 return -ENAMETOOLONG;
847 }
848 1670
849 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 1671 p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
850 if (iov->iov_len < hdrlen) { 1672 if (unlikely(p == NULL))
851 dprintk("NFS: READLINK reply header overflowed:" 1673 goto out_overflow;
852 "length %Zu > %Zu\n", hdrlen, iov->iov_len); 1674 result->count = be32_to_cpup(p++);
853 return -errno_NFSERR_IO; 1675 result->verf->committed = be32_to_cpup(p++);
854 } else if (iov->iov_len != hdrlen) { 1676 if (unlikely(result->verf->committed > NFS_FILE_SYNC))
855 dprintk("NFS: READLINK header is short. " 1677 goto out_badvalue;
856 "iovec will be shifted.\n"); 1678 memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
857 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); 1679 return result->count;
858 } 1680out_badvalue:
859 recvd = req->rq_rcv_buf.len - hdrlen; 1681 dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
860 if (recvd < len) { 1682 return -EIO;
861 dprintk("NFS: server cheating in readlink reply: " 1683out_overflow:
862 "count %u > recvd %u\n", len, recvd); 1684 print_overflow_msg(__func__, xdr);
863 return -EIO; 1685 return -EIO;
864 } 1686}
865 1687
866 xdr_terminate_string(rcvbuf, len); 1688static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
867 return 0; 1689 struct nfs_writeres *result)
1690{
1691 enum nfs_stat status;
1692 int error;
1693
1694 error = decode_nfsstat3(xdr, &status);
1695 if (unlikely(error))
1696 goto out;
1697 error = decode_wcc_data(xdr, result->fattr);
1698 if (unlikely(error))
1699 goto out;
1700 if (status != NFS3_OK)
1701 goto out_status;
1702 error = decode_write3resok(xdr, result);
1703out:
1704 return error;
1705out_status:
1706 return nfs_stat_to_errno(status);
868} 1707}
869 1708
870/* 1709/*
871 * Decode READ reply 1710 * 3.3.8 CREATE3res
1711 *
1712 * struct CREATE3resok {
1713 * post_op_fh3 obj;
1714 * post_op_attr obj_attributes;
1715 * wcc_data dir_wcc;
1716 * };
1717 *
1718 * struct CREATE3resfail {
1719 * wcc_data dir_wcc;
1720 * };
1721 *
1722 * union CREATE3res switch (nfsstat3 status) {
1723 * case NFS3_OK:
1724 * CREATE3resok resok;
1725 * default:
1726 * CREATE3resfail resfail;
1727 * };
872 */ 1728 */
873static int 1729static int decode_create3resok(struct xdr_stream *xdr,
874nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) 1730 struct nfs3_diropres *result)
875{ 1731{
876 struct kvec *iov = req->rq_rcv_buf.head; 1732 int error;
877 size_t hdrlen; 1733
878 u32 count, ocount, recvd; 1734 error = decode_post_op_fh3(xdr, result->fh);
879 int status; 1735 if (unlikely(error))
1736 goto out;
1737 error = decode_post_op_attr(xdr, result->fattr);
1738 if (unlikely(error))
1739 goto out;
1740 /* The server isn't required to return a file handle.
1741 * If it didn't, force the client to perform a LOOKUP
1742 * to determine the correct file handle and attribute
1743 * values for the new object. */
1744 if (result->fh->size == 0)
1745 result->fattr->valid = 0;
1746 error = decode_wcc_data(xdr, result->dir_attr);
1747out:
1748 return error;
1749}
880 1750
881 status = ntohl(*p++); 1751static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
882 p = xdr_decode_post_op_attr(p, res->fattr); 1752 struct xdr_stream *xdr,
1753 struct nfs3_diropres *result)
1754{
1755 enum nfs_stat status;
1756 int error;
1757
1758 error = decode_nfsstat3(xdr, &status);
1759 if (unlikely(error))
1760 goto out;
1761 if (status != NFS3_OK)
1762 goto out_default;
1763 error = decode_create3resok(xdr, result);
1764out:
1765 return error;
1766out_default:
1767 error = decode_wcc_data(xdr, result->dir_attr);
1768 if (unlikely(error))
1769 goto out;
1770 return nfs_stat_to_errno(status);
1771}
883 1772
884 if (status != 0) 1773/*
885 return nfs_stat_to_errno(status); 1774 * 3.3.12 REMOVE3res
1775 *
1776 * struct REMOVE3resok {
1777 * wcc_data dir_wcc;
1778 * };
1779 *
1780 * struct REMOVE3resfail {
1781 * wcc_data dir_wcc;
1782 * };
1783 *
1784 * union REMOVE3res switch (nfsstat3 status) {
1785 * case NFS3_OK:
1786 * REMOVE3resok resok;
1787 * default:
1788 * REMOVE3resfail resfail;
1789 * };
1790 */
1791static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
1792 struct xdr_stream *xdr,
1793 struct nfs_removeres *result)
1794{
1795 enum nfs_stat status;
1796 int error;
1797
1798 error = decode_nfsstat3(xdr, &status);
1799 if (unlikely(error))
1800 goto out;
1801 error = decode_wcc_data(xdr, result->dir_attr);
1802 if (unlikely(error))
1803 goto out;
1804 if (status != NFS3_OK)
1805 goto out_status;
1806out:
1807 return error;
1808out_status:
1809 return nfs_stat_to_errno(status);
1810}
886 1811
887 /* Decode reply count and EOF flag. NFSv3 is somewhat redundant 1812/*
888 * in that it puts the count both in the res struct and in the 1813 * 3.3.14 RENAME3res
889 * opaque data count. */ 1814 *
890 count = ntohl(*p++); 1815 * struct RENAME3resok {
891 res->eof = ntohl(*p++); 1816 * wcc_data fromdir_wcc;
892 ocount = ntohl(*p++); 1817 * wcc_data todir_wcc;
1818 * };
1819 *
1820 * struct RENAME3resfail {
1821 * wcc_data fromdir_wcc;
1822 * wcc_data todir_wcc;
1823 * };
1824 *
1825 * union RENAME3res switch (nfsstat3 status) {
1826 * case NFS3_OK:
1827 * RENAME3resok resok;
1828 * default:
1829 * RENAME3resfail resfail;
1830 * };
1831 */
1832static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
1833 struct xdr_stream *xdr,
1834 struct nfs_renameres *result)
1835{
1836 enum nfs_stat status;
1837 int error;
1838
1839 error = decode_nfsstat3(xdr, &status);
1840 if (unlikely(error))
1841 goto out;
1842 error = decode_wcc_data(xdr, result->old_fattr);
1843 if (unlikely(error))
1844 goto out;
1845 error = decode_wcc_data(xdr, result->new_fattr);
1846 if (unlikely(error))
1847 goto out;
1848 if (status != NFS3_OK)
1849 goto out_status;
1850out:
1851 return error;
1852out_status:
1853 return nfs_stat_to_errno(status);
1854}
893 1855
894 if (ocount != count) { 1856/*
895 dprintk("NFS: READ count doesn't match RPC opaque count.\n"); 1857 * 3.3.15 LINK3res
896 return -errno_NFSERR_IO; 1858 *
897 } 1859 * struct LINK3resok {
1860 * post_op_attr file_attributes;
1861 * wcc_data linkdir_wcc;
1862 * };
1863 *
1864 * struct LINK3resfail {
1865 * post_op_attr file_attributes;
1866 * wcc_data linkdir_wcc;
1867 * };
1868 *
1869 * union LINK3res switch (nfsstat3 status) {
1870 * case NFS3_OK:
1871 * LINK3resok resok;
1872 * default:
1873 * LINK3resfail resfail;
1874 * };
1875 */
1876static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
1877 struct nfs3_linkres *result)
1878{
1879 enum nfs_stat status;
1880 int error;
1881
1882 error = decode_nfsstat3(xdr, &status);
1883 if (unlikely(error))
1884 goto out;
1885 error = decode_post_op_attr(xdr, result->fattr);
1886 if (unlikely(error))
1887 goto out;
1888 error = decode_wcc_data(xdr, result->dir_attr);
1889 if (unlikely(error))
1890 goto out;
1891 if (status != NFS3_OK)
1892 goto out_status;
1893out:
1894 return error;
1895out_status:
1896 return nfs_stat_to_errno(status);
1897}
898 1898
899 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 1899/**
900 if (iov->iov_len < hdrlen) { 1900 * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
901 dprintk("NFS: READ reply header overflowed:" 1901 * the local page cache
902 "length %Zu > %Zu\n", hdrlen, iov->iov_len); 1902 * @xdr: XDR stream where entry resides
903 return -errno_NFSERR_IO; 1903 * @entry: buffer to fill in with entry data
904 } else if (iov->iov_len != hdrlen) { 1904 * @plus: boolean indicating whether this should be a readdirplus entry
905 dprintk("NFS: READ header is short. iovec will be shifted.\n"); 1905 *
906 xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); 1906 * Returns zero if successful, otherwise a negative errno value is
907 } 1907 * returned.
1908 *
1909 * This function is not invoked during READDIR reply decoding, but
1910 * rather whenever an application invokes the getdents(2) system call
1911 * on a directory already in our cache.
1912 *
1913 * 3.3.16 entry3
1914 *
1915 * struct entry3 {
1916 * fileid3 fileid;
1917 * filename3 name;
1918 * cookie3 cookie;
1919 * fhandle3 filehandle;
1920 * post_op_attr3 attributes;
1921 * entry3 *nextentry;
1922 * };
1923 *
1924 * 3.3.17 entryplus3
1925 * struct entryplus3 {
1926 * fileid3 fileid;
1927 * filename3 name;
1928 * cookie3 cookie;
1929 * post_op_attr name_attributes;
1930 * post_op_fh3 name_handle;
1931 * entryplus3 *nextentry;
1932 * };
1933 */
1934int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
1935 int plus)
1936{
1937 struct nfs_entry old = *entry;
1938 __be32 *p;
1939 int error;
908 1940
909 recvd = req->rq_rcv_buf.len - hdrlen; 1941 p = xdr_inline_decode(xdr, 4);
910 if (count > recvd) { 1942 if (unlikely(p == NULL))
911 dprintk("NFS: server cheating in read reply: " 1943 goto out_overflow;
912 "count %u > recvd %u\n", count, recvd); 1944 if (*p == xdr_zero) {
913 count = recvd; 1945 p = xdr_inline_decode(xdr, 4);
914 res->eof = 0; 1946 if (unlikely(p == NULL))
1947 goto out_overflow;
1948 if (*p == xdr_zero)
1949 return -EAGAIN;
1950 entry->eof = 1;
1951 return -EBADCOOKIE;
915 } 1952 }
916 1953
917 if (count < res->count) 1954 error = decode_fileid3(xdr, &entry->ino);
918 res->count = count; 1955 if (unlikely(error))
1956 return error;
919 1957
920 return count; 1958 error = decode_inline_filename3(xdr, &entry->name, &entry->len);
921} 1959 if (unlikely(error))
1960 return error;
922 1961
923/* 1962 entry->prev_cookie = entry->cookie;
924 * Decode WRITE response 1963 error = decode_cookie3(xdr, &entry->cookie);
925 */ 1964 if (unlikely(error))
926static int 1965 return error;
927nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
928{
929 int status;
930 1966
931 status = ntohl(*p++); 1967 entry->d_type = DT_UNKNOWN;
932 p = xdr_decode_wcc_data(p, res->fattr);
933 1968
934 if (status != 0) 1969 if (plus) {
935 return nfs_stat_to_errno(status); 1970 entry->fattr->valid = 0;
1971 error = decode_post_op_attr(xdr, entry->fattr);
1972 if (unlikely(error))
1973 return error;
1974 if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
1975 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
936 1976
937 res->count = ntohl(*p++); 1977 /* In fact, a post_op_fh3: */
938 res->verf->committed = (enum nfs3_stable_how)ntohl(*p++); 1978 p = xdr_inline_decode(xdr, 4);
939 res->verf->verifier[0] = *p++; 1979 if (unlikely(p == NULL))
940 res->verf->verifier[1] = *p++; 1980 goto out_overflow;
1981 if (*p != xdr_zero) {
1982 error = decode_nfs_fh3(xdr, entry->fh);
1983 if (unlikely(error)) {
1984 if (error == -E2BIG)
1985 goto out_truncated;
1986 return error;
1987 }
1988 } else
1989 zero_nfs_fh3(entry->fh);
1990 }
941 1991
942 return res->count; 1992 return 0;
943}
944 1993
945/* 1994out_overflow:
946 * Decode a CREATE response 1995 print_overflow_msg(__func__, xdr);
947 */ 1996 return -EAGAIN;
948static int 1997out_truncated:
949nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) 1998 dprintk("NFS: directory entry contains invalid file handle\n");
950{ 1999 *entry = old;
951 int status; 2000 return -EAGAIN;
952
953 status = ntohl(*p++);
954 if (status == 0) {
955 if (*p++) {
956 if (!(p = xdr_decode_fhandle(p, res->fh)))
957 return -errno_NFSERR_IO;
958 p = xdr_decode_post_op_attr(p, res->fattr);
959 } else {
960 memset(res->fh, 0, sizeof(*res->fh));
961 /* Do decode post_op_attr but set it to NULL */
962 p = xdr_decode_post_op_attr(p, res->fattr);
963 res->fattr->valid = 0;
964 }
965 } else {
966 status = nfs_stat_to_errno(status);
967 }
968 p = xdr_decode_wcc_data(p, res->dir_attr);
969 return status;
970} 2001}
971 2002
972/* 2003/*
973 * Decode RENAME reply 2004 * 3.3.16 READDIR3res
2005 *
2006 * struct dirlist3 {
2007 * entry3 *entries;
2008 * bool eof;
2009 * };
2010 *
2011 * struct READDIR3resok {
2012 * post_op_attr dir_attributes;
2013 * cookieverf3 cookieverf;
2014 * dirlist3 reply;
2015 * };
2016 *
2017 * struct READDIR3resfail {
2018 * post_op_attr dir_attributes;
2019 * };
2020 *
2021 * union READDIR3res switch (nfsstat3 status) {
2022 * case NFS3_OK:
2023 * READDIR3resok resok;
2024 * default:
2025 * READDIR3resfail resfail;
2026 * };
2027 *
2028 * Read the directory contents into the page cache, but otherwise
2029 * don't touch them. The actual decoding is done by nfs3_decode_entry()
2030 * during subsequent nfs_readdir() calls.
974 */ 2031 */
975static int 2032static int decode_dirlist3(struct xdr_stream *xdr)
976nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
977{ 2033{
978 int status; 2034 u32 recvd, pglen;
2035 size_t hdrlen;
979 2036
980 if ((status = ntohl(*p++)) != 0) 2037 pglen = xdr->buf->page_len;
981 status = nfs_stat_to_errno(status); 2038 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
982 p = xdr_decode_wcc_data(p, res->old_fattr); 2039 recvd = xdr->buf->len - hdrlen;
983 p = xdr_decode_wcc_data(p, res->new_fattr); 2040 if (unlikely(pglen > recvd))
984 return status; 2041 goto out_cheating;
2042out:
2043 xdr_read_pages(xdr, pglen);
2044 return pglen;
2045out_cheating:
2046 dprintk("NFS: server cheating in readdir result: "
2047 "pglen %u > recvd %u\n", pglen, recvd);
2048 pglen = recvd;
2049 goto out;
985} 2050}
986 2051
987/* 2052static int decode_readdir3resok(struct xdr_stream *xdr,
988 * Decode LINK reply 2053 struct nfs3_readdirres *result)
989 */
990static int
991nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
992{ 2054{
993 int status; 2055 int error;
2056
2057 error = decode_post_op_attr(xdr, result->dir_attr);
2058 if (unlikely(error))
2059 goto out;
2060 /* XXX: do we need to check if result->verf != NULL ? */
2061 error = decode_cookieverf3(xdr, result->verf);
2062 if (unlikely(error))
2063 goto out;
2064 error = decode_dirlist3(xdr);
2065out:
2066 return error;
2067}
994 2068
995 if ((status = ntohl(*p++)) != 0) 2069static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
996 status = nfs_stat_to_errno(status); 2070 struct xdr_stream *xdr,
997 p = xdr_decode_post_op_attr(p, res->fattr); 2071 struct nfs3_readdirres *result)
998 p = xdr_decode_wcc_data(p, res->dir_attr); 2072{
999 return status; 2073 enum nfs_stat status;
2074 int error;
2075
2076 error = decode_nfsstat3(xdr, &status);
2077 if (unlikely(error))
2078 goto out;
2079 if (status != NFS3_OK)
2080 goto out_default;
2081 error = decode_readdir3resok(xdr, result);
2082out:
2083 return error;
2084out_default:
2085 error = decode_post_op_attr(xdr, result->dir_attr);
2086 if (unlikely(error))
2087 goto out;
2088 return nfs_stat_to_errno(status);
1000} 2089}
1001 2090
1002/* 2091/*
1003 * Decode FSSTAT reply 2092 * 3.3.18 FSSTAT3res
2093 *
2094 * struct FSSTAT3resok {
2095 * post_op_attr obj_attributes;
2096 * size3 tbytes;
2097 * size3 fbytes;
2098 * size3 abytes;
2099 * size3 tfiles;
2100 * size3 ffiles;
2101 * size3 afiles;
2102 * uint32 invarsec;
2103 * };
2104 *
2105 * struct FSSTAT3resfail {
2106 * post_op_attr obj_attributes;
2107 * };
2108 *
2109 * union FSSTAT3res switch (nfsstat3 status) {
2110 * case NFS3_OK:
2111 * FSSTAT3resok resok;
2112 * default:
2113 * FSSTAT3resfail resfail;
2114 * };
1004 */ 2115 */
1005static int 2116static int decode_fsstat3resok(struct xdr_stream *xdr,
1006nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res) 2117 struct nfs_fsstat *result)
1007{ 2118{
1008 int status; 2119 __be32 *p;
1009
1010 status = ntohl(*p++);
1011
1012 p = xdr_decode_post_op_attr(p, res->fattr);
1013 if (status != 0)
1014 return nfs_stat_to_errno(status);
1015
1016 p = xdr_decode_hyper(p, &res->tbytes);
1017 p = xdr_decode_hyper(p, &res->fbytes);
1018 p = xdr_decode_hyper(p, &res->abytes);
1019 p = xdr_decode_hyper(p, &res->tfiles);
1020 p = xdr_decode_hyper(p, &res->ffiles);
1021 p = xdr_decode_hyper(p, &res->afiles);
1022 2120
2121 p = xdr_inline_decode(xdr, 8 * 6 + 4);
2122 if (unlikely(p == NULL))
2123 goto out_overflow;
2124 p = xdr_decode_size3(p, &result->tbytes);
2125 p = xdr_decode_size3(p, &result->fbytes);
2126 p = xdr_decode_size3(p, &result->abytes);
2127 p = xdr_decode_size3(p, &result->tfiles);
2128 p = xdr_decode_size3(p, &result->ffiles);
2129 xdr_decode_size3(p, &result->afiles);
1023 /* ignore invarsec */ 2130 /* ignore invarsec */
1024 return 0; 2131 return 0;
2132out_overflow:
2133 print_overflow_msg(__func__, xdr);
2134 return -EIO;
2135}
2136
2137static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
2138 struct xdr_stream *xdr,
2139 struct nfs_fsstat *result)
2140{
2141 enum nfs_stat status;
2142 int error;
2143
2144 error = decode_nfsstat3(xdr, &status);
2145 if (unlikely(error))
2146 goto out;
2147 error = decode_post_op_attr(xdr, result->fattr);
2148 if (unlikely(error))
2149 goto out;
2150 if (status != NFS3_OK)
2151 goto out_status;
2152 error = decode_fsstat3resok(xdr, result);
2153out:
2154 return error;
2155out_status:
2156 return nfs_stat_to_errno(status);
1025} 2157}
1026 2158
1027/* 2159/*
1028 * Decode FSINFO reply 2160 * 3.3.19 FSINFO3res
2161 *
2162 * struct FSINFO3resok {
2163 * post_op_attr obj_attributes;
2164 * uint32 rtmax;
2165 * uint32 rtpref;
2166 * uint32 rtmult;
2167 * uint32 wtmax;
2168 * uint32 wtpref;
2169 * uint32 wtmult;
2170 * uint32 dtpref;
2171 * size3 maxfilesize;
2172 * nfstime3 time_delta;
2173 * uint32 properties;
2174 * };
2175 *
2176 * struct FSINFO3resfail {
2177 * post_op_attr obj_attributes;
2178 * };
2179 *
2180 * union FSINFO3res switch (nfsstat3 status) {
2181 * case NFS3_OK:
2182 * FSINFO3resok resok;
2183 * default:
2184 * FSINFO3resfail resfail;
2185 * };
1029 */ 2186 */
1030static int 2187static int decode_fsinfo3resok(struct xdr_stream *xdr,
1031nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) 2188 struct nfs_fsinfo *result)
1032{ 2189{
1033 int status; 2190 __be32 *p;
1034
1035 status = ntohl(*p++);
1036
1037 p = xdr_decode_post_op_attr(p, res->fattr);
1038 if (status != 0)
1039 return nfs_stat_to_errno(status);
1040 2191
1041 res->rtmax = ntohl(*p++); 2192 p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
1042 res->rtpref = ntohl(*p++); 2193 if (unlikely(p == NULL))
1043 res->rtmult = ntohl(*p++); 2194 goto out_overflow;
1044 res->wtmax = ntohl(*p++); 2195 result->rtmax = be32_to_cpup(p++);
1045 res->wtpref = ntohl(*p++); 2196 result->rtpref = be32_to_cpup(p++);
1046 res->wtmult = ntohl(*p++); 2197 result->rtmult = be32_to_cpup(p++);
1047 res->dtpref = ntohl(*p++); 2198 result->wtmax = be32_to_cpup(p++);
1048 p = xdr_decode_hyper(p, &res->maxfilesize); 2199 result->wtpref = be32_to_cpup(p++);
1049 p = xdr_decode_time3(p, &res->time_delta); 2200 result->wtmult = be32_to_cpup(p++);
2201 result->dtpref = be32_to_cpup(p++);
2202 p = xdr_decode_size3(p, &result->maxfilesize);
2203 xdr_decode_nfstime3(p, &result->time_delta);
1050 2204
1051 /* ignore properties */ 2205 /* ignore properties */
1052 res->lease_time = 0; 2206 result->lease_time = 0;
1053 return 0; 2207 return 0;
2208out_overflow:
2209 print_overflow_msg(__func__, xdr);
2210 return -EIO;
2211}
2212
2213static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
2214 struct xdr_stream *xdr,
2215 struct nfs_fsinfo *result)
2216{
2217 enum nfs_stat status;
2218 int error;
2219
2220 error = decode_nfsstat3(xdr, &status);
2221 if (unlikely(error))
2222 goto out;
2223 error = decode_post_op_attr(xdr, result->fattr);
2224 if (unlikely(error))
2225 goto out;
2226 if (status != NFS3_OK)
2227 goto out_status;
2228 error = decode_fsinfo3resok(xdr, result);
2229out:
2230 return error;
2231out_status:
2232 return nfs_stat_to_errno(status);
1054} 2233}
1055 2234
1056/* 2235/*
1057 * Decode PATHCONF reply 2236 * 3.3.20 PATHCONF3res
2237 *
2238 * struct PATHCONF3resok {
2239 * post_op_attr obj_attributes;
2240 * uint32 linkmax;
2241 * uint32 name_max;
2242 * bool no_trunc;
2243 * bool chown_restricted;
2244 * bool case_insensitive;
2245 * bool case_preserving;
2246 * };
2247 *
2248 * struct PATHCONF3resfail {
2249 * post_op_attr obj_attributes;
2250 * };
2251 *
2252 * union PATHCONF3res switch (nfsstat3 status) {
2253 * case NFS3_OK:
2254 * PATHCONF3resok resok;
2255 * default:
2256 * PATHCONF3resfail resfail;
2257 * };
1058 */ 2258 */
1059static int 2259static int decode_pathconf3resok(struct xdr_stream *xdr,
1060nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res) 2260 struct nfs_pathconf *result)
1061{ 2261{
1062 int status; 2262 __be32 *p;
1063
1064 status = ntohl(*p++);
1065
1066 p = xdr_decode_post_op_attr(p, res->fattr);
1067 if (status != 0)
1068 return nfs_stat_to_errno(status);
1069 res->max_link = ntohl(*p++);
1070 res->max_namelen = ntohl(*p++);
1071 2263
2264 p = xdr_inline_decode(xdr, 4 * 6);
2265 if (unlikely(p == NULL))
2266 goto out_overflow;
2267 result->max_link = be32_to_cpup(p++);
2268 result->max_namelen = be32_to_cpup(p);
1072 /* ignore remaining fields */ 2269 /* ignore remaining fields */
1073 return 0; 2270 return 0;
2271out_overflow:
2272 print_overflow_msg(__func__, xdr);
2273 return -EIO;
2274}
2275
2276static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
2277 struct xdr_stream *xdr,
2278 struct nfs_pathconf *result)
2279{
2280 enum nfs_stat status;
2281 int error;
2282
2283 error = decode_nfsstat3(xdr, &status);
2284 if (unlikely(error))
2285 goto out;
2286 error = decode_post_op_attr(xdr, result->fattr);
2287 if (unlikely(error))
2288 goto out;
2289 if (status != NFS3_OK)
2290 goto out_status;
2291 error = decode_pathconf3resok(xdr, result);
2292out:
2293 return error;
2294out_status:
2295 return nfs_stat_to_errno(status);
1074} 2296}
1075 2297
1076/* 2298/*
1077 * Decode COMMIT reply 2299 * 3.3.21 COMMIT3res
2300 *
2301 * struct COMMIT3resok {
2302 * wcc_data file_wcc;
2303 * writeverf3 verf;
2304 * };
2305 *
2306 * struct COMMIT3resfail {
2307 * wcc_data file_wcc;
2308 * };
2309 *
2310 * union COMMIT3res switch (nfsstat3 status) {
2311 * case NFS3_OK:
2312 * COMMIT3resok resok;
2313 * default:
2314 * COMMIT3resfail resfail;
2315 * };
1078 */ 2316 */
1079static int 2317static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
1080nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) 2318 struct xdr_stream *xdr,
2319 struct nfs_writeres *result)
1081{ 2320{
1082 int status; 2321 enum nfs_stat status;
1083 2322 int error;
1084 status = ntohl(*p++); 2323
1085 p = xdr_decode_wcc_data(p, res->fattr); 2324 error = decode_nfsstat3(xdr, &status);
1086 if (status != 0) 2325 if (unlikely(error))
1087 return nfs_stat_to_errno(status); 2326 goto out;
1088 2327 error = decode_wcc_data(xdr, result->fattr);
1089 res->verf->verifier[0] = *p++; 2328 if (unlikely(error))
1090 res->verf->verifier[1] = *p++; 2329 goto out;
1091 return 0; 2330 if (status != NFS3_OK)
2331 goto out_status;
2332 error = decode_writeverf3(xdr, result->verf->verifier);
2333out:
2334 return error;
2335out_status:
2336 return nfs_stat_to_errno(status);
1092} 2337}
1093 2338
1094#ifdef CONFIG_NFS_V3_ACL 2339#ifdef CONFIG_NFS_V3_ACL
1095/* 2340
1096 * Decode GETACL reply 2341static inline int decode_getacl3resok(struct xdr_stream *xdr,
1097 */ 2342 struct nfs3_getaclres *result)
1098static int
1099nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
1100 struct nfs3_getaclres *res)
1101{ 2343{
1102 struct xdr_buf *buf = &req->rq_rcv_buf;
1103 int status = ntohl(*p++);
1104 struct posix_acl **acl; 2344 struct posix_acl **acl;
1105 unsigned int *aclcnt; 2345 unsigned int *aclcnt;
1106 int err, base; 2346 size_t hdrlen;
1107 2347 int error;
1108 if (status != 0) 2348
1109 return nfs_stat_to_errno(status); 2349 error = decode_post_op_attr(xdr, result->fattr);
1110 p = xdr_decode_post_op_attr(p, res->fattr); 2350 if (unlikely(error))
1111 res->mask = ntohl(*p++); 2351 goto out;
1112 if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) 2352 error = decode_uint32(xdr, &result->mask);
1113 return -EINVAL; 2353 if (unlikely(error))
1114 base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base; 2354 goto out;
1115 2355 error = -EINVAL;
1116 acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL; 2356 if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
1117 aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL; 2357 goto out;
1118 err = nfsacl_decode(buf, base, aclcnt, acl); 2358
1119 2359 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
1120 acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL; 2360
1121 aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL; 2361 acl = NULL;
1122 if (err > 0) 2362 if (result->mask & NFS_ACL)
1123 err = nfsacl_decode(buf, base + err, aclcnt, acl); 2363 acl = &result->acl_access;
1124 return (err > 0) ? 0 : err; 2364 aclcnt = NULL;
2365 if (result->mask & NFS_ACLCNT)
2366 aclcnt = &result->acl_access_count;
2367 error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
2368 if (unlikely(error <= 0))
2369 goto out;
2370
2371 acl = NULL;
2372 if (result->mask & NFS_DFACL)
2373 acl = &result->acl_default;
2374 aclcnt = NULL;
2375 if (result->mask & NFS_DFACLCNT)
2376 aclcnt = &result->acl_default_count;
2377 error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
2378 if (unlikely(error <= 0))
2379 return error;
2380 error = 0;
2381out:
2382 return error;
1125} 2383}
1126 2384
1127/* 2385static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
1128 * Decode setacl reply. 2386 struct xdr_stream *xdr,
1129 */ 2387 struct nfs3_getaclres *result)
1130static int
1131nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
1132{ 2388{
1133 int status = ntohl(*p++); 2389 enum nfs_stat status;
2390 int error;
2391
2392 error = decode_nfsstat3(xdr, &status);
2393 if (unlikely(error))
2394 goto out;
2395 if (status != NFS3_OK)
2396 goto out_default;
2397 error = decode_getacl3resok(xdr, result);
2398out:
2399 return error;
2400out_default:
2401 return nfs_stat_to_errno(status);
2402}
1134 2403
1135 if (status) 2404static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
1136 return nfs_stat_to_errno(status); 2405 struct xdr_stream *xdr,
1137 xdr_decode_post_op_attr(p, fattr); 2406 struct nfs_fattr *result)
1138 return 0; 2407{
2408 enum nfs_stat status;
2409 int error;
2410
2411 error = decode_nfsstat3(xdr, &status);
2412 if (unlikely(error))
2413 goto out;
2414 if (status != NFS3_OK)
2415 goto out_default;
2416 error = decode_post_op_attr(xdr, result);
2417out:
2418 return error;
2419out_default:
2420 return nfs_stat_to_errno(status);
1139} 2421}
2422
1140#endif /* CONFIG_NFS_V3_ACL */ 2423#endif /* CONFIG_NFS_V3_ACL */
1141 2424
1142#define PROC(proc, argtype, restype, timer) \ 2425#define PROC(proc, argtype, restype, timer) \
1143[NFS3PROC_##proc] = { \ 2426[NFS3PROC_##proc] = { \
1144 .p_proc = NFS3PROC_##proc, \ 2427 .p_proc = NFS3PROC_##proc, \
1145 .p_encode = (kxdrproc_t) nfs3_xdr_##argtype, \ 2428 .p_encode = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args, \
1146 .p_decode = (kxdrproc_t) nfs3_xdr_##restype, \ 2429 .p_decode = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res, \
1147 .p_arglen = NFS3_##argtype##_sz, \ 2430 .p_arglen = NFS3_##argtype##args_sz, \
1148 .p_replen = NFS3_##restype##_sz, \ 2431 .p_replen = NFS3_##restype##res_sz, \
1149 .p_timer = timer, \ 2432 .p_timer = timer, \
1150 .p_statidx = NFS3PROC_##proc, \ 2433 .p_statidx = NFS3PROC_##proc, \
1151 .p_name = #proc, \ 2434 .p_name = #proc, \
1152 } 2435 }
1153 2436
1154struct rpc_procinfo nfs3_procedures[] = { 2437struct rpc_procinfo nfs3_procedures[] = {
1155 PROC(GETATTR, fhandle, attrstat, 1), 2438 PROC(GETATTR, getattr, getattr, 1),
1156 PROC(SETATTR, sattrargs, wccstat, 0), 2439 PROC(SETATTR, setattr, setattr, 0),
1157 PROC(LOOKUP, diropargs, lookupres, 2), 2440 PROC(LOOKUP, lookup, lookup, 2),
1158 PROC(ACCESS, accessargs, accessres, 1), 2441 PROC(ACCESS, access, access, 1),
1159 PROC(READLINK, readlinkargs, readlinkres, 3), 2442 PROC(READLINK, readlink, readlink, 3),
1160 PROC(READ, readargs, readres, 3), 2443 PROC(READ, read, read, 3),
1161 PROC(WRITE, writeargs, writeres, 4), 2444 PROC(WRITE, write, write, 4),
1162 PROC(CREATE, createargs, createres, 0), 2445 PROC(CREATE, create, create, 0),
1163 PROC(MKDIR, mkdirargs, createres, 0), 2446 PROC(MKDIR, mkdir, create, 0),
1164 PROC(SYMLINK, symlinkargs, createres, 0), 2447 PROC(SYMLINK, symlink, create, 0),
1165 PROC(MKNOD, mknodargs, createres, 0), 2448 PROC(MKNOD, mknod, create, 0),
1166 PROC(REMOVE, removeargs, removeres, 0), 2449 PROC(REMOVE, remove, remove, 0),
1167 PROC(RMDIR, diropargs, wccstat, 0), 2450 PROC(RMDIR, lookup, setattr, 0),
1168 PROC(RENAME, renameargs, renameres, 0), 2451 PROC(RENAME, rename, rename, 0),
1169 PROC(LINK, linkargs, linkres, 0), 2452 PROC(LINK, link, link, 0),
1170 PROC(READDIR, readdirargs, readdirres, 3), 2453 PROC(READDIR, readdir, readdir, 3),
1171 PROC(READDIRPLUS, readdirargs, readdirres, 3), 2454 PROC(READDIRPLUS, readdirplus, readdir, 3),
1172 PROC(FSSTAT, fhandle, fsstatres, 0), 2455 PROC(FSSTAT, getattr, fsstat, 0),
1173 PROC(FSINFO, fhandle, fsinfores, 0), 2456 PROC(FSINFO, getattr, fsinfo, 0),
1174 PROC(PATHCONF, fhandle, pathconfres, 0), 2457 PROC(PATHCONF, getattr, pathconf, 0),
1175 PROC(COMMIT, commitargs, commitres, 5), 2458 PROC(COMMIT, commit, commit, 5),
1176}; 2459};
1177 2460
1178struct rpc_version nfs_version3 = { 2461struct rpc_version nfs_version3 = {
@@ -1185,8 +2468,8 @@ struct rpc_version nfs_version3 = {
1185static struct rpc_procinfo nfs3_acl_procedures[] = { 2468static struct rpc_procinfo nfs3_acl_procedures[] = {
1186 [ACLPROC3_GETACL] = { 2469 [ACLPROC3_GETACL] = {
1187 .p_proc = ACLPROC3_GETACL, 2470 .p_proc = ACLPROC3_GETACL,
1188 .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs, 2471 .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
1189 .p_decode = (kxdrproc_t) nfs3_xdr_getaclres, 2472 .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
1190 .p_arglen = ACL3_getaclargs_sz, 2473 .p_arglen = ACL3_getaclargs_sz,
1191 .p_replen = ACL3_getaclres_sz, 2474 .p_replen = ACL3_getaclres_sz,
1192 .p_timer = 1, 2475 .p_timer = 1,
@@ -1194,8 +2477,8 @@ static struct rpc_procinfo nfs3_acl_procedures[] = {
1194 }, 2477 },
1195 [ACLPROC3_SETACL] = { 2478 [ACLPROC3_SETACL] = {
1196 .p_proc = ACLPROC3_SETACL, 2479 .p_proc = ACLPROC3_SETACL,
1197 .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs, 2480 .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
1198 .p_decode = (kxdrproc_t) nfs3_xdr_setaclres, 2481 .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
1199 .p_arglen = ACL3_setaclargs_sz, 2482 .p_arglen = ACL3_setaclargs_sz,
1200 .p_replen = ACL3_setaclres_sz, 2483 .p_replen = ACL3_setaclres_sz,
1201 .p_timer = 0, 2484 .p_timer = 0,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9fa496387fdf..7a7474073148 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
44 NFS4CLNT_RECLAIM_REBOOT, 44 NFS4CLNT_RECLAIM_REBOOT,
45 NFS4CLNT_RECLAIM_NOGRACE, 45 NFS4CLNT_RECLAIM_NOGRACE,
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_LAYOUTRECALL,
47 NFS4CLNT_SESSION_RESET, 48 NFS4CLNT_SESSION_RESET,
48 NFS4CLNT_RECALL_SLOT, 49 NFS4CLNT_RECALL_SLOT,
49}; 50};
@@ -109,7 +110,7 @@ struct nfs_unique_id {
109struct nfs4_state_owner { 110struct nfs4_state_owner {
110 struct nfs_unique_id so_owner_id; 111 struct nfs_unique_id so_owner_id;
111 struct nfs_server *so_server; 112 struct nfs_server *so_server;
112 struct rb_node so_client_node; 113 struct rb_node so_server_node;
113 114
114 struct rpc_cred *so_cred; /* Associated cred */ 115 struct rpc_cred *so_cred; /* Associated cred */
115 116
@@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops {
227extern const struct dentry_operations nfs4_dentry_operations; 228extern const struct dentry_operations nfs4_dentry_operations;
228extern const struct inode_operations nfs4_dir_inode_operations; 229extern const struct inode_operations nfs4_dir_inode_operations;
229 230
230/* inode.c */
231extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
232extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
233extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
234
235
236/* nfs4proc.c */ 231/* nfs4proc.c */
237extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); 232extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
238extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); 233extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
@@ -241,11 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
241extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); 236extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 237extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 238extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
244extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); 239extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
245extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 240extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
246extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 241extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
247 struct nfs4_fs_locations *fs_locations, struct page *page); 242 struct nfs4_fs_locations *fs_locations, struct page *page);
248extern void nfs4_release_lockowner(const struct nfs4_lock_state *); 243extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
244extern const struct xattr_handler *nfs4_xattr_handlers[];
249 245
250#if defined(CONFIG_NFS_V4_1) 246#if defined(CONFIG_NFS_V4_1)
251static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) 247static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -331,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
331extern const nfs4_stateid zero_stateid; 327extern const nfs4_stateid zero_stateid;
332 328
333/* nfs4xdr.c */ 329/* nfs4xdr.c */
334extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
335extern struct rpc_procinfo nfs4_procedures[]; 330extern struct rpc_procinfo nfs4_procedures[];
336 331
337struct nfs4_mount_data; 332struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e92f0d8d654..23f930caf1e2 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
82{ 82{
83 struct nfs4_file_layout_dsaddr *dsaddr; 83 struct nfs4_file_layout_dsaddr *dsaddr;
84 int status = -EINVAL; 84 int status = -EINVAL;
85 struct nfs_server *nfss = NFS_SERVER(lo->inode); 85 struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
86 86
87 dprintk("--> %s\n", __func__); 87 dprintk("--> %s\n", __func__);
88 88
@@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
101 /* find and reference the deviceid */ 101 /* find and reference the deviceid */
102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); 102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
103 if (dsaddr == NULL) { 103 if (dsaddr == NULL) {
104 dsaddr = get_device_info(lo->inode, id); 104 dsaddr = get_device_info(lo->plh_inode, id);
105 if (dsaddr == NULL) 105 if (dsaddr == NULL)
106 goto out; 106 goto out;
107 } 107 }
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
243static void 243static void
244filelayout_free_lseg(struct pnfs_layout_segment *lseg) 244filelayout_free_lseg(struct pnfs_layout_segment *lseg)
245{ 245{
246 struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode); 246 struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
248 248
249 dprintk("--> %s\n", __func__); 249 dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4435e5e1f904..9d992b0346e3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -49,6 +49,7 @@
49#include <linux/mount.h> 49#include <linux/mount.h>
50#include <linux/module.h> 50#include <linux/module.h>
51#include <linux/sunrpc/bc_xprt.h> 51#include <linux/sunrpc/bc_xprt.h>
52#include <linux/xattr.h>
52 53
53#include "nfs4_fs.h" 54#include "nfs4_fs.h"
54#include "delegation.h" 55#include "delegation.h"
@@ -355,9 +356,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
355} 356}
356 357
357/* 358/*
358 * Signal state manager thread if session is drained 359 * Signal state manager thread if session fore channel is drained
359 */ 360 */
360static void nfs41_check_drain_session_complete(struct nfs4_session *ses) 361static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
361{ 362{
362 struct rpc_task *task; 363 struct rpc_task *task;
363 364
@@ -371,8 +372,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
371 if (ses->fc_slot_table.highest_used_slotid != -1) 372 if (ses->fc_slot_table.highest_used_slotid != -1)
372 return; 373 return;
373 374
374 dprintk("%s COMPLETE: Session Drained\n", __func__); 375 dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
375 complete(&ses->complete); 376 complete(&ses->fc_slot_table.complete);
377}
378
379/*
380 * Signal state manager thread if session back channel is drained
381 */
382void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
383{
384 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
385 ses->bc_slot_table.highest_used_slotid != -1)
386 return;
387 dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
388 complete(&ses->bc_slot_table.complete);
376} 389}
377 390
378static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) 391static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -389,7 +402,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
389 402
390 spin_lock(&tbl->slot_tbl_lock); 403 spin_lock(&tbl->slot_tbl_lock);
391 nfs4_free_slot(tbl, res->sr_slot); 404 nfs4_free_slot(tbl, res->sr_slot);
392 nfs41_check_drain_session_complete(res->sr_session); 405 nfs4_check_drain_fc_complete(res->sr_session);
393 spin_unlock(&tbl->slot_tbl_lock); 406 spin_unlock(&tbl->slot_tbl_lock);
394 res->sr_slot = NULL; 407 res->sr_slot = NULL;
395} 408}
@@ -1826,6 +1839,8 @@ struct nfs4_closedata {
1826 struct nfs_closeres res; 1839 struct nfs_closeres res;
1827 struct nfs_fattr fattr; 1840 struct nfs_fattr fattr;
1828 unsigned long timestamp; 1841 unsigned long timestamp;
1842 bool roc;
1843 u32 roc_barrier;
1829}; 1844};
1830 1845
1831static void nfs4_free_closedata(void *data) 1846static void nfs4_free_closedata(void *data)
@@ -1833,6 +1848,8 @@ static void nfs4_free_closedata(void *data)
1833 struct nfs4_closedata *calldata = data; 1848 struct nfs4_closedata *calldata = data;
1834 struct nfs4_state_owner *sp = calldata->state->owner; 1849 struct nfs4_state_owner *sp = calldata->state->owner;
1835 1850
1851 if (calldata->roc)
1852 pnfs_roc_release(calldata->state->inode);
1836 nfs4_put_open_state(calldata->state); 1853 nfs4_put_open_state(calldata->state);
1837 nfs_free_seqid(calldata->arg.seqid); 1854 nfs_free_seqid(calldata->arg.seqid);
1838 nfs4_put_state_owner(sp); 1855 nfs4_put_state_owner(sp);
@@ -1865,6 +1882,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1865 */ 1882 */
1866 switch (task->tk_status) { 1883 switch (task->tk_status) {
1867 case 0: 1884 case 0:
1885 if (calldata->roc)
1886 pnfs_roc_set_barrier(state->inode,
1887 calldata->roc_barrier);
1868 nfs_set_open_stateid(state, &calldata->res.stateid, 0); 1888 nfs_set_open_stateid(state, &calldata->res.stateid, 0);
1869 renew_lease(server, calldata->timestamp); 1889 renew_lease(server, calldata->timestamp);
1870 nfs4_close_clear_stateid_flags(state, 1890 nfs4_close_clear_stateid_flags(state,
@@ -1917,8 +1937,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1917 return; 1937 return;
1918 } 1938 }
1919 1939
1920 if (calldata->arg.fmode == 0) 1940 if (calldata->arg.fmode == 0) {
1921 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; 1941 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
1942 if (calldata->roc &&
1943 pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
1944 rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
1945 task, NULL);
1946 return;
1947 }
1948 }
1922 1949
1923 nfs_fattr_init(calldata->res.fattr); 1950 nfs_fattr_init(calldata->res.fattr);
1924 calldata->timestamp = jiffies; 1951 calldata->timestamp = jiffies;
@@ -1946,7 +1973,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
1946 * 1973 *
1947 * NOTE: Caller must be holding the sp->so_owner semaphore! 1974 * NOTE: Caller must be holding the sp->so_owner semaphore!
1948 */ 1975 */
1949int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) 1976int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
1950{ 1977{
1951 struct nfs_server *server = NFS_SERVER(state->inode); 1978 struct nfs_server *server = NFS_SERVER(state->inode);
1952 struct nfs4_closedata *calldata; 1979 struct nfs4_closedata *calldata;
@@ -1981,11 +2008,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
1981 calldata->res.fattr = &calldata->fattr; 2008 calldata->res.fattr = &calldata->fattr;
1982 calldata->res.seqid = calldata->arg.seqid; 2009 calldata->res.seqid = calldata->arg.seqid;
1983 calldata->res.server = server; 2010 calldata->res.server = server;
2011 calldata->roc = roc;
1984 path_get(path); 2012 path_get(path);
1985 calldata->path = *path; 2013 calldata->path = *path;
1986 2014
1987 msg.rpc_argp = &calldata->arg, 2015 msg.rpc_argp = &calldata->arg;
1988 msg.rpc_resp = &calldata->res, 2016 msg.rpc_resp = &calldata->res;
1989 task_setup_data.callback_data = calldata; 2017 task_setup_data.callback_data = calldata;
1990 task = rpc_run_task(&task_setup_data); 2018 task = rpc_run_task(&task_setup_data);
1991 if (IS_ERR(task)) 2019 if (IS_ERR(task))
@@ -1998,6 +2026,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
1998out_free_calldata: 2026out_free_calldata:
1999 kfree(calldata); 2027 kfree(calldata);
2000out: 2028out:
2029 if (roc)
2030 pnfs_roc_release(state->inode);
2001 nfs4_put_open_state(state); 2031 nfs4_put_open_state(state);
2002 nfs4_put_state_owner(sp); 2032 nfs4_put_state_owner(sp);
2003 return status; 2033 return status;
@@ -2486,6 +2516,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2486 path = &ctx->path; 2516 path = &ctx->path;
2487 fmode = ctx->mode; 2517 fmode = ctx->mode;
2488 } 2518 }
2519 sattr->ia_mode &= ~current_umask();
2489 state = nfs4_do_open(dir, path, fmode, flags, sattr, cred); 2520 state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
2490 d_drop(dentry); 2521 d_drop(dentry);
2491 if (IS_ERR(state)) { 2522 if (IS_ERR(state)) {
@@ -2816,6 +2847,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
2816{ 2847{
2817 struct nfs4_exception exception = { }; 2848 struct nfs4_exception exception = { };
2818 int err; 2849 int err;
2850
2851 sattr->ia_mode &= ~current_umask();
2819 do { 2852 do {
2820 err = nfs4_handle_exception(NFS_SERVER(dir), 2853 err = nfs4_handle_exception(NFS_SERVER(dir),
2821 _nfs4_proc_mkdir(dir, dentry, sattr), 2854 _nfs4_proc_mkdir(dir, dentry, sattr),
@@ -2916,6 +2949,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
2916{ 2949{
2917 struct nfs4_exception exception = { }; 2950 struct nfs4_exception exception = { };
2918 int err; 2951 int err;
2952
2953 sattr->ia_mode &= ~current_umask();
2919 do { 2954 do {
2920 err = nfs4_handle_exception(NFS_SERVER(dir), 2955 err = nfs4_handle_exception(NFS_SERVER(dir),
2921 _nfs4_proc_mknod(dir, dentry, sattr, rdev), 2956 _nfs4_proc_mknod(dir, dentry, sattr, rdev),
@@ -3478,6 +3513,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3478 struct nfs4_setclientid setclientid = { 3513 struct nfs4_setclientid setclientid = {
3479 .sc_verifier = &sc_verifier, 3514 .sc_verifier = &sc_verifier,
3480 .sc_prog = program, 3515 .sc_prog = program,
3516 .sc_cb_ident = clp->cl_cb_ident,
3481 }; 3517 };
3482 struct rpc_message msg = { 3518 struct rpc_message msg = {
3483 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], 3519 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -3517,7 +3553,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3517 if (signalled()) 3553 if (signalled())
3518 break; 3554 break;
3519 if (loop++ & 1) 3555 if (loop++ & 1)
3520 ssleep(clp->cl_lease_time + 1); 3556 ssleep(clp->cl_lease_time / HZ + 1);
3521 else 3557 else
3522 if (++clp->cl_id_uniquifier == 0) 3558 if (++clp->cl_id_uniquifier == 0)
3523 break; 3559 break;
@@ -3663,8 +3699,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3663 data->rpc_status = 0; 3699 data->rpc_status = 0;
3664 3700
3665 task_setup_data.callback_data = data; 3701 task_setup_data.callback_data = data;
3666 msg.rpc_argp = &data->args, 3702 msg.rpc_argp = &data->args;
3667 msg.rpc_resp = &data->res, 3703 msg.rpc_resp = &data->res;
3668 task = rpc_run_task(&task_setup_data); 3704 task = rpc_run_task(&task_setup_data);
3669 if (IS_ERR(task)) 3705 if (IS_ERR(task))
3670 return PTR_ERR(task); 3706 return PTR_ERR(task);
@@ -3743,6 +3779,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
3743 goto out; 3779 goto out;
3744 lsp = request->fl_u.nfs4_fl.owner; 3780 lsp = request->fl_u.nfs4_fl.owner;
3745 arg.lock_owner.id = lsp->ls_id.id; 3781 arg.lock_owner.id = lsp->ls_id.id;
3782 arg.lock_owner.s_dev = server->s_dev;
3746 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 3783 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
3747 switch (status) { 3784 switch (status) {
3748 case 0: 3785 case 0:
@@ -3908,8 +3945,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
3908 return ERR_PTR(-ENOMEM); 3945 return ERR_PTR(-ENOMEM);
3909 } 3946 }
3910 3947
3911 msg.rpc_argp = &data->arg, 3948 msg.rpc_argp = &data->arg;
3912 msg.rpc_resp = &data->res, 3949 msg.rpc_resp = &data->res;
3913 task_setup_data.callback_data = data; 3950 task_setup_data.callback_data = data;
3914 return rpc_run_task(&task_setup_data); 3951 return rpc_run_task(&task_setup_data);
3915} 3952}
@@ -3988,6 +4025,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3988 p->arg.lock_stateid = &lsp->ls_stateid; 4025 p->arg.lock_stateid = &lsp->ls_stateid;
3989 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 4026 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
3990 p->arg.lock_owner.id = lsp->ls_id.id; 4027 p->arg.lock_owner.id = lsp->ls_id.id;
4028 p->arg.lock_owner.s_dev = server->s_dev;
3991 p->res.lock_seqid = p->arg.lock_seqid; 4029 p->res.lock_seqid = p->arg.lock_seqid;
3992 p->lsp = lsp; 4030 p->lsp = lsp;
3993 p->server = server; 4031 p->server = server;
@@ -4145,8 +4183,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4145 data->arg.reclaim = NFS_LOCK_RECLAIM; 4183 data->arg.reclaim = NFS_LOCK_RECLAIM;
4146 task_setup_data.callback_ops = &nfs4_recover_lock_ops; 4184 task_setup_data.callback_ops = &nfs4_recover_lock_ops;
4147 } 4185 }
4148 msg.rpc_argp = &data->arg, 4186 msg.rpc_argp = &data->arg;
4149 msg.rpc_resp = &data->res, 4187 msg.rpc_resp = &data->res;
4150 task_setup_data.callback_data = data; 4188 task_setup_data.callback_data = data;
4151 task = rpc_run_task(&task_setup_data); 4189 task = rpc_run_task(&task_setup_data);
4152 if (IS_ERR(task)) 4190 if (IS_ERR(task))
@@ -4392,48 +4430,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
4392 return; 4430 return;
4393 args->lock_owner.clientid = server->nfs_client->cl_clientid; 4431 args->lock_owner.clientid = server->nfs_client->cl_clientid;
4394 args->lock_owner.id = lsp->ls_id.id; 4432 args->lock_owner.id = lsp->ls_id.id;
4433 args->lock_owner.s_dev = server->s_dev;
4395 msg.rpc_argp = args; 4434 msg.rpc_argp = args;
4396 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); 4435 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
4397} 4436}
4398 4437
4399#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" 4438#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
4400 4439
4401int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, 4440static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
4402 size_t buflen, int flags) 4441 const void *buf, size_t buflen,
4442 int flags, int type)
4403{ 4443{
4404 struct inode *inode = dentry->d_inode; 4444 if (strcmp(key, "") != 0)
4405 4445 return -EINVAL;
4406 if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
4407 return -EOPNOTSUPP;
4408 4446
4409 return nfs4_proc_set_acl(inode, buf, buflen); 4447 return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
4410} 4448}
4411 4449
4412/* The getxattr man page suggests returning -ENODATA for unknown attributes, 4450static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
4413 * and that's what we'll do for e.g. user attributes that haven't been set. 4451 void *buf, size_t buflen, int type)
4414 * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
4415 * attributes in kernel-managed attribute namespaces. */
4416ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
4417 size_t buflen)
4418{ 4452{
4419 struct inode *inode = dentry->d_inode; 4453 if (strcmp(key, "") != 0)
4420 4454 return -EINVAL;
4421 if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
4422 return -EOPNOTSUPP;
4423 4455
4424 return nfs4_proc_get_acl(inode, buf, buflen); 4456 return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
4425} 4457}
4426 4458
4427ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) 4459static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
4460 size_t list_len, const char *name,
4461 size_t name_len, int type)
4428{ 4462{
4429 size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1; 4463 size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
4430 4464
4431 if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode))) 4465 if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
4432 return 0; 4466 return 0;
4433 if (buf && buflen < len) 4467
4434 return -ERANGE; 4468 if (list && len <= list_len)
4435 if (buf) 4469 memcpy(list, XATTR_NAME_NFSV4_ACL, len);
4436 memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
4437 return len; 4470 return len;
4438} 4471}
4439 4472
@@ -4486,6 +4519,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
4486 4519
4487#ifdef CONFIG_NFS_V4_1 4520#ifdef CONFIG_NFS_V4_1
4488/* 4521/*
4522 * Check the exchange flags returned by the server for invalid flags, having
4523 * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
4524 * DS flags set.
4525 */
4526static int nfs4_check_cl_exchange_flags(u32 flags)
4527{
4528 if (flags & ~EXCHGID4_FLAG_MASK_R)
4529 goto out_inval;
4530 if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
4531 (flags & EXCHGID4_FLAG_USE_NON_PNFS))
4532 goto out_inval;
4533 if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
4534 goto out_inval;
4535 return NFS_OK;
4536out_inval:
4537 return -NFS4ERR_INVAL;
4538}
4539
4540/*
4489 * nfs4_proc_exchange_id() 4541 * nfs4_proc_exchange_id()
4490 * 4542 *
4491 * Since the clientid has expired, all compounds using sessions 4543 * Since the clientid has expired, all compounds using sessions
@@ -4498,7 +4550,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4498 nfs4_verifier verifier; 4550 nfs4_verifier verifier;
4499 struct nfs41_exchange_id_args args = { 4551 struct nfs41_exchange_id_args args = {
4500 .client = clp, 4552 .client = clp,
4501 .flags = clp->cl_exchange_flags, 4553 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
4502 }; 4554 };
4503 struct nfs41_exchange_id_res res = { 4555 struct nfs41_exchange_id_res res = {
4504 .client = clp, 4556 .client = clp,
@@ -4515,9 +4567,6 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4515 dprintk("--> %s\n", __func__); 4567 dprintk("--> %s\n", __func__);
4516 BUG_ON(clp == NULL); 4568 BUG_ON(clp == NULL);
4517 4569
4518 /* Remove server-only flags */
4519 args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
4520
4521 p = (u32 *)verifier.data; 4570 p = (u32 *)verifier.data;
4522 *p++ = htonl((u32)clp->cl_boot_time.tv_sec); 4571 *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
4523 *p = htonl((u32)clp->cl_boot_time.tv_nsec); 4572 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
@@ -4543,6 +4592,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4543 break; 4592 break;
4544 } 4593 }
4545 4594
4595 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
4546 dprintk("<-- %s status= %d\n", __func__, status); 4596 dprintk("<-- %s status= %d\n", __func__, status);
4547 return status; 4597 return status;
4548} 4598}
@@ -4776,17 +4826,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4776 if (!session) 4826 if (!session)
4777 return NULL; 4827 return NULL;
4778 4828
4779 init_completion(&session->complete);
4780
4781 tbl = &session->fc_slot_table; 4829 tbl = &session->fc_slot_table;
4782 tbl->highest_used_slotid = -1; 4830 tbl->highest_used_slotid = -1;
4783 spin_lock_init(&tbl->slot_tbl_lock); 4831 spin_lock_init(&tbl->slot_tbl_lock);
4784 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); 4832 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
4833 init_completion(&tbl->complete);
4785 4834
4786 tbl = &session->bc_slot_table; 4835 tbl = &session->bc_slot_table;
4787 tbl->highest_used_slotid = -1; 4836 tbl->highest_used_slotid = -1;
4788 spin_lock_init(&tbl->slot_tbl_lock); 4837 spin_lock_init(&tbl->slot_tbl_lock);
4789 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); 4838 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
4839 init_completion(&tbl->complete);
4790 4840
4791 session->session_state = 1<<NFS4_SESSION_INITING; 4841 session->session_state = 1<<NFS4_SESSION_INITING;
4792 4842
@@ -5280,13 +5330,23 @@ static void
5280nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) 5330nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
5281{ 5331{
5282 struct nfs4_layoutget *lgp = calldata; 5332 struct nfs4_layoutget *lgp = calldata;
5283 struct inode *ino = lgp->args.inode; 5333 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5284 struct nfs_server *server = NFS_SERVER(ino);
5285 5334
5286 dprintk("--> %s\n", __func__); 5335 dprintk("--> %s\n", __func__);
5336 /* Note the is a race here, where a CB_LAYOUTRECALL can come in
5337 * right now covering the LAYOUTGET we are about to send.
5338 * However, that is not so catastrophic, and there seems
5339 * to be no way to prevent it completely.
5340 */
5287 if (nfs4_setup_sequence(server, &lgp->args.seq_args, 5341 if (nfs4_setup_sequence(server, &lgp->args.seq_args,
5288 &lgp->res.seq_res, 0, task)) 5342 &lgp->res.seq_res, 0, task))
5289 return; 5343 return;
5344 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
5345 NFS_I(lgp->args.inode)->layout,
5346 lgp->args.ctx->state)) {
5347 rpc_exit(task, NFS4_OK);
5348 return;
5349 }
5290 rpc_call_start(task); 5350 rpc_call_start(task);
5291} 5351}
5292 5352
@@ -5313,7 +5373,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
5313 return; 5373 return;
5314 } 5374 }
5315 } 5375 }
5316 lgp->status = task->tk_status;
5317 dprintk("<-- %s\n", __func__); 5376 dprintk("<-- %s\n", __func__);
5318} 5377}
5319 5378
@@ -5322,7 +5381,6 @@ static void nfs4_layoutget_release(void *calldata)
5322 struct nfs4_layoutget *lgp = calldata; 5381 struct nfs4_layoutget *lgp = calldata;
5323 5382
5324 dprintk("--> %s\n", __func__); 5383 dprintk("--> %s\n", __func__);
5325 put_layout_hdr(lgp->args.inode);
5326 if (lgp->res.layout.buf != NULL) 5384 if (lgp->res.layout.buf != NULL)
5327 free_page((unsigned long) lgp->res.layout.buf); 5385 free_page((unsigned long) lgp->res.layout.buf);
5328 put_nfs_open_context(lgp->args.ctx); 5386 put_nfs_open_context(lgp->args.ctx);
@@ -5367,13 +5425,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5367 if (IS_ERR(task)) 5425 if (IS_ERR(task))
5368 return PTR_ERR(task); 5426 return PTR_ERR(task);
5369 status = nfs4_wait_for_completion_rpc_task(task); 5427 status = nfs4_wait_for_completion_rpc_task(task);
5370 if (status != 0) 5428 if (status == 0)
5371 goto out; 5429 status = task->tk_status;
5372 status = lgp->status; 5430 if (status == 0)
5373 if (status != 0) 5431 status = pnfs_layout_process(lgp);
5374 goto out;
5375 status = pnfs_layout_process(lgp);
5376out:
5377 rpc_put_task(task); 5432 rpc_put_task(task);
5378 dprintk("<-- %s status=%d\n", __func__, status); 5433 dprintk("<-- %s status=%d\n", __func__, status);
5379 return status; 5434 return status;
@@ -5504,9 +5559,10 @@ static const struct inode_operations nfs4_file_inode_operations = {
5504 .permission = nfs_permission, 5559 .permission = nfs_permission,
5505 .getattr = nfs_getattr, 5560 .getattr = nfs_getattr,
5506 .setattr = nfs_setattr, 5561 .setattr = nfs_setattr,
5507 .getxattr = nfs4_getxattr, 5562 .getxattr = generic_getxattr,
5508 .setxattr = nfs4_setxattr, 5563 .setxattr = generic_setxattr,
5509 .listxattr = nfs4_listxattr, 5564 .listxattr = generic_listxattr,
5565 .removexattr = generic_removexattr,
5510}; 5566};
5511 5567
5512const struct nfs_rpc_ops nfs_v4_clientops = { 5568const struct nfs_rpc_ops nfs_v4_clientops = {
@@ -5551,6 +5607,18 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5551 .open_context = nfs4_atomic_open, 5607 .open_context = nfs4_atomic_open,
5552}; 5608};
5553 5609
5610static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
5611 .prefix = XATTR_NAME_NFSV4_ACL,
5612 .list = nfs4_xattr_list_nfs4_acl,
5613 .get = nfs4_xattr_get_nfs4_acl,
5614 .set = nfs4_xattr_set_nfs4_acl,
5615};
5616
5617const struct xattr_handler *nfs4_xattr_handlers[] = {
5618 &nfs4_xattr_nfs4_acl_handler,
5619 NULL
5620};
5621
5554/* 5622/*
5555 * Local variables: 5623 * Local variables:
5556 * c-basic-offset: 8 5624 * c-basic-offset: 8
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 72b6c580af13..402143d75fc5 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work)
63 63
64 ops = clp->cl_mvops->state_renewal_ops; 64 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 65 dprintk("%s: start\n", __func__);
66 /* Are there any active superblocks? */ 66
67 if (list_empty(&clp->cl_superblocks)) 67 rcu_read_lock();
68 if (list_empty(&clp->cl_superblocks)) {
69 rcu_read_unlock();
68 goto out; 70 goto out;
71 }
72 rcu_read_unlock();
73
69 spin_lock(&clp->cl_lock); 74 spin_lock(&clp->cl_lock);
70 lease = clp->cl_lease_time; 75 lease = clp->cl_lease_time;
71 last = clp->cl_last_renewal; 76 last = clp->cl_last_renewal;
@@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
75 cred = ops->get_state_renewal_cred_locked(clp); 80 cred = ops->get_state_renewal_cred_locked(clp);
76 spin_unlock(&clp->cl_lock); 81 spin_unlock(&clp->cl_lock);
77 if (cred == NULL) { 82 if (cred == NULL) {
78 if (list_empty(&clp->cl_delegations)) { 83 if (!nfs_delegations_present(clp)) {
79 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 84 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
80 goto out; 85 goto out;
81 } 86 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f575a3126737..2336d532cf66 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -105,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
105 put_rpccred(cred); 105 put_rpccred(cred);
106} 106}
107 107
108struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) 108static struct rpc_cred *
109nfs4_get_renew_cred_server_locked(struct nfs_server *server)
109{ 110{
111 struct rpc_cred *cred = NULL;
110 struct nfs4_state_owner *sp; 112 struct nfs4_state_owner *sp;
111 struct rb_node *pos; 113 struct rb_node *pos;
112 struct rpc_cred *cred = NULL;
113 114
114 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 115 for (pos = rb_first(&server->state_owners);
115 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 116 pos != NULL;
117 pos = rb_next(pos)) {
118 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
116 if (list_empty(&sp->so_states)) 119 if (list_empty(&sp->so_states))
117 continue; 120 continue;
118 cred = get_rpccred(sp->so_cred); 121 cred = get_rpccred(sp->so_cred);
@@ -121,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
121 return cred; 124 return cred;
122} 125}
123 126
127/**
128 * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
129 * @clp: client state handle
130 *
131 * Returns an rpc_cred with reference count bumped, or NULL.
132 * Caller must hold clp->cl_lock.
133 */
134struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
135{
136 struct rpc_cred *cred = NULL;
137 struct nfs_server *server;
138
139 rcu_read_lock();
140 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
141 cred = nfs4_get_renew_cred_server_locked(server);
142 if (cred != NULL)
143 break;
144 }
145 rcu_read_unlock();
146 return cred;
147}
148
124#if defined(CONFIG_NFS_V4_1) 149#if defined(CONFIG_NFS_V4_1)
125 150
126static int nfs41_setup_state_renewal(struct nfs_client *clp) 151static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -142,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
142 return status; 167 return status;
143} 168}
144 169
170/*
171 * Back channel returns NFS4ERR_DELAY for new requests when
172 * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
173 * is ended.
174 */
145static void nfs4_end_drain_session(struct nfs_client *clp) 175static void nfs4_end_drain_session(struct nfs_client *clp)
146{ 176{
147 struct nfs4_session *ses = clp->cl_session; 177 struct nfs4_session *ses = clp->cl_session;
@@ -165,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
165 } 195 }
166} 196}
167 197
168static int nfs4_begin_drain_session(struct nfs_client *clp) 198static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
169{ 199{
170 struct nfs4_session *ses = clp->cl_session;
171 struct nfs4_slot_table *tbl = &ses->fc_slot_table;
172
173 spin_lock(&tbl->slot_tbl_lock); 200 spin_lock(&tbl->slot_tbl_lock);
174 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
175 if (tbl->highest_used_slotid != -1) { 201 if (tbl->highest_used_slotid != -1) {
176 INIT_COMPLETION(ses->complete); 202 INIT_COMPLETION(tbl->complete);
177 spin_unlock(&tbl->slot_tbl_lock); 203 spin_unlock(&tbl->slot_tbl_lock);
178 return wait_for_completion_interruptible(&ses->complete); 204 return wait_for_completion_interruptible(&tbl->complete);
179 } 205 }
180 spin_unlock(&tbl->slot_tbl_lock); 206 spin_unlock(&tbl->slot_tbl_lock);
181 return 0; 207 return 0;
182} 208}
183 209
210static int nfs4_begin_drain_session(struct nfs_client *clp)
211{
212 struct nfs4_session *ses = clp->cl_session;
213 int ret = 0;
214
215 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
216 /* back channel */
217 ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
218 if (ret)
219 return ret;
220 /* fore channel */
221 return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
222}
223
184int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 224int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
185{ 225{
186 int status; 226 int status;
@@ -192,6 +232,12 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
192 status = nfs4_proc_create_session(clp); 232 status = nfs4_proc_create_session(clp);
193 if (status != 0) 233 if (status != 0)
194 goto out; 234 goto out;
235 status = nfs4_set_callback_sessionid(clp);
236 if (status != 0) {
237 printk(KERN_WARNING "Sessionid not set. No callback service\n");
238 nfs_callback_down(1);
239 status = 0;
240 }
195 nfs41_setup_state_renewal(clp); 241 nfs41_setup_state_renewal(clp);
196 nfs_mark_client_ready(clp, NFS_CS_READY); 242 nfs_mark_client_ready(clp, NFS_CS_READY);
197out: 243out:
@@ -210,28 +256,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
210 256
211#endif /* CONFIG_NFS_V4_1 */ 257#endif /* CONFIG_NFS_V4_1 */
212 258
213struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) 259static struct rpc_cred *
260nfs4_get_setclientid_cred_server(struct nfs_server *server)
214{ 261{
262 struct nfs_client *clp = server->nfs_client;
263 struct rpc_cred *cred = NULL;
215 struct nfs4_state_owner *sp; 264 struct nfs4_state_owner *sp;
216 struct rb_node *pos; 265 struct rb_node *pos;
266
267 spin_lock(&clp->cl_lock);
268 pos = rb_first(&server->state_owners);
269 if (pos != NULL) {
270 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
271 cred = get_rpccred(sp->so_cred);
272 }
273 spin_unlock(&clp->cl_lock);
274 return cred;
275}
276
277/**
278 * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
279 * @clp: client state handle
280 *
281 * Returns an rpc_cred with reference count bumped, or NULL.
282 */
283struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
284{
285 struct nfs_server *server;
217 struct rpc_cred *cred; 286 struct rpc_cred *cred;
218 287
219 spin_lock(&clp->cl_lock); 288 spin_lock(&clp->cl_lock);
220 cred = nfs4_get_machine_cred_locked(clp); 289 cred = nfs4_get_machine_cred_locked(clp);
290 spin_unlock(&clp->cl_lock);
221 if (cred != NULL) 291 if (cred != NULL)
222 goto out; 292 goto out;
223 pos = rb_first(&clp->cl_state_owners); 293
224 if (pos != NULL) { 294 rcu_read_lock();
225 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 295 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
226 cred = get_rpccred(sp->so_cred); 296 cred = nfs4_get_setclientid_cred_server(server);
297 if (cred != NULL)
298 break;
227 } 299 }
300 rcu_read_unlock();
301
228out: 302out:
229 spin_unlock(&clp->cl_lock);
230 return cred; 303 return cred;
231} 304}
232 305
233static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new, 306static void nfs_alloc_unique_id_locked(struct rb_root *root,
234 __u64 minval, int maxbits) 307 struct nfs_unique_id *new,
308 __u64 minval, int maxbits)
235{ 309{
236 struct rb_node **p, *parent; 310 struct rb_node **p, *parent;
237 struct nfs_unique_id *pos; 311 struct nfs_unique_id *pos;
@@ -286,16 +360,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
286} 360}
287 361
288static struct nfs4_state_owner * 362static struct nfs4_state_owner *
289nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred) 363nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
290{ 364{
291 struct nfs_client *clp = server->nfs_client; 365 struct rb_node **p = &server->state_owners.rb_node,
292 struct rb_node **p = &clp->cl_state_owners.rb_node,
293 *parent = NULL; 366 *parent = NULL;
294 struct nfs4_state_owner *sp, *res = NULL; 367 struct nfs4_state_owner *sp, *res = NULL;
295 368
296 while (*p != NULL) { 369 while (*p != NULL) {
297 parent = *p; 370 parent = *p;
298 sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); 371 sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
299 372
300 if (server < sp->so_server) { 373 if (server < sp->so_server) {
301 p = &parent->rb_left; 374 p = &parent->rb_left;
@@ -319,24 +392,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
319} 392}
320 393
321static struct nfs4_state_owner * 394static struct nfs4_state_owner *
322nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new) 395nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
323{ 396{
324 struct rb_node **p = &clp->cl_state_owners.rb_node, 397 struct nfs_server *server = new->so_server;
398 struct rb_node **p = &server->state_owners.rb_node,
325 *parent = NULL; 399 *parent = NULL;
326 struct nfs4_state_owner *sp; 400 struct nfs4_state_owner *sp;
327 401
328 while (*p != NULL) { 402 while (*p != NULL) {
329 parent = *p; 403 parent = *p;
330 sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); 404 sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
331 405
332 if (new->so_server < sp->so_server) {
333 p = &parent->rb_left;
334 continue;
335 }
336 if (new->so_server > sp->so_server) {
337 p = &parent->rb_right;
338 continue;
339 }
340 if (new->so_cred < sp->so_cred) 406 if (new->so_cred < sp->so_cred)
341 p = &parent->rb_left; 407 p = &parent->rb_left;
342 else if (new->so_cred > sp->so_cred) 408 else if (new->so_cred > sp->so_cred)
@@ -346,18 +412,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
346 return sp; 412 return sp;
347 } 413 }
348 } 414 }
349 nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64); 415 nfs_alloc_unique_id_locked(&server->openowner_id,
350 rb_link_node(&new->so_client_node, parent, p); 416 &new->so_owner_id, 1, 64);
351 rb_insert_color(&new->so_client_node, &clp->cl_state_owners); 417 rb_link_node(&new->so_server_node, parent, p);
418 rb_insert_color(&new->so_server_node, &server->state_owners);
352 return new; 419 return new;
353} 420}
354 421
355static void 422static void
356nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp) 423nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
357{ 424{
358 if (!RB_EMPTY_NODE(&sp->so_client_node)) 425 struct nfs_server *server = sp->so_server;
359 rb_erase(&sp->so_client_node, &clp->cl_state_owners); 426
360 nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id); 427 if (!RB_EMPTY_NODE(&sp->so_server_node))
428 rb_erase(&sp->so_server_node, &server->state_owners);
429 nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
361} 430}
362 431
363/* 432/*
@@ -386,23 +455,32 @@ nfs4_alloc_state_owner(void)
386static void 455static void
387nfs4_drop_state_owner(struct nfs4_state_owner *sp) 456nfs4_drop_state_owner(struct nfs4_state_owner *sp)
388{ 457{
389 if (!RB_EMPTY_NODE(&sp->so_client_node)) { 458 if (!RB_EMPTY_NODE(&sp->so_server_node)) {
390 struct nfs_client *clp = sp->so_server->nfs_client; 459 struct nfs_server *server = sp->so_server;
460 struct nfs_client *clp = server->nfs_client;
391 461
392 spin_lock(&clp->cl_lock); 462 spin_lock(&clp->cl_lock);
393 rb_erase(&sp->so_client_node, &clp->cl_state_owners); 463 rb_erase(&sp->so_server_node, &server->state_owners);
394 RB_CLEAR_NODE(&sp->so_client_node); 464 RB_CLEAR_NODE(&sp->so_server_node);
395 spin_unlock(&clp->cl_lock); 465 spin_unlock(&clp->cl_lock);
396 } 466 }
397} 467}
398 468
399struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) 469/**
470 * nfs4_get_state_owner - Look up a state owner given a credential
471 * @server: nfs_server to search
472 * @cred: RPC credential to match
473 *
474 * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
475 */
476struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
477 struct rpc_cred *cred)
400{ 478{
401 struct nfs_client *clp = server->nfs_client; 479 struct nfs_client *clp = server->nfs_client;
402 struct nfs4_state_owner *sp, *new; 480 struct nfs4_state_owner *sp, *new;
403 481
404 spin_lock(&clp->cl_lock); 482 spin_lock(&clp->cl_lock);
405 sp = nfs4_find_state_owner(server, cred); 483 sp = nfs4_find_state_owner_locked(server, cred);
406 spin_unlock(&clp->cl_lock); 484 spin_unlock(&clp->cl_lock);
407 if (sp != NULL) 485 if (sp != NULL)
408 return sp; 486 return sp;
@@ -412,7 +490,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
412 new->so_server = server; 490 new->so_server = server;
413 new->so_cred = cred; 491 new->so_cred = cred;
414 spin_lock(&clp->cl_lock); 492 spin_lock(&clp->cl_lock);
415 sp = nfs4_insert_state_owner(clp, new); 493 sp = nfs4_insert_state_owner_locked(new);
416 spin_unlock(&clp->cl_lock); 494 spin_unlock(&clp->cl_lock);
417 if (sp == new) 495 if (sp == new)
418 get_rpccred(cred); 496 get_rpccred(cred);
@@ -423,6 +501,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
423 return sp; 501 return sp;
424} 502}
425 503
504/**
505 * nfs4_put_state_owner - Release a nfs4_state_owner
506 * @sp: state owner data to release
507 *
508 */
426void nfs4_put_state_owner(struct nfs4_state_owner *sp) 509void nfs4_put_state_owner(struct nfs4_state_owner *sp)
427{ 510{
428 struct nfs_client *clp = sp->so_server->nfs_client; 511 struct nfs_client *clp = sp->so_server->nfs_client;
@@ -430,7 +513,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
430 513
431 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) 514 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
432 return; 515 return;
433 nfs4_remove_state_owner(clp, sp); 516 nfs4_remove_state_owner_locked(sp);
434 spin_unlock(&clp->cl_lock); 517 spin_unlock(&clp->cl_lock);
435 rpc_destroy_wait_queue(&sp->so_sequence.wait); 518 rpc_destroy_wait_queue(&sp->so_sequence.wait);
436 put_rpccred(cred); 519 put_rpccred(cred);
@@ -585,8 +668,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
585 if (!call_close) { 668 if (!call_close) {
586 nfs4_put_open_state(state); 669 nfs4_put_open_state(state);
587 nfs4_put_state_owner(owner); 670 nfs4_put_state_owner(owner);
588 } else 671 } else {
589 nfs4_do_close(path, state, gfp_mask, wait); 672 bool roc = pnfs_roc(state->inode);
673
674 nfs4_do_close(path, state, gfp_mask, wait, roc);
675 }
590} 676}
591 677
592void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) 678void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
@@ -633,7 +719,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
633static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) 719static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
634{ 720{
635 struct nfs4_lock_state *lsp; 721 struct nfs4_lock_state *lsp;
636 struct nfs_client *clp = state->owner->so_server->nfs_client; 722 struct nfs_server *server = state->owner->so_server;
723 struct nfs_client *clp = server->nfs_client;
637 724
638 lsp = kzalloc(sizeof(*lsp), GFP_NOFS); 725 lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
639 if (lsp == NULL) 726 if (lsp == NULL)
@@ -657,7 +744,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
657 return NULL; 744 return NULL;
658 } 745 }
659 spin_lock(&clp->cl_lock); 746 spin_lock(&clp->cl_lock);
660 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); 747 nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
661 spin_unlock(&clp->cl_lock); 748 spin_unlock(&clp->cl_lock);
662 INIT_LIST_HEAD(&lsp->ls_locks); 749 INIT_LIST_HEAD(&lsp->ls_locks);
663 return lsp; 750 return lsp;
@@ -665,10 +752,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
665 752
666static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) 753static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
667{ 754{
668 struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; 755 struct nfs_server *server = lsp->ls_state->owner->so_server;
756 struct nfs_client *clp = server->nfs_client;
669 757
670 spin_lock(&clp->cl_lock); 758 spin_lock(&clp->cl_lock);
671 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); 759 nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
672 spin_unlock(&clp->cl_lock); 760 spin_unlock(&clp->cl_lock);
673 rpc_destroy_wait_queue(&lsp->ls_sequence.wait); 761 rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
674 kfree(lsp); 762 kfree(lsp);
@@ -1114,15 +1202,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
1114 } 1202 }
1115} 1203}
1116 1204
1117static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) 1205static void nfs4_reset_seqids(struct nfs_server *server,
1206 int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
1118{ 1207{
1208 struct nfs_client *clp = server->nfs_client;
1119 struct nfs4_state_owner *sp; 1209 struct nfs4_state_owner *sp;
1120 struct rb_node *pos; 1210 struct rb_node *pos;
1121 struct nfs4_state *state; 1211 struct nfs4_state *state;
1122 1212
1123 /* Reset all sequence ids to zero */ 1213 spin_lock(&clp->cl_lock);
1124 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1214 for (pos = rb_first(&server->state_owners);
1125 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1215 pos != NULL;
1216 pos = rb_next(pos)) {
1217 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
1126 sp->so_seqid.flags = 0; 1218 sp->so_seqid.flags = 0;
1127 spin_lock(&sp->so_lock); 1219 spin_lock(&sp->so_lock);
1128 list_for_each_entry(state, &sp->so_states, open_states) { 1220 list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1131,6 +1223,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
1131 } 1223 }
1132 spin_unlock(&sp->so_lock); 1224 spin_unlock(&sp->so_lock);
1133 } 1225 }
1226 spin_unlock(&clp->cl_lock);
1227}
1228
1229static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
1230 int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
1231{
1232 struct nfs_server *server;
1233
1234 rcu_read_lock();
1235 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
1236 nfs4_reset_seqids(server, mark_reclaim);
1237 rcu_read_unlock();
1134} 1238}
1135 1239
1136static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) 1240static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1148,25 +1252,41 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
1148 (void)ops->reclaim_complete(clp); 1252 (void)ops->reclaim_complete(clp);
1149} 1253}
1150 1254
1151static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) 1255static void nfs4_clear_reclaim_server(struct nfs_server *server)
1152{ 1256{
1257 struct nfs_client *clp = server->nfs_client;
1153 struct nfs4_state_owner *sp; 1258 struct nfs4_state_owner *sp;
1154 struct rb_node *pos; 1259 struct rb_node *pos;
1155 struct nfs4_state *state; 1260 struct nfs4_state *state;
1156 1261
1157 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1262 spin_lock(&clp->cl_lock);
1158 return 0; 1263 for (pos = rb_first(&server->state_owners);
1159 1264 pos != NULL;
1160 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1265 pos = rb_next(pos)) {
1161 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1266 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
1162 spin_lock(&sp->so_lock); 1267 spin_lock(&sp->so_lock);
1163 list_for_each_entry(state, &sp->so_states, open_states) { 1268 list_for_each_entry(state, &sp->so_states, open_states) {
1164 if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags)) 1269 if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
1270 &state->flags))
1165 continue; 1271 continue;
1166 nfs4_state_mark_reclaim_nograce(clp, state); 1272 nfs4_state_mark_reclaim_nograce(clp, state);
1167 } 1273 }
1168 spin_unlock(&sp->so_lock); 1274 spin_unlock(&sp->so_lock);
1169 } 1275 }
1276 spin_unlock(&clp->cl_lock);
1277}
1278
1279static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
1280{
1281 struct nfs_server *server;
1282
1283 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1284 return 0;
1285
1286 rcu_read_lock();
1287 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
1288 nfs4_clear_reclaim_server(server);
1289 rcu_read_unlock();
1170 1290
1171 nfs_delegation_reap_unclaimed(clp); 1291 nfs_delegation_reap_unclaimed(clp);
1172 return 1; 1292 return 1;
@@ -1238,27 +1358,40 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1238 1358
1239static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) 1359static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
1240{ 1360{
1361 struct nfs4_state_owner *sp;
1362 struct nfs_server *server;
1241 struct rb_node *pos; 1363 struct rb_node *pos;
1242 int status = 0; 1364 int status = 0;
1243 1365
1244restart: 1366restart:
1245 spin_lock(&clp->cl_lock); 1367 rcu_read_lock();
1246 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1368 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
1247 struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1369 spin_lock(&clp->cl_lock);
1248 if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags)) 1370 for (pos = rb_first(&server->state_owners);
1249 continue; 1371 pos != NULL;
1250 atomic_inc(&sp->so_count); 1372 pos = rb_next(pos)) {
1251 spin_unlock(&clp->cl_lock); 1373 sp = rb_entry(pos,
1252 status = nfs4_reclaim_open_state(sp, ops); 1374 struct nfs4_state_owner, so_server_node);
1253 if (status < 0) { 1375 if (!test_and_clear_bit(ops->owner_flag_bit,
1254 set_bit(ops->owner_flag_bit, &sp->so_flags); 1376 &sp->so_flags))
1377 continue;
1378 atomic_inc(&sp->so_count);
1379 spin_unlock(&clp->cl_lock);
1380 rcu_read_unlock();
1381
1382 status = nfs4_reclaim_open_state(sp, ops);
1383 if (status < 0) {
1384 set_bit(ops->owner_flag_bit, &sp->so_flags);
1385 nfs4_put_state_owner(sp);
1386 return nfs4_recovery_handle_error(clp, status);
1387 }
1388
1255 nfs4_put_state_owner(sp); 1389 nfs4_put_state_owner(sp);
1256 return nfs4_recovery_handle_error(clp, status); 1390 goto restart;
1257 } 1391 }
1258 nfs4_put_state_owner(sp); 1392 spin_unlock(&clp->cl_lock);
1259 goto restart;
1260 } 1393 }
1261 spin_unlock(&clp->cl_lock); 1394 rcu_read_unlock();
1262 return status; 1395 return status;
1263} 1396}
1264 1397
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f1826b012e6..2ab8e5cb8f59 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -71,8 +71,8 @@ static int nfs4_stat_to_errno(int);
71/* lock,open owner id: 71/* lock,open owner id:
72 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) 72 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
73 */ 73 */
74#define open_owner_id_maxsz (1 + 4) 74#define open_owner_id_maxsz (1 + 1 + 4)
75#define lock_owner_id_maxsz (1 + 4) 75#define lock_owner_id_maxsz (1 + 1 + 4)
76#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) 76#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
77#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) 77#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
78#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) 78#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
@@ -1088,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
1088{ 1088{
1089 __be32 *p; 1089 __be32 *p;
1090 1090
1091 p = reserve_space(xdr, 28); 1091 p = reserve_space(xdr, 32);
1092 p = xdr_encode_hyper(p, lowner->clientid); 1092 p = xdr_encode_hyper(p, lowner->clientid);
1093 *p++ = cpu_to_be32(16); 1093 *p++ = cpu_to_be32(20);
1094 p = xdr_encode_opaque_fixed(p, "lock id:", 8); 1094 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1095 *p++ = cpu_to_be32(lowner->s_dev);
1095 xdr_encode_hyper(p, lowner->id); 1096 xdr_encode_hyper(p, lowner->id);
1096} 1097}
1097 1098
@@ -1210,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1210 *p++ = cpu_to_be32(OP_OPEN); 1211 *p++ = cpu_to_be32(OP_OPEN);
1211 *p = cpu_to_be32(arg->seqid->sequence->counter); 1212 *p = cpu_to_be32(arg->seqid->sequence->counter);
1212 encode_share_access(xdr, arg->fmode); 1213 encode_share_access(xdr, arg->fmode);
1213 p = reserve_space(xdr, 28); 1214 p = reserve_space(xdr, 32);
1214 p = xdr_encode_hyper(p, arg->clientid); 1215 p = xdr_encode_hyper(p, arg->clientid);
1215 *p++ = cpu_to_be32(16); 1216 *p++ = cpu_to_be32(20);
1216 p = xdr_encode_opaque_fixed(p, "open id:", 8); 1217 p = xdr_encode_opaque_fixed(p, "open id:", 8);
1218 *p++ = cpu_to_be32(arg->server->s_dev);
1217 xdr_encode_hyper(p, arg->id); 1219 xdr_encode_hyper(p, arg->id);
1218} 1220}
1219 1221
@@ -1510,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1510 hdr->replen += decode_restorefh_maxsz; 1512 hdr->replen += decode_restorefh_maxsz;
1511} 1513}
1512 1514
1513static int 1515static void
1514encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr) 1516encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
1515{ 1517{
1516 __be32 *p; 1518 __be32 *p;
@@ -1521,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1521 p = reserve_space(xdr, 2*4); 1523 p = reserve_space(xdr, 2*4);
1522 *p++ = cpu_to_be32(1); 1524 *p++ = cpu_to_be32(1);
1523 *p = cpu_to_be32(FATTR4_WORD0_ACL); 1525 *p = cpu_to_be32(FATTR4_WORD0_ACL);
1524 if (arg->acl_len % 4) 1526 BUG_ON(arg->acl_len % 4);
1525 return -EINVAL;
1526 p = reserve_space(xdr, 4); 1527 p = reserve_space(xdr, 4);
1527 *p = cpu_to_be32(arg->acl_len); 1528 *p = cpu_to_be32(arg->acl_len);
1528 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1529 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
1529 hdr->nops++; 1530 hdr->nops++;
1530 hdr->replen += decode_setacl_maxsz; 1531 hdr->replen += decode_setacl_maxsz;
1531 return 0;
1532} 1532}
1533 1533
1534static void 1534static void
@@ -1789,7 +1789,6 @@ encode_layoutget(struct xdr_stream *xdr,
1789 const struct nfs4_layoutget_args *args, 1789 const struct nfs4_layoutget_args *args,
1790 struct compound_hdr *hdr) 1790 struct compound_hdr *hdr)
1791{ 1791{
1792 nfs4_stateid stateid;
1793 __be32 *p; 1792 __be32 *p;
1794 1793
1795 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); 1794 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1800,9 +1799,7 @@ encode_layoutget(struct xdr_stream *xdr,
1800 p = xdr_encode_hyper(p, args->range.offset); 1799 p = xdr_encode_hyper(p, args->range.offset);
1801 p = xdr_encode_hyper(p, args->range.length); 1800 p = xdr_encode_hyper(p, args->range.length);
1802 p = xdr_encode_hyper(p, args->minlength); 1801 p = xdr_encode_hyper(p, args->minlength);
1803 pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout, 1802 p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
1804 args->ctx->state);
1805 p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
1806 *p = cpu_to_be32(args->maxcount); 1803 *p = cpu_to_be32(args->maxcount);
1807 1804
1808 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", 1805 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
@@ -1833,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
1833/* 1830/*
1834 * Encode an ACCESS request 1831 * Encode an ACCESS request
1835 */ 1832 */
1836static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args) 1833static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
1834 const struct nfs4_accessargs *args)
1837{ 1835{
1838 struct xdr_stream xdr;
1839 struct compound_hdr hdr = { 1836 struct compound_hdr hdr = {
1840 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1837 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1841 }; 1838 };
1842 1839
1843 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1840 encode_compound_hdr(xdr, req, &hdr);
1844 encode_compound_hdr(&xdr, req, &hdr); 1841 encode_sequence(xdr, &args->seq_args, &hdr);
1845 encode_sequence(&xdr, &args->seq_args, &hdr); 1842 encode_putfh(xdr, args->fh, &hdr);
1846 encode_putfh(&xdr, args->fh, &hdr); 1843 encode_access(xdr, args->access, &hdr);
1847 encode_access(&xdr, args->access, &hdr); 1844 encode_getfattr(xdr, args->bitmask, &hdr);
1848 encode_getfattr(&xdr, args->bitmask, &hdr);
1849 encode_nops(&hdr); 1845 encode_nops(&hdr);
1850 return 0;
1851} 1846}
1852 1847
1853/* 1848/*
1854 * Encode LOOKUP request 1849 * Encode LOOKUP request
1855 */ 1850 */
1856static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args) 1851static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
1852 const struct nfs4_lookup_arg *args)
1857{ 1853{
1858 struct xdr_stream xdr;
1859 struct compound_hdr hdr = { 1854 struct compound_hdr hdr = {
1860 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1855 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1861 }; 1856 };
1862 1857
1863 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1858 encode_compound_hdr(xdr, req, &hdr);
1864 encode_compound_hdr(&xdr, req, &hdr); 1859 encode_sequence(xdr, &args->seq_args, &hdr);
1865 encode_sequence(&xdr, &args->seq_args, &hdr); 1860 encode_putfh(xdr, args->dir_fh, &hdr);
1866 encode_putfh(&xdr, args->dir_fh, &hdr); 1861 encode_lookup(xdr, args->name, &hdr);
1867 encode_lookup(&xdr, args->name, &hdr); 1862 encode_getfh(xdr, &hdr);
1868 encode_getfh(&xdr, &hdr); 1863 encode_getfattr(xdr, args->bitmask, &hdr);
1869 encode_getfattr(&xdr, args->bitmask, &hdr);
1870 encode_nops(&hdr); 1864 encode_nops(&hdr);
1871 return 0;
1872} 1865}
1873 1866
1874/* 1867/*
1875 * Encode LOOKUP_ROOT request 1868 * Encode LOOKUP_ROOT request
1876 */ 1869 */
1877static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args) 1870static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
1871 struct xdr_stream *xdr,
1872 const struct nfs4_lookup_root_arg *args)
1878{ 1873{
1879 struct xdr_stream xdr;
1880 struct compound_hdr hdr = { 1874 struct compound_hdr hdr = {
1881 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1875 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1882 }; 1876 };
1883 1877
1884 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1878 encode_compound_hdr(xdr, req, &hdr);
1885 encode_compound_hdr(&xdr, req, &hdr); 1879 encode_sequence(xdr, &args->seq_args, &hdr);
1886 encode_sequence(&xdr, &args->seq_args, &hdr); 1880 encode_putrootfh(xdr, &hdr);
1887 encode_putrootfh(&xdr, &hdr); 1881 encode_getfh(xdr, &hdr);
1888 encode_getfh(&xdr, &hdr); 1882 encode_getfattr(xdr, args->bitmask, &hdr);
1889 encode_getfattr(&xdr, args->bitmask, &hdr);
1890 encode_nops(&hdr); 1883 encode_nops(&hdr);
1891 return 0;
1892} 1884}
1893 1885
1894/* 1886/*
1895 * Encode REMOVE request 1887 * Encode REMOVE request
1896 */ 1888 */
1897static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) 1889static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
1890 const struct nfs_removeargs *args)
1898{ 1891{
1899 struct xdr_stream xdr;
1900 struct compound_hdr hdr = { 1892 struct compound_hdr hdr = {
1901 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1893 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1902 }; 1894 };
1903 1895
1904 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1896 encode_compound_hdr(xdr, req, &hdr);
1905 encode_compound_hdr(&xdr, req, &hdr); 1897 encode_sequence(xdr, &args->seq_args, &hdr);
1906 encode_sequence(&xdr, &args->seq_args, &hdr); 1898 encode_putfh(xdr, args->fh, &hdr);
1907 encode_putfh(&xdr, args->fh, &hdr); 1899 encode_remove(xdr, &args->name, &hdr);
1908 encode_remove(&xdr, &args->name, &hdr); 1900 encode_getfattr(xdr, args->bitmask, &hdr);
1909 encode_getfattr(&xdr, args->bitmask, &hdr);
1910 encode_nops(&hdr); 1901 encode_nops(&hdr);
1911 return 0;
1912} 1902}
1913 1903
1914/* 1904/*
1915 * Encode RENAME request 1905 * Encode RENAME request
1916 */ 1906 */
1917static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args) 1907static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
1908 const struct nfs_renameargs *args)
1918{ 1909{
1919 struct xdr_stream xdr;
1920 struct compound_hdr hdr = { 1910 struct compound_hdr hdr = {
1921 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1911 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1922 }; 1912 };
1923 1913
1924 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1914 encode_compound_hdr(xdr, req, &hdr);
1925 encode_compound_hdr(&xdr, req, &hdr); 1915 encode_sequence(xdr, &args->seq_args, &hdr);
1926 encode_sequence(&xdr, &args->seq_args, &hdr); 1916 encode_putfh(xdr, args->old_dir, &hdr);
1927 encode_putfh(&xdr, args->old_dir, &hdr); 1917 encode_savefh(xdr, &hdr);
1928 encode_savefh(&xdr, &hdr); 1918 encode_putfh(xdr, args->new_dir, &hdr);
1929 encode_putfh(&xdr, args->new_dir, &hdr); 1919 encode_rename(xdr, args->old_name, args->new_name, &hdr);
1930 encode_rename(&xdr, args->old_name, args->new_name, &hdr); 1920 encode_getfattr(xdr, args->bitmask, &hdr);
1931 encode_getfattr(&xdr, args->bitmask, &hdr); 1921 encode_restorefh(xdr, &hdr);
1932 encode_restorefh(&xdr, &hdr); 1922 encode_getfattr(xdr, args->bitmask, &hdr);
1933 encode_getfattr(&xdr, args->bitmask, &hdr);
1934 encode_nops(&hdr); 1923 encode_nops(&hdr);
1935 return 0;
1936} 1924}
1937 1925
1938/* 1926/*
1939 * Encode LINK request 1927 * Encode LINK request
1940 */ 1928 */
1941static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args) 1929static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
1930 const struct nfs4_link_arg *args)
1942{ 1931{
1943 struct xdr_stream xdr;
1944 struct compound_hdr hdr = { 1932 struct compound_hdr hdr = {
1945 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1933 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1946 }; 1934 };
1947 1935
1948 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1936 encode_compound_hdr(xdr, req, &hdr);
1949 encode_compound_hdr(&xdr, req, &hdr); 1937 encode_sequence(xdr, &args->seq_args, &hdr);
1950 encode_sequence(&xdr, &args->seq_args, &hdr); 1938 encode_putfh(xdr, args->fh, &hdr);
1951 encode_putfh(&xdr, args->fh, &hdr); 1939 encode_savefh(xdr, &hdr);
1952 encode_savefh(&xdr, &hdr); 1940 encode_putfh(xdr, args->dir_fh, &hdr);
1953 encode_putfh(&xdr, args->dir_fh, &hdr); 1941 encode_link(xdr, args->name, &hdr);
1954 encode_link(&xdr, args->name, &hdr); 1942 encode_getfattr(xdr, args->bitmask, &hdr);
1955 encode_getfattr(&xdr, args->bitmask, &hdr); 1943 encode_restorefh(xdr, &hdr);
1956 encode_restorefh(&xdr, &hdr); 1944 encode_getfattr(xdr, args->bitmask, &hdr);
1957 encode_getfattr(&xdr, args->bitmask, &hdr);
1958 encode_nops(&hdr); 1945 encode_nops(&hdr);
1959 return 0;
1960} 1946}
1961 1947
1962/* 1948/*
1963 * Encode CREATE request 1949 * Encode CREATE request
1964 */ 1950 */
1965static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) 1951static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
1952 const struct nfs4_create_arg *args)
1966{ 1953{
1967 struct xdr_stream xdr;
1968 struct compound_hdr hdr = { 1954 struct compound_hdr hdr = {
1969 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1955 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1970 }; 1956 };
1971 1957
1972 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1958 encode_compound_hdr(xdr, req, &hdr);
1973 encode_compound_hdr(&xdr, req, &hdr); 1959 encode_sequence(xdr, &args->seq_args, &hdr);
1974 encode_sequence(&xdr, &args->seq_args, &hdr); 1960 encode_putfh(xdr, args->dir_fh, &hdr);
1975 encode_putfh(&xdr, args->dir_fh, &hdr); 1961 encode_savefh(xdr, &hdr);
1976 encode_savefh(&xdr, &hdr); 1962 encode_create(xdr, args, &hdr);
1977 encode_create(&xdr, args, &hdr); 1963 encode_getfh(xdr, &hdr);
1978 encode_getfh(&xdr, &hdr); 1964 encode_getfattr(xdr, args->bitmask, &hdr);
1979 encode_getfattr(&xdr, args->bitmask, &hdr); 1965 encode_restorefh(xdr, &hdr);
1980 encode_restorefh(&xdr, &hdr); 1966 encode_getfattr(xdr, args->bitmask, &hdr);
1981 encode_getfattr(&xdr, args->bitmask, &hdr);
1982 encode_nops(&hdr); 1967 encode_nops(&hdr);
1983 return 0;
1984} 1968}
1985 1969
1986/* 1970/*
1987 * Encode SYMLINK request 1971 * Encode SYMLINK request
1988 */ 1972 */
1989static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) 1973static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
1974 const struct nfs4_create_arg *args)
1990{ 1975{
1991 return nfs4_xdr_enc_create(req, p, args); 1976 nfs4_xdr_enc_create(req, xdr, args);
1992} 1977}
1993 1978
1994/* 1979/*
1995 * Encode GETATTR request 1980 * Encode GETATTR request
1996 */ 1981 */
1997static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args) 1982static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
1983 const struct nfs4_getattr_arg *args)
1998{ 1984{
1999 struct xdr_stream xdr;
2000 struct compound_hdr hdr = { 1985 struct compound_hdr hdr = {
2001 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1986 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2002 }; 1987 };
2003 1988
2004 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1989 encode_compound_hdr(xdr, req, &hdr);
2005 encode_compound_hdr(&xdr, req, &hdr); 1990 encode_sequence(xdr, &args->seq_args, &hdr);
2006 encode_sequence(&xdr, &args->seq_args, &hdr); 1991 encode_putfh(xdr, args->fh, &hdr);
2007 encode_putfh(&xdr, args->fh, &hdr); 1992 encode_getfattr(xdr, args->bitmask, &hdr);
2008 encode_getfattr(&xdr, args->bitmask, &hdr);
2009 encode_nops(&hdr); 1993 encode_nops(&hdr);
2010 return 0;
2011} 1994}
2012 1995
2013/* 1996/*
2014 * Encode a CLOSE request 1997 * Encode a CLOSE request
2015 */ 1998 */
2016static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) 1999static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
2000 struct nfs_closeargs *args)
2017{ 2001{
2018 struct xdr_stream xdr;
2019 struct compound_hdr hdr = { 2002 struct compound_hdr hdr = {
2020 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2003 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2021 }; 2004 };
2022 2005
2023 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2006 encode_compound_hdr(xdr, req, &hdr);
2024 encode_compound_hdr(&xdr, req, &hdr); 2007 encode_sequence(xdr, &args->seq_args, &hdr);
2025 encode_sequence(&xdr, &args->seq_args, &hdr); 2008 encode_putfh(xdr, args->fh, &hdr);
2026 encode_putfh(&xdr, args->fh, &hdr); 2009 encode_close(xdr, args, &hdr);
2027 encode_close(&xdr, args, &hdr); 2010 encode_getfattr(xdr, args->bitmask, &hdr);
2028 encode_getfattr(&xdr, args->bitmask, &hdr);
2029 encode_nops(&hdr); 2011 encode_nops(&hdr);
2030 return 0;
2031} 2012}
2032 2013
2033/* 2014/*
2034 * Encode an OPEN request 2015 * Encode an OPEN request
2035 */ 2016 */
2036static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) 2017static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
2018 struct nfs_openargs *args)
2037{ 2019{
2038 struct xdr_stream xdr;
2039 struct compound_hdr hdr = { 2020 struct compound_hdr hdr = {
2040 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2021 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2041 }; 2022 };
2042 2023
2043 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2024 encode_compound_hdr(xdr, req, &hdr);
2044 encode_compound_hdr(&xdr, req, &hdr); 2025 encode_sequence(xdr, &args->seq_args, &hdr);
2045 encode_sequence(&xdr, &args->seq_args, &hdr); 2026 encode_putfh(xdr, args->fh, &hdr);
2046 encode_putfh(&xdr, args->fh, &hdr); 2027 encode_savefh(xdr, &hdr);
2047 encode_savefh(&xdr, &hdr); 2028 encode_open(xdr, args, &hdr);
2048 encode_open(&xdr, args, &hdr); 2029 encode_getfh(xdr, &hdr);
2049 encode_getfh(&xdr, &hdr); 2030 encode_getfattr(xdr, args->bitmask, &hdr);
2050 encode_getfattr(&xdr, args->bitmask, &hdr); 2031 encode_restorefh(xdr, &hdr);
2051 encode_restorefh(&xdr, &hdr); 2032 encode_getfattr(xdr, args->bitmask, &hdr);
2052 encode_getfattr(&xdr, args->bitmask, &hdr);
2053 encode_nops(&hdr); 2033 encode_nops(&hdr);
2054 return 0;
2055} 2034}
2056 2035
2057/* 2036/*
2058 * Encode an OPEN_CONFIRM request 2037 * Encode an OPEN_CONFIRM request
2059 */ 2038 */
2060static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args) 2039static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
2040 struct xdr_stream *xdr,
2041 struct nfs_open_confirmargs *args)
2061{ 2042{
2062 struct xdr_stream xdr;
2063 struct compound_hdr hdr = { 2043 struct compound_hdr hdr = {
2064 .nops = 0, 2044 .nops = 0,
2065 }; 2045 };
2066 2046
2067 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2047 encode_compound_hdr(xdr, req, &hdr);
2068 encode_compound_hdr(&xdr, req, &hdr); 2048 encode_putfh(xdr, args->fh, &hdr);
2069 encode_putfh(&xdr, args->fh, &hdr); 2049 encode_open_confirm(xdr, args, &hdr);
2070 encode_open_confirm(&xdr, args, &hdr);
2071 encode_nops(&hdr); 2050 encode_nops(&hdr);
2072 return 0;
2073} 2051}
2074 2052
2075/* 2053/*
2076 * Encode an OPEN request with no attributes. 2054 * Encode an OPEN request with no attributes.
2077 */ 2055 */
2078static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) 2056static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
2057 struct xdr_stream *xdr,
2058 struct nfs_openargs *args)
2079{ 2059{
2080 struct xdr_stream xdr;
2081 struct compound_hdr hdr = { 2060 struct compound_hdr hdr = {
2082 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2061 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2083 }; 2062 };
2084 2063
2085 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2064 encode_compound_hdr(xdr, req, &hdr);
2086 encode_compound_hdr(&xdr, req, &hdr); 2065 encode_sequence(xdr, &args->seq_args, &hdr);
2087 encode_sequence(&xdr, &args->seq_args, &hdr); 2066 encode_putfh(xdr, args->fh, &hdr);
2088 encode_putfh(&xdr, args->fh, &hdr); 2067 encode_open(xdr, args, &hdr);
2089 encode_open(&xdr, args, &hdr); 2068 encode_getfattr(xdr, args->bitmask, &hdr);
2090 encode_getfattr(&xdr, args->bitmask, &hdr);
2091 encode_nops(&hdr); 2069 encode_nops(&hdr);
2092 return 0;
2093} 2070}
2094 2071
2095/* 2072/*
2096 * Encode an OPEN_DOWNGRADE request 2073 * Encode an OPEN_DOWNGRADE request
2097 */ 2074 */
2098static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) 2075static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
2076 struct xdr_stream *xdr,
2077 struct nfs_closeargs *args)
2099{ 2078{
2100 struct xdr_stream xdr;
2101 struct compound_hdr hdr = { 2079 struct compound_hdr hdr = {
2102 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2080 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2103 }; 2081 };
2104 2082
2105 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2083 encode_compound_hdr(xdr, req, &hdr);
2106 encode_compound_hdr(&xdr, req, &hdr); 2084 encode_sequence(xdr, &args->seq_args, &hdr);
2107 encode_sequence(&xdr, &args->seq_args, &hdr); 2085 encode_putfh(xdr, args->fh, &hdr);
2108 encode_putfh(&xdr, args->fh, &hdr); 2086 encode_open_downgrade(xdr, args, &hdr);
2109 encode_open_downgrade(&xdr, args, &hdr); 2087 encode_getfattr(xdr, args->bitmask, &hdr);
2110 encode_getfattr(&xdr, args->bitmask, &hdr);
2111 encode_nops(&hdr); 2088 encode_nops(&hdr);
2112 return 0;
2113} 2089}
2114 2090
2115/* 2091/*
2116 * Encode a LOCK request 2092 * Encode a LOCK request
2117 */ 2093 */
2118static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args) 2094static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
2095 struct nfs_lock_args *args)
2119{ 2096{
2120 struct xdr_stream xdr;
2121 struct compound_hdr hdr = { 2097 struct compound_hdr hdr = {
2122 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2098 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2123 }; 2099 };
2124 2100
2125 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2101 encode_compound_hdr(xdr, req, &hdr);
2126 encode_compound_hdr(&xdr, req, &hdr); 2102 encode_sequence(xdr, &args->seq_args, &hdr);
2127 encode_sequence(&xdr, &args->seq_args, &hdr); 2103 encode_putfh(xdr, args->fh, &hdr);
2128 encode_putfh(&xdr, args->fh, &hdr); 2104 encode_lock(xdr, args, &hdr);
2129 encode_lock(&xdr, args, &hdr);
2130 encode_nops(&hdr); 2105 encode_nops(&hdr);
2131 return 0;
2132} 2106}
2133 2107
2134/* 2108/*
2135 * Encode a LOCKT request 2109 * Encode a LOCKT request
2136 */ 2110 */
2137static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args) 2111static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
2112 struct nfs_lockt_args *args)
2138{ 2113{
2139 struct xdr_stream xdr;
2140 struct compound_hdr hdr = { 2114 struct compound_hdr hdr = {
2141 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2115 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2142 }; 2116 };
2143 2117
2144 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2118 encode_compound_hdr(xdr, req, &hdr);
2145 encode_compound_hdr(&xdr, req, &hdr); 2119 encode_sequence(xdr, &args->seq_args, &hdr);
2146 encode_sequence(&xdr, &args->seq_args, &hdr); 2120 encode_putfh(xdr, args->fh, &hdr);
2147 encode_putfh(&xdr, args->fh, &hdr); 2121 encode_lockt(xdr, args, &hdr);
2148 encode_lockt(&xdr, args, &hdr);
2149 encode_nops(&hdr); 2122 encode_nops(&hdr);
2150 return 0;
2151} 2123}
2152 2124
2153/* 2125/*
2154 * Encode a LOCKU request 2126 * Encode a LOCKU request
2155 */ 2127 */
2156static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args) 2128static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
2129 struct nfs_locku_args *args)
2157{ 2130{
2158 struct xdr_stream xdr;
2159 struct compound_hdr hdr = { 2131 struct compound_hdr hdr = {
2160 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2132 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2161 }; 2133 };
2162 2134
2163 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2135 encode_compound_hdr(xdr, req, &hdr);
2164 encode_compound_hdr(&xdr, req, &hdr); 2136 encode_sequence(xdr, &args->seq_args, &hdr);
2165 encode_sequence(&xdr, &args->seq_args, &hdr); 2137 encode_putfh(xdr, args->fh, &hdr);
2166 encode_putfh(&xdr, args->fh, &hdr); 2138 encode_locku(xdr, args, &hdr);
2167 encode_locku(&xdr, args, &hdr);
2168 encode_nops(&hdr); 2139 encode_nops(&hdr);
2169 return 0;
2170} 2140}
2171 2141
2172static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) 2142static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
2143 struct xdr_stream *xdr,
2144 struct nfs_release_lockowner_args *args)
2173{ 2145{
2174 struct xdr_stream xdr;
2175 struct compound_hdr hdr = { 2146 struct compound_hdr hdr = {
2176 .minorversion = 0, 2147 .minorversion = 0,
2177 }; 2148 };
2178 2149
2179 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2150 encode_compound_hdr(xdr, req, &hdr);
2180 encode_compound_hdr(&xdr, req, &hdr); 2151 encode_release_lockowner(xdr, &args->lock_owner, &hdr);
2181 encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
2182 encode_nops(&hdr); 2152 encode_nops(&hdr);
2183 return 0;
2184} 2153}
2185 2154
2186/* 2155/*
2187 * Encode a READLINK request 2156 * Encode a READLINK request
2188 */ 2157 */
2189static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args) 2158static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
2159 const struct nfs4_readlink *args)
2190{ 2160{
2191 struct xdr_stream xdr;
2192 struct compound_hdr hdr = { 2161 struct compound_hdr hdr = {
2193 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2162 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2194 }; 2163 };
2195 2164
2196 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2165 encode_compound_hdr(xdr, req, &hdr);
2197 encode_compound_hdr(&xdr, req, &hdr); 2166 encode_sequence(xdr, &args->seq_args, &hdr);
2198 encode_sequence(&xdr, &args->seq_args, &hdr); 2167 encode_putfh(xdr, args->fh, &hdr);
2199 encode_putfh(&xdr, args->fh, &hdr); 2168 encode_readlink(xdr, args, req, &hdr);
2200 encode_readlink(&xdr, args, req, &hdr);
2201 2169
2202 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, 2170 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
2203 args->pgbase, args->pglen); 2171 args->pgbase, args->pglen);
2204 encode_nops(&hdr); 2172 encode_nops(&hdr);
2205 return 0;
2206} 2173}
2207 2174
2208/* 2175/*
2209 * Encode a READDIR request 2176 * Encode a READDIR request
2210 */ 2177 */
2211static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args) 2178static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
2179 const struct nfs4_readdir_arg *args)
2212{ 2180{
2213 struct xdr_stream xdr;
2214 struct compound_hdr hdr = { 2181 struct compound_hdr hdr = {
2215 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2182 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2216 }; 2183 };
2217 2184
2218 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2185 encode_compound_hdr(xdr, req, &hdr);
2219 encode_compound_hdr(&xdr, req, &hdr); 2186 encode_sequence(xdr, &args->seq_args, &hdr);
2220 encode_sequence(&xdr, &args->seq_args, &hdr); 2187 encode_putfh(xdr, args->fh, &hdr);
2221 encode_putfh(&xdr, args->fh, &hdr); 2188 encode_readdir(xdr, args, req, &hdr);
2222 encode_readdir(&xdr, args, req, &hdr);
2223 2189
2224 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, 2190 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
2225 args->pgbase, args->count); 2191 args->pgbase, args->count);
@@ -2227,428 +2193,387 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
2227 __func__, hdr.replen << 2, args->pages, 2193 __func__, hdr.replen << 2, args->pages,
2228 args->pgbase, args->count); 2194 args->pgbase, args->count);
2229 encode_nops(&hdr); 2195 encode_nops(&hdr);
2230 return 0;
2231} 2196}
2232 2197
2233/* 2198/*
2234 * Encode a READ request 2199 * Encode a READ request
2235 */ 2200 */
2236static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 2201static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
2202 struct nfs_readargs *args)
2237{ 2203{
2238 struct xdr_stream xdr;
2239 struct compound_hdr hdr = { 2204 struct compound_hdr hdr = {
2240 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2205 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2241 }; 2206 };
2242 2207
2243 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2208 encode_compound_hdr(xdr, req, &hdr);
2244 encode_compound_hdr(&xdr, req, &hdr); 2209 encode_sequence(xdr, &args->seq_args, &hdr);
2245 encode_sequence(&xdr, &args->seq_args, &hdr); 2210 encode_putfh(xdr, args->fh, &hdr);
2246 encode_putfh(&xdr, args->fh, &hdr); 2211 encode_read(xdr, args, &hdr);
2247 encode_read(&xdr, args, &hdr);
2248 2212
2249 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, 2213 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
2250 args->pages, args->pgbase, args->count); 2214 args->pages, args->pgbase, args->count);
2251 req->rq_rcv_buf.flags |= XDRBUF_READ; 2215 req->rq_rcv_buf.flags |= XDRBUF_READ;
2252 encode_nops(&hdr); 2216 encode_nops(&hdr);
2253 return 0;
2254} 2217}
2255 2218
2256/* 2219/*
2257 * Encode an SETATTR request 2220 * Encode an SETATTR request
2258 */ 2221 */
2259static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args) 2222static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
2223 struct nfs_setattrargs *args)
2260{ 2224{
2261 struct xdr_stream xdr;
2262 struct compound_hdr hdr = { 2225 struct compound_hdr hdr = {
2263 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2226 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2264 }; 2227 };
2265 2228
2266 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2229 encode_compound_hdr(xdr, req, &hdr);
2267 encode_compound_hdr(&xdr, req, &hdr); 2230 encode_sequence(xdr, &args->seq_args, &hdr);
2268 encode_sequence(&xdr, &args->seq_args, &hdr); 2231 encode_putfh(xdr, args->fh, &hdr);
2269 encode_putfh(&xdr, args->fh, &hdr); 2232 encode_setattr(xdr, args, args->server, &hdr);
2270 encode_setattr(&xdr, args, args->server, &hdr); 2233 encode_getfattr(xdr, args->bitmask, &hdr);
2271 encode_getfattr(&xdr, args->bitmask, &hdr);
2272 encode_nops(&hdr); 2234 encode_nops(&hdr);
2273 return 0;
2274} 2235}
2275 2236
2276/* 2237/*
2277 * Encode a GETACL request 2238 * Encode a GETACL request
2278 */ 2239 */
2279static int 2240static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
2280nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, 2241 struct nfs_getaclargs *args)
2281 struct nfs_getaclargs *args)
2282{ 2242{
2283 struct xdr_stream xdr;
2284 struct compound_hdr hdr = { 2243 struct compound_hdr hdr = {
2285 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2244 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2286 }; 2245 };
2287 uint32_t replen; 2246 uint32_t replen;
2288 2247
2289 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2248 encode_compound_hdr(xdr, req, &hdr);
2290 encode_compound_hdr(&xdr, req, &hdr); 2249 encode_sequence(xdr, &args->seq_args, &hdr);
2291 encode_sequence(&xdr, &args->seq_args, &hdr); 2250 encode_putfh(xdr, args->fh, &hdr);
2292 encode_putfh(&xdr, args->fh, &hdr);
2293 replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; 2251 replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
2294 encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); 2252 encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
2295 2253
2296 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, 2254 xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
2297 args->acl_pages, args->acl_pgbase, args->acl_len); 2255 args->acl_pages, args->acl_pgbase, args->acl_len);
2298 encode_nops(&hdr); 2256 encode_nops(&hdr);
2299 return 0;
2300} 2257}
2301 2258
2302/* 2259/*
2303 * Encode a WRITE request 2260 * Encode a WRITE request
2304 */ 2261 */
2305static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 2262static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
2263 struct nfs_writeargs *args)
2306{ 2264{
2307 struct xdr_stream xdr;
2308 struct compound_hdr hdr = { 2265 struct compound_hdr hdr = {
2309 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2266 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2310 }; 2267 };
2311 2268
2312 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2269 encode_compound_hdr(xdr, req, &hdr);
2313 encode_compound_hdr(&xdr, req, &hdr); 2270 encode_sequence(xdr, &args->seq_args, &hdr);
2314 encode_sequence(&xdr, &args->seq_args, &hdr); 2271 encode_putfh(xdr, args->fh, &hdr);
2315 encode_putfh(&xdr, args->fh, &hdr); 2272 encode_write(xdr, args, &hdr);
2316 encode_write(&xdr, args, &hdr);
2317 req->rq_snd_buf.flags |= XDRBUF_WRITE; 2273 req->rq_snd_buf.flags |= XDRBUF_WRITE;
2318 encode_getfattr(&xdr, args->bitmask, &hdr); 2274 encode_getfattr(xdr, args->bitmask, &hdr);
2319 encode_nops(&hdr); 2275 encode_nops(&hdr);
2320 return 0;
2321} 2276}
2322 2277
2323/* 2278/*
2324 * a COMMIT request 2279 * a COMMIT request
2325 */ 2280 */
2326static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 2281static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
2282 struct nfs_writeargs *args)
2327{ 2283{
2328 struct xdr_stream xdr;
2329 struct compound_hdr hdr = { 2284 struct compound_hdr hdr = {
2330 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2285 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2331 }; 2286 };
2332 2287
2333 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2288 encode_compound_hdr(xdr, req, &hdr);
2334 encode_compound_hdr(&xdr, req, &hdr); 2289 encode_sequence(xdr, &args->seq_args, &hdr);
2335 encode_sequence(&xdr, &args->seq_args, &hdr); 2290 encode_putfh(xdr, args->fh, &hdr);
2336 encode_putfh(&xdr, args->fh, &hdr); 2291 encode_commit(xdr, args, &hdr);
2337 encode_commit(&xdr, args, &hdr); 2292 encode_getfattr(xdr, args->bitmask, &hdr);
2338 encode_getfattr(&xdr, args->bitmask, &hdr);
2339 encode_nops(&hdr); 2293 encode_nops(&hdr);
2340 return 0;
2341} 2294}
2342 2295
2343/* 2296/*
2344 * FSINFO request 2297 * FSINFO request
2345 */ 2298 */
2346static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args) 2299static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
2300 struct nfs4_fsinfo_arg *args)
2347{ 2301{
2348 struct xdr_stream xdr;
2349 struct compound_hdr hdr = { 2302 struct compound_hdr hdr = {
2350 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2303 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2351 }; 2304 };
2352 2305
2353 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2306 encode_compound_hdr(xdr, req, &hdr);
2354 encode_compound_hdr(&xdr, req, &hdr); 2307 encode_sequence(xdr, &args->seq_args, &hdr);
2355 encode_sequence(&xdr, &args->seq_args, &hdr); 2308 encode_putfh(xdr, args->fh, &hdr);
2356 encode_putfh(&xdr, args->fh, &hdr); 2309 encode_fsinfo(xdr, args->bitmask, &hdr);
2357 encode_fsinfo(&xdr, args->bitmask, &hdr);
2358 encode_nops(&hdr); 2310 encode_nops(&hdr);
2359 return 0;
2360} 2311}
2361 2312
2362/* 2313/*
2363 * a PATHCONF request 2314 * a PATHCONF request
2364 */ 2315 */
2365static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args) 2316static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
2317 const struct nfs4_pathconf_arg *args)
2366{ 2318{
2367 struct xdr_stream xdr;
2368 struct compound_hdr hdr = { 2319 struct compound_hdr hdr = {
2369 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2320 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2370 }; 2321 };
2371 2322
2372 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2323 encode_compound_hdr(xdr, req, &hdr);
2373 encode_compound_hdr(&xdr, req, &hdr); 2324 encode_sequence(xdr, &args->seq_args, &hdr);
2374 encode_sequence(&xdr, &args->seq_args, &hdr); 2325 encode_putfh(xdr, args->fh, &hdr);
2375 encode_putfh(&xdr, args->fh, &hdr); 2326 encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
2376 encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
2377 &hdr); 2327 &hdr);
2378 encode_nops(&hdr); 2328 encode_nops(&hdr);
2379 return 0;
2380} 2329}
2381 2330
2382/* 2331/*
2383 * a STATFS request 2332 * a STATFS request
2384 */ 2333 */
2385static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args) 2334static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
2335 const struct nfs4_statfs_arg *args)
2386{ 2336{
2387 struct xdr_stream xdr;
2388 struct compound_hdr hdr = { 2337 struct compound_hdr hdr = {
2389 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2338 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2390 }; 2339 };
2391 2340
2392 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2341 encode_compound_hdr(xdr, req, &hdr);
2393 encode_compound_hdr(&xdr, req, &hdr); 2342 encode_sequence(xdr, &args->seq_args, &hdr);
2394 encode_sequence(&xdr, &args->seq_args, &hdr); 2343 encode_putfh(xdr, args->fh, &hdr);
2395 encode_putfh(&xdr, args->fh, &hdr); 2344 encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
2396 encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
2397 args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); 2345 args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
2398 encode_nops(&hdr); 2346 encode_nops(&hdr);
2399 return 0;
2400} 2347}
2401 2348
2402/* 2349/*
2403 * GETATTR_BITMAP request 2350 * GETATTR_BITMAP request
2404 */ 2351 */
2405static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, 2352static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
2406 struct nfs4_server_caps_arg *args) 2353 struct xdr_stream *xdr,
2354 struct nfs4_server_caps_arg *args)
2407{ 2355{
2408 struct xdr_stream xdr;
2409 struct compound_hdr hdr = { 2356 struct compound_hdr hdr = {
2410 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2357 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2411 }; 2358 };
2412 2359
2413 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2360 encode_compound_hdr(xdr, req, &hdr);
2414 encode_compound_hdr(&xdr, req, &hdr); 2361 encode_sequence(xdr, &args->seq_args, &hdr);
2415 encode_sequence(&xdr, &args->seq_args, &hdr); 2362 encode_putfh(xdr, args->fhandle, &hdr);
2416 encode_putfh(&xdr, args->fhandle, &hdr); 2363 encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
2417 encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
2418 FATTR4_WORD0_LINK_SUPPORT| 2364 FATTR4_WORD0_LINK_SUPPORT|
2419 FATTR4_WORD0_SYMLINK_SUPPORT| 2365 FATTR4_WORD0_SYMLINK_SUPPORT|
2420 FATTR4_WORD0_ACLSUPPORT, &hdr); 2366 FATTR4_WORD0_ACLSUPPORT, &hdr);
2421 encode_nops(&hdr); 2367 encode_nops(&hdr);
2422 return 0;
2423} 2368}
2424 2369
2425/* 2370/*
2426 * a RENEW request 2371 * a RENEW request
2427 */ 2372 */
2428static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) 2373static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
2374 struct nfs_client *clp)
2429{ 2375{
2430 struct xdr_stream xdr;
2431 struct compound_hdr hdr = { 2376 struct compound_hdr hdr = {
2432 .nops = 0, 2377 .nops = 0,
2433 }; 2378 };
2434 2379
2435 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2380 encode_compound_hdr(xdr, req, &hdr);
2436 encode_compound_hdr(&xdr, req, &hdr); 2381 encode_renew(xdr, clp, &hdr);
2437 encode_renew(&xdr, clp, &hdr);
2438 encode_nops(&hdr); 2382 encode_nops(&hdr);
2439 return 0;
2440} 2383}
2441 2384
2442/* 2385/*
2443 * a SETCLIENTID request 2386 * a SETCLIENTID request
2444 */ 2387 */
2445static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc) 2388static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
2389 struct xdr_stream *xdr,
2390 struct nfs4_setclientid *sc)
2446{ 2391{
2447 struct xdr_stream xdr;
2448 struct compound_hdr hdr = { 2392 struct compound_hdr hdr = {
2449 .nops = 0, 2393 .nops = 0,
2450 }; 2394 };
2451 2395
2452 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2396 encode_compound_hdr(xdr, req, &hdr);
2453 encode_compound_hdr(&xdr, req, &hdr); 2397 encode_setclientid(xdr, sc, &hdr);
2454 encode_setclientid(&xdr, sc, &hdr);
2455 encode_nops(&hdr); 2398 encode_nops(&hdr);
2456 return 0;
2457} 2399}
2458 2400
2459/* 2401/*
2460 * a SETCLIENTID_CONFIRM request 2402 * a SETCLIENTID_CONFIRM request
2461 */ 2403 */
2462static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg) 2404static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
2405 struct xdr_stream *xdr,
2406 struct nfs4_setclientid_res *arg)
2463{ 2407{
2464 struct xdr_stream xdr;
2465 struct compound_hdr hdr = { 2408 struct compound_hdr hdr = {
2466 .nops = 0, 2409 .nops = 0,
2467 }; 2410 };
2468 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2411 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
2469 2412
2470 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2413 encode_compound_hdr(xdr, req, &hdr);
2471 encode_compound_hdr(&xdr, req, &hdr); 2414 encode_setclientid_confirm(xdr, arg, &hdr);
2472 encode_setclientid_confirm(&xdr, arg, &hdr); 2415 encode_putrootfh(xdr, &hdr);
2473 encode_putrootfh(&xdr, &hdr); 2416 encode_fsinfo(xdr, lease_bitmap, &hdr);
2474 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2475 encode_nops(&hdr); 2417 encode_nops(&hdr);
2476 return 0;
2477} 2418}
2478 2419
2479/* 2420/*
2480 * DELEGRETURN request 2421 * DELEGRETURN request
2481 */ 2422 */
2482static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args) 2423static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
2424 struct xdr_stream *xdr,
2425 const struct nfs4_delegreturnargs *args)
2483{ 2426{
2484 struct xdr_stream xdr;
2485 struct compound_hdr hdr = { 2427 struct compound_hdr hdr = {
2486 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2428 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2487 }; 2429 };
2488 2430
2489 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2431 encode_compound_hdr(xdr, req, &hdr);
2490 encode_compound_hdr(&xdr, req, &hdr); 2432 encode_sequence(xdr, &args->seq_args, &hdr);
2491 encode_sequence(&xdr, &args->seq_args, &hdr); 2433 encode_putfh(xdr, args->fhandle, &hdr);
2492 encode_putfh(&xdr, args->fhandle, &hdr); 2434 encode_delegreturn(xdr, args->stateid, &hdr);
2493 encode_delegreturn(&xdr, args->stateid, &hdr); 2435 encode_getfattr(xdr, args->bitmask, &hdr);
2494 encode_getfattr(&xdr, args->bitmask, &hdr);
2495 encode_nops(&hdr); 2436 encode_nops(&hdr);
2496 return 0;
2497} 2437}
2498 2438
2499/* 2439/*
2500 * Encode FS_LOCATIONS request 2440 * Encode FS_LOCATIONS request
2501 */ 2441 */
2502static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args) 2442static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
2443 struct xdr_stream *xdr,
2444 struct nfs4_fs_locations_arg *args)
2503{ 2445{
2504 struct xdr_stream xdr;
2505 struct compound_hdr hdr = { 2446 struct compound_hdr hdr = {
2506 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2447 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2507 }; 2448 };
2508 uint32_t replen; 2449 uint32_t replen;
2509 2450
2510 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2451 encode_compound_hdr(xdr, req, &hdr);
2511 encode_compound_hdr(&xdr, req, &hdr); 2452 encode_sequence(xdr, &args->seq_args, &hdr);
2512 encode_sequence(&xdr, &args->seq_args, &hdr); 2453 encode_putfh(xdr, args->dir_fh, &hdr);
2513 encode_putfh(&xdr, args->dir_fh, &hdr); 2454 encode_lookup(xdr, args->name, &hdr);
2514 encode_lookup(&xdr, args->name, &hdr);
2515 replen = hdr.replen; /* get the attribute into args->page */ 2455 replen = hdr.replen; /* get the attribute into args->page */
2516 encode_fs_locations(&xdr, args->bitmask, &hdr); 2456 encode_fs_locations(xdr, args->bitmask, &hdr);
2517 2457
2518 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, 2458 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
2519 0, PAGE_SIZE); 2459 0, PAGE_SIZE);
2520 encode_nops(&hdr); 2460 encode_nops(&hdr);
2521 return 0;
2522} 2461}
2523 2462
2524#if defined(CONFIG_NFS_V4_1) 2463#if defined(CONFIG_NFS_V4_1)
2525/* 2464/*
2526 * EXCHANGE_ID request 2465 * EXCHANGE_ID request
2527 */ 2466 */
2528static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p, 2467static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
2529 struct nfs41_exchange_id_args *args) 2468 struct xdr_stream *xdr,
2469 struct nfs41_exchange_id_args *args)
2530{ 2470{
2531 struct xdr_stream xdr;
2532 struct compound_hdr hdr = { 2471 struct compound_hdr hdr = {
2533 .minorversion = args->client->cl_mvops->minor_version, 2472 .minorversion = args->client->cl_mvops->minor_version,
2534 }; 2473 };
2535 2474
2536 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2475 encode_compound_hdr(xdr, req, &hdr);
2537 encode_compound_hdr(&xdr, req, &hdr); 2476 encode_exchange_id(xdr, args, &hdr);
2538 encode_exchange_id(&xdr, args, &hdr);
2539 encode_nops(&hdr); 2477 encode_nops(&hdr);
2540 return 0;
2541} 2478}
2542 2479
2543/* 2480/*
2544 * a CREATE_SESSION request 2481 * a CREATE_SESSION request
2545 */ 2482 */
2546static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p, 2483static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
2547 struct nfs41_create_session_args *args) 2484 struct xdr_stream *xdr,
2485 struct nfs41_create_session_args *args)
2548{ 2486{
2549 struct xdr_stream xdr;
2550 struct compound_hdr hdr = { 2487 struct compound_hdr hdr = {
2551 .minorversion = args->client->cl_mvops->minor_version, 2488 .minorversion = args->client->cl_mvops->minor_version,
2552 }; 2489 };
2553 2490
2554 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2491 encode_compound_hdr(xdr, req, &hdr);
2555 encode_compound_hdr(&xdr, req, &hdr); 2492 encode_create_session(xdr, args, &hdr);
2556 encode_create_session(&xdr, args, &hdr);
2557 encode_nops(&hdr); 2493 encode_nops(&hdr);
2558 return 0;
2559} 2494}
2560 2495
2561/* 2496/*
2562 * a DESTROY_SESSION request 2497 * a DESTROY_SESSION request
2563 */ 2498 */
2564static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p, 2499static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
2565 struct nfs4_session *session) 2500 struct xdr_stream *xdr,
2501 struct nfs4_session *session)
2566{ 2502{
2567 struct xdr_stream xdr;
2568 struct compound_hdr hdr = { 2503 struct compound_hdr hdr = {
2569 .minorversion = session->clp->cl_mvops->minor_version, 2504 .minorversion = session->clp->cl_mvops->minor_version,
2570 }; 2505 };
2571 2506
2572 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2507 encode_compound_hdr(xdr, req, &hdr);
2573 encode_compound_hdr(&xdr, req, &hdr); 2508 encode_destroy_session(xdr, session, &hdr);
2574 encode_destroy_session(&xdr, session, &hdr);
2575 encode_nops(&hdr); 2509 encode_nops(&hdr);
2576 return 0;
2577} 2510}
2578 2511
2579/* 2512/*
2580 * a SEQUENCE request 2513 * a SEQUENCE request
2581 */ 2514 */
2582static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p, 2515static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
2583 struct nfs4_sequence_args *args) 2516 struct nfs4_sequence_args *args)
2584{ 2517{
2585 struct xdr_stream xdr;
2586 struct compound_hdr hdr = { 2518 struct compound_hdr hdr = {
2587 .minorversion = nfs4_xdr_minorversion(args), 2519 .minorversion = nfs4_xdr_minorversion(args),
2588 }; 2520 };
2589 2521
2590 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2522 encode_compound_hdr(xdr, req, &hdr);
2591 encode_compound_hdr(&xdr, req, &hdr); 2523 encode_sequence(xdr, args, &hdr);
2592 encode_sequence(&xdr, args, &hdr);
2593 encode_nops(&hdr); 2524 encode_nops(&hdr);
2594 return 0;
2595} 2525}
2596 2526
2597/* 2527/*
2598 * a GET_LEASE_TIME request 2528 * a GET_LEASE_TIME request
2599 */ 2529 */
2600static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, 2530static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
2601 struct nfs4_get_lease_time_args *args) 2531 struct xdr_stream *xdr,
2532 struct nfs4_get_lease_time_args *args)
2602{ 2533{
2603 struct xdr_stream xdr;
2604 struct compound_hdr hdr = { 2534 struct compound_hdr hdr = {
2605 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), 2535 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
2606 }; 2536 };
2607 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2537 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
2608 2538
2609 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2539 encode_compound_hdr(xdr, req, &hdr);
2610 encode_compound_hdr(&xdr, req, &hdr); 2540 encode_sequence(xdr, &args->la_seq_args, &hdr);
2611 encode_sequence(&xdr, &args->la_seq_args, &hdr); 2541 encode_putrootfh(xdr, &hdr);
2612 encode_putrootfh(&xdr, &hdr); 2542 encode_fsinfo(xdr, lease_bitmap, &hdr);
2613 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2614 encode_nops(&hdr); 2543 encode_nops(&hdr);
2615 return 0;
2616} 2544}
2617 2545
2618/* 2546/*
2619 * a RECLAIM_COMPLETE request 2547 * a RECLAIM_COMPLETE request
2620 */ 2548 */
2621static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, 2549static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2622 struct nfs41_reclaim_complete_args *args) 2550 struct xdr_stream *xdr,
2551 struct nfs41_reclaim_complete_args *args)
2623{ 2552{
2624 struct xdr_stream xdr;
2625 struct compound_hdr hdr = { 2553 struct compound_hdr hdr = {
2626 .minorversion = nfs4_xdr_minorversion(&args->seq_args) 2554 .minorversion = nfs4_xdr_minorversion(&args->seq_args)
2627 }; 2555 };
2628 2556
2629 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2557 encode_compound_hdr(xdr, req, &hdr);
2630 encode_compound_hdr(&xdr, req, &hdr); 2558 encode_sequence(xdr, &args->seq_args, &hdr);
2631 encode_sequence(&xdr, &args->seq_args, &hdr); 2559 encode_reclaim_complete(xdr, args, &hdr);
2632 encode_reclaim_complete(&xdr, args, &hdr);
2633 encode_nops(&hdr); 2560 encode_nops(&hdr);
2634 return 0;
2635} 2561}
2636 2562
2637/* 2563/*
2638 * Encode GETDEVICEINFO request 2564 * Encode GETDEVICEINFO request
2639 */ 2565 */
2640static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, 2566static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
2641 struct nfs4_getdeviceinfo_args *args) 2567 struct xdr_stream *xdr,
2568 struct nfs4_getdeviceinfo_args *args)
2642{ 2569{
2643 struct xdr_stream xdr;
2644 struct compound_hdr hdr = { 2570 struct compound_hdr hdr = {
2645 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2571 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2646 }; 2572 };
2647 2573
2648 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2574 encode_compound_hdr(xdr, req, &hdr);
2649 encode_compound_hdr(&xdr, req, &hdr); 2575 encode_sequence(xdr, &args->seq_args, &hdr);
2650 encode_sequence(&xdr, &args->seq_args, &hdr); 2576 encode_getdeviceinfo(xdr, args, &hdr);
2651 encode_getdeviceinfo(&xdr, args, &hdr);
2652 2577
2653 /* set up reply kvec. Subtract notification bitmap max size (2) 2578 /* set up reply kvec. Subtract notification bitmap max size (2)
2654 * so that notification bitmap is put in xdr_buf tail */ 2579 * so that notification bitmap is put in xdr_buf tail */
@@ -2657,27 +2582,24 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
2657 args->pdev->pglen); 2582 args->pdev->pglen);
2658 2583
2659 encode_nops(&hdr); 2584 encode_nops(&hdr);
2660 return 0;
2661} 2585}
2662 2586
2663/* 2587/*
2664 * Encode LAYOUTGET request 2588 * Encode LAYOUTGET request
2665 */ 2589 */
2666static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, 2590static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
2667 struct nfs4_layoutget_args *args) 2591 struct xdr_stream *xdr,
2592 struct nfs4_layoutget_args *args)
2668{ 2593{
2669 struct xdr_stream xdr;
2670 struct compound_hdr hdr = { 2594 struct compound_hdr hdr = {
2671 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2595 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2672 }; 2596 };
2673 2597
2674 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2598 encode_compound_hdr(xdr, req, &hdr);
2675 encode_compound_hdr(&xdr, req, &hdr); 2599 encode_sequence(xdr, &args->seq_args, &hdr);
2676 encode_sequence(&xdr, &args->seq_args, &hdr); 2600 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2677 encode_putfh(&xdr, NFS_FH(args->inode), &hdr); 2601 encode_layoutget(xdr, args, &hdr);
2678 encode_layoutget(&xdr, args, &hdr);
2679 encode_nops(&hdr); 2602 encode_nops(&hdr);
2680 return 0;
2681} 2603}
2682#endif /* CONFIG_NFS_V4_1 */ 2604#endif /* CONFIG_NFS_V4_1 */
2683 2605
@@ -4475,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
4475 goto out_overflow; 4397 goto out_overflow;
4476 eof = be32_to_cpup(p++); 4398 eof = be32_to_cpup(p++);
4477 count = be32_to_cpup(p); 4399 count = be32_to_cpup(p);
4478 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 4400 hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
4479 recvd = req->rq_rcv_buf.len - hdrlen; 4401 recvd = req->rq_rcv_buf.len - hdrlen;
4480 if (count > recvd) { 4402 if (count > recvd) {
4481 dprintk("NFS: server cheating in read reply: " 4403 dprintk("NFS: server cheating in read reply: "
@@ -5000,7 +4922,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
5000 goto out_overflow; 4922 goto out_overflow;
5001 len = be32_to_cpup(p); 4923 len = be32_to_cpup(p);
5002 if (len) { 4924 if (len) {
5003 int i; 4925 uint32_t i;
5004 4926
5005 p = xdr_inline_decode(xdr, 4 * len); 4927 p = xdr_inline_decode(xdr, 4 * len);
5006 if (unlikely(!p)) 4928 if (unlikely(!p))
@@ -5090,26 +5012,26 @@ out_overflow:
5090/* 5012/*
5091 * Decode OPEN_DOWNGRADE response 5013 * Decode OPEN_DOWNGRADE response
5092 */ 5014 */
5093static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) 5015static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
5016 struct xdr_stream *xdr,
5017 struct nfs_closeres *res)
5094{ 5018{
5095 struct xdr_stream xdr;
5096 struct compound_hdr hdr; 5019 struct compound_hdr hdr;
5097 int status; 5020 int status;
5098 5021
5099 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5022 status = decode_compound_hdr(xdr, &hdr);
5100 status = decode_compound_hdr(&xdr, &hdr);
5101 if (status) 5023 if (status)
5102 goto out; 5024 goto out;
5103 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5025 status = decode_sequence(xdr, &res->seq_res, rqstp);
5104 if (status) 5026 if (status)
5105 goto out; 5027 goto out;
5106 status = decode_putfh(&xdr); 5028 status = decode_putfh(xdr);
5107 if (status) 5029 if (status)
5108 goto out; 5030 goto out;
5109 status = decode_open_downgrade(&xdr, res); 5031 status = decode_open_downgrade(xdr, res);
5110 if (status != 0) 5032 if (status != 0)
5111 goto out; 5033 goto out;
5112 decode_getfattr(&xdr, res->fattr, res->server, 5034 decode_getfattr(xdr, res->fattr, res->server,
5113 !RPC_IS_ASYNC(rqstp->rq_task)); 5035 !RPC_IS_ASYNC(rqstp->rq_task));
5114out: 5036out:
5115 return status; 5037 return status;
@@ -5118,26 +5040,25 @@ out:
5118/* 5040/*
5119 * Decode ACCESS response 5041 * Decode ACCESS response
5120 */ 5042 */
5121static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res) 5043static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5044 struct nfs4_accessres *res)
5122{ 5045{
5123 struct xdr_stream xdr;
5124 struct compound_hdr hdr; 5046 struct compound_hdr hdr;
5125 int status; 5047 int status;
5126 5048
5127 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5049 status = decode_compound_hdr(xdr, &hdr);
5128 status = decode_compound_hdr(&xdr, &hdr);
5129 if (status) 5050 if (status)
5130 goto out; 5051 goto out;
5131 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5052 status = decode_sequence(xdr, &res->seq_res, rqstp);
5132 if (status) 5053 if (status)
5133 goto out; 5054 goto out;
5134 status = decode_putfh(&xdr); 5055 status = decode_putfh(xdr);
5135 if (status != 0) 5056 if (status != 0)
5136 goto out; 5057 goto out;
5137 status = decode_access(&xdr, res); 5058 status = decode_access(xdr, res);
5138 if (status != 0) 5059 if (status != 0)
5139 goto out; 5060 goto out;
5140 decode_getfattr(&xdr, res->fattr, res->server, 5061 decode_getfattr(xdr, res->fattr, res->server,
5141 !RPC_IS_ASYNC(rqstp->rq_task)); 5062 !RPC_IS_ASYNC(rqstp->rq_task));
5142out: 5063out:
5143 return status; 5064 return status;
@@ -5146,26 +5067,28 @@ out:
5146/* 5067/*
5147 * Decode LOOKUP response 5068 * Decode LOOKUP response
5148 */ 5069 */
5149static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) 5070static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5071 struct nfs4_lookup_res *res)
5150{ 5072{
5151 struct xdr_stream xdr;
5152 struct compound_hdr hdr; 5073 struct compound_hdr hdr;
5153 int status; 5074 int status;
5154 5075
5155 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5076 status = decode_compound_hdr(xdr, &hdr);
5156 status = decode_compound_hdr(&xdr, &hdr);
5157 if (status) 5077 if (status)
5158 goto out; 5078 goto out;
5159 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5079 status = decode_sequence(xdr, &res->seq_res, rqstp);
5160 if (status) 5080 if (status)
5161 goto out; 5081 goto out;
5162 if ((status = decode_putfh(&xdr)) != 0) 5082 status = decode_putfh(xdr);
5083 if (status)
5163 goto out; 5084 goto out;
5164 if ((status = decode_lookup(&xdr)) != 0) 5085 status = decode_lookup(xdr);
5086 if (status)
5165 goto out; 5087 goto out;
5166 if ((status = decode_getfh(&xdr, res->fh)) != 0) 5088 status = decode_getfh(xdr, res->fh);
5089 if (status)
5167 goto out; 5090 goto out;
5168 status = decode_getfattr(&xdr, res->fattr, res->server 5091 status = decode_getfattr(xdr, res->fattr, res->server
5169 ,!RPC_IS_ASYNC(rqstp->rq_task)); 5092 ,!RPC_IS_ASYNC(rqstp->rq_task));
5170out: 5093out:
5171 return status; 5094 return status;
@@ -5174,23 +5097,25 @@ out:
5174/* 5097/*
5175 * Decode LOOKUP_ROOT response 5098 * Decode LOOKUP_ROOT response
5176 */ 5099 */
5177static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) 5100static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
5101 struct xdr_stream *xdr,
5102 struct nfs4_lookup_res *res)
5178{ 5103{
5179 struct xdr_stream xdr;
5180 struct compound_hdr hdr; 5104 struct compound_hdr hdr;
5181 int status; 5105 int status;
5182 5106
5183 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5107 status = decode_compound_hdr(xdr, &hdr);
5184 status = decode_compound_hdr(&xdr, &hdr);
5185 if (status) 5108 if (status)
5186 goto out; 5109 goto out;
5187 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5110 status = decode_sequence(xdr, &res->seq_res, rqstp);
5188 if (status) 5111 if (status)
5189 goto out; 5112 goto out;
5190 if ((status = decode_putrootfh(&xdr)) != 0) 5113 status = decode_putrootfh(xdr);
5114 if (status)
5191 goto out; 5115 goto out;
5192 if ((status = decode_getfh(&xdr, res->fh)) == 0) 5116 status = decode_getfh(xdr, res->fh);
5193 status = decode_getfattr(&xdr, res->fattr, res->server, 5117 if (status == 0)
5118 status = decode_getfattr(xdr, res->fattr, res->server,
5194 !RPC_IS_ASYNC(rqstp->rq_task)); 5119 !RPC_IS_ASYNC(rqstp->rq_task));
5195out: 5120out:
5196 return status; 5121 return status;
@@ -5199,24 +5124,25 @@ out:
5199/* 5124/*
5200 * Decode REMOVE response 5125 * Decode REMOVE response
5201 */ 5126 */
5202static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res) 5127static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5128 struct nfs_removeres *res)
5203{ 5129{
5204 struct xdr_stream xdr;
5205 struct compound_hdr hdr; 5130 struct compound_hdr hdr;
5206 int status; 5131 int status;
5207 5132
5208 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5133 status = decode_compound_hdr(xdr, &hdr);
5209 status = decode_compound_hdr(&xdr, &hdr);
5210 if (status) 5134 if (status)
5211 goto out; 5135 goto out;
5212 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5136 status = decode_sequence(xdr, &res->seq_res, rqstp);
5213 if (status) 5137 if (status)
5214 goto out; 5138 goto out;
5215 if ((status = decode_putfh(&xdr)) != 0) 5139 status = decode_putfh(xdr);
5140 if (status)
5216 goto out; 5141 goto out;
5217 if ((status = decode_remove(&xdr, &res->cinfo)) != 0) 5142 status = decode_remove(xdr, &res->cinfo);
5143 if (status)
5218 goto out; 5144 goto out;
5219 decode_getfattr(&xdr, res->dir_attr, res->server, 5145 decode_getfattr(xdr, res->dir_attr, res->server,
5220 !RPC_IS_ASYNC(rqstp->rq_task)); 5146 !RPC_IS_ASYNC(rqstp->rq_task));
5221out: 5147out:
5222 return status; 5148 return status;
@@ -5225,34 +5151,38 @@ out:
5225/* 5151/*
5226 * Decode RENAME response 5152 * Decode RENAME response
5227 */ 5153 */
5228static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res) 5154static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5155 struct nfs_renameres *res)
5229{ 5156{
5230 struct xdr_stream xdr;
5231 struct compound_hdr hdr; 5157 struct compound_hdr hdr;
5232 int status; 5158 int status;
5233 5159
5234 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5160 status = decode_compound_hdr(xdr, &hdr);
5235 status = decode_compound_hdr(&xdr, &hdr);
5236 if (status) 5161 if (status)
5237 goto out; 5162 goto out;
5238 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5163 status = decode_sequence(xdr, &res->seq_res, rqstp);
5239 if (status) 5164 if (status)
5240 goto out; 5165 goto out;
5241 if ((status = decode_putfh(&xdr)) != 0) 5166 status = decode_putfh(xdr);
5167 if (status)
5242 goto out; 5168 goto out;
5243 if ((status = decode_savefh(&xdr)) != 0) 5169 status = decode_savefh(xdr);
5170 if (status)
5244 goto out; 5171 goto out;
5245 if ((status = decode_putfh(&xdr)) != 0) 5172 status = decode_putfh(xdr);
5173 if (status)
5246 goto out; 5174 goto out;
5247 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) 5175 status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
5176 if (status)
5248 goto out; 5177 goto out;
5249 /* Current FH is target directory */ 5178 /* Current FH is target directory */
5250 if (decode_getfattr(&xdr, res->new_fattr, res->server, 5179 if (decode_getfattr(xdr, res->new_fattr, res->server,
5251 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5180 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5252 goto out; 5181 goto out;
5253 if ((status = decode_restorefh(&xdr)) != 0) 5182 status = decode_restorefh(xdr);
5183 if (status)
5254 goto out; 5184 goto out;
5255 decode_getfattr(&xdr, res->old_fattr, res->server, 5185 decode_getfattr(xdr, res->old_fattr, res->server,
5256 !RPC_IS_ASYNC(rqstp->rq_task)); 5186 !RPC_IS_ASYNC(rqstp->rq_task));
5257out: 5187out:
5258 return status; 5188 return status;
@@ -5261,37 +5191,41 @@ out:
5261/* 5191/*
5262 * Decode LINK response 5192 * Decode LINK response
5263 */ 5193 */
5264static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res) 5194static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5195 struct nfs4_link_res *res)
5265{ 5196{
5266 struct xdr_stream xdr;
5267 struct compound_hdr hdr; 5197 struct compound_hdr hdr;
5268 int status; 5198 int status;
5269 5199
5270 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5200 status = decode_compound_hdr(xdr, &hdr);
5271 status = decode_compound_hdr(&xdr, &hdr);
5272 if (status) 5201 if (status)
5273 goto out; 5202 goto out;
5274 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5203 status = decode_sequence(xdr, &res->seq_res, rqstp);
5275 if (status) 5204 if (status)
5276 goto out; 5205 goto out;
5277 if ((status = decode_putfh(&xdr)) != 0) 5206 status = decode_putfh(xdr);
5207 if (status)
5278 goto out; 5208 goto out;
5279 if ((status = decode_savefh(&xdr)) != 0) 5209 status = decode_savefh(xdr);
5210 if (status)
5280 goto out; 5211 goto out;
5281 if ((status = decode_putfh(&xdr)) != 0) 5212 status = decode_putfh(xdr);
5213 if (status)
5282 goto out; 5214 goto out;
5283 if ((status = decode_link(&xdr, &res->cinfo)) != 0) 5215 status = decode_link(xdr, &res->cinfo);
5216 if (status)
5284 goto out; 5217 goto out;
5285 /* 5218 /*
5286 * Note order: OP_LINK leaves the directory as the current 5219 * Note order: OP_LINK leaves the directory as the current
5287 * filehandle. 5220 * filehandle.
5288 */ 5221 */
5289 if (decode_getfattr(&xdr, res->dir_attr, res->server, 5222 if (decode_getfattr(xdr, res->dir_attr, res->server,
5290 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5223 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5291 goto out; 5224 goto out;
5292 if ((status = decode_restorefh(&xdr)) != 0) 5225 status = decode_restorefh(xdr);
5226 if (status)
5293 goto out; 5227 goto out;
5294 decode_getfattr(&xdr, res->fattr, res->server, 5228 decode_getfattr(xdr, res->fattr, res->server,
5295 !RPC_IS_ASYNC(rqstp->rq_task)); 5229 !RPC_IS_ASYNC(rqstp->rq_task));
5296out: 5230out:
5297 return status; 5231 return status;
@@ -5300,33 +5234,37 @@ out:
5300/* 5234/*
5301 * Decode CREATE response 5235 * Decode CREATE response
5302 */ 5236 */
5303static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) 5237static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5238 struct nfs4_create_res *res)
5304{ 5239{
5305 struct xdr_stream xdr;
5306 struct compound_hdr hdr; 5240 struct compound_hdr hdr;
5307 int status; 5241 int status;
5308 5242
5309 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5243 status = decode_compound_hdr(xdr, &hdr);
5310 status = decode_compound_hdr(&xdr, &hdr);
5311 if (status) 5244 if (status)
5312 goto out; 5245 goto out;
5313 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5246 status = decode_sequence(xdr, &res->seq_res, rqstp);
5314 if (status) 5247 if (status)
5315 goto out; 5248 goto out;
5316 if ((status = decode_putfh(&xdr)) != 0) 5249 status = decode_putfh(xdr);
5250 if (status)
5317 goto out; 5251 goto out;
5318 if ((status = decode_savefh(&xdr)) != 0) 5252 status = decode_savefh(xdr);
5253 if (status)
5319 goto out; 5254 goto out;
5320 if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0) 5255 status = decode_create(xdr, &res->dir_cinfo);
5256 if (status)
5321 goto out; 5257 goto out;
5322 if ((status = decode_getfh(&xdr, res->fh)) != 0) 5258 status = decode_getfh(xdr, res->fh);
5259 if (status)
5323 goto out; 5260 goto out;
5324 if (decode_getfattr(&xdr, res->fattr, res->server, 5261 if (decode_getfattr(xdr, res->fattr, res->server,
5325 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5262 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5326 goto out; 5263 goto out;
5327 if ((status = decode_restorefh(&xdr)) != 0) 5264 status = decode_restorefh(xdr);
5265 if (status)
5328 goto out; 5266 goto out;
5329 decode_getfattr(&xdr, res->dir_fattr, res->server, 5267 decode_getfattr(xdr, res->dir_fattr, res->server,
5330 !RPC_IS_ASYNC(rqstp->rq_task)); 5268 !RPC_IS_ASYNC(rqstp->rq_task));
5331out: 5269out:
5332 return status; 5270 return status;
@@ -5335,31 +5273,31 @@ out:
5335/* 5273/*
5336 * Decode SYMLINK response 5274 * Decode SYMLINK response
5337 */ 5275 */
5338static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) 5276static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5277 struct nfs4_create_res *res)
5339{ 5278{
5340 return nfs4_xdr_dec_create(rqstp, p, res); 5279 return nfs4_xdr_dec_create(rqstp, xdr, res);
5341} 5280}
5342 5281
5343/* 5282/*
5344 * Decode GETATTR response 5283 * Decode GETATTR response
5345 */ 5284 */
5346static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res) 5285static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5286 struct nfs4_getattr_res *res)
5347{ 5287{
5348 struct xdr_stream xdr;
5349 struct compound_hdr hdr; 5288 struct compound_hdr hdr;
5350 int status; 5289 int status;
5351 5290
5352 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5291 status = decode_compound_hdr(xdr, &hdr);
5353 status = decode_compound_hdr(&xdr, &hdr);
5354 if (status) 5292 if (status)
5355 goto out; 5293 goto out;
5356 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5294 status = decode_sequence(xdr, &res->seq_res, rqstp);
5357 if (status) 5295 if (status)
5358 goto out; 5296 goto out;
5359 status = decode_putfh(&xdr); 5297 status = decode_putfh(xdr);
5360 if (status) 5298 if (status)
5361 goto out; 5299 goto out;
5362 status = decode_getfattr(&xdr, res->fattr, res->server, 5300 status = decode_getfattr(xdr, res->fattr, res->server,
5363 !RPC_IS_ASYNC(rqstp->rq_task)); 5301 !RPC_IS_ASYNC(rqstp->rq_task));
5364out: 5302out:
5365 return status; 5303 return status;
@@ -5368,46 +5306,40 @@ out:
5368/* 5306/*
5369 * Encode an SETACL request 5307 * Encode an SETACL request
5370 */ 5308 */
5371static int 5309static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
5372nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args) 5310 struct nfs_setaclargs *args)
5373{ 5311{
5374 struct xdr_stream xdr;
5375 struct compound_hdr hdr = { 5312 struct compound_hdr hdr = {
5376 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 5313 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
5377 }; 5314 };
5378 int status;
5379 5315
5380 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 5316 encode_compound_hdr(xdr, req, &hdr);
5381 encode_compound_hdr(&xdr, req, &hdr); 5317 encode_sequence(xdr, &args->seq_args, &hdr);
5382 encode_sequence(&xdr, &args->seq_args, &hdr); 5318 encode_putfh(xdr, args->fh, &hdr);
5383 encode_putfh(&xdr, args->fh, &hdr); 5319 encode_setacl(xdr, args, &hdr);
5384 status = encode_setacl(&xdr, args, &hdr);
5385 encode_nops(&hdr); 5320 encode_nops(&hdr);
5386 return status;
5387} 5321}
5388 5322
5389/* 5323/*
5390 * Decode SETACL response 5324 * Decode SETACL response
5391 */ 5325 */
5392static int 5326static int
5393nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, 5327nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5394 struct nfs_setaclres *res) 5328 struct nfs_setaclres *res)
5395{ 5329{
5396 struct xdr_stream xdr;
5397 struct compound_hdr hdr; 5330 struct compound_hdr hdr;
5398 int status; 5331 int status;
5399 5332
5400 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5333 status = decode_compound_hdr(xdr, &hdr);
5401 status = decode_compound_hdr(&xdr, &hdr);
5402 if (status) 5334 if (status)
5403 goto out; 5335 goto out;
5404 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5336 status = decode_sequence(xdr, &res->seq_res, rqstp);
5405 if (status) 5337 if (status)
5406 goto out; 5338 goto out;
5407 status = decode_putfh(&xdr); 5339 status = decode_putfh(xdr);
5408 if (status) 5340 if (status)
5409 goto out; 5341 goto out;
5410 status = decode_setattr(&xdr); 5342 status = decode_setattr(xdr);
5411out: 5343out:
5412 return status; 5344 return status;
5413} 5345}
@@ -5416,24 +5348,22 @@ out:
5416 * Decode GETACL response 5348 * Decode GETACL response
5417 */ 5349 */
5418static int 5350static int
5419nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, 5351nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5420 struct nfs_getaclres *res) 5352 struct nfs_getaclres *res)
5421{ 5353{
5422 struct xdr_stream xdr;
5423 struct compound_hdr hdr; 5354 struct compound_hdr hdr;
5424 int status; 5355 int status;
5425 5356
5426 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5357 status = decode_compound_hdr(xdr, &hdr);
5427 status = decode_compound_hdr(&xdr, &hdr);
5428 if (status) 5358 if (status)
5429 goto out; 5359 goto out;
5430 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5360 status = decode_sequence(xdr, &res->seq_res, rqstp);
5431 if (status) 5361 if (status)
5432 goto out; 5362 goto out;
5433 status = decode_putfh(&xdr); 5363 status = decode_putfh(xdr);
5434 if (status) 5364 if (status)
5435 goto out; 5365 goto out;
5436 status = decode_getacl(&xdr, rqstp, &res->acl_len); 5366 status = decode_getacl(xdr, rqstp, &res->acl_len);
5437 5367
5438out: 5368out:
5439 return status; 5369 return status;
@@ -5442,23 +5372,22 @@ out:
5442/* 5372/*
5443 * Decode CLOSE response 5373 * Decode CLOSE response
5444 */ 5374 */
5445static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) 5375static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5376 struct nfs_closeres *res)
5446{ 5377{
5447 struct xdr_stream xdr;
5448 struct compound_hdr hdr; 5378 struct compound_hdr hdr;
5449 int status; 5379 int status;
5450 5380
5451 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5381 status = decode_compound_hdr(xdr, &hdr);
5452 status = decode_compound_hdr(&xdr, &hdr);
5453 if (status) 5382 if (status)
5454 goto out; 5383 goto out;
5455 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5384 status = decode_sequence(xdr, &res->seq_res, rqstp);
5456 if (status) 5385 if (status)
5457 goto out; 5386 goto out;
5458 status = decode_putfh(&xdr); 5387 status = decode_putfh(xdr);
5459 if (status) 5388 if (status)
5460 goto out; 5389 goto out;
5461 status = decode_close(&xdr, res); 5390 status = decode_close(xdr, res);
5462 if (status != 0) 5391 if (status != 0)
5463 goto out; 5392 goto out;
5464 /* 5393 /*
@@ -5467,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
5467 * an ESTALE error. Shouldn't be a problem, 5396 * an ESTALE error. Shouldn't be a problem,
5468 * though, since fattr->valid will remain unset. 5397 * though, since fattr->valid will remain unset.
5469 */ 5398 */
5470 decode_getfattr(&xdr, res->fattr, res->server, 5399 decode_getfattr(xdr, res->fattr, res->server,
5471 !RPC_IS_ASYNC(rqstp->rq_task)); 5400 !RPC_IS_ASYNC(rqstp->rq_task));
5472out: 5401out:
5473 return status; 5402 return status;
@@ -5476,36 +5405,35 @@ out:
5476/* 5405/*
5477 * Decode OPEN response 5406 * Decode OPEN response
5478 */ 5407 */
5479static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) 5408static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5409 struct nfs_openres *res)
5480{ 5410{
5481 struct xdr_stream xdr;
5482 struct compound_hdr hdr; 5411 struct compound_hdr hdr;
5483 int status; 5412 int status;
5484 5413
5485 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5414 status = decode_compound_hdr(xdr, &hdr);
5486 status = decode_compound_hdr(&xdr, &hdr);
5487 if (status) 5415 if (status)
5488 goto out; 5416 goto out;
5489 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5417 status = decode_sequence(xdr, &res->seq_res, rqstp);
5490 if (status) 5418 if (status)
5491 goto out; 5419 goto out;
5492 status = decode_putfh(&xdr); 5420 status = decode_putfh(xdr);
5493 if (status) 5421 if (status)
5494 goto out; 5422 goto out;
5495 status = decode_savefh(&xdr); 5423 status = decode_savefh(xdr);
5496 if (status) 5424 if (status)
5497 goto out; 5425 goto out;
5498 status = decode_open(&xdr, res); 5426 status = decode_open(xdr, res);
5499 if (status) 5427 if (status)
5500 goto out; 5428 goto out;
5501 if (decode_getfh(&xdr, &res->fh) != 0) 5429 if (decode_getfh(xdr, &res->fh) != 0)
5502 goto out; 5430 goto out;
5503 if (decode_getfattr(&xdr, res->f_attr, res->server, 5431 if (decode_getfattr(xdr, res->f_attr, res->server,
5504 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5432 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5505 goto out; 5433 goto out;
5506 if (decode_restorefh(&xdr) != 0) 5434 if (decode_restorefh(xdr) != 0)
5507 goto out; 5435 goto out;
5508 decode_getfattr(&xdr, res->dir_attr, res->server, 5436 decode_getfattr(xdr, res->dir_attr, res->server,
5509 !RPC_IS_ASYNC(rqstp->rq_task)); 5437 !RPC_IS_ASYNC(rqstp->rq_task));
5510out: 5438out:
5511 return status; 5439 return status;
@@ -5514,20 +5442,20 @@ out:
5514/* 5442/*
5515 * Decode OPEN_CONFIRM response 5443 * Decode OPEN_CONFIRM response
5516 */ 5444 */
5517static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res) 5445static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
5446 struct xdr_stream *xdr,
5447 struct nfs_open_confirmres *res)
5518{ 5448{
5519 struct xdr_stream xdr;
5520 struct compound_hdr hdr; 5449 struct compound_hdr hdr;
5521 int status; 5450 int status;
5522 5451
5523 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5452 status = decode_compound_hdr(xdr, &hdr);
5524 status = decode_compound_hdr(&xdr, &hdr);
5525 if (status) 5453 if (status)
5526 goto out; 5454 goto out;
5527 status = decode_putfh(&xdr); 5455 status = decode_putfh(xdr);
5528 if (status) 5456 if (status)
5529 goto out; 5457 goto out;
5530 status = decode_open_confirm(&xdr, res); 5458 status = decode_open_confirm(xdr, res);
5531out: 5459out:
5532 return status; 5460 return status;
5533} 5461}
@@ -5535,26 +5463,26 @@ out:
5535/* 5463/*
5536 * Decode OPEN response 5464 * Decode OPEN response
5537 */ 5465 */
5538static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) 5466static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
5467 struct xdr_stream *xdr,
5468 struct nfs_openres *res)
5539{ 5469{
5540 struct xdr_stream xdr;
5541 struct compound_hdr hdr; 5470 struct compound_hdr hdr;
5542 int status; 5471 int status;
5543 5472
5544 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5473 status = decode_compound_hdr(xdr, &hdr);
5545 status = decode_compound_hdr(&xdr, &hdr);
5546 if (status) 5474 if (status)
5547 goto out; 5475 goto out;
5548 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5476 status = decode_sequence(xdr, &res->seq_res, rqstp);
5549 if (status) 5477 if (status)
5550 goto out; 5478 goto out;
5551 status = decode_putfh(&xdr); 5479 status = decode_putfh(xdr);
5552 if (status) 5480 if (status)
5553 goto out; 5481 goto out;
5554 status = decode_open(&xdr, res); 5482 status = decode_open(xdr, res);
5555 if (status) 5483 if (status)
5556 goto out; 5484 goto out;
5557 decode_getfattr(&xdr, res->f_attr, res->server, 5485 decode_getfattr(xdr, res->f_attr, res->server,
5558 !RPC_IS_ASYNC(rqstp->rq_task)); 5486 !RPC_IS_ASYNC(rqstp->rq_task));
5559out: 5487out:
5560 return status; 5488 return status;
@@ -5563,26 +5491,26 @@ out:
5563/* 5491/*
5564 * Decode SETATTR response 5492 * Decode SETATTR response
5565 */ 5493 */
5566static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res) 5494static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
5495 struct xdr_stream *xdr,
5496 struct nfs_setattrres *res)
5567{ 5497{
5568 struct xdr_stream xdr;
5569 struct compound_hdr hdr; 5498 struct compound_hdr hdr;
5570 int status; 5499 int status;
5571 5500
5572 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5501 status = decode_compound_hdr(xdr, &hdr);
5573 status = decode_compound_hdr(&xdr, &hdr);
5574 if (status) 5502 if (status)
5575 goto out; 5503 goto out;
5576 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5504 status = decode_sequence(xdr, &res->seq_res, rqstp);
5577 if (status) 5505 if (status)
5578 goto out; 5506 goto out;
5579 status = decode_putfh(&xdr); 5507 status = decode_putfh(xdr);
5580 if (status) 5508 if (status)
5581 goto out; 5509 goto out;
5582 status = decode_setattr(&xdr); 5510 status = decode_setattr(xdr);
5583 if (status) 5511 if (status)
5584 goto out; 5512 goto out;
5585 decode_getfattr(&xdr, res->fattr, res->server, 5513 decode_getfattr(xdr, res->fattr, res->server,
5586 !RPC_IS_ASYNC(rqstp->rq_task)); 5514 !RPC_IS_ASYNC(rqstp->rq_task));
5587out: 5515out:
5588 return status; 5516 return status;
@@ -5591,23 +5519,22 @@ out:
5591/* 5519/*
5592 * Decode LOCK response 5520 * Decode LOCK response
5593 */ 5521 */
5594static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res) 5522static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5523 struct nfs_lock_res *res)
5595{ 5524{
5596 struct xdr_stream xdr;
5597 struct compound_hdr hdr; 5525 struct compound_hdr hdr;
5598 int status; 5526 int status;
5599 5527
5600 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5528 status = decode_compound_hdr(xdr, &hdr);
5601 status = decode_compound_hdr(&xdr, &hdr);
5602 if (status) 5529 if (status)
5603 goto out; 5530 goto out;
5604 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5531 status = decode_sequence(xdr, &res->seq_res, rqstp);
5605 if (status) 5532 if (status)
5606 goto out; 5533 goto out;
5607 status = decode_putfh(&xdr); 5534 status = decode_putfh(xdr);
5608 if (status) 5535 if (status)
5609 goto out; 5536 goto out;
5610 status = decode_lock(&xdr, res); 5537 status = decode_lock(xdr, res);
5611out: 5538out:
5612 return status; 5539 return status;
5613} 5540}
@@ -5615,23 +5542,22 @@ out:
5615/* 5542/*
5616 * Decode LOCKT response 5543 * Decode LOCKT response
5617 */ 5544 */
5618static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res) 5545static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5546 struct nfs_lockt_res *res)
5619{ 5547{
5620 struct xdr_stream xdr;
5621 struct compound_hdr hdr; 5548 struct compound_hdr hdr;
5622 int status; 5549 int status;
5623 5550
5624 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5551 status = decode_compound_hdr(xdr, &hdr);
5625 status = decode_compound_hdr(&xdr, &hdr);
5626 if (status) 5552 if (status)
5627 goto out; 5553 goto out;
5628 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5554 status = decode_sequence(xdr, &res->seq_res, rqstp);
5629 if (status) 5555 if (status)
5630 goto out; 5556 goto out;
5631 status = decode_putfh(&xdr); 5557 status = decode_putfh(xdr);
5632 if (status) 5558 if (status)
5633 goto out; 5559 goto out;
5634 status = decode_lockt(&xdr, res); 5560 status = decode_lockt(xdr, res);
5635out: 5561out:
5636 return status; 5562 return status;
5637} 5563}
@@ -5639,61 +5565,58 @@ out:
5639/* 5565/*
5640 * Decode LOCKU response 5566 * Decode LOCKU response
5641 */ 5567 */
5642static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res) 5568static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5569 struct nfs_locku_res *res)
5643{ 5570{
5644 struct xdr_stream xdr;
5645 struct compound_hdr hdr; 5571 struct compound_hdr hdr;
5646 int status; 5572 int status;
5647 5573
5648 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5574 status = decode_compound_hdr(xdr, &hdr);
5649 status = decode_compound_hdr(&xdr, &hdr);
5650 if (status) 5575 if (status)
5651 goto out; 5576 goto out;
5652 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5577 status = decode_sequence(xdr, &res->seq_res, rqstp);
5653 if (status) 5578 if (status)
5654 goto out; 5579 goto out;
5655 status = decode_putfh(&xdr); 5580 status = decode_putfh(xdr);
5656 if (status) 5581 if (status)
5657 goto out; 5582 goto out;
5658 status = decode_locku(&xdr, res); 5583 status = decode_locku(xdr, res);
5659out: 5584out:
5660 return status; 5585 return status;
5661} 5586}
5662 5587
5663static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) 5588static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
5589 struct xdr_stream *xdr, void *dummy)
5664{ 5590{
5665 struct xdr_stream xdr;
5666 struct compound_hdr hdr; 5591 struct compound_hdr hdr;
5667 int status; 5592 int status;
5668 5593
5669 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5594 status = decode_compound_hdr(xdr, &hdr);
5670 status = decode_compound_hdr(&xdr, &hdr);
5671 if (!status) 5595 if (!status)
5672 status = decode_release_lockowner(&xdr); 5596 status = decode_release_lockowner(xdr);
5673 return status; 5597 return status;
5674} 5598}
5675 5599
5676/* 5600/*
5677 * Decode READLINK response 5601 * Decode READLINK response
5678 */ 5602 */
5679static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, 5603static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
5604 struct xdr_stream *xdr,
5680 struct nfs4_readlink_res *res) 5605 struct nfs4_readlink_res *res)
5681{ 5606{
5682 struct xdr_stream xdr;
5683 struct compound_hdr hdr; 5607 struct compound_hdr hdr;
5684 int status; 5608 int status;
5685 5609
5686 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5610 status = decode_compound_hdr(xdr, &hdr);
5687 status = decode_compound_hdr(&xdr, &hdr);
5688 if (status) 5611 if (status)
5689 goto out; 5612 goto out;
5690 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5613 status = decode_sequence(xdr, &res->seq_res, rqstp);
5691 if (status) 5614 if (status)
5692 goto out; 5615 goto out;
5693 status = decode_putfh(&xdr); 5616 status = decode_putfh(xdr);
5694 if (status) 5617 if (status)
5695 goto out; 5618 goto out;
5696 status = decode_readlink(&xdr, rqstp); 5619 status = decode_readlink(xdr, rqstp);
5697out: 5620out:
5698 return status; 5621 return status;
5699} 5622}
@@ -5701,23 +5624,22 @@ out:
5701/* 5624/*
5702 * Decode READDIR response 5625 * Decode READDIR response
5703 */ 5626 */
5704static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res) 5627static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5628 struct nfs4_readdir_res *res)
5705{ 5629{
5706 struct xdr_stream xdr;
5707 struct compound_hdr hdr; 5630 struct compound_hdr hdr;
5708 int status; 5631 int status;
5709 5632
5710 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5633 status = decode_compound_hdr(xdr, &hdr);
5711 status = decode_compound_hdr(&xdr, &hdr);
5712 if (status) 5634 if (status)
5713 goto out; 5635 goto out;
5714 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5636 status = decode_sequence(xdr, &res->seq_res, rqstp);
5715 if (status) 5637 if (status)
5716 goto out; 5638 goto out;
5717 status = decode_putfh(&xdr); 5639 status = decode_putfh(xdr);
5718 if (status) 5640 if (status)
5719 goto out; 5641 goto out;
5720 status = decode_readdir(&xdr, rqstp, res); 5642 status = decode_readdir(xdr, rqstp, res);
5721out: 5643out:
5722 return status; 5644 return status;
5723} 5645}
@@ -5725,23 +5647,22 @@ out:
5725/* 5647/*
5726 * Decode Read response 5648 * Decode Read response
5727 */ 5649 */
5728static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res) 5650static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5651 struct nfs_readres *res)
5729{ 5652{
5730 struct xdr_stream xdr;
5731 struct compound_hdr hdr; 5653 struct compound_hdr hdr;
5732 int status; 5654 int status;
5733 5655
5734 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5656 status = decode_compound_hdr(xdr, &hdr);
5735 status = decode_compound_hdr(&xdr, &hdr);
5736 if (status) 5657 if (status)
5737 goto out; 5658 goto out;
5738 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5659 status = decode_sequence(xdr, &res->seq_res, rqstp);
5739 if (status) 5660 if (status)
5740 goto out; 5661 goto out;
5741 status = decode_putfh(&xdr); 5662 status = decode_putfh(xdr);
5742 if (status) 5663 if (status)
5743 goto out; 5664 goto out;
5744 status = decode_read(&xdr, rqstp, res); 5665 status = decode_read(xdr, rqstp, res);
5745 if (!status) 5666 if (!status)
5746 status = res->count; 5667 status = res->count;
5747out: 5668out:
@@ -5751,26 +5672,25 @@ out:
5751/* 5672/*
5752 * Decode WRITE response 5673 * Decode WRITE response
5753 */ 5674 */
5754static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) 5675static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5676 struct nfs_writeres *res)
5755{ 5677{
5756 struct xdr_stream xdr;
5757 struct compound_hdr hdr; 5678 struct compound_hdr hdr;
5758 int status; 5679 int status;
5759 5680
5760 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5681 status = decode_compound_hdr(xdr, &hdr);
5761 status = decode_compound_hdr(&xdr, &hdr);
5762 if (status) 5682 if (status)
5763 goto out; 5683 goto out;
5764 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5684 status = decode_sequence(xdr, &res->seq_res, rqstp);
5765 if (status) 5685 if (status)
5766 goto out; 5686 goto out;
5767 status = decode_putfh(&xdr); 5687 status = decode_putfh(xdr);
5768 if (status) 5688 if (status)
5769 goto out; 5689 goto out;
5770 status = decode_write(&xdr, res); 5690 status = decode_write(xdr, res);
5771 if (status) 5691 if (status)
5772 goto out; 5692 goto out;
5773 decode_getfattr(&xdr, res->fattr, res->server, 5693 decode_getfattr(xdr, res->fattr, res->server,
5774 !RPC_IS_ASYNC(rqstp->rq_task)); 5694 !RPC_IS_ASYNC(rqstp->rq_task));
5775 if (!status) 5695 if (!status)
5776 status = res->count; 5696 status = res->count;
@@ -5781,26 +5701,25 @@ out:
5781/* 5701/*
5782 * Decode COMMIT response 5702 * Decode COMMIT response
5783 */ 5703 */
5784static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) 5704static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5705 struct nfs_writeres *res)
5785{ 5706{
5786 struct xdr_stream xdr;
5787 struct compound_hdr hdr; 5707 struct compound_hdr hdr;
5788 int status; 5708 int status;
5789 5709
5790 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5710 status = decode_compound_hdr(xdr, &hdr);
5791 status = decode_compound_hdr(&xdr, &hdr);
5792 if (status) 5711 if (status)
5793 goto out; 5712 goto out;
5794 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5713 status = decode_sequence(xdr, &res->seq_res, rqstp);
5795 if (status) 5714 if (status)
5796 goto out; 5715 goto out;
5797 status = decode_putfh(&xdr); 5716 status = decode_putfh(xdr);
5798 if (status) 5717 if (status)
5799 goto out; 5718 goto out;
5800 status = decode_commit(&xdr, res); 5719 status = decode_commit(xdr, res);
5801 if (status) 5720 if (status)
5802 goto out; 5721 goto out;
5803 decode_getfattr(&xdr, res->fattr, res->server, 5722 decode_getfattr(xdr, res->fattr, res->server,
5804 !RPC_IS_ASYNC(rqstp->rq_task)); 5723 !RPC_IS_ASYNC(rqstp->rq_task));
5805out: 5724out:
5806 return status; 5725 return status;
@@ -5809,85 +5728,80 @@ out:
5809/* 5728/*
5810 * Decode FSINFO response 5729 * Decode FSINFO response
5811 */ 5730 */
5812static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, 5731static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
5813 struct nfs4_fsinfo_res *res) 5732 struct nfs4_fsinfo_res *res)
5814{ 5733{
5815 struct xdr_stream xdr;
5816 struct compound_hdr hdr; 5734 struct compound_hdr hdr;
5817 int status; 5735 int status;
5818 5736
5819 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5737 status = decode_compound_hdr(xdr, &hdr);
5820 status = decode_compound_hdr(&xdr, &hdr);
5821 if (!status) 5738 if (!status)
5822 status = decode_sequence(&xdr, &res->seq_res, req); 5739 status = decode_sequence(xdr, &res->seq_res, req);
5823 if (!status) 5740 if (!status)
5824 status = decode_putfh(&xdr); 5741 status = decode_putfh(xdr);
5825 if (!status) 5742 if (!status)
5826 status = decode_fsinfo(&xdr, res->fsinfo); 5743 status = decode_fsinfo(xdr, res->fsinfo);
5827 return status; 5744 return status;
5828} 5745}
5829 5746
5830/* 5747/*
5831 * Decode PATHCONF response 5748 * Decode PATHCONF response
5832 */ 5749 */
5833static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, 5750static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
5834 struct nfs4_pathconf_res *res) 5751 struct nfs4_pathconf_res *res)
5835{ 5752{
5836 struct xdr_stream xdr;
5837 struct compound_hdr hdr; 5753 struct compound_hdr hdr;
5838 int status; 5754 int status;
5839 5755
5840 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5756 status = decode_compound_hdr(xdr, &hdr);
5841 status = decode_compound_hdr(&xdr, &hdr);
5842 if (!status) 5757 if (!status)
5843 status = decode_sequence(&xdr, &res->seq_res, req); 5758 status = decode_sequence(xdr, &res->seq_res, req);
5844 if (!status) 5759 if (!status)
5845 status = decode_putfh(&xdr); 5760 status = decode_putfh(xdr);
5846 if (!status) 5761 if (!status)
5847 status = decode_pathconf(&xdr, res->pathconf); 5762 status = decode_pathconf(xdr, res->pathconf);
5848 return status; 5763 return status;
5849} 5764}
5850 5765
5851/* 5766/*
5852 * Decode STATFS response 5767 * Decode STATFS response
5853 */ 5768 */
5854static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, 5769static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
5855 struct nfs4_statfs_res *res) 5770 struct nfs4_statfs_res *res)
5856{ 5771{
5857 struct xdr_stream xdr;
5858 struct compound_hdr hdr; 5772 struct compound_hdr hdr;
5859 int status; 5773 int status;
5860 5774
5861 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5775 status = decode_compound_hdr(xdr, &hdr);
5862 status = decode_compound_hdr(&xdr, &hdr);
5863 if (!status) 5776 if (!status)
5864 status = decode_sequence(&xdr, &res->seq_res, req); 5777 status = decode_sequence(xdr, &res->seq_res, req);
5865 if (!status) 5778 if (!status)
5866 status = decode_putfh(&xdr); 5779 status = decode_putfh(xdr);
5867 if (!status) 5780 if (!status)
5868 status = decode_statfs(&xdr, res->fsstat); 5781 status = decode_statfs(xdr, res->fsstat);
5869 return status; 5782 return status;
5870} 5783}
5871 5784
5872/* 5785/*
5873 * Decode GETATTR_BITMAP response 5786 * Decode GETATTR_BITMAP response
5874 */ 5787 */
5875static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res) 5788static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
5789 struct xdr_stream *xdr,
5790 struct nfs4_server_caps_res *res)
5876{ 5791{
5877 struct xdr_stream xdr;
5878 struct compound_hdr hdr; 5792 struct compound_hdr hdr;
5879 int status; 5793 int status;
5880 5794
5881 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5795 status = decode_compound_hdr(xdr, &hdr);
5882 status = decode_compound_hdr(&xdr, &hdr);
5883 if (status) 5796 if (status)
5884 goto out; 5797 goto out;
5885 status = decode_sequence(&xdr, &res->seq_res, req); 5798 status = decode_sequence(xdr, &res->seq_res, req);
5886 if (status) 5799 if (status)
5887 goto out; 5800 goto out;
5888 if ((status = decode_putfh(&xdr)) != 0) 5801 status = decode_putfh(xdr);
5802 if (status)
5889 goto out; 5803 goto out;
5890 status = decode_server_caps(&xdr, res); 5804 status = decode_server_caps(xdr, res);
5891out: 5805out:
5892 return status; 5806 return status;
5893} 5807}
@@ -5895,79 +5809,77 @@ out:
5895/* 5809/*
5896 * Decode RENEW response 5810 * Decode RENEW response
5897 */ 5811 */
5898static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy) 5812static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5813 void *__unused)
5899{ 5814{
5900 struct xdr_stream xdr;
5901 struct compound_hdr hdr; 5815 struct compound_hdr hdr;
5902 int status; 5816 int status;
5903 5817
5904 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5818 status = decode_compound_hdr(xdr, &hdr);
5905 status = decode_compound_hdr(&xdr, &hdr);
5906 if (!status) 5819 if (!status)
5907 status = decode_renew(&xdr); 5820 status = decode_renew(xdr);
5908 return status; 5821 return status;
5909} 5822}
5910 5823
5911/* 5824/*
5912 * Decode SETCLIENTID response 5825 * Decode SETCLIENTID response
5913 */ 5826 */
5914static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, 5827static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
5915 struct nfs4_setclientid_res *res) 5828 struct xdr_stream *xdr,
5829 struct nfs4_setclientid_res *res)
5916{ 5830{
5917 struct xdr_stream xdr;
5918 struct compound_hdr hdr; 5831 struct compound_hdr hdr;
5919 int status; 5832 int status;
5920 5833
5921 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5834 status = decode_compound_hdr(xdr, &hdr);
5922 status = decode_compound_hdr(&xdr, &hdr);
5923 if (!status) 5835 if (!status)
5924 status = decode_setclientid(&xdr, res); 5836 status = decode_setclientid(xdr, res);
5925 return status; 5837 return status;
5926} 5838}
5927 5839
5928/* 5840/*
5929 * Decode SETCLIENTID_CONFIRM response 5841 * Decode SETCLIENTID_CONFIRM response
5930 */ 5842 */
5931static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) 5843static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
5844 struct xdr_stream *xdr,
5845 struct nfs_fsinfo *fsinfo)
5932{ 5846{
5933 struct xdr_stream xdr;
5934 struct compound_hdr hdr; 5847 struct compound_hdr hdr;
5935 int status; 5848 int status;
5936 5849
5937 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5850 status = decode_compound_hdr(xdr, &hdr);
5938 status = decode_compound_hdr(&xdr, &hdr);
5939 if (!status) 5851 if (!status)
5940 status = decode_setclientid_confirm(&xdr); 5852 status = decode_setclientid_confirm(xdr);
5941 if (!status) 5853 if (!status)
5942 status = decode_putrootfh(&xdr); 5854 status = decode_putrootfh(xdr);
5943 if (!status) 5855 if (!status)
5944 status = decode_fsinfo(&xdr, fsinfo); 5856 status = decode_fsinfo(xdr, fsinfo);
5945 return status; 5857 return status;
5946} 5858}
5947 5859
5948/* 5860/*
5949 * Decode DELEGRETURN response 5861 * Decode DELEGRETURN response
5950 */ 5862 */
5951static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res) 5863static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
5864 struct xdr_stream *xdr,
5865 struct nfs4_delegreturnres *res)
5952{ 5866{
5953 struct xdr_stream xdr;
5954 struct compound_hdr hdr; 5867 struct compound_hdr hdr;
5955 int status; 5868 int status;
5956 5869
5957 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5870 status = decode_compound_hdr(xdr, &hdr);
5958 status = decode_compound_hdr(&xdr, &hdr);
5959 if (status) 5871 if (status)
5960 goto out; 5872 goto out;
5961 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5873 status = decode_sequence(xdr, &res->seq_res, rqstp);
5962 if (status) 5874 if (status)
5963 goto out; 5875 goto out;
5964 status = decode_putfh(&xdr); 5876 status = decode_putfh(xdr);
5965 if (status != 0) 5877 if (status != 0)
5966 goto out; 5878 goto out;
5967 status = decode_delegreturn(&xdr); 5879 status = decode_delegreturn(xdr);
5968 if (status != 0) 5880 if (status != 0)
5969 goto out; 5881 goto out;
5970 decode_getfattr(&xdr, res->fattr, res->server, 5882 decode_getfattr(xdr, res->fattr, res->server,
5971 !RPC_IS_ASYNC(rqstp->rq_task)); 5883 !RPC_IS_ASYNC(rqstp->rq_task));
5972out: 5884out:
5973 return status; 5885 return status;
@@ -5976,26 +5888,27 @@ out:
5976/* 5888/*
5977 * Decode FS_LOCATIONS response 5889 * Decode FS_LOCATIONS response
5978 */ 5890 */
5979static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, 5891static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
5892 struct xdr_stream *xdr,
5980 struct nfs4_fs_locations_res *res) 5893 struct nfs4_fs_locations_res *res)
5981{ 5894{
5982 struct xdr_stream xdr;
5983 struct compound_hdr hdr; 5895 struct compound_hdr hdr;
5984 int status; 5896 int status;
5985 5897
5986 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5898 status = decode_compound_hdr(xdr, &hdr);
5987 status = decode_compound_hdr(&xdr, &hdr);
5988 if (status) 5899 if (status)
5989 goto out; 5900 goto out;
5990 status = decode_sequence(&xdr, &res->seq_res, req); 5901 status = decode_sequence(xdr, &res->seq_res, req);
5991 if (status) 5902 if (status)
5992 goto out; 5903 goto out;
5993 if ((status = decode_putfh(&xdr)) != 0) 5904 status = decode_putfh(xdr);
5905 if (status)
5994 goto out; 5906 goto out;
5995 if ((status = decode_lookup(&xdr)) != 0) 5907 status = decode_lookup(xdr);
5908 if (status)
5996 goto out; 5909 goto out;
5997 xdr_enter_page(&xdr, PAGE_SIZE); 5910 xdr_enter_page(xdr, PAGE_SIZE);
5998 status = decode_getfattr(&xdr, &res->fs_locations->fattr, 5911 status = decode_getfattr(xdr, &res->fs_locations->fattr,
5999 res->fs_locations->server, 5912 res->fs_locations->server,
6000 !RPC_IS_ASYNC(req->rq_task)); 5913 !RPC_IS_ASYNC(req->rq_task));
6001out: 5914out:
@@ -6006,129 +5919,122 @@ out:
6006/* 5919/*
6007 * Decode EXCHANGE_ID response 5920 * Decode EXCHANGE_ID response
6008 */ 5921 */
6009static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, 5922static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
5923 struct xdr_stream *xdr,
6010 void *res) 5924 void *res)
6011{ 5925{
6012 struct xdr_stream xdr;
6013 struct compound_hdr hdr; 5926 struct compound_hdr hdr;
6014 int status; 5927 int status;
6015 5928
6016 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5929 status = decode_compound_hdr(xdr, &hdr);
6017 status = decode_compound_hdr(&xdr, &hdr);
6018 if (!status) 5930 if (!status)
6019 status = decode_exchange_id(&xdr, res); 5931 status = decode_exchange_id(xdr, res);
6020 return status; 5932 return status;
6021} 5933}
6022 5934
6023/* 5935/*
6024 * Decode CREATE_SESSION response 5936 * Decode CREATE_SESSION response
6025 */ 5937 */
6026static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, 5938static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
5939 struct xdr_stream *xdr,
6027 struct nfs41_create_session_res *res) 5940 struct nfs41_create_session_res *res)
6028{ 5941{
6029 struct xdr_stream xdr;
6030 struct compound_hdr hdr; 5942 struct compound_hdr hdr;
6031 int status; 5943 int status;
6032 5944
6033 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5945 status = decode_compound_hdr(xdr, &hdr);
6034 status = decode_compound_hdr(&xdr, &hdr);
6035 if (!status) 5946 if (!status)
6036 status = decode_create_session(&xdr, res); 5947 status = decode_create_session(xdr, res);
6037 return status; 5948 return status;
6038} 5949}
6039 5950
6040/* 5951/*
6041 * Decode DESTROY_SESSION response 5952 * Decode DESTROY_SESSION response
6042 */ 5953 */
6043static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, 5954static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
6044 void *dummy) 5955 struct xdr_stream *xdr,
5956 void *res)
6045{ 5957{
6046 struct xdr_stream xdr;
6047 struct compound_hdr hdr; 5958 struct compound_hdr hdr;
6048 int status; 5959 int status;
6049 5960
6050 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5961 status = decode_compound_hdr(xdr, &hdr);
6051 status = decode_compound_hdr(&xdr, &hdr);
6052 if (!status) 5962 if (!status)
6053 status = decode_destroy_session(&xdr, dummy); 5963 status = decode_destroy_session(xdr, res);
6054 return status; 5964 return status;
6055} 5965}
6056 5966
6057/* 5967/*
6058 * Decode SEQUENCE response 5968 * Decode SEQUENCE response
6059 */ 5969 */
6060static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, 5970static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
5971 struct xdr_stream *xdr,
6061 struct nfs4_sequence_res *res) 5972 struct nfs4_sequence_res *res)
6062{ 5973{
6063 struct xdr_stream xdr;
6064 struct compound_hdr hdr; 5974 struct compound_hdr hdr;
6065 int status; 5975 int status;
6066 5976
6067 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5977 status = decode_compound_hdr(xdr, &hdr);
6068 status = decode_compound_hdr(&xdr, &hdr);
6069 if (!status) 5978 if (!status)
6070 status = decode_sequence(&xdr, res, rqstp); 5979 status = decode_sequence(xdr, res, rqstp);
6071 return status; 5980 return status;
6072} 5981}
6073 5982
6074/* 5983/*
6075 * Decode GET_LEASE_TIME response 5984 * Decode GET_LEASE_TIME response
6076 */ 5985 */
6077static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, 5986static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
5987 struct xdr_stream *xdr,
6078 struct nfs4_get_lease_time_res *res) 5988 struct nfs4_get_lease_time_res *res)
6079{ 5989{
6080 struct xdr_stream xdr;
6081 struct compound_hdr hdr; 5990 struct compound_hdr hdr;
6082 int status; 5991 int status;
6083 5992
6084 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5993 status = decode_compound_hdr(xdr, &hdr);
6085 status = decode_compound_hdr(&xdr, &hdr);
6086 if (!status) 5994 if (!status)
6087 status = decode_sequence(&xdr, &res->lr_seq_res, rqstp); 5995 status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
6088 if (!status) 5996 if (!status)
6089 status = decode_putrootfh(&xdr); 5997 status = decode_putrootfh(xdr);
6090 if (!status) 5998 if (!status)
6091 status = decode_fsinfo(&xdr, res->lr_fsinfo); 5999 status = decode_fsinfo(xdr, res->lr_fsinfo);
6092 return status; 6000 return status;
6093} 6001}
6094 6002
6095/* 6003/*
6096 * Decode RECLAIM_COMPLETE response 6004 * Decode RECLAIM_COMPLETE response
6097 */ 6005 */
6098static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, 6006static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
6007 struct xdr_stream *xdr,
6099 struct nfs41_reclaim_complete_res *res) 6008 struct nfs41_reclaim_complete_res *res)
6100{ 6009{
6101 struct xdr_stream xdr;
6102 struct compound_hdr hdr; 6010 struct compound_hdr hdr;
6103 int status; 6011 int status;
6104 6012
6105 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 6013 status = decode_compound_hdr(xdr, &hdr);
6106 status = decode_compound_hdr(&xdr, &hdr);
6107 if (!status) 6014 if (!status)
6108 status = decode_sequence(&xdr, &res->seq_res, rqstp); 6015 status = decode_sequence(xdr, &res->seq_res, rqstp);
6109 if (!status) 6016 if (!status)
6110 status = decode_reclaim_complete(&xdr, (void *)NULL); 6017 status = decode_reclaim_complete(xdr, (void *)NULL);
6111 return status; 6018 return status;
6112} 6019}
6113 6020
6114/* 6021/*
6115 * Decode GETDEVINFO response 6022 * Decode GETDEVINFO response
6116 */ 6023 */
6117static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, 6024static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
6025 struct xdr_stream *xdr,
6118 struct nfs4_getdeviceinfo_res *res) 6026 struct nfs4_getdeviceinfo_res *res)
6119{ 6027{
6120 struct xdr_stream xdr;
6121 struct compound_hdr hdr; 6028 struct compound_hdr hdr;
6122 int status; 6029 int status;
6123 6030
6124 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 6031 status = decode_compound_hdr(xdr, &hdr);
6125 status = decode_compound_hdr(&xdr, &hdr);
6126 if (status != 0) 6032 if (status != 0)
6127 goto out; 6033 goto out;
6128 status = decode_sequence(&xdr, &res->seq_res, rqstp); 6034 status = decode_sequence(xdr, &res->seq_res, rqstp);
6129 if (status != 0) 6035 if (status != 0)
6130 goto out; 6036 goto out;
6131 status = decode_getdeviceinfo(&xdr, res->pdev); 6037 status = decode_getdeviceinfo(xdr, res->pdev);
6132out: 6038out:
6133 return status; 6039 return status;
6134} 6040}
@@ -6136,31 +6042,44 @@ out:
6136/* 6042/*
6137 * Decode LAYOUTGET response 6043 * Decode LAYOUTGET response
6138 */ 6044 */
6139static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, 6045static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
6046 struct xdr_stream *xdr,
6140 struct nfs4_layoutget_res *res) 6047 struct nfs4_layoutget_res *res)
6141{ 6048{
6142 struct xdr_stream xdr;
6143 struct compound_hdr hdr; 6049 struct compound_hdr hdr;
6144 int status; 6050 int status;
6145 6051
6146 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 6052 status = decode_compound_hdr(xdr, &hdr);
6147 status = decode_compound_hdr(&xdr, &hdr);
6148 if (status) 6053 if (status)
6149 goto out; 6054 goto out;
6150 status = decode_sequence(&xdr, &res->seq_res, rqstp); 6055 status = decode_sequence(xdr, &res->seq_res, rqstp);
6151 if (status) 6056 if (status)
6152 goto out; 6057 goto out;
6153 status = decode_putfh(&xdr); 6058 status = decode_putfh(xdr);
6154 if (status) 6059 if (status)
6155 goto out; 6060 goto out;
6156 status = decode_layoutget(&xdr, rqstp, res); 6061 status = decode_layoutget(xdr, rqstp, res);
6157out: 6062out:
6158 return status; 6063 return status;
6159} 6064}
6160#endif /* CONFIG_NFS_V4_1 */ 6065#endif /* CONFIG_NFS_V4_1 */
6161 6066
6162__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, 6067/**
6163 struct nfs_server *server, int plus) 6068 * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
6069 * the local page cache.
6070 * @xdr: XDR stream where entry resides
6071 * @entry: buffer to fill in with entry data
6072 * @plus: boolean indicating whether this should be a readdirplus entry
6073 *
6074 * Returns zero if successful, otherwise a negative errno value is
6075 * returned.
6076 *
6077 * This function is not invoked during READDIR reply decoding, but
6078 * rather whenever an application invokes the getdents(2) system call
6079 * on a directory already in our cache.
6080 */
6081int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6082 int plus)
6164{ 6083{
6165 uint32_t bitmap[2] = {0}; 6084 uint32_t bitmap[2] = {0};
6166 uint32_t len; 6085 uint32_t len;
@@ -6172,9 +6091,9 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6172 if (unlikely(!p)) 6091 if (unlikely(!p))
6173 goto out_overflow; 6092 goto out_overflow;
6174 if (!ntohl(*p++)) 6093 if (!ntohl(*p++))
6175 return ERR_PTR(-EAGAIN); 6094 return -EAGAIN;
6176 entry->eof = 1; 6095 entry->eof = 1;
6177 return ERR_PTR(-EBADCOOKIE); 6096 return -EBADCOOKIE;
6178 } 6097 }
6179 6098
6180 p = xdr_inline_decode(xdr, 12); 6099 p = xdr_inline_decode(xdr, 12);
@@ -6203,7 +6122,8 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6203 if (decode_attr_length(xdr, &len, &p) < 0) 6122 if (decode_attr_length(xdr, &len, &p) < 0)
6204 goto out_overflow; 6123 goto out_overflow;
6205 6124
6206 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0) 6125 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
6126 entry->server, 1) < 0)
6207 goto out_overflow; 6127 goto out_overflow;
6208 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) 6128 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
6209 entry->ino = entry->fattr->fileid; 6129 entry->ino = entry->fattr->fileid;
@@ -6215,17 +6135,11 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6215 if (verify_attr_len(xdr, p, len) < 0) 6135 if (verify_attr_len(xdr, p, len) < 0)
6216 goto out_overflow; 6136 goto out_overflow;
6217 6137
6218 p = xdr_inline_peek(xdr, 8); 6138 return 0;
6219 if (p != NULL)
6220 entry->eof = !p[0] && p[1];
6221 else
6222 entry->eof = 0;
6223
6224 return p;
6225 6139
6226out_overflow: 6140out_overflow:
6227 print_overflow_msg(__func__, xdr); 6141 print_overflow_msg(__func__, xdr);
6228 return ERR_PTR(-EAGAIN); 6142 return -EAGAIN;
6229} 6143}
6230 6144
6231/* 6145/*
@@ -6301,8 +6215,8 @@ nfs4_stat_to_errno(int stat)
6301#define PROC(proc, argtype, restype) \ 6215#define PROC(proc, argtype, restype) \
6302[NFSPROC4_CLNT_##proc] = { \ 6216[NFSPROC4_CLNT_##proc] = { \
6303 .p_proc = NFSPROC4_COMPOUND, \ 6217 .p_proc = NFSPROC4_COMPOUND, \
6304 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ 6218 .p_encode = (kxdreproc_t)nfs4_xdr_##argtype, \
6305 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ 6219 .p_decode = (kxdrdproc_t)nfs4_xdr_##restype, \
6306 .p_arglen = NFS4_##argtype##_sz, \ 6220 .p_arglen = NFS4_##argtype##_sz, \
6307 .p_replen = NFS4_##restype##_sz, \ 6221 .p_replen = NFS4_##restype##_sz, \
6308 .p_statidx = NFSPROC4_CLNT_##proc, \ 6222 .p_statidx = NFSPROC4_CLNT_##proc, \
@@ -6310,50 +6224,50 @@ nfs4_stat_to_errno(int stat)
6310} 6224}
6311 6225
6312struct rpc_procinfo nfs4_procedures[] = { 6226struct rpc_procinfo nfs4_procedures[] = {
6313 PROC(READ, enc_read, dec_read), 6227 PROC(READ, enc_read, dec_read),
6314 PROC(WRITE, enc_write, dec_write), 6228 PROC(WRITE, enc_write, dec_write),
6315 PROC(COMMIT, enc_commit, dec_commit), 6229 PROC(COMMIT, enc_commit, dec_commit),
6316 PROC(OPEN, enc_open, dec_open), 6230 PROC(OPEN, enc_open, dec_open),
6317 PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), 6231 PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm),
6318 PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), 6232 PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr),
6319 PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), 6233 PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade),
6320 PROC(CLOSE, enc_close, dec_close), 6234 PROC(CLOSE, enc_close, dec_close),
6321 PROC(SETATTR, enc_setattr, dec_setattr), 6235 PROC(SETATTR, enc_setattr, dec_setattr),
6322 PROC(FSINFO, enc_fsinfo, dec_fsinfo), 6236 PROC(FSINFO, enc_fsinfo, dec_fsinfo),
6323 PROC(RENEW, enc_renew, dec_renew), 6237 PROC(RENEW, enc_renew, dec_renew),
6324 PROC(SETCLIENTID, enc_setclientid, dec_setclientid), 6238 PROC(SETCLIENTID, enc_setclientid, dec_setclientid),
6325 PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), 6239 PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
6326 PROC(LOCK, enc_lock, dec_lock), 6240 PROC(LOCK, enc_lock, dec_lock),
6327 PROC(LOCKT, enc_lockt, dec_lockt), 6241 PROC(LOCKT, enc_lockt, dec_lockt),
6328 PROC(LOCKU, enc_locku, dec_locku), 6242 PROC(LOCKU, enc_locku, dec_locku),
6329 PROC(ACCESS, enc_access, dec_access), 6243 PROC(ACCESS, enc_access, dec_access),
6330 PROC(GETATTR, enc_getattr, dec_getattr), 6244 PROC(GETATTR, enc_getattr, dec_getattr),
6331 PROC(LOOKUP, enc_lookup, dec_lookup), 6245 PROC(LOOKUP, enc_lookup, dec_lookup),
6332 PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), 6246 PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root),
6333 PROC(REMOVE, enc_remove, dec_remove), 6247 PROC(REMOVE, enc_remove, dec_remove),
6334 PROC(RENAME, enc_rename, dec_rename), 6248 PROC(RENAME, enc_rename, dec_rename),
6335 PROC(LINK, enc_link, dec_link), 6249 PROC(LINK, enc_link, dec_link),
6336 PROC(SYMLINK, enc_symlink, dec_symlink), 6250 PROC(SYMLINK, enc_symlink, dec_symlink),
6337 PROC(CREATE, enc_create, dec_create), 6251 PROC(CREATE, enc_create, dec_create),
6338 PROC(PATHCONF, enc_pathconf, dec_pathconf), 6252 PROC(PATHCONF, enc_pathconf, dec_pathconf),
6339 PROC(STATFS, enc_statfs, dec_statfs), 6253 PROC(STATFS, enc_statfs, dec_statfs),
6340 PROC(READLINK, enc_readlink, dec_readlink), 6254 PROC(READLINK, enc_readlink, dec_readlink),
6341 PROC(READDIR, enc_readdir, dec_readdir), 6255 PROC(READDIR, enc_readdir, dec_readdir),
6342 PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), 6256 PROC(SERVER_CAPS, enc_server_caps, dec_server_caps),
6343 PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), 6257 PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn),
6344 PROC(GETACL, enc_getacl, dec_getacl), 6258 PROC(GETACL, enc_getacl, dec_getacl),
6345 PROC(SETACL, enc_setacl, dec_setacl), 6259 PROC(SETACL, enc_setacl, dec_setacl),
6346 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), 6260 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
6347 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), 6261 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
6348#if defined(CONFIG_NFS_V4_1) 6262#if defined(CONFIG_NFS_V4_1)
6349 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), 6263 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
6350 PROC(CREATE_SESSION, enc_create_session, dec_create_session), 6264 PROC(CREATE_SESSION, enc_create_session, dec_create_session),
6351 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), 6265 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session),
6352 PROC(SEQUENCE, enc_sequence, dec_sequence), 6266 PROC(SEQUENCE, enc_sequence, dec_sequence),
6353 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), 6267 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
6354 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), 6268 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
6355 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), 6269 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6356 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 6270 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
6357#endif /* CONFIG_NFS_V4_1 */ 6271#endif /* CONFIG_NFS_V4_1 */
6358}; 6272};
6359 6273
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b68536cc9046..e1164e3f9e69 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep;
26static inline struct nfs_page * 26static inline struct nfs_page *
27nfs_page_alloc(void) 27nfs_page_alloc(void)
28{ 28{
29 struct nfs_page *p; 29 struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
30 p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL); 30 if (p)
31 if (p) {
32 memset(p, 0, sizeof(*p));
33 INIT_LIST_HEAD(&p->wb_list); 31 INIT_LIST_HEAD(&p->wb_list);
34 }
35 return p; 32 return p;
36} 33}
37 34
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db773428f95f..bc4089769735 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,105 +177,149 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
177 * pNFS client layout cache 177 * pNFS client layout cache
178 */ 178 */
179 179
180/* Need to hold i_lock if caller does not already hold reference */
181void
182get_layout_hdr(struct pnfs_layout_hdr *lo)
183{
184 atomic_inc(&lo->plh_refcount);
185}
186
180static void 187static void
181get_layout_hdr_locked(struct pnfs_layout_hdr *lo) 188destroy_layout_hdr(struct pnfs_layout_hdr *lo)
182{ 189{
183 assert_spin_locked(&lo->inode->i_lock); 190 dprintk("%s: freeing layout cache %p\n", __func__, lo);
184 lo->refcount++; 191 BUG_ON(!list_empty(&lo->plh_layouts));
192 NFS_I(lo->plh_inode)->layout = NULL;
193 kfree(lo);
185} 194}
186 195
187static void 196static void
188put_layout_hdr_locked(struct pnfs_layout_hdr *lo) 197put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
189{ 198{
190 assert_spin_locked(&lo->inode->i_lock); 199 if (atomic_dec_and_test(&lo->plh_refcount))
191 BUG_ON(lo->refcount == 0); 200 destroy_layout_hdr(lo);
192
193 lo->refcount--;
194 if (!lo->refcount) {
195 dprintk("%s: freeing layout cache %p\n", __func__, lo);
196 BUG_ON(!list_empty(&lo->layouts));
197 NFS_I(lo->inode)->layout = NULL;
198 kfree(lo);
199 }
200} 201}
201 202
202void 203void
203put_layout_hdr(struct inode *inode) 204put_layout_hdr(struct pnfs_layout_hdr *lo)
204{ 205{
205 spin_lock(&inode->i_lock); 206 struct inode *inode = lo->plh_inode;
206 put_layout_hdr_locked(NFS_I(inode)->layout); 207
207 spin_unlock(&inode->i_lock); 208 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
209 destroy_layout_hdr(lo);
210 spin_unlock(&inode->i_lock);
211 }
208} 212}
209 213
210static void 214static void
211init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 215init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
212{ 216{
213 INIT_LIST_HEAD(&lseg->fi_list); 217 INIT_LIST_HEAD(&lseg->pls_list);
214 kref_init(&lseg->kref); 218 atomic_set(&lseg->pls_refcount, 1);
215 lseg->layout = lo; 219 smp_mb();
220 set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
221 lseg->pls_layout = lo;
216} 222}
217 223
218/* Called without i_lock held, as the free_lseg call may sleep */ 224static void free_lseg(struct pnfs_layout_segment *lseg)
219static void
220destroy_lseg(struct kref *kref)
221{ 225{
222 struct pnfs_layout_segment *lseg = 226 struct inode *ino = lseg->pls_layout->plh_inode;
223 container_of(kref, struct pnfs_layout_segment, kref);
224 struct inode *ino = lseg->layout->inode;
225 227
226 dprintk("--> %s\n", __func__);
227 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 228 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
228 /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ 229 /* Matched by get_layout_hdr in pnfs_insert_layout */
229 put_layout_hdr(ino); 230 put_layout_hdr(NFS_I(ino)->layout);
230} 231}
231 232
232static void 233/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
233put_lseg(struct pnfs_layout_segment *lseg) 234 * could sleep, so must be called outside of the lock.
235 * Returns 1 if object was removed, otherwise return 0.
236 */
237static int
238put_lseg_locked(struct pnfs_layout_segment *lseg,
239 struct list_head *tmp_list)
240{
241 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
242 atomic_read(&lseg->pls_refcount),
243 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
244 if (atomic_dec_and_test(&lseg->pls_refcount)) {
245 struct inode *ino = lseg->pls_layout->plh_inode;
246
247 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
248 list_del(&lseg->pls_list);
249 if (list_empty(&lseg->pls_layout->plh_segs)) {
250 struct nfs_client *clp;
251
252 clp = NFS_SERVER(ino)->nfs_client;
253 spin_lock(&clp->cl_lock);
254 /* List does not take a reference, so no need for put here */
255 list_del_init(&lseg->pls_layout->plh_layouts);
256 spin_unlock(&clp->cl_lock);
257 clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
258 }
259 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
260 list_add(&lseg->pls_list, tmp_list);
261 return 1;
262 }
263 return 0;
264}
265
266static bool
267should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
234{ 268{
235 if (!lseg) 269 return (recall_iomode == IOMODE_ANY ||
236 return; 270 lseg_iomode == recall_iomode);
271}
237 272
238 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 273/* Returns 1 if lseg is removed from list, 0 otherwise */
239 atomic_read(&lseg->kref.refcount)); 274static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
240 kref_put(&lseg->kref, destroy_lseg); 275 struct list_head *tmp_list)
276{
277 int rv = 0;
278
279 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
280 /* Remove the reference keeping the lseg in the
281 * list. It will now be removed when all
282 * outstanding io is finished.
283 */
284 rv = put_lseg_locked(lseg, tmp_list);
285 }
286 return rv;
241} 287}
242 288
243static void 289/* Returns count of number of matching invalid lsegs remaining in list
244pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list) 290 * after call.
291 */
292int
293mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
294 struct list_head *tmp_list,
295 u32 iomode)
245{ 296{
246 struct pnfs_layout_segment *lseg, *next; 297 struct pnfs_layout_segment *lseg, *next;
247 struct nfs_client *clp; 298 int invalid = 0, removed = 0;
248 299
249 dprintk("%s:Begin lo %p\n", __func__, lo); 300 dprintk("%s:Begin lo %p\n", __func__, lo);
250 301
251 assert_spin_locked(&lo->inode->i_lock); 302 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
252 list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) { 303 if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
253 dprintk("%s: freeing lseg %p\n", __func__, lseg); 304 dprintk("%s: freeing lseg %p iomode %d "
254 list_move(&lseg->fi_list, tmp_list); 305 "offset %llu length %llu\n", __func__,
255 } 306 lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
256 clp = NFS_SERVER(lo->inode)->nfs_client; 307 lseg->pls_range.length);
257 spin_lock(&clp->cl_lock); 308 invalid++;
258 /* List does not take a reference, so no need for put here */ 309 removed += mark_lseg_invalid(lseg, tmp_list);
259 list_del_init(&lo->layouts); 310 }
260 spin_unlock(&clp->cl_lock); 311 dprintk("%s:Return %i\n", __func__, invalid - removed);
261 write_seqlock(&lo->seqlock); 312 return invalid - removed;
262 clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
263 write_sequnlock(&lo->seqlock);
264
265 dprintk("%s:Return\n", __func__);
266} 313}
267 314
268static void 315void
269pnfs_free_lseg_list(struct list_head *tmp_list) 316pnfs_free_lseg_list(struct list_head *free_me)
270{ 317{
271 struct pnfs_layout_segment *lseg; 318 struct pnfs_layout_segment *lseg, *tmp;
272 319
273 while (!list_empty(tmp_list)) { 320 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
274 lseg = list_entry(tmp_list->next, struct pnfs_layout_segment, 321 list_del(&lseg->pls_list);
275 fi_list); 322 free_lseg(lseg);
276 dprintk("%s calling put_lseg on %p\n", __func__, lseg);
277 list_del(&lseg->fi_list);
278 put_lseg(lseg);
279 } 323 }
280} 324}
281 325
@@ -288,7 +332,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
288 spin_lock(&nfsi->vfs_inode.i_lock); 332 spin_lock(&nfsi->vfs_inode.i_lock);
289 lo = nfsi->layout; 333 lo = nfsi->layout;
290 if (lo) { 334 if (lo) {
291 pnfs_clear_lseg_list(lo, &tmp_list); 335 set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
336 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
292 /* Matched by refcount set to 1 in alloc_init_layout_hdr */ 337 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
293 put_layout_hdr_locked(lo); 338 put_layout_hdr_locked(lo);
294 } 339 }
@@ -312,76 +357,80 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
312 357
313 while (!list_empty(&tmp_list)) { 358 while (!list_empty(&tmp_list)) {
314 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, 359 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
315 layouts); 360 plh_layouts);
316 dprintk("%s freeing layout for inode %lu\n", __func__, 361 dprintk("%s freeing layout for inode %lu\n", __func__,
317 lo->inode->i_ino); 362 lo->plh_inode->i_ino);
318 pnfs_destroy_layout(NFS_I(lo->inode)); 363 pnfs_destroy_layout(NFS_I(lo->plh_inode));
319 } 364 }
320} 365}
321 366
322/* update lo->stateid with new if is more recent 367/* update lo->plh_stateid with new if is more recent */
323 * 368void
324 * lo->stateid could be the open stateid, in which case we just use what given. 369pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
325 */ 370 bool update_barrier)
326static void 371{
327pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, 372 u32 oldseq, newseq;
328 const nfs4_stateid *new) 373
329{ 374 oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
330 nfs4_stateid *old = &lo->stateid; 375 newseq = be32_to_cpu(new->stateid.seqid);
331 bool overwrite = false; 376 if ((int)(newseq - oldseq) > 0) {
332 377 memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
333 write_seqlock(&lo->seqlock); 378 if (update_barrier) {
334 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) || 379 u32 new_barrier = be32_to_cpu(new->stateid.seqid);
335 memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other))) 380
336 overwrite = true; 381 if ((int)(new_barrier - lo->plh_barrier))
337 else { 382 lo->plh_barrier = new_barrier;
338 u32 oldseq, newseq; 383 } else {
339 384 /* Because of wraparound, we want to keep the barrier
340 oldseq = be32_to_cpu(old->stateid.seqid); 385 * "close" to the current seqids. It needs to be
341 newseq = be32_to_cpu(new->stateid.seqid); 386 * within 2**31 to count as "behind", so if it
342 if ((int)(newseq - oldseq) > 0) 387 * gets too near that limit, give us a litle leeway
343 overwrite = true; 388 * and bring it to within 2**30.
389 * NOTE - and yes, this is all unsigned arithmetic.
390 */
391 if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
392 lo->plh_barrier = newseq - (1 << 30);
393 }
344 } 394 }
345 if (overwrite)
346 memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
347 write_sequnlock(&lo->seqlock);
348} 395}
349 396
350static void 397/* lget is set to 1 if called from inside send_layoutget call chain */
351pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, 398static bool
352 struct nfs4_state *state) 399pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
400 int lget)
353{ 401{
354 int seq; 402 if ((stateid) &&
355 403 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
356 dprintk("--> %s\n", __func__); 404 return true;
357 write_seqlock(&lo->seqlock); 405 return lo->plh_block_lgets ||
358 do { 406 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
359 seq = read_seqbegin(&state->seqlock); 407 (list_empty(&lo->plh_segs) &&
360 memcpy(lo->stateid.data, state->stateid.data, 408 (atomic_read(&lo->plh_outstanding) > lget));
361 sizeof(state->stateid.data));
362 } while (read_seqretry(&state->seqlock, seq));
363 set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
364 write_sequnlock(&lo->seqlock);
365 dprintk("<-- %s\n", __func__);
366} 409}
367 410
368void 411int
369pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 412pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
370 struct nfs4_state *open_state) 413 struct nfs4_state *open_state)
371{ 414{
372 int seq; 415 int status = 0;
373 416
374 dprintk("--> %s\n", __func__); 417 dprintk("--> %s\n", __func__);
375 do { 418 spin_lock(&lo->plh_inode->i_lock);
376 seq = read_seqbegin(&lo->seqlock); 419 if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
377 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) { 420 status = -EAGAIN;
378 /* This will trigger retry of the read */ 421 } else if (list_empty(&lo->plh_segs)) {
379 pnfs_layout_from_open_stateid(lo, open_state); 422 int seq;
380 } else 423
381 memcpy(dst->data, lo->stateid.data, 424 do {
382 sizeof(lo->stateid.data)); 425 seq = read_seqbegin(&open_state->seqlock);
383 } while (read_seqretry(&lo->seqlock, seq)); 426 memcpy(dst->data, open_state->stateid.data,
427 sizeof(open_state->stateid.data));
428 } while (read_seqretry(&open_state->seqlock, seq));
429 } else
430 memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
431 spin_unlock(&lo->plh_inode->i_lock);
384 dprintk("<-- %s\n", __func__); 432 dprintk("<-- %s\n", __func__);
433 return status;
385} 434}
386 435
387/* 436/*
@@ -395,7 +444,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
395 struct nfs_open_context *ctx, 444 struct nfs_open_context *ctx,
396 u32 iomode) 445 u32 iomode)
397{ 446{
398 struct inode *ino = lo->inode; 447 struct inode *ino = lo->plh_inode;
399 struct nfs_server *server = NFS_SERVER(ino); 448 struct nfs_server *server = NFS_SERVER(ino);
400 struct nfs4_layoutget *lgp; 449 struct nfs4_layoutget *lgp;
401 struct pnfs_layout_segment *lseg = NULL; 450 struct pnfs_layout_segment *lseg = NULL;
@@ -404,10 +453,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
404 453
405 BUG_ON(ctx == NULL); 454 BUG_ON(ctx == NULL);
406 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); 455 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
407 if (lgp == NULL) { 456 if (lgp == NULL)
408 put_layout_hdr(lo->inode);
409 return NULL; 457 return NULL;
410 }
411 lgp->args.minlength = NFS4_MAX_UINT64; 458 lgp->args.minlength = NFS4_MAX_UINT64;
412 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 459 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
413 lgp->args.range.iomode = iomode; 460 lgp->args.range.iomode = iomode;
@@ -424,11 +471,88 @@ send_layoutget(struct pnfs_layout_hdr *lo,
424 nfs4_proc_layoutget(lgp); 471 nfs4_proc_layoutget(lgp);
425 if (!lseg) { 472 if (!lseg) {
426 /* remember that LAYOUTGET failed and suspend trying */ 473 /* remember that LAYOUTGET failed and suspend trying */
427 set_bit(lo_fail_bit(iomode), &lo->state); 474 set_bit(lo_fail_bit(iomode), &lo->plh_flags);
428 } 475 }
429 return lseg; 476 return lseg;
430} 477}
431 478
479bool pnfs_roc(struct inode *ino)
480{
481 struct pnfs_layout_hdr *lo;
482 struct pnfs_layout_segment *lseg, *tmp;
483 LIST_HEAD(tmp_list);
484 bool found = false;
485
486 spin_lock(&ino->i_lock);
487 lo = NFS_I(ino)->layout;
488 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
489 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
490 goto out_nolayout;
491 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
492 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
493 mark_lseg_invalid(lseg, &tmp_list);
494 found = true;
495 }
496 if (!found)
497 goto out_nolayout;
498 lo->plh_block_lgets++;
499 get_layout_hdr(lo); /* matched in pnfs_roc_release */
500 spin_unlock(&ino->i_lock);
501 pnfs_free_lseg_list(&tmp_list);
502 return true;
503
504out_nolayout:
505 spin_unlock(&ino->i_lock);
506 return false;
507}
508
509void pnfs_roc_release(struct inode *ino)
510{
511 struct pnfs_layout_hdr *lo;
512
513 spin_lock(&ino->i_lock);
514 lo = NFS_I(ino)->layout;
515 lo->plh_block_lgets--;
516 put_layout_hdr_locked(lo);
517 spin_unlock(&ino->i_lock);
518}
519
520void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
521{
522 struct pnfs_layout_hdr *lo;
523
524 spin_lock(&ino->i_lock);
525 lo = NFS_I(ino)->layout;
526 if ((int)(barrier - lo->plh_barrier) > 0)
527 lo->plh_barrier = barrier;
528 spin_unlock(&ino->i_lock);
529}
530
531bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
532{
533 struct nfs_inode *nfsi = NFS_I(ino);
534 struct pnfs_layout_segment *lseg;
535 bool found = false;
536
537 spin_lock(&ino->i_lock);
538 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
539 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
540 found = true;
541 break;
542 }
543 if (!found) {
544 struct pnfs_layout_hdr *lo = nfsi->layout;
545 u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
546
547 /* Since close does not return a layout stateid for use as
548 * a barrier, we choose the worst-case barrier.
549 */
550 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
551 }
552 spin_unlock(&ino->i_lock);
553 return found;
554}
555
432/* 556/*
433 * Compare two layout segments for sorting into layout cache. 557 * Compare two layout segments for sorting into layout cache.
434 * We want to preferentially return RW over RO layouts, so ensure those 558 * We want to preferentially return RW over RO layouts, so ensure those
@@ -450,37 +574,29 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
450 574
451 dprintk("%s:Begin\n", __func__); 575 dprintk("%s:Begin\n", __func__);
452 576
453 assert_spin_locked(&lo->inode->i_lock); 577 assert_spin_locked(&lo->plh_inode->i_lock);
454 if (list_empty(&lo->segs)) { 578 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
455 struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client; 579 if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
456
457 spin_lock(&clp->cl_lock);
458 BUG_ON(!list_empty(&lo->layouts));
459 list_add_tail(&lo->layouts, &clp->cl_layouts);
460 spin_unlock(&clp->cl_lock);
461 }
462 list_for_each_entry(lp, &lo->segs, fi_list) {
463 if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
464 continue; 580 continue;
465 list_add_tail(&lseg->fi_list, &lp->fi_list); 581 list_add_tail(&lseg->pls_list, &lp->pls_list);
466 dprintk("%s: inserted lseg %p " 582 dprintk("%s: inserted lseg %p "
467 "iomode %d offset %llu length %llu before " 583 "iomode %d offset %llu length %llu before "
468 "lp %p iomode %d offset %llu length %llu\n", 584 "lp %p iomode %d offset %llu length %llu\n",
469 __func__, lseg, lseg->range.iomode, 585 __func__, lseg, lseg->pls_range.iomode,
470 lseg->range.offset, lseg->range.length, 586 lseg->pls_range.offset, lseg->pls_range.length,
471 lp, lp->range.iomode, lp->range.offset, 587 lp, lp->pls_range.iomode, lp->pls_range.offset,
472 lp->range.length); 588 lp->pls_range.length);
473 found = 1; 589 found = 1;
474 break; 590 break;
475 } 591 }
476 if (!found) { 592 if (!found) {
477 list_add_tail(&lseg->fi_list, &lo->segs); 593 list_add_tail(&lseg->pls_list, &lo->plh_segs);
478 dprintk("%s: inserted lseg %p " 594 dprintk("%s: inserted lseg %p "
479 "iomode %d offset %llu length %llu at tail\n", 595 "iomode %d offset %llu length %llu at tail\n",
480 __func__, lseg, lseg->range.iomode, 596 __func__, lseg, lseg->pls_range.iomode,
481 lseg->range.offset, lseg->range.length); 597 lseg->pls_range.offset, lseg->pls_range.length);
482 } 598 }
483 get_layout_hdr_locked(lo); 599 get_layout_hdr(lo);
484 600
485 dprintk("%s:Return\n", __func__); 601 dprintk("%s:Return\n", __func__);
486} 602}
@@ -493,11 +609,11 @@ alloc_init_layout_hdr(struct inode *ino)
493 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); 609 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
494 if (!lo) 610 if (!lo)
495 return NULL; 611 return NULL;
496 lo->refcount = 1; 612 atomic_set(&lo->plh_refcount, 1);
497 INIT_LIST_HEAD(&lo->layouts); 613 INIT_LIST_HEAD(&lo->plh_layouts);
498 INIT_LIST_HEAD(&lo->segs); 614 INIT_LIST_HEAD(&lo->plh_segs);
499 seqlock_init(&lo->seqlock); 615 INIT_LIST_HEAD(&lo->plh_bulk_recall);
500 lo->inode = ino; 616 lo->plh_inode = ino;
501 return lo; 617 return lo;
502} 618}
503 619
@@ -510,9 +626,12 @@ pnfs_find_alloc_layout(struct inode *ino)
510 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 626 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
511 627
512 assert_spin_locked(&ino->i_lock); 628 assert_spin_locked(&ino->i_lock);
513 if (nfsi->layout) 629 if (nfsi->layout) {
514 return nfsi->layout; 630 if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
515 631 return NULL;
632 else
633 return nfsi->layout;
634 }
516 spin_unlock(&ino->i_lock); 635 spin_unlock(&ino->i_lock);
517 new = alloc_init_layout_hdr(ino); 636 new = alloc_init_layout_hdr(ino);
518 spin_lock(&ino->i_lock); 637 spin_lock(&ino->i_lock);
@@ -538,31 +657,32 @@ pnfs_find_alloc_layout(struct inode *ino)
538static int 657static int
539is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) 658is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
540{ 659{
541 return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW); 660 return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
542} 661}
543 662
544/* 663/*
545 * lookup range in layout 664 * lookup range in layout
546 */ 665 */
547static struct pnfs_layout_segment * 666static struct pnfs_layout_segment *
548pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) 667pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
549{ 668{
550 struct pnfs_layout_segment *lseg, *ret = NULL; 669 struct pnfs_layout_segment *lseg, *ret = NULL;
551 670
552 dprintk("%s:Begin\n", __func__); 671 dprintk("%s:Begin\n", __func__);
553 672
554 assert_spin_locked(&lo->inode->i_lock); 673 assert_spin_locked(&lo->plh_inode->i_lock);
555 list_for_each_entry(lseg, &lo->segs, fi_list) { 674 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
556 if (is_matching_lseg(lseg, iomode)) { 675 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
676 is_matching_lseg(lseg, iomode)) {
557 ret = lseg; 677 ret = lseg;
558 break; 678 break;
559 } 679 }
560 if (cmp_layout(iomode, lseg->range.iomode) > 0) 680 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
561 break; 681 break;
562 } 682 }
563 683
564 dprintk("%s:Return lseg %p ref %d\n", 684 dprintk("%s:Return lseg %p ref %d\n",
565 __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0); 685 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
566 return ret; 686 return ret;
567} 687}
568 688
@@ -576,6 +696,7 @@ pnfs_update_layout(struct inode *ino,
576 enum pnfs_iomode iomode) 696 enum pnfs_iomode iomode)
577{ 697{
578 struct nfs_inode *nfsi = NFS_I(ino); 698 struct nfs_inode *nfsi = NFS_I(ino);
699 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
579 struct pnfs_layout_hdr *lo; 700 struct pnfs_layout_hdr *lo;
580 struct pnfs_layout_segment *lseg = NULL; 701 struct pnfs_layout_segment *lseg = NULL;
581 702
@@ -588,25 +709,53 @@ pnfs_update_layout(struct inode *ino,
588 goto out_unlock; 709 goto out_unlock;
589 } 710 }
590 711
591 /* Check to see if the layout for the given range already exists */ 712 /* Do we even need to bother with this? */
592 lseg = pnfs_has_layout(lo, iomode); 713 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
593 if (lseg) { 714 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
594 dprintk("%s: Using cached lseg %p for iomode %d)\n", 715 dprintk("%s matches recall, use MDS\n", __func__);
595 __func__, lseg, iomode);
596 goto out_unlock; 716 goto out_unlock;
597 } 717 }
718 /* Check to see if the layout for the given range already exists */
719 lseg = pnfs_find_lseg(lo, iomode);
720 if (lseg)
721 goto out_unlock;
598 722
599 /* if LAYOUTGET already failed once we don't try again */ 723 /* if LAYOUTGET already failed once we don't try again */
600 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) 724 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
725 goto out_unlock;
726
727 if (pnfs_layoutgets_blocked(lo, NULL, 0))
601 goto out_unlock; 728 goto out_unlock;
729 atomic_inc(&lo->plh_outstanding);
602 730
603 get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */ 731 get_layout_hdr(lo);
732 if (list_empty(&lo->plh_segs)) {
733 /* The lo must be on the clp list if there is any
734 * chance of a CB_LAYOUTRECALL(FILE) coming in.
735 */
736 spin_lock(&clp->cl_lock);
737 BUG_ON(!list_empty(&lo->plh_layouts));
738 list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
739 spin_unlock(&clp->cl_lock);
740 }
604 spin_unlock(&ino->i_lock); 741 spin_unlock(&ino->i_lock);
605 742
606 lseg = send_layoutget(lo, ctx, iomode); 743 lseg = send_layoutget(lo, ctx, iomode);
744 if (!lseg) {
745 spin_lock(&ino->i_lock);
746 if (list_empty(&lo->plh_segs)) {
747 spin_lock(&clp->cl_lock);
748 list_del_init(&lo->plh_layouts);
749 spin_unlock(&clp->cl_lock);
750 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
751 }
752 spin_unlock(&ino->i_lock);
753 }
754 atomic_dec(&lo->plh_outstanding);
755 put_layout_hdr(lo);
607out: 756out:
608 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 757 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
609 nfsi->layout->state, lseg); 758 nfsi->layout->plh_flags, lseg);
610 return lseg; 759 return lseg;
611out_unlock: 760out_unlock:
612 spin_unlock(&ino->i_lock); 761 spin_unlock(&ino->i_lock);
@@ -619,9 +768,21 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
619 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 768 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
620 struct nfs4_layoutget_res *res = &lgp->res; 769 struct nfs4_layoutget_res *res = &lgp->res;
621 struct pnfs_layout_segment *lseg; 770 struct pnfs_layout_segment *lseg;
622 struct inode *ino = lo->inode; 771 struct inode *ino = lo->plh_inode;
772 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
623 int status = 0; 773 int status = 0;
624 774
775 /* Verify we got what we asked for.
776 * Note that because the xdr parsing only accepts a single
777 * element array, this can fail even if the server is behaving
778 * correctly.
779 */
780 if (lgp->args.range.iomode > res->range.iomode ||
781 res->range.offset != 0 ||
782 res->range.length != NFS4_MAX_UINT64) {
783 status = -EINVAL;
784 goto out;
785 }
625 /* Inject layout blob into I/O device driver */ 786 /* Inject layout blob into I/O device driver */
626 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); 787 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
627 if (!lseg || IS_ERR(lseg)) { 788 if (!lseg || IS_ERR(lseg)) {
@@ -635,16 +796,37 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
635 } 796 }
636 797
637 spin_lock(&ino->i_lock); 798 spin_lock(&ino->i_lock);
799 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
800 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
801 dprintk("%s forget reply due to recall\n", __func__);
802 goto out_forget_reply;
803 }
804
805 if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
806 dprintk("%s forget reply due to state\n", __func__);
807 goto out_forget_reply;
808 }
638 init_lseg(lo, lseg); 809 init_lseg(lo, lseg);
639 lseg->range = res->range; 810 lseg->pls_range = res->range;
640 *lgp->lsegpp = lseg; 811 *lgp->lsegpp = lseg;
641 pnfs_insert_layout(lo, lseg); 812 pnfs_insert_layout(lo, lseg);
642 813
814 if (res->return_on_close) {
815 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
816 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
817 }
818
643 /* Done processing layoutget. Set the layout stateid */ 819 /* Done processing layoutget. Set the layout stateid */
644 pnfs_set_layout_stateid(lo, &res->stateid); 820 pnfs_set_layout_stateid(lo, &res->stateid, false);
645 spin_unlock(&ino->i_lock); 821 spin_unlock(&ino->i_lock);
646out: 822out:
647 return status; 823 return status;
824
825out_forget_reply:
826 spin_unlock(&ino->i_lock);
827 lseg->pls_layout = lo;
828 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
829 goto out;
648} 830}
649 831
650/* 832/*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e12367d50489..e2612ea0cbed 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,11 +30,17 @@
30#ifndef FS_NFS_PNFS_H 30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H 31#define FS_NFS_PNFS_H
32 32
33enum {
34 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
35 NFS_LSEG_ROC, /* roc bit received from server */
36};
37
33struct pnfs_layout_segment { 38struct pnfs_layout_segment {
34 struct list_head fi_list; 39 struct list_head pls_list;
35 struct pnfs_layout_range range; 40 struct pnfs_layout_range pls_range;
36 struct kref kref; 41 atomic_t pls_refcount;
37 struct pnfs_layout_hdr *layout; 42 unsigned long pls_flags;
43 struct pnfs_layout_hdr *pls_layout;
38}; 44};
39 45
40#ifdef CONFIG_NFS_V4_1 46#ifdef CONFIG_NFS_V4_1
@@ -44,7 +50,9 @@ struct pnfs_layout_segment {
44enum { 50enum {
45 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ 51 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
46 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ 52 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
47 NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */ 53 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
54 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
55 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
48}; 56};
49 57
50/* Per-layout driver specific registration structure */ 58/* Per-layout driver specific registration structure */
@@ -60,13 +68,16 @@ struct pnfs_layoutdriver_type {
60}; 68};
61 69
62struct pnfs_layout_hdr { 70struct pnfs_layout_hdr {
63 unsigned long refcount; 71 atomic_t plh_refcount;
64 struct list_head layouts; /* other client layouts */ 72 struct list_head plh_layouts; /* other client layouts */
65 struct list_head segs; /* layout segments list */ 73 struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
66 seqlock_t seqlock; /* Protects the stateid */ 74 struct list_head plh_segs; /* layout segments list */
67 nfs4_stateid stateid; 75 nfs4_stateid plh_stateid;
68 unsigned long state; 76 atomic_t plh_outstanding; /* number of RPCs out */
69 struct inode *inode; 77 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
78 u32 plh_barrier; /* ignore lower seqids */
79 unsigned long plh_flags;
80 struct inode *plh_inode;
70}; 81};
71 82
72struct pnfs_device { 83struct pnfs_device {
@@ -134,17 +145,30 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
134extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); 145extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
135 146
136/* pnfs.c */ 147/* pnfs.c */
148void get_layout_hdr(struct pnfs_layout_hdr *lo);
137struct pnfs_layout_segment * 149struct pnfs_layout_segment *
138pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 150pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
139 enum pnfs_iomode access_type); 151 enum pnfs_iomode access_type);
140void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 152void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
141void unset_pnfs_layoutdriver(struct nfs_server *); 153void unset_pnfs_layoutdriver(struct nfs_server *);
142int pnfs_layout_process(struct nfs4_layoutget *lgp); 154int pnfs_layout_process(struct nfs4_layoutget *lgp);
155void pnfs_free_lseg_list(struct list_head *tmp_list);
143void pnfs_destroy_layout(struct nfs_inode *); 156void pnfs_destroy_layout(struct nfs_inode *);
144void pnfs_destroy_all_layouts(struct nfs_client *); 157void pnfs_destroy_all_layouts(struct nfs_client *);
145void put_layout_hdr(struct inode *inode); 158void put_layout_hdr(struct pnfs_layout_hdr *lo);
146void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 159void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
147 struct nfs4_state *open_state); 160 const nfs4_stateid *new,
161 bool update_barrier);
162int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
163 struct pnfs_layout_hdr *lo,
164 struct nfs4_state *open_state);
165int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
166 struct list_head *tmp_list,
167 u32 iomode);
168bool pnfs_roc(struct inode *ino);
169void pnfs_roc_release(struct inode *ino);
170void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
171bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
148 172
149 173
150static inline int lo_fail_bit(u32 iomode) 174static inline int lo_fail_bit(u32 iomode)
@@ -176,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
176 return NULL; 200 return NULL;
177} 201}
178 202
203static inline bool
204pnfs_roc(struct inode *ino)
205{
206 return false;
207}
208
209static inline void
210pnfs_roc_release(struct inode *ino)
211{
212}
213
214static inline void
215pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
216{
217}
218
219static inline bool
220pnfs_roc_drain(struct inode *ino, u32 *barrier)
221{
222 return false;
223}
224
179static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) 225static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
180{ 226{
181} 227}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 58e7f84fc1fd..77d5e21c4ad6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -458,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
458 fattr = nfs_alloc_fattr(); 458 fattr = nfs_alloc_fattr();
459 status = -ENOMEM; 459 status = -ENOMEM;
460 if (fh == NULL || fattr == NULL) 460 if (fh == NULL || fattr == NULL)
461 goto out; 461 goto out_free;
462 462
463 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 463 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
464 nfs_mark_for_revalidate(dir); 464 nfs_mark_for_revalidate(dir);
@@ -471,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
471 if (status == 0) 471 if (status == 0)
472 status = nfs_instantiate(dentry, fh, fattr); 472 status = nfs_instantiate(dentry, fh, fattr);
473 473
474out_free:
474 nfs_free_fattr(fattr); 475 nfs_free_fattr(fattr);
475 nfs_free_fhandle(fh); 476 nfs_free_fhandle(fh);
476out: 477out:
@@ -731,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
731 .statfs = nfs_proc_statfs, 732 .statfs = nfs_proc_statfs,
732 .fsinfo = nfs_proc_fsinfo, 733 .fsinfo = nfs_proc_fsinfo,
733 .pathconf = nfs_proc_pathconf, 734 .pathconf = nfs_proc_pathconf,
734 .decode_dirent = nfs_decode_dirent, 735 .decode_dirent = nfs2_decode_dirent,
735 .read_setup = nfs_proc_read_setup, 736 .read_setup = nfs_proc_read_setup,
736 .read_done = nfs_read_done, 737 .read_done = nfs_read_done,
737 .write_setup = nfs_proc_write_setup, 738 .write_setup = nfs_proc_write_setup,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4100630c9a5b..b68c8607770f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -598,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
598 598
599 if (nfss->mountd_version || showdefaults) 599 if (nfss->mountd_version || showdefaults)
600 seq_printf(m, ",mountvers=%u", nfss->mountd_version); 600 seq_printf(m, ",mountvers=%u", nfss->mountd_version);
601 if (nfss->mountd_port || showdefaults) 601 if ((nfss->mountd_port &&
602 nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
603 showdefaults)
602 seq_printf(m, ",mountport=%u", nfss->mountd_port); 604 seq_printf(m, ",mountport=%u", nfss->mountd_port);
603 605
604 nfs_show_mountd_netid(m, nfss, showdefaults); 606 nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -2200,6 +2202,7 @@ static int nfs_set_super(struct super_block *s, void *data)
2200 2202
2201 s->s_flags = sb_mntdata->mntflags; 2203 s->s_flags = sb_mntdata->mntflags;
2202 s->s_fs_info = server; 2204 s->s_fs_info = server;
2205 s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
2203 ret = set_anon_super(s, server); 2206 ret = set_anon_super(s, server);
2204 if (ret == 0) 2207 if (ret == 0)
2205 server->s_dev = s->s_dev; 2208 server->s_dev = s->s_dev;
@@ -2494,7 +2497,13 @@ static void nfs4_clone_super(struct super_block *sb,
2494 sb->s_maxbytes = old_sb->s_maxbytes; 2497 sb->s_maxbytes = old_sb->s_maxbytes;
2495 sb->s_time_gran = 1; 2498 sb->s_time_gran = 1;
2496 sb->s_op = old_sb->s_op; 2499 sb->s_op = old_sb->s_op;
2497 nfs_initialise_sb(sb); 2500 /*
2501 * The VFS shouldn't apply the umask to mode bits. We will do
2502 * so ourselves when necessary.
2503 */
2504 sb->s_flags |= MS_POSIXACL;
2505 sb->s_xattr = old_sb->s_xattr;
2506 nfs_initialise_sb(sb);
2498} 2507}
2499 2508
2500/* 2509/*
@@ -2504,6 +2513,12 @@ static void nfs4_fill_super(struct super_block *sb)
2504{ 2513{
2505 sb->s_time_gran = 1; 2514 sb->s_time_gran = 1;
2506 sb->s_op = &nfs4_sops; 2515 sb->s_op = &nfs4_sops;
2516 /*
2517 * The VFS shouldn't apply the umask to mode bits. We will do
2518 * so ourselves when necessary.
2519 */
2520 sb->s_flags |= MS_POSIXACL;
2521 sb->s_xattr = nfs4_xattr_handlers;
2507 nfs_initialise_sb(sb); 2522 nfs_initialise_sb(sb);
2508} 2523}
2509 2524
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 8fe9eb47a97f..e313a51acdd1 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -429,7 +429,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
429 data = kzalloc(sizeof(*data), GFP_KERNEL); 429 data = kzalloc(sizeof(*data), GFP_KERNEL);
430 if (data == NULL) 430 if (data == NULL)
431 return ERR_PTR(-ENOMEM); 431 return ERR_PTR(-ENOMEM);
432 task_setup_data.callback_data = data, 432 task_setup_data.callback_data = data;
433 433
434 data->cred = rpc_lookup_cred(); 434 data->cred = rpc_lookup_cred();
435 if (IS_ERR(data->cred)) { 435 if (IS_ERR(data->cred)) {
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 000000000000..34e5c40af5ef
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
1/*
2 * Common NFSv4 ACL handling definitions.
3 *
4 * Copyright (c) 2002 The Regents of the University of Michigan.
5 * All rights reserved.
6 *
7 * Marius Aamodt Eriksen <marius@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35#ifndef LINUX_NFS4_ACL_H
36#define LINUX_NFS4_ACL_H
37
38#include <linux/posix_acl.h>
39
40/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
41 * fit in a page: */
42#define NFS4_ACL_MAX 170
43
44struct nfs4_acl *nfs4_acl_new(int);
45int nfs4_acl_get_whotype(char *, u32);
46int nfs4_acl_write_who(int who, char *p);
47int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
48 uid_t who, u32 mask);
49
50#define NFS4_ACL_TYPE_DEFAULT 0x01
51#define NFS4_ACL_DIR 0x02
52#define NFS4_ACL_OWNER 0x04
53
54struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
55 struct posix_acl *, unsigned int flags);
56int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
57 struct posix_acl **, unsigned int flags);
58
59#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c0fcb7ab7f6d..8b31e5f8795d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
1#define MSNFS /* HACK HACK */
2/* 1/*
3 * NFS exporting and validation. 2 * NFS exporting and validation.
4 * 3 *
@@ -1444,9 +1443,6 @@ static struct flags {
1444 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, 1443 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
1445 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, 1444 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
1446 { NFSEXP_V4ROOT, {"v4root", ""}}, 1445 { NFSEXP_V4ROOT, {"v4root", ""}},
1447#ifdef MSNFS
1448 { NFSEXP_MSNFS, {"msnfs", ""}},
1449#endif
1450 { 0, {"", ""}} 1446 { 0, {"", ""}}
1451}; 1447};
1452 1448
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 000000000000..2f3be1321534
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
1/*
2 * Mapping of UID to name and vice versa.
3 *
4 * Copyright (c) 2002, 2003 The Regents of the University of
5 * Michigan. All rights reserved.
6> *
7 * Marius Aamodt Eriksen <marius@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35#ifndef LINUX_NFSD_IDMAP_H
36#define LINUX_NFSD_IDMAP_H
37
38#include <linux/in.h>
39#include <linux/sunrpc/svc.h>
40
41/* XXX from linux/nfs_idmap.h */
42#define IDMAP_NAMESZ 128
43
44#ifdef CONFIG_NFSD_V4
45int nfsd_idmap_init(void);
46void nfsd_idmap_shutdown(void);
47#else
48static inline int nfsd_idmap_init(void)
49{
50 return 0;
51}
52static inline void nfsd_idmap_shutdown(void)
53{
54}
55#endif
56
57__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
58__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
59int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
60int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
61
62#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06b..2247fc91d5e9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
151 __be32 nfserr; 151 __be32 nfserr;
152 u32 max_blocksize = svc_max_payload(rqstp); 152 u32 max_blocksize = svc_max_payload(rqstp);
153 153
154 dprintk("nfsd: READ(3) %s %lu bytes at %lu\n", 154 dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
155 SVCFH_fmt(&argp->fh), 155 SVCFH_fmt(&argp->fh),
156 (unsigned long) argp->count, 156 (unsigned long) argp->count,
157 (unsigned long) argp->offset); 157 (unsigned long long) argp->offset);
158 158
159 /* Obtain buffer pointer for payload. 159 /* Obtain buffer pointer for payload.
160 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) 160 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
191 __be32 nfserr; 191 __be32 nfserr;
192 unsigned long cnt = argp->len; 192 unsigned long cnt = argp->len;
193 193
194 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 194 dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n",
195 SVCFH_fmt(&argp->fh), 195 SVCFH_fmt(&argp->fh),
196 argp->len, 196 argp->len,
197 (unsigned long) argp->offset, 197 (unsigned long long) argp->offset,
198 argp->stable? " stable" : ""); 198 argp->stable? " stable" : "");
199 199
200 fh_copy(&resp->fh, &argp->fh); 200 fh_copy(&resp->fh, &argp->fh);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e48052615159..ad88f1c0a4c3 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
36 36
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
39#include <linux/nfs4_acl.h> 39#include "acl.h"
40 40
41 41
42/* mode bit translations: */ 42/* mode bit translations: */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 143da2eecd7b..3be975e18919 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -50,11 +50,6 @@ enum {
50 NFSPROC4_CLNT_CB_SEQUENCE, 50 NFSPROC4_CLNT_CB_SEQUENCE,
51}; 51};
52 52
53enum nfs_cb_opnum4 {
54 OP_CB_RECALL = 4,
55 OP_CB_SEQUENCE = 11,
56};
57
58#define NFS4_MAXTAGLEN 20 53#define NFS4_MAXTAGLEN 20
59 54
60#define NFS4_enc_cb_null_sz 0 55#define NFS4_enc_cb_null_sz 0
@@ -79,61 +74,6 @@ enum nfs_cb_opnum4 {
79 cb_sequence_dec_sz + \ 74 cb_sequence_dec_sz + \
80 op_dec_sz) 75 op_dec_sz)
81 76
82/*
83* Generic encode routines from fs/nfs/nfs4xdr.c
84*/
85static inline __be32 *
86xdr_writemem(__be32 *p, const void *ptr, int nbytes)
87{
88 int tmp = XDR_QUADLEN(nbytes);
89 if (!tmp)
90 return p;
91 p[tmp-1] = 0;
92 memcpy(p, ptr, nbytes);
93 return p + tmp;
94}
95
96#define WRITE32(n) *p++ = htonl(n)
97#define WRITEMEM(ptr,nbytes) do { \
98 p = xdr_writemem(p, ptr, nbytes); \
99} while (0)
100#define RESERVE_SPACE(nbytes) do { \
101 p = xdr_reserve_space(xdr, nbytes); \
102 if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
103 BUG_ON(!p); \
104} while (0)
105
106/*
107 * Generic decode routines from fs/nfs/nfs4xdr.c
108 */
109#define DECODE_TAIL \
110 status = 0; \
111out: \
112 return status; \
113xdr_error: \
114 dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
115 status = -EIO; \
116 goto out
117
118#define READ32(x) (x) = ntohl(*p++)
119#define READ64(x) do { \
120 (x) = (u64)ntohl(*p++) << 32; \
121 (x) |= ntohl(*p++); \
122} while (0)
123#define READTIME(x) do { \
124 p++; \
125 (x.tv_sec) = ntohl(*p++); \
126 (x.tv_nsec) = ntohl(*p++); \
127} while (0)
128#define READ_BUF(nbytes) do { \
129 p = xdr_inline_decode(xdr, nbytes); \
130 if (!p) { \
131 dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
132 __func__, __LINE__); \
133 return -EIO; \
134 } \
135} while (0)
136
137struct nfs4_cb_compound_hdr { 77struct nfs4_cb_compound_hdr {
138 /* args */ 78 /* args */
139 u32 ident; /* minorversion 0 only */ 79 u32 ident; /* minorversion 0 only */
@@ -144,295 +84,513 @@ struct nfs4_cb_compound_hdr {
144 int status; 84 int status;
145}; 85};
146 86
147static struct { 87/*
148int stat; 88 * Handle decode buffer overflows out-of-line.
149int errno; 89 */
150} nfs_cb_errtbl[] = { 90static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
151 { NFS4_OK, 0 }, 91{
152 { NFS4ERR_PERM, EPERM }, 92 dprintk("NFS: %s prematurely hit the end of our receive buffer. "
153 { NFS4ERR_NOENT, ENOENT }, 93 "Remaining buffer length is %tu words.\n",
154 { NFS4ERR_IO, EIO }, 94 func, xdr->end - xdr->p);
155 { NFS4ERR_NXIO, ENXIO }, 95}
156 { NFS4ERR_ACCESS, EACCES },
157 { NFS4ERR_EXIST, EEXIST },
158 { NFS4ERR_XDEV, EXDEV },
159 { NFS4ERR_NOTDIR, ENOTDIR },
160 { NFS4ERR_ISDIR, EISDIR },
161 { NFS4ERR_INVAL, EINVAL },
162 { NFS4ERR_FBIG, EFBIG },
163 { NFS4ERR_NOSPC, ENOSPC },
164 { NFS4ERR_ROFS, EROFS },
165 { NFS4ERR_MLINK, EMLINK },
166 { NFS4ERR_NAMETOOLONG, ENAMETOOLONG },
167 { NFS4ERR_NOTEMPTY, ENOTEMPTY },
168 { NFS4ERR_DQUOT, EDQUOT },
169 { NFS4ERR_STALE, ESTALE },
170 { NFS4ERR_BADHANDLE, EBADHANDLE },
171 { NFS4ERR_BAD_COOKIE, EBADCOOKIE },
172 { NFS4ERR_NOTSUPP, ENOTSUPP },
173 { NFS4ERR_TOOSMALL, ETOOSMALL },
174 { NFS4ERR_SERVERFAULT, ESERVERFAULT },
175 { NFS4ERR_BADTYPE, EBADTYPE },
176 { NFS4ERR_LOCKED, EAGAIN },
177 { NFS4ERR_RESOURCE, EREMOTEIO },
178 { NFS4ERR_SYMLINK, ELOOP },
179 { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP },
180 { NFS4ERR_DEADLOCK, EDEADLK },
181 { -1, EIO }
182};
183 96
184static int 97static __be32 *xdr_encode_empty_array(__be32 *p)
185nfs_cb_stat_to_errno(int stat)
186{ 98{
187 int i; 99 *p++ = xdr_zero;
188 for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { 100 return p;
189 if (nfs_cb_errtbl[i].stat == stat)
190 return nfs_cb_errtbl[i].errno;
191 }
192 /* If we cannot translate the error, the recovery routines should
193 * handle it.
194 * Note: remaining NFSv4 error codes have values > 10000, so should
195 * not conflict with native Linux error codes.
196 */
197 return stat;
198} 101}
199 102
200/* 103/*
201 * XDR encode 104 * Encode/decode NFSv4 CB basic data types
105 *
106 * Basic NFSv4 callback data types are defined in section 15 of RFC
107 * 3530: "Network File System (NFS) version 4 Protocol" and section
108 * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
109 * 1 Protocol"
202 */ 110 */
203 111
204static void 112/*
205encode_stateid(struct xdr_stream *xdr, stateid_t *sid) 113 * nfs_cb_opnum4
114 *
115 * enum nfs_cb_opnum4 {
116 * OP_CB_GETATTR = 3,
117 * ...
118 * };
119 */
120enum nfs_cb_opnum4 {
121 OP_CB_GETATTR = 3,
122 OP_CB_RECALL = 4,
123 OP_CB_LAYOUTRECALL = 5,
124 OP_CB_NOTIFY = 6,
125 OP_CB_PUSH_DELEG = 7,
126 OP_CB_RECALL_ANY = 8,
127 OP_CB_RECALLABLE_OBJ_AVAIL = 9,
128 OP_CB_RECALL_SLOT = 10,
129 OP_CB_SEQUENCE = 11,
130 OP_CB_WANTS_CANCELLED = 12,
131 OP_CB_NOTIFY_LOCK = 13,
132 OP_CB_NOTIFY_DEVICEID = 14,
133 OP_CB_ILLEGAL = 10044
134};
135
136static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
206{ 137{
207 __be32 *p; 138 __be32 *p;
208 139
209 RESERVE_SPACE(sizeof(stateid_t)); 140 p = xdr_reserve_space(xdr, 4);
210 WRITE32(sid->si_generation); 141 *p = cpu_to_be32(op);
211 WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
212} 142}
213 143
214static void 144/*
215encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) 145 * nfs_fh4
146 *
147 * typedef opaque nfs_fh4<NFS4_FHSIZE>;
148 */
149static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
216{ 150{
217 __be32 * p; 151 u32 length = fh->fh_size;
152 __be32 *p;
218 153
219 RESERVE_SPACE(16); 154 BUG_ON(length > NFS4_FHSIZE);
220 WRITE32(0); /* tag length is always 0 */ 155 p = xdr_reserve_space(xdr, 4 + length);
221 WRITE32(hdr->minorversion); 156 xdr_encode_opaque(p, &fh->fh_base, length);
222 WRITE32(hdr->ident);
223 hdr->nops_p = p;
224 WRITE32(hdr->nops);
225} 157}
226 158
227static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr) 159/*
160 * stateid4
161 *
162 * struct stateid4 {
163 * uint32_t seqid;
164 * opaque other[12];
165 * };
166 */
167static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
228{ 168{
229 *hdr->nops_p = htonl(hdr->nops); 169 __be32 *p;
170
171 p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
172 *p++ = cpu_to_be32(sid->si_generation);
173 xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
230} 174}
231 175
232static void 176/*
233encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp, 177 * sessionid4
234 struct nfs4_cb_compound_hdr *hdr) 178 *
179 * typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
180 */
181static void encode_sessionid4(struct xdr_stream *xdr,
182 const struct nfsd4_session *session)
235{ 183{
236 __be32 *p; 184 __be32 *p;
237 int len = dp->dl_fh.fh_size; 185
238 186 p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
239 RESERVE_SPACE(4); 187 xdr_encode_opaque_fixed(p, session->se_sessionid.data,
240 WRITE32(OP_CB_RECALL); 188 NFS4_MAX_SESSIONID_LEN);
241 encode_stateid(xdr, &dp->dl_stateid);
242 RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
243 WRITE32(0); /* truncate optimization not implemented */
244 WRITE32(len);
245 WRITEMEM(&dp->dl_fh.fh_base, len);
246 hdr->nops++;
247} 189}
248 190
249static void 191/*
250encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb, 192 * nfsstat4
251 struct nfs4_cb_compound_hdr *hdr) 193 */
252{ 194static const struct {
253 __be32 *p; 195 int stat;
254 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session; 196 int errno;
197} nfs_cb_errtbl[] = {
198 { NFS4_OK, 0 },
199 { NFS4ERR_PERM, -EPERM },
200 { NFS4ERR_NOENT, -ENOENT },
201 { NFS4ERR_IO, -EIO },
202 { NFS4ERR_NXIO, -ENXIO },
203 { NFS4ERR_ACCESS, -EACCES },
204 { NFS4ERR_EXIST, -EEXIST },
205 { NFS4ERR_XDEV, -EXDEV },
206 { NFS4ERR_NOTDIR, -ENOTDIR },
207 { NFS4ERR_ISDIR, -EISDIR },
208 { NFS4ERR_INVAL, -EINVAL },
209 { NFS4ERR_FBIG, -EFBIG },
210 { NFS4ERR_NOSPC, -ENOSPC },
211 { NFS4ERR_ROFS, -EROFS },
212 { NFS4ERR_MLINK, -EMLINK },
213 { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG },
214 { NFS4ERR_NOTEMPTY, -ENOTEMPTY },
215 { NFS4ERR_DQUOT, -EDQUOT },
216 { NFS4ERR_STALE, -ESTALE },
217 { NFS4ERR_BADHANDLE, -EBADHANDLE },
218 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
219 { NFS4ERR_NOTSUPP, -ENOTSUPP },
220 { NFS4ERR_TOOSMALL, -ETOOSMALL },
221 { NFS4ERR_SERVERFAULT, -ESERVERFAULT },
222 { NFS4ERR_BADTYPE, -EBADTYPE },
223 { NFS4ERR_LOCKED, -EAGAIN },
224 { NFS4ERR_RESOURCE, -EREMOTEIO },
225 { NFS4ERR_SYMLINK, -ELOOP },
226 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
227 { NFS4ERR_DEADLOCK, -EDEADLK },
228 { -1, -EIO }
229};
255 230
256 if (hdr->minorversion == 0) 231/*
257 return; 232 * If we cannot translate the error, the recovery routines should
233 * handle it.
234 *
235 * Note: remaining NFSv4 error codes have values > 10000, so should
236 * not conflict with native Linux error codes.
237 */
238static int nfs_cb_stat_to_errno(int status)
239{
240 int i;
258 241
259 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); 242 for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
243 if (nfs_cb_errtbl[i].stat == status)
244 return nfs_cb_errtbl[i].errno;
245 }
260 246
261 WRITE32(OP_CB_SEQUENCE); 247 dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
262 WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); 248 return -status;
263 WRITE32(ses->se_cb_seq_nr);
264 WRITE32(0); /* slotid, always 0 */
265 WRITE32(0); /* highest slotid always 0 */
266 WRITE32(0); /* cachethis always 0 */
267 WRITE32(0); /* FIXME: support referring_call_lists */
268 hdr->nops++;
269} 249}
270 250
271static int 251static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
272nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) 252 enum nfsstat4 *status)
273{ 253{
274 struct xdr_stream xdrs, *xdr = &xdrs; 254 __be32 *p;
255 u32 op;
275 256
276 xdr_init_encode(&xdrs, &req->rq_snd_buf, p); 257 p = xdr_inline_decode(xdr, 4 + 4);
277 RESERVE_SPACE(0); 258 if (unlikely(p == NULL))
259 goto out_overflow;
260 op = be32_to_cpup(p++);
261 if (unlikely(op != expected))
262 goto out_unexpected;
263 *status = be32_to_cpup(p);
278 return 0; 264 return 0;
265out_overflow:
266 print_overflow_msg(__func__, xdr);
267 return -EIO;
268out_unexpected:
269 dprintk("NFSD: Callback server returned operation %d but "
270 "we issued a request for %d\n", op, expected);
271 return -EIO;
279} 272}
280 273
281static int 274/*
282nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, 275 * CB_COMPOUND4args
283 struct nfsd4_callback *cb) 276 *
277 * struct CB_COMPOUND4args {
278 * utf8str_cs tag;
279 * uint32_t minorversion;
280 * uint32_t callback_ident;
281 * nfs_cb_argop4 argarray<>;
282 * };
283*/
284static void encode_cb_compound4args(struct xdr_stream *xdr,
285 struct nfs4_cb_compound_hdr *hdr)
284{ 286{
285 struct xdr_stream xdr; 287 __be32 * p;
286 struct nfs4_delegation *args = cb->cb_op;
287 struct nfs4_cb_compound_hdr hdr = {
288 .ident = cb->cb_clp->cl_cb_ident,
289 .minorversion = cb->cb_minorversion,
290 };
291 288
292 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 289 p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
293 encode_cb_compound_hdr(&xdr, &hdr); 290 p = xdr_encode_empty_array(p); /* empty tag */
294 encode_cb_sequence(&xdr, cb, &hdr); 291 *p++ = cpu_to_be32(hdr->minorversion);
295 encode_cb_recall(&xdr, args, &hdr); 292 *p++ = cpu_to_be32(hdr->ident);
296 encode_cb_nops(&hdr); 293
294 hdr->nops_p = p;
295 *p = cpu_to_be32(hdr->nops); /* argarray element count */
296}
297
298/*
299 * Update argarray element count
300 */
301static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
302{
303 BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
304 *hdr->nops_p = cpu_to_be32(hdr->nops);
305}
306
307/*
308 * CB_COMPOUND4res
309 *
310 * struct CB_COMPOUND4res {
311 * nfsstat4 status;
312 * utf8str_cs tag;
313 * nfs_cb_resop4 resarray<>;
314 * };
315 */
316static int decode_cb_compound4res(struct xdr_stream *xdr,
317 struct nfs4_cb_compound_hdr *hdr)
318{
319 u32 length;
320 __be32 *p;
321
322 p = xdr_inline_decode(xdr, 4 + 4);
323 if (unlikely(p == NULL))
324 goto out_overflow;
325 hdr->status = be32_to_cpup(p++);
326 /* Ignore the tag */
327 length = be32_to_cpup(p++);
328 p = xdr_inline_decode(xdr, length + 4);
329 if (unlikely(p == NULL))
330 goto out_overflow;
331 hdr->nops = be32_to_cpup(p);
297 return 0; 332 return 0;
333out_overflow:
334 print_overflow_msg(__func__, xdr);
335 return -EIO;
298} 336}
299 337
338/*
339 * CB_RECALL4args
340 *
341 * struct CB_RECALL4args {
342 * stateid4 stateid;
343 * bool truncate;
344 * nfs_fh4 fh;
345 * };
346 */
347static void encode_cb_recall4args(struct xdr_stream *xdr,
348 const struct nfs4_delegation *dp,
349 struct nfs4_cb_compound_hdr *hdr)
350{
351 __be32 *p;
352
353 encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
354 encode_stateid4(xdr, &dp->dl_stateid);
355
356 p = xdr_reserve_space(xdr, 4);
357 *p++ = xdr_zero; /* truncate */
300 358
301static int 359 encode_nfs_fh4(xdr, &dp->dl_fh);
302decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
303 __be32 *p;
304 u32 taglen;
305 360
306 READ_BUF(8); 361 hdr->nops++;
307 READ32(hdr->status);
308 /* We've got no use for the tag; ignore it: */
309 READ32(taglen);
310 READ_BUF(taglen + 4);
311 p += XDR_QUADLEN(taglen);
312 READ32(hdr->nops);
313 return 0;
314} 362}
315 363
316static int 364/*
317decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) 365 * CB_SEQUENCE4args
366 *
367 * struct CB_SEQUENCE4args {
368 * sessionid4 csa_sessionid;
369 * sequenceid4 csa_sequenceid;
370 * slotid4 csa_slotid;
371 * slotid4 csa_highest_slotid;
372 * bool csa_cachethis;
373 * referring_call_list4 csa_referring_call_lists<>;
374 * };
375 */
376static void encode_cb_sequence4args(struct xdr_stream *xdr,
377 const struct nfsd4_callback *cb,
378 struct nfs4_cb_compound_hdr *hdr)
318{ 379{
380 struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
319 __be32 *p; 381 __be32 *p;
320 u32 op; 382
321 int32_t nfserr; 383 if (hdr->minorversion == 0)
322 384 return;
323 READ_BUF(8); 385
324 READ32(op); 386 encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
325 if (op != expected) { 387 encode_sessionid4(xdr, session);
326 dprintk("NFSD: decode_cb_op_hdr: Callback server returned " 388
327 " operation %d but we issued a request for %d\n", 389 p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
328 op, expected); 390 *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */
329 return -EIO; 391 *p++ = xdr_zero; /* csa_slotid */
330 } 392 *p++ = xdr_zero; /* csa_highest_slotid */
331 READ32(nfserr); 393 *p++ = xdr_zero; /* csa_cachethis */
332 if (nfserr != NFS_OK) 394 xdr_encode_empty_array(p); /* csa_referring_call_lists */
333 return -nfs_cb_stat_to_errno(nfserr); 395
334 return 0; 396 hdr->nops++;
335} 397}
336 398
337/* 399/*
400 * CB_SEQUENCE4resok
401 *
402 * struct CB_SEQUENCE4resok {
403 * sessionid4 csr_sessionid;
404 * sequenceid4 csr_sequenceid;
405 * slotid4 csr_slotid;
406 * slotid4 csr_highest_slotid;
407 * slotid4 csr_target_highest_slotid;
408 * };
409 *
410 * union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
411 * case NFS4_OK:
412 * CB_SEQUENCE4resok csr_resok4;
413 * default:
414 * void;
415 * };
416 *
338 * Our current back channel implmentation supports a single backchannel 417 * Our current back channel implmentation supports a single backchannel
339 * with a single slot. 418 * with a single slot.
340 */ 419 */
341static int 420static int decode_cb_sequence4resok(struct xdr_stream *xdr,
342decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb, 421 struct nfsd4_callback *cb)
343 struct rpc_rqst *rqstp)
344{ 422{
345 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session; 423 struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
346 struct nfs4_sessionid id; 424 struct nfs4_sessionid id;
347 int status; 425 int status;
348 u32 dummy;
349 __be32 *p; 426 __be32 *p;
427 u32 dummy;
350 428
351 if (cb->cb_minorversion == 0) 429 status = -ESERVERFAULT;
352 return 0;
353
354 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
355 if (status)
356 return status;
357 430
358 /* 431 /*
359 * If the server returns different values for sessionID, slotID or 432 * If the server returns different values for sessionID, slotID or
360 * sequence number, the server is looney tunes. 433 * sequence number, the server is looney tunes.
361 */ 434 */
362 status = -ESERVERFAULT; 435 p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
363 436 if (unlikely(p == NULL))
364 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); 437 goto out_overflow;
365 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); 438 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
366 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); 439 if (memcmp(id.data, session->se_sessionid.data,
367 if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) { 440 NFS4_MAX_SESSIONID_LEN) != 0) {
368 dprintk("%s Invalid session id\n", __func__); 441 dprintk("NFS: %s Invalid session id\n", __func__);
369 goto out; 442 goto out;
370 } 443 }
371 READ32(dummy); 444 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
372 if (dummy != ses->se_cb_seq_nr) { 445
373 dprintk("%s Invalid sequence number\n", __func__); 446 dummy = be32_to_cpup(p++);
447 if (dummy != session->se_cb_seq_nr) {
448 dprintk("NFS: %s Invalid sequence number\n", __func__);
374 goto out; 449 goto out;
375 } 450 }
376 READ32(dummy); /* slotid must be 0 */ 451
452 dummy = be32_to_cpup(p++);
377 if (dummy != 0) { 453 if (dummy != 0) {
378 dprintk("%s Invalid slotid\n", __func__); 454 dprintk("NFS: %s Invalid slotid\n", __func__);
379 goto out; 455 goto out;
380 } 456 }
381 /* FIXME: process highest slotid and target highest slotid */ 457
458 /*
459 * FIXME: process highest slotid and target highest slotid
460 */
382 status = 0; 461 status = 0;
383out: 462out:
384 return status; 463 return status;
464out_overflow:
465 print_overflow_msg(__func__, xdr);
466 return -EIO;
385} 467}
386 468
469static int decode_cb_sequence4res(struct xdr_stream *xdr,
470 struct nfsd4_callback *cb)
471{
472 enum nfsstat4 nfserr;
473 int status;
474
475 if (cb->cb_minorversion == 0)
476 return 0;
387 477
388static int 478 status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
389nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) 479 if (unlikely(status))
480 goto out;
481 if (unlikely(nfserr != NFS4_OK))
482 goto out_default;
483 status = decode_cb_sequence4resok(xdr, cb);
484out:
485 return status;
486out_default:
487 return nfs_cb_stat_to_errno(status);
488}
489
490/*
491 * NFSv4.0 and NFSv4.1 XDR encode functions
492 *
493 * NFSv4.0 callback argument types are defined in section 15 of RFC
494 * 3530: "Network File System (NFS) version 4 Protocol" and section 20
495 * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1
496 * Protocol".
497 */
498
499/*
500 * NB: Without this zero space reservation, callbacks over krb5p fail
501 */
502static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
503 void *__unused)
504{
505 xdr_reserve_space(xdr, 0);
506}
507
508/*
509 * 20.2. Operation 4: CB_RECALL - Recall a Delegation
510 */
511static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
512 const struct nfsd4_callback *cb)
513{
514 const struct nfs4_delegation *args = cb->cb_op;
515 struct nfs4_cb_compound_hdr hdr = {
516 .ident = cb->cb_clp->cl_cb_ident,
517 .minorversion = cb->cb_minorversion,
518 };
519
520 encode_cb_compound4args(xdr, &hdr);
521 encode_cb_sequence4args(xdr, cb, &hdr);
522 encode_cb_recall4args(xdr, args, &hdr);
523 encode_cb_nops(&hdr);
524}
525
526
527/*
528 * NFSv4.0 and NFSv4.1 XDR decode functions
529 *
530 * NFSv4.0 callback result types are defined in section 15 of RFC
531 * 3530: "Network File System (NFS) version 4 Protocol" and section 20
532 * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1
533 * Protocol".
534 */
535
536static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
537 void *__unused)
390{ 538{
391 return 0; 539 return 0;
392} 540}
393 541
394static int 542/*
395nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, 543 * 20.2. Operation 4: CB_RECALL - Recall a Delegation
396 struct nfsd4_callback *cb) 544 */
545static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
546 struct xdr_stream *xdr,
547 struct nfsd4_callback *cb)
397{ 548{
398 struct xdr_stream xdr;
399 struct nfs4_cb_compound_hdr hdr; 549 struct nfs4_cb_compound_hdr hdr;
550 enum nfsstat4 nfserr;
400 int status; 551 int status;
401 552
402 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 553 status = decode_cb_compound4res(xdr, &hdr);
403 status = decode_cb_compound_hdr(&xdr, &hdr); 554 if (unlikely(status))
404 if (status)
405 goto out; 555 goto out;
406 if (cb) { 556
407 status = decode_cb_sequence(&xdr, cb, rqstp); 557 if (cb != NULL) {
408 if (status) 558 status = decode_cb_sequence4res(xdr, cb);
559 if (unlikely(status))
409 goto out; 560 goto out;
410 } 561 }
411 status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); 562
563 status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
564 if (unlikely(status))
565 goto out;
566 if (unlikely(nfserr != NFS4_OK))
567 goto out_default;
412out: 568out:
413 return status; 569 return status;
570out_default:
571 return nfs_cb_stat_to_errno(status);
414} 572}
415 573
416/* 574/*
417 * RPC procedure tables 575 * RPC procedure tables
418 */ 576 */
419#define PROC(proc, call, argtype, restype) \ 577#define PROC(proc, call, argtype, restype) \
420[NFSPROC4_CLNT_##proc] = { \ 578[NFSPROC4_CLNT_##proc] = { \
421 .p_proc = NFSPROC4_CB_##call, \ 579 .p_proc = NFSPROC4_CB_##call, \
422 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ 580 .p_encode = (kxdreproc_t)nfs4_xdr_enc_##argtype, \
423 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ 581 .p_decode = (kxdrdproc_t)nfs4_xdr_dec_##restype, \
424 .p_arglen = NFS4_##argtype##_sz, \ 582 .p_arglen = NFS4_enc_##argtype##_sz, \
425 .p_replen = NFS4_##restype##_sz, \ 583 .p_replen = NFS4_dec_##restype##_sz, \
426 .p_statidx = NFSPROC4_CB_##call, \ 584 .p_statidx = NFSPROC4_CB_##call, \
427 .p_name = #proc, \ 585 .p_name = #proc, \
428} 586}
429 587
430static struct rpc_procinfo nfs4_cb_procedures[] = { 588static struct rpc_procinfo nfs4_cb_procedures[] = {
431 PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), 589 PROC(CB_NULL, NULL, cb_null, cb_null),
432 PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), 590 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
433}; 591};
434 592
435static struct rpc_version nfs_cb_version4 = { 593static struct rpc_version nfs_cb_version4 = {
436/* 594/*
437 * Note on the callback rpc program version number: despite language in rfc 595 * Note on the callback rpc program version number: despite language in rfc
438 * 5661 section 18.36.3 requiring servers to use 4 in this field, the 596 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
@@ -440,29 +598,29 @@ static struct rpc_version nfs_cb_version4 = {
440 * in practice that appears to be what implementations use. The section 598 * in practice that appears to be what implementations use. The section
441 * 18.36.3 language is expected to be fixed in an erratum. 599 * 18.36.3 language is expected to be fixed in an erratum.
442 */ 600 */
443 .number = 1, 601 .number = 1,
444 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), 602 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures),
445 .procs = nfs4_cb_procedures 603 .procs = nfs4_cb_procedures
446}; 604};
447 605
448static struct rpc_version * nfs_cb_version[] = { 606static struct rpc_version *nfs_cb_version[] = {
449 &nfs_cb_version4, 607 &nfs_cb_version4,
450}; 608};
451 609
452static struct rpc_program cb_program; 610static struct rpc_program cb_program;
453 611
454static struct rpc_stat cb_stats = { 612static struct rpc_stat cb_stats = {
455 .program = &cb_program 613 .program = &cb_program
456}; 614};
457 615
458#define NFS4_CALLBACK 0x40000000 616#define NFS4_CALLBACK 0x40000000
459static struct rpc_program cb_program = { 617static struct rpc_program cb_program = {
460 .name = "nfs4_cb", 618 .name = "nfs4_cb",
461 .number = NFS4_CALLBACK, 619 .number = NFS4_CALLBACK,
462 .nrvers = ARRAY_SIZE(nfs_cb_version), 620 .nrvers = ARRAY_SIZE(nfs_cb_version),
463 .version = nfs_cb_version, 621 .version = nfs_cb_version,
464 .stats = &cb_stats, 622 .stats = &cb_stats,
465 .pipe_dir_name = "/nfsd4_cb", 623 .pipe_dir_name = "/nfsd4_cb",
466}; 624};
467 625
468static int max_cb_time(void) 626static int max_cb_time(void)
@@ -470,10 +628,8 @@ static int max_cb_time(void)
470 return max(nfsd4_lease/10, (time_t)1) * HZ; 628 return max(nfsd4_lease/10, (time_t)1) * HZ;
471} 629}
472 630
473/* Reference counting, callback cleanup, etc., all look racy as heck.
474 * And why is cl_cb_set an atomic? */
475 631
476int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) 632static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
477{ 633{
478 struct rpc_timeout timeparms = { 634 struct rpc_timeout timeparms = {
479 .to_initval = max_cb_time(), 635 .to_initval = max_cb_time(),
@@ -483,6 +639,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
483 .net = &init_net, 639 .net = &init_net,
484 .address = (struct sockaddr *) &conn->cb_addr, 640 .address = (struct sockaddr *) &conn->cb_addr,
485 .addrsize = conn->cb_addrlen, 641 .addrsize = conn->cb_addrlen,
642 .saddress = (struct sockaddr *) &conn->cb_saddr,
486 .timeout = &timeparms, 643 .timeout = &timeparms,
487 .program = &cb_program, 644 .program = &cb_program,
488 .version = 0, 645 .version = 0,
@@ -499,6 +656,10 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
499 args.protocol = XPRT_TRANSPORT_TCP; 656 args.protocol = XPRT_TRANSPORT_TCP;
500 clp->cl_cb_ident = conn->cb_ident; 657 clp->cl_cb_ident = conn->cb_ident;
501 } else { 658 } else {
659 if (!conn->cb_xprt)
660 return -EINVAL;
661 clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
662 clp->cl_cb_session = ses;
502 args.bc_xprt = conn->cb_xprt; 663 args.bc_xprt = conn->cb_xprt;
503 args.prognumber = clp->cl_cb_session->se_cb_prog; 664 args.prognumber = clp->cl_cb_session->se_cb_prog;
504 args.protocol = XPRT_TRANSPORT_BC_TCP; 665 args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -521,14 +682,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
521 (int)clp->cl_name.len, clp->cl_name.data, reason); 682 (int)clp->cl_name.len, clp->cl_name.data, reason);
522} 683}
523 684
685static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
686{
687 clp->cl_cb_state = NFSD4_CB_DOWN;
688 warn_no_callback_path(clp, reason);
689}
690
524static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) 691static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
525{ 692{
526 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); 693 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
527 694
528 if (task->tk_status) 695 if (task->tk_status)
529 warn_no_callback_path(clp, task->tk_status); 696 nfsd4_mark_cb_down(clp, task->tk_status);
530 else 697 else
531 atomic_set(&clp->cl_cb_set, 1); 698 clp->cl_cb_state = NFSD4_CB_UP;
532} 699}
533 700
534static const struct rpc_call_ops nfsd4_cb_probe_ops = { 701static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -551,6 +718,11 @@ int set_callback_cred(void)
551 718
552static struct workqueue_struct *callback_wq; 719static struct workqueue_struct *callback_wq;
553 720
721static void run_nfsd4_cb(struct nfsd4_callback *cb)
722{
723 queue_work(callback_wq, &cb->cb_work);
724}
725
554static void do_probe_callback(struct nfs4_client *clp) 726static void do_probe_callback(struct nfs4_client *clp)
555{ 727{
556 struct nfsd4_callback *cb = &clp->cl_cb_null; 728 struct nfsd4_callback *cb = &clp->cl_cb_null;
@@ -565,7 +737,7 @@ static void do_probe_callback(struct nfs4_client *clp)
565 737
566 cb->cb_ops = &nfsd4_cb_probe_ops; 738 cb->cb_ops = &nfsd4_cb_probe_ops;
567 739
568 queue_work(callback_wq, &cb->cb_work); 740 run_nfsd4_cb(cb);
569} 741}
570 742
571/* 743/*
@@ -574,14 +746,21 @@ static void do_probe_callback(struct nfs4_client *clp)
574 */ 746 */
575void nfsd4_probe_callback(struct nfs4_client *clp) 747void nfsd4_probe_callback(struct nfs4_client *clp)
576{ 748{
749 /* XXX: atomicity? Also, should we be using cl_cb_flags? */
750 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
577 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); 751 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
578 do_probe_callback(clp); 752 do_probe_callback(clp);
579} 753}
580 754
581void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) 755void nfsd4_probe_callback_sync(struct nfs4_client *clp)
582{ 756{
583 BUG_ON(atomic_read(&clp->cl_cb_set)); 757 nfsd4_probe_callback(clp);
758 flush_workqueue(callback_wq);
759}
584 760
761void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
762{
763 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
585 spin_lock(&clp->cl_lock); 764 spin_lock(&clp->cl_lock);
586 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); 765 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
587 spin_unlock(&clp->cl_lock); 766 spin_unlock(&clp->cl_lock);
@@ -592,24 +771,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
592 * If the slot is available, then mark it busy. Otherwise, set the 771 * If the slot is available, then mark it busy. Otherwise, set the
593 * thread for sleeping on the callback RPC wait queue. 772 * thread for sleeping on the callback RPC wait queue.
594 */ 773 */
595static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, 774static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
596 struct rpc_task *task)
597{ 775{
598 u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
599 int status = 0;
600
601 dprintk("%s: %u:%u:%u:%u\n", __func__,
602 ptr[0], ptr[1], ptr[2], ptr[3]);
603
604 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { 776 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
605 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); 777 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
606 dprintk("%s slot is busy\n", __func__); 778 dprintk("%s slot is busy\n", __func__);
607 status = -EAGAIN; 779 return false;
608 goto out;
609 } 780 }
610out: 781 return true;
611 dprintk("%s status=%d\n", __func__, status);
612 return status;
613} 782}
614 783
615/* 784/*
@@ -622,20 +791,19 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
622 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 791 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
623 struct nfs4_client *clp = dp->dl_client; 792 struct nfs4_client *clp = dp->dl_client;
624 u32 minorversion = clp->cl_minorversion; 793 u32 minorversion = clp->cl_minorversion;
625 int status = 0;
626 794
627 cb->cb_minorversion = minorversion; 795 cb->cb_minorversion = minorversion;
628 if (minorversion) { 796 if (minorversion) {
629 status = nfsd41_cb_setup_sequence(clp, task); 797 if (!nfsd41_cb_get_slot(clp, task))
630 if (status) {
631 if (status != -EAGAIN) {
632 /* terminate rpc task */
633 task->tk_status = status;
634 task->tk_action = NULL;
635 }
636 return; 798 return;
637 }
638 } 799 }
800 spin_lock(&clp->cl_lock);
801 if (list_empty(&cb->cb_per_client)) {
802 /* This is the first call, not a restart */
803 cb->cb_done = false;
804 list_add(&cb->cb_per_client, &clp->cl_callbacks);
805 }
806 spin_unlock(&clp->cl_lock);
639 rpc_call_start(task); 807 rpc_call_start(task);
640} 808}
641 809
@@ -671,15 +839,18 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
671 839
672 nfsd4_cb_done(task, calldata); 840 nfsd4_cb_done(task, calldata);
673 841
674 if (current_rpc_client == NULL) { 842 if (current_rpc_client != task->tk_client) {
675 /* We're shutting down; give up. */ 843 /* We're shutting down or changing cl_cb_client; leave
676 /* XXX: err, or is it ok just to fall through 844 * it to nfsd4_process_cb_update to restart the call if
677 * and rpc_restart_call? */ 845 * necessary. */
678 return; 846 return;
679 } 847 }
680 848
849 if (cb->cb_done)
850 return;
681 switch (task->tk_status) { 851 switch (task->tk_status) {
682 case 0: 852 case 0:
853 cb->cb_done = true;
683 return; 854 return;
684 case -EBADHANDLE: 855 case -EBADHANDLE:
685 case -NFS4ERR_BAD_STATEID: 856 case -NFS4ERR_BAD_STATEID:
@@ -688,32 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
688 break; 859 break;
689 default: 860 default:
690 /* Network partition? */ 861 /* Network partition? */
691 atomic_set(&clp->cl_cb_set, 0); 862 nfsd4_mark_cb_down(clp, task->tk_status);
692 warn_no_callback_path(clp, task->tk_status);
693 if (current_rpc_client != task->tk_client) {
694 /* queue a callback on the new connection: */
695 atomic_inc(&dp->dl_count);
696 nfsd4_cb_recall(dp);
697 return;
698 }
699 } 863 }
700 if (dp->dl_retries--) { 864 if (dp->dl_retries--) {
701 rpc_delay(task, 2*HZ); 865 rpc_delay(task, 2*HZ);
702 task->tk_status = 0; 866 task->tk_status = 0;
703 rpc_restart_call_prepare(task); 867 rpc_restart_call_prepare(task);
704 return; 868 return;
705 } else {
706 atomic_set(&clp->cl_cb_set, 0);
707 warn_no_callback_path(clp, task->tk_status);
708 } 869 }
870 nfsd4_mark_cb_down(clp, task->tk_status);
871 cb->cb_done = true;
709} 872}
710 873
711static void nfsd4_cb_recall_release(void *calldata) 874static void nfsd4_cb_recall_release(void *calldata)
712{ 875{
713 struct nfsd4_callback *cb = calldata; 876 struct nfsd4_callback *cb = calldata;
877 struct nfs4_client *clp = cb->cb_clp;
714 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 878 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
715 879
716 nfs4_put_delegation(dp); 880 if (cb->cb_done) {
881 spin_lock(&clp->cl_lock);
882 list_del(&cb->cb_per_client);
883 spin_unlock(&clp->cl_lock);
884 nfs4_put_delegation(dp);
885 }
717} 886}
718 887
719static const struct rpc_call_ops nfsd4_cb_recall_ops = { 888static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -748,16 +917,33 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
748 flush_workqueue(callback_wq); 917 flush_workqueue(callback_wq);
749} 918}
750 919
751void nfsd4_release_cb(struct nfsd4_callback *cb) 920static void nfsd4_release_cb(struct nfsd4_callback *cb)
752{ 921{
753 if (cb->cb_ops->rpc_release) 922 if (cb->cb_ops->rpc_release)
754 cb->cb_ops->rpc_release(cb); 923 cb->cb_ops->rpc_release(cb);
755} 924}
756 925
757void nfsd4_process_cb_update(struct nfsd4_callback *cb) 926/* requires cl_lock: */
927static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
928{
929 struct nfsd4_session *s;
930 struct nfsd4_conn *c;
931
932 list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
933 list_for_each_entry(c, &s->se_conns, cn_persession) {
934 if (c->cn_flags & NFS4_CDFC4_BACK)
935 return c;
936 }
937 }
938 return NULL;
939}
940
941static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
758{ 942{
759 struct nfs4_cb_conn conn; 943 struct nfs4_cb_conn conn;
760 struct nfs4_client *clp = cb->cb_clp; 944 struct nfs4_client *clp = cb->cb_clp;
945 struct nfsd4_session *ses = NULL;
946 struct nfsd4_conn *c;
761 int err; 947 int err;
762 948
763 /* 949 /*
@@ -768,6 +954,10 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
768 rpc_shutdown_client(clp->cl_cb_client); 954 rpc_shutdown_client(clp->cl_cb_client);
769 clp->cl_cb_client = NULL; 955 clp->cl_cb_client = NULL;
770 } 956 }
957 if (clp->cl_cb_conn.cb_xprt) {
958 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
959 clp->cl_cb_conn.cb_xprt = NULL;
960 }
771 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags)) 961 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
772 return; 962 return;
773 spin_lock(&clp->cl_lock); 963 spin_lock(&clp->cl_lock);
@@ -778,11 +968,22 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
778 BUG_ON(!clp->cl_cb_flags); 968 BUG_ON(!clp->cl_cb_flags);
779 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); 969 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
780 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); 970 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
971 c = __nfsd4_find_backchannel(clp);
972 if (c) {
973 svc_xprt_get(c->cn_xprt);
974 conn.cb_xprt = c->cn_xprt;
975 ses = c->cn_session;
976 }
781 spin_unlock(&clp->cl_lock); 977 spin_unlock(&clp->cl_lock);
782 978
783 err = setup_callback_client(clp, &conn); 979 err = setup_callback_client(clp, &conn, ses);
784 if (err) 980 if (err) {
785 warn_no_callback_path(clp, err); 981 warn_no_callback_path(clp, err);
982 return;
983 }
984 /* Yay, the callback channel's back! Restart any callbacks: */
985 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
986 run_nfsd4_cb(cb);
786} 987}
787 988
788void nfsd4_do_callback_rpc(struct work_struct *w) 989void nfsd4_do_callback_rpc(struct work_struct *w)
@@ -807,10 +1008,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
807void nfsd4_cb_recall(struct nfs4_delegation *dp) 1008void nfsd4_cb_recall(struct nfs4_delegation *dp)
808{ 1009{
809 struct nfsd4_callback *cb = &dp->dl_recall; 1010 struct nfsd4_callback *cb = &dp->dl_recall;
1011 struct nfs4_client *clp = dp->dl_client;
810 1012
811 dp->dl_retries = 1; 1013 dp->dl_retries = 1;
812 cb->cb_op = dp; 1014 cb->cb_op = dp;
813 cb->cb_clp = dp->dl_client; 1015 cb->cb_clp = clp;
814 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; 1016 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
815 cb->cb_msg.rpc_argp = cb; 1017 cb->cb_msg.rpc_argp = cb;
816 cb->cb_msg.rpc_resp = cb; 1018 cb->cb_msg.rpc_resp = cb;
@@ -819,5 +1021,8 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
819 cb->cb_ops = &nfsd4_cb_recall_ops; 1021 cb->cb_ops = &nfsd4_cb_recall_ops;
820 dp->dl_retries = 1; 1022 dp->dl_retries = 1;
821 1023
822 queue_work(callback_wq, &dp->dl_recall.cb_work); 1024 INIT_LIST_HEAD(&cb->cb_per_client);
1025 cb->cb_done = true;
1026
1027 run_nfsd4_cb(&dp->dl_recall);
823} 1028}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f0695e815f0e..6d2c397d458b 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,11 @@
33 */ 33 */
34 34
35#include <linux/module.h> 35#include <linux/module.h>
36#include <linux/nfsd_idmap.h>
37#include <linux/seq_file.h> 36#include <linux/seq_file.h>
38#include <linux/sched.h> 37#include <linux/sched.h>
39#include <linux/slab.h> 38#include <linux/slab.h>
39#include "idmap.h"
40#include "nfsd.h"
40 41
41/* 42/*
42 * Cache entry 43 * Cache entry
@@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
514 return clp->name; 515 return clp->name;
515} 516}
516 517
517static int 518static __be32
518idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, 519idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
519 uid_t *id) 520 uid_t *id)
520{ 521{
@@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
524 int ret; 525 int ret;
525 526
526 if (namelen + 1 > sizeof(key.name)) 527 if (namelen + 1 > sizeof(key.name))
527 return -EINVAL; 528 return nfserr_badowner;
528 memcpy(key.name, name, namelen); 529 memcpy(key.name, name, namelen);
529 key.name[namelen] = '\0'; 530 key.name[namelen] = '\0';
530 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); 531 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
531 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); 532 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
532 if (ret == -ENOENT) 533 if (ret == -ENOENT)
533 ret = -ESRCH; /* nfserr_badname */ 534 return nfserr_badowner;
534 if (ret) 535 if (ret)
535 return ret; 536 return nfserrno(ret);
536 *id = item->id; 537 *id = item->id;
537 cache_put(&item->h, &nametoid_cache); 538 cache_put(&item->h, &nametoid_cache);
538 return 0; 539 return 0;
@@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
560 return ret; 561 return ret;
561} 562}
562 563
563int 564__be32
564nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, 565nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
565 __u32 *id) 566 __u32 *id)
566{ 567{
567 return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); 568 return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
568} 569}
569 570
570int 571__be32
571nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, 572nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
572 __u32 *id) 573 __u32 *id)
573{ 574{
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0cdfd022bb7b..db52546143d1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
604 return status; 604 return status;
605} 605}
606 606
607static __be32 607static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
608nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
609 void *arg)
610{ 608{
611 struct svc_fh tmp_fh; 609 struct svc_fh tmp_fh;
612 __be32 ret; 610 __be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
615 ret = exp_pseudoroot(rqstp, &tmp_fh); 613 ret = exp_pseudoroot(rqstp, &tmp_fh);
616 if (ret) 614 if (ret)
617 return ret; 615 return ret;
618 if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { 616 if (tmp_fh.fh_dentry == fh->fh_dentry) {
619 fh_put(&tmp_fh); 617 fh_put(&tmp_fh);
620 return nfserr_noent; 618 return nfserr_noent;
621 } 619 }
622 fh_put(&tmp_fh); 620 fh_put(&tmp_fh);
623 return nfsd_lookup(rqstp, &cstate->current_fh, 621 return nfsd_lookup(rqstp, fh, "..", 2, fh);
624 "..", 2, &cstate->current_fh); 622}
623
624static __be32
625nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
626 void *arg)
627{
628 return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
625} 629}
626 630
627static __be32 631static __be32
@@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
769 } else 773 } else
770 secinfo->si_exp = exp; 774 secinfo->si_exp = exp;
771 dput(dentry); 775 dput(dentry);
776 if (cstate->minorversion)
777 /* See rfc 5661 section 2.6.3.1.1.8 */
778 fh_put(&cstate->current_fh);
772 return err; 779 return err;
773} 780}
774 781
775static __be32 782static __be32
783nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
784 struct nfsd4_secinfo_no_name *sin)
785{
786 __be32 err;
787
788 switch (sin->sin_style) {
789 case NFS4_SECINFO_STYLE4_CURRENT_FH:
790 break;
791 case NFS4_SECINFO_STYLE4_PARENT:
792 err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
793 if (err)
794 return err;
795 break;
796 default:
797 return nfserr_inval;
798 }
799 exp_get(cstate->current_fh.fh_export);
800 sin->sin_exp = cstate->current_fh.fh_export;
801 fh_put(&cstate->current_fh);
802 return nfs_ok;
803}
804
805static __be32
776nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 806nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
777 struct nfsd4_setattr *setattr) 807 struct nfsd4_setattr *setattr)
778{ 808{
@@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
974 * Also note, enforced elsewhere: 1004 * Also note, enforced elsewhere:
975 * - SEQUENCE other than as first op results in 1005 * - SEQUENCE other than as first op results in
976 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) 1006 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
977 * - BIND_CONN_TO_SESSION must be the only op in its compound 1007 * - BIND_CONN_TO_SESSION must be the only op in its compound.
978 * (Will be enforced in nfsd4_bind_conn_to_session().) 1008 * (Enforced in nfsd4_bind_conn_to_session().)
979 * - DESTROY_SESSION must be the final operation in a compound, if 1009 * - DESTROY_SESSION must be the final operation in a compound, if
980 * sessionid's in SEQUENCE and DESTROY_SESSION are the same. 1010 * sessionid's in SEQUENCE and DESTROY_SESSION are the same.
981 * (Enforced in nfsd4_destroy_session().) 1011 * (Enforced in nfsd4_destroy_session().)
@@ -1126,10 +1156,6 @@ encode_op:
1126 1156
1127 nfsd4_increment_op_stats(op->opnum); 1157 nfsd4_increment_op_stats(op->opnum);
1128 } 1158 }
1129 if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
1130 dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
1131 status = nfserr_jukebox;
1132 }
1133 1159
1134 resp->cstate.status = status; 1160 resp->cstate.status = status;
1135 fh_put(&resp->cstate.current_fh); 1161 fh_put(&resp->cstate.current_fh);
@@ -1300,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
1300 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1326 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1301 .op_name = "OP_EXCHANGE_ID", 1327 .op_name = "OP_EXCHANGE_ID",
1302 }, 1328 },
1329 [OP_BIND_CONN_TO_SESSION] = {
1330 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
1331 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1332 .op_name = "OP_BIND_CONN_TO_SESSION",
1333 },
1303 [OP_CREATE_SESSION] = { 1334 [OP_CREATE_SESSION] = {
1304 .op_func = (nfsd4op_func)nfsd4_create_session, 1335 .op_func = (nfsd4op_func)nfsd4_create_session,
1305 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1336 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
@@ -1320,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
1320 .op_flags = ALLOWED_WITHOUT_FH, 1351 .op_flags = ALLOWED_WITHOUT_FH,
1321 .op_name = "OP_RECLAIM_COMPLETE", 1352 .op_name = "OP_RECLAIM_COMPLETE",
1322 }, 1353 },
1354 [OP_SECINFO_NO_NAME] = {
1355 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
1356 .op_name = "OP_SECINFO_NO_NAME",
1357 },
1323}; 1358};
1324 1359
1325static const char *nfsd4_op_name(unsigned opnum) 1360static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a26..ffb59ef6f82f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
302{ 302{
303 int status; 303 int status;
304 304
305 /* note: we currently use this path only for minorversion 0 */
306 if (nfs4_has_reclaimed_state(child->d_name.name, false)) 305 if (nfs4_has_reclaimed_state(child->d_name.name, false))
307 return 0; 306 return 0;
308 307
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fbd18c3074bb..d98d0213285d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
230 dp->dl_client = clp; 230 dp->dl_client = clp;
231 get_nfs4_file(fp); 231 get_nfs4_file(fp);
232 dp->dl_file = fp; 232 dp->dl_file = fp;
233 nfs4_file_get_access(fp, O_RDONLY); 233 dp->dl_vfs_file = find_readable_file(fp);
234 get_file(dp->dl_vfs_file);
234 dp->dl_flock = NULL; 235 dp->dl_flock = NULL;
235 dp->dl_type = type; 236 dp->dl_type = type;
236 dp->dl_stateid.si_boot = boot_time; 237 dp->dl_stateid.si_boot = boot_time;
@@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
252 if (atomic_dec_and_test(&dp->dl_count)) { 253 if (atomic_dec_and_test(&dp->dl_count)) {
253 dprintk("NFSD: freeing dp %p\n",dp); 254 dprintk("NFSD: freeing dp %p\n",dp);
254 put_nfs4_file(dp->dl_file); 255 put_nfs4_file(dp->dl_file);
256 fput(dp->dl_vfs_file);
255 kmem_cache_free(deleg_slab, dp); 257 kmem_cache_free(deleg_slab, dp);
256 num_delegations--; 258 num_delegations--;
257 } 259 }
@@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
265static void 267static void
266nfs4_close_delegation(struct nfs4_delegation *dp) 268nfs4_close_delegation(struct nfs4_delegation *dp)
267{ 269{
268 struct file *filp = find_readable_file(dp->dl_file);
269
270 dprintk("NFSD: close_delegation dp %p\n",dp); 270 dprintk("NFSD: close_delegation dp %p\n",dp);
271 /* XXX: do we even need this check?: */
271 if (dp->dl_flock) 272 if (dp->dl_flock)
272 vfs_setlease(filp, F_UNLCK, &dp->dl_flock); 273 vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
273 nfs4_file_put_access(dp->dl_file, O_RDONLY);
274} 274}
275 275
276/* Called under the state lock. */ 276/* Called under the state lock. */
@@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
642 free_conn(c); 642 free_conn(c);
643 } 643 }
644 spin_unlock(&clp->cl_lock); 644 spin_unlock(&clp->cl_lock);
645 nfsd4_probe_callback(clp);
645} 646}
646 647
647static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) 648static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -679,15 +680,12 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
679 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); 680 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
680} 681}
681 682
682static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) 683static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
683{ 684{
684 struct nfsd4_conn *conn; 685 struct nfsd4_conn *conn;
685 u32 flags = NFS4_CDFC4_FORE;
686 int ret; 686 int ret;
687 687
688 if (ses->se_flags & SESSION4_BACK_CHAN) 688 conn = alloc_conn(rqstp, dir);
689 flags |= NFS4_CDFC4_BACK;
690 conn = alloc_conn(rqstp, flags);
691 if (!conn) 689 if (!conn)
692 return nfserr_jukebox; 690 return nfserr_jukebox;
693 nfsd4_hash_conn(conn, ses); 691 nfsd4_hash_conn(conn, ses);
@@ -698,6 +696,17 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
698 return nfs_ok; 696 return nfs_ok;
699} 697}
700 698
699static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
700{
701 u32 dir = NFS4_CDFC4_FORE;
702
703 if (ses->se_flags & SESSION4_BACK_CHAN)
704 dir |= NFS4_CDFC4_BACK;
705
706 return nfsd4_new_conn(rqstp, ses, dir);
707}
708
709/* must be called under client_lock */
701static void nfsd4_del_conns(struct nfsd4_session *s) 710static void nfsd4_del_conns(struct nfsd4_session *s)
702{ 711{
703 struct nfs4_client *clp = s->se_client; 712 struct nfs4_client *clp = s->se_client;
@@ -749,6 +758,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
749 */ 758 */
750 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached); 759 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
751 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs); 760 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
761 if (numslots < 1)
762 return NULL;
752 763
753 new = alloc_session(slotsize, numslots); 764 new = alloc_session(slotsize, numslots);
754 if (!new) { 765 if (!new) {
@@ -769,25 +780,30 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
769 idx = hash_sessionid(&new->se_sessionid); 780 idx = hash_sessionid(&new->se_sessionid);
770 spin_lock(&client_lock); 781 spin_lock(&client_lock);
771 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 782 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
783 spin_lock(&clp->cl_lock);
772 list_add(&new->se_perclnt, &clp->cl_sessions); 784 list_add(&new->se_perclnt, &clp->cl_sessions);
785 spin_unlock(&clp->cl_lock);
773 spin_unlock(&client_lock); 786 spin_unlock(&client_lock);
774 787
775 status = nfsd4_new_conn(rqstp, new); 788 status = nfsd4_new_conn_from_crses(rqstp, new);
776 /* whoops: benny points out, status is ignored! (err, or bogus) */ 789 /* whoops: benny points out, status is ignored! (err, or bogus) */
777 if (status) { 790 if (status) {
778 free_session(&new->se_ref); 791 free_session(&new->se_ref);
779 return NULL; 792 return NULL;
780 } 793 }
781 if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) { 794 if (cses->flags & SESSION4_BACK_CHAN) {
782 struct sockaddr *sa = svc_addr(rqstp); 795 struct sockaddr *sa = svc_addr(rqstp);
783 796 /*
784 clp->cl_cb_session = new; 797 * This is a little silly; with sessions there's no real
785 clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt; 798 * use for the callback address. Use the peer address
786 svc_xprt_get(rqstp->rq_xprt); 799 * as a reasonable default for now, but consider fixing
800 * the rpc client not to require an address in the
801 * future:
802 */
787 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); 803 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
788 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); 804 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
789 nfsd4_probe_callback(clp);
790 } 805 }
806 nfsd4_probe_callback(clp);
791 return new; 807 return new;
792} 808}
793 809
@@ -817,7 +833,9 @@ static void
817unhash_session(struct nfsd4_session *ses) 833unhash_session(struct nfsd4_session *ses)
818{ 834{
819 list_del(&ses->se_hash); 835 list_del(&ses->se_hash);
836 spin_lock(&ses->se_client->cl_lock);
820 list_del(&ses->se_perclnt); 837 list_del(&ses->se_perclnt);
838 spin_unlock(&ses->se_client->cl_lock);
821} 839}
822 840
823/* must be called under the client_lock */ 841/* must be called under the client_lock */
@@ -923,8 +941,10 @@ unhash_client_locked(struct nfs4_client *clp)
923 941
924 mark_client_expired(clp); 942 mark_client_expired(clp);
925 list_del(&clp->cl_lru); 943 list_del(&clp->cl_lru);
944 spin_lock(&clp->cl_lock);
926 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) 945 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
927 list_del_init(&ses->se_hash); 946 list_del_init(&ses->se_hash);
947 spin_unlock(&clp->cl_lock);
928} 948}
929 949
930static void 950static void
@@ -1051,12 +1071,13 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1051 1071
1052 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 1072 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
1053 atomic_set(&clp->cl_refcount, 0); 1073 atomic_set(&clp->cl_refcount, 0);
1054 atomic_set(&clp->cl_cb_set, 0); 1074 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
1055 INIT_LIST_HEAD(&clp->cl_idhash); 1075 INIT_LIST_HEAD(&clp->cl_idhash);
1056 INIT_LIST_HEAD(&clp->cl_strhash); 1076 INIT_LIST_HEAD(&clp->cl_strhash);
1057 INIT_LIST_HEAD(&clp->cl_openowners); 1077 INIT_LIST_HEAD(&clp->cl_openowners);
1058 INIT_LIST_HEAD(&clp->cl_delegations); 1078 INIT_LIST_HEAD(&clp->cl_delegations);
1059 INIT_LIST_HEAD(&clp->cl_lru); 1079 INIT_LIST_HEAD(&clp->cl_lru);
1080 INIT_LIST_HEAD(&clp->cl_callbacks);
1060 spin_lock_init(&clp->cl_lock); 1081 spin_lock_init(&clp->cl_lock);
1061 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); 1082 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
1062 clp->cl_time = get_seconds(); 1083 clp->cl_time = get_seconds();
@@ -1132,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid)
1132 return NULL; 1153 return NULL;
1133} 1154}
1134 1155
1135/* 1156static bool clp_used_exchangeid(struct nfs4_client *clp)
1136 * Return 1 iff clp's clientid establishment method matches the use_exchange_id
1137 * parameter. Matching is based on the fact the at least one of the
1138 * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
1139 *
1140 * FIXME: we need to unify the clientid namespaces for nfsv4.x
1141 * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
1142 * and SET_CLIENTID{,_CONFIRM}
1143 */
1144static inline int
1145match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
1146{ 1157{
1147 bool has_exchange_flags = (clp->cl_exchange_flags != 0); 1158 return clp->cl_exchange_flags != 0;
1148 return use_exchange_id == has_exchange_flags; 1159}
1149}
1150 1160
1151static struct nfs4_client * 1161static struct nfs4_client *
1152find_confirmed_client_by_str(const char *dname, unsigned int hashval, 1162find_confirmed_client_by_str(const char *dname, unsigned int hashval)
1153 bool use_exchange_id)
1154{ 1163{
1155 struct nfs4_client *clp; 1164 struct nfs4_client *clp;
1156 1165
1157 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { 1166 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
1158 if (same_name(clp->cl_recdir, dname) && 1167 if (same_name(clp->cl_recdir, dname))
1159 match_clientid_establishment(clp, use_exchange_id))
1160 return clp; 1168 return clp;
1161 } 1169 }
1162 return NULL; 1170 return NULL;
1163} 1171}
1164 1172
1165static struct nfs4_client * 1173static struct nfs4_client *
1166find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, 1174find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
1167 bool use_exchange_id)
1168{ 1175{
1169 struct nfs4_client *clp; 1176 struct nfs4_client *clp;
1170 1177
1171 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { 1178 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
1172 if (same_name(clp->cl_recdir, dname) && 1179 if (same_name(clp->cl_recdir, dname))
1173 match_clientid_establishment(clp, use_exchange_id))
1174 return clp; 1180 return clp;
1175 } 1181 }
1176 return NULL; 1182 return NULL;
1177} 1183}
1178 1184
1185static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
1186{
1187 switch (family) {
1188 case AF_INET:
1189 ((struct sockaddr_in *)sa)->sin_family = AF_INET;
1190 ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
1191 return;
1192 case AF_INET6:
1193 ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
1194 ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
1195 return;
1196 }
1197}
1198
1179static void 1199static void
1180gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) 1200gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
1181{ 1201{
1182 struct nfs4_cb_conn *conn = &clp->cl_cb_conn; 1202 struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
1203 struct sockaddr *sa = svc_addr(rqstp);
1204 u32 scopeid = rpc_get_scope_id(sa);
1183 unsigned short expected_family; 1205 unsigned short expected_family;
1184 1206
1185 /* Currently, we only support tcp and tcp6 for the callback channel */ 1207 /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1205,6 +1227,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
1205 1227
1206 conn->cb_prog = se->se_callback_prog; 1228 conn->cb_prog = se->se_callback_prog;
1207 conn->cb_ident = se->se_callback_ident; 1229 conn->cb_ident = se->se_callback_ident;
1230 rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
1208 return; 1231 return;
1209out_err: 1232out_err:
1210 conn->cb_addr.ss_family = AF_UNSPEC; 1233 conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1344,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1344 case SP4_NONE: 1367 case SP4_NONE:
1345 break; 1368 break;
1346 case SP4_SSV: 1369 case SP4_SSV:
1347 return nfserr_encr_alg_unsupp; 1370 return nfserr_serverfault;
1348 default: 1371 default:
1349 BUG(); /* checked by xdr code */ 1372 BUG(); /* checked by xdr code */
1350 case SP4_MACH_CRED: 1373 case SP4_MACH_CRED:
@@ -1361,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1361 nfs4_lock_state(); 1384 nfs4_lock_state();
1362 status = nfs_ok; 1385 status = nfs_ok;
1363 1386
1364 conf = find_confirmed_client_by_str(dname, strhashval, true); 1387 conf = find_confirmed_client_by_str(dname, strhashval);
1365 if (conf) { 1388 if (conf) {
1389 if (!clp_used_exchangeid(conf)) {
1390 status = nfserr_clid_inuse; /* XXX: ? */
1391 goto out;
1392 }
1366 if (!same_verf(&verf, &conf->cl_verifier)) { 1393 if (!same_verf(&verf, &conf->cl_verifier)) {
1367 /* 18.35.4 case 8 */ 1394 /* 18.35.4 case 8 */
1368 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { 1395 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1403,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1403 goto out; 1430 goto out;
1404 } 1431 }
1405 1432
1406 unconf = find_unconfirmed_client_by_str(dname, strhashval, true); 1433 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1407 if (unconf) { 1434 if (unconf) {
1408 /* 1435 /*
1409 * Possible retry or client restart. Per 18.35.4 case 4, 1436 * Possible retry or client restart. Per 18.35.4 case 4,
@@ -1560,6 +1587,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1560 status = nfs_ok; 1587 status = nfs_ok;
1561 memcpy(cr_ses->sessionid.data, new->se_sessionid.data, 1588 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
1562 NFS4_MAX_SESSIONID_LEN); 1589 NFS4_MAX_SESSIONID_LEN);
1590 memcpy(&cr_ses->fore_channel, &new->se_fchannel,
1591 sizeof(struct nfsd4_channel_attrs));
1563 cs_slot->sl_seqid++; 1592 cs_slot->sl_seqid++;
1564 cr_ses->seqid = cs_slot->sl_seqid; 1593 cr_ses->seqid = cs_slot->sl_seqid;
1565 1594
@@ -1581,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
1581 return argp->opcnt == resp->opcnt; 1610 return argp->opcnt == resp->opcnt;
1582} 1611}
1583 1612
1613static __be32 nfsd4_map_bcts_dir(u32 *dir)
1614{
1615 switch (*dir) {
1616 case NFS4_CDFC4_FORE:
1617 case NFS4_CDFC4_BACK:
1618 return nfs_ok;
1619 case NFS4_CDFC4_FORE_OR_BOTH:
1620 case NFS4_CDFC4_BACK_OR_BOTH:
1621 *dir = NFS4_CDFC4_BOTH;
1622 return nfs_ok;
1623 };
1624 return nfserr_inval;
1625}
1626
1627__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1628 struct nfsd4_compound_state *cstate,
1629 struct nfsd4_bind_conn_to_session *bcts)
1630{
1631 __be32 status;
1632
1633 if (!nfsd4_last_compound_op(rqstp))
1634 return nfserr_not_only_op;
1635 spin_lock(&client_lock);
1636 cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
1637 /* Sorta weird: we only need the refcnt'ing because new_conn acquires
1638 * client_lock iself: */
1639 if (cstate->session) {
1640 nfsd4_get_session(cstate->session);
1641 atomic_inc(&cstate->session->se_client->cl_refcount);
1642 }
1643 spin_unlock(&client_lock);
1644 if (!cstate->session)
1645 return nfserr_badsession;
1646
1647 status = nfsd4_map_bcts_dir(&bcts->dir);
1648 nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
1649 return nfs_ok;
1650}
1651
1584static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) 1652static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
1585{ 1653{
1586 if (!session) 1654 if (!session)
@@ -1619,8 +1687,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
1619 spin_unlock(&client_lock); 1687 spin_unlock(&client_lock);
1620 1688
1621 nfs4_lock_state(); 1689 nfs4_lock_state();
1622 /* wait for callbacks */ 1690 nfsd4_probe_callback_sync(ses->se_client);
1623 nfsd4_shutdown_callback(ses->se_client);
1624 nfs4_unlock_state(); 1691 nfs4_unlock_state();
1625 1692
1626 nfsd4_del_conns(ses); 1693 nfsd4_del_conns(ses);
@@ -1733,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1733out: 1800out:
1734 /* Hold a session reference until done processing the compound. */ 1801 /* Hold a session reference until done processing the compound. */
1735 if (cstate->session) { 1802 if (cstate->session) {
1803 struct nfs4_client *clp = session->se_client;
1804
1736 nfsd4_get_session(cstate->session); 1805 nfsd4_get_session(cstate->session);
1737 atomic_inc(&session->se_client->cl_refcount); 1806 atomic_inc(&clp->cl_refcount);
1807 if (clp->cl_cb_state == NFSD4_CB_DOWN)
1808 seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
1738 } 1809 }
1739 kfree(conn); 1810 kfree(conn);
1740 spin_unlock(&client_lock); 1811 spin_unlock(&client_lock);
@@ -1775,7 +1846,6 @@ __be32
1775nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1846nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1776 struct nfsd4_setclientid *setclid) 1847 struct nfsd4_setclientid *setclid)
1777{ 1848{
1778 struct sockaddr *sa = svc_addr(rqstp);
1779 struct xdr_netobj clname = { 1849 struct xdr_netobj clname = {
1780 .len = setclid->se_namelen, 1850 .len = setclid->se_namelen,
1781 .data = setclid->se_name, 1851 .data = setclid->se_name,
@@ -1801,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1801 strhashval = clientstr_hashval(dname); 1871 strhashval = clientstr_hashval(dname);
1802 1872
1803 nfs4_lock_state(); 1873 nfs4_lock_state();
1804 conf = find_confirmed_client_by_str(dname, strhashval, false); 1874 conf = find_confirmed_client_by_str(dname, strhashval);
1805 if (conf) { 1875 if (conf) {
1806 /* RFC 3530 14.2.33 CASE 0: */ 1876 /* RFC 3530 14.2.33 CASE 0: */
1807 status = nfserr_clid_inuse; 1877 status = nfserr_clid_inuse;
1878 if (clp_used_exchangeid(conf))
1879 goto out;
1808 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { 1880 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1809 char addr_str[INET6_ADDRSTRLEN]; 1881 char addr_str[INET6_ADDRSTRLEN];
1810 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, 1882 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1819,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1819 * has a description of SETCLIENTID request processing consisting 1891 * has a description of SETCLIENTID request processing consisting
1820 * of 5 bullet points, labeled as CASE0 - CASE4 below. 1892 * of 5 bullet points, labeled as CASE0 - CASE4 below.
1821 */ 1893 */
1822 unconf = find_unconfirmed_client_by_str(dname, strhashval, false); 1894 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1823 status = nfserr_resource; 1895 status = nfserr_resource;
1824 if (!conf) { 1896 if (!conf) {
1825 /* 1897 /*
@@ -1876,7 +1948,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1876 * for consistent minorversion use throughout: 1948 * for consistent minorversion use throughout:
1877 */ 1949 */
1878 new->cl_minorversion = 0; 1950 new->cl_minorversion = 0;
1879 gen_callback(new, setclid, rpc_get_scope_id(sa)); 1951 gen_callback(new, setclid, rqstp);
1880 add_to_unconfirmed(new, strhashval); 1952 add_to_unconfirmed(new, strhashval);
1881 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1953 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
1882 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 1954 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1935,7 +2007,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1935 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 2007 if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
1936 status = nfserr_clid_inuse; 2008 status = nfserr_clid_inuse;
1937 else { 2009 else {
1938 atomic_set(&conf->cl_cb_set, 0);
1939 nfsd4_change_callback(conf, &unconf->cl_cb_conn); 2010 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
1940 nfsd4_probe_callback(conf); 2011 nfsd4_probe_callback(conf);
1941 expire_client(unconf); 2012 expire_client(unconf);
@@ -1964,7 +2035,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1964 unsigned int hash = 2035 unsigned int hash =
1965 clientstr_hashval(unconf->cl_recdir); 2036 clientstr_hashval(unconf->cl_recdir);
1966 conf = find_confirmed_client_by_str(unconf->cl_recdir, 2037 conf = find_confirmed_client_by_str(unconf->cl_recdir,
1967 hash, false); 2038 hash);
1968 if (conf) { 2039 if (conf) {
1969 nfsd4_remove_clid_dir(conf); 2040 nfsd4_remove_clid_dir(conf);
1970 expire_client(conf); 2041 expire_client(conf);
@@ -2300,41 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2300 nfsd4_cb_recall(dp); 2371 nfsd4_cb_recall(dp);
2301} 2372}
2302 2373
2303/*
2304 * The file_lock is being reapd.
2305 *
2306 * Called by locks_free_lock() with lock_flocks() held.
2307 */
2308static
2309void nfsd_release_deleg_cb(struct file_lock *fl)
2310{
2311 struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
2312
2313 dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
2314
2315 if (!(fl->fl_flags & FL_LEASE) || !dp)
2316 return;
2317 dp->dl_flock = NULL;
2318}
2319
2320/*
2321 * Called from setlease() with lock_flocks() held
2322 */
2323static
2324int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
2325{
2326 struct nfs4_delegation *onlistd =
2327 (struct nfs4_delegation *)onlist->fl_owner;
2328 struct nfs4_delegation *tryd =
2329 (struct nfs4_delegation *)try->fl_owner;
2330
2331 if (onlist->fl_lmops != try->fl_lmops)
2332 return 0;
2333
2334 return onlistd->dl_client == tryd->dl_client;
2335}
2336
2337
2338static 2374static
2339int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) 2375int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2340{ 2376{
@@ -2346,8 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2346 2382
2347static const struct lock_manager_operations nfsd_lease_mng_ops = { 2383static const struct lock_manager_operations nfsd_lease_mng_ops = {
2348 .fl_break = nfsd_break_deleg_cb, 2384 .fl_break = nfsd_break_deleg_cb,
2349 .fl_release_private = nfsd_release_deleg_cb,
2350 .fl_mylease = nfsd_same_client_deleg_cb,
2351 .fl_change = nfsd_change_deleg_cb, 2385 .fl_change = nfsd_change_deleg_cb,
2352}; 2386};
2353 2387
@@ -2514,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
2514 if (!fp->fi_fds[oflag]) { 2548 if (!fp->fi_fds[oflag]) {
2515 status = nfsd_open(rqstp, cur_fh, S_IFREG, access, 2549 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
2516 &fp->fi_fds[oflag]); 2550 &fp->fi_fds[oflag]);
2517 if (status == nfserr_dropit)
2518 status = nfserr_jukebox;
2519 if (status) 2551 if (status)
2520 return status; 2552 return status;
2521 } 2553 }
@@ -2596,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
2596 open->op_stateowner->so_client->cl_firststate = 1; 2628 open->op_stateowner->so_client->cl_firststate = 1;
2597} 2629}
2598 2630
2631/* Should we give out recallable state?: */
2632static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
2633{
2634 if (clp->cl_cb_state == NFSD4_CB_UP)
2635 return true;
2636 /*
2637 * In the sessions case, since we don't have to establish a
2638 * separate connection for callbacks, we assume it's OK
2639 * until we hear otherwise:
2640 */
2641 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
2642}
2643
2599/* 2644/*
2600 * Attempt to hand out a delegation. 2645 * Attempt to hand out a delegation.
2601 */ 2646 */
@@ -2604,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2604{ 2649{
2605 struct nfs4_delegation *dp; 2650 struct nfs4_delegation *dp;
2606 struct nfs4_stateowner *sop = stp->st_stateowner; 2651 struct nfs4_stateowner *sop = stp->st_stateowner;
2607 int cb_up = atomic_read(&sop->so_client->cl_cb_set); 2652 int cb_up;
2608 struct file_lock *fl; 2653 struct file_lock *fl;
2609 int status, flag = 0; 2654 int status, flag = 0;
2610 2655
2656 cb_up = nfsd4_cb_channel_good(sop->so_client);
2611 flag = NFS4_OPEN_DELEGATE_NONE; 2657 flag = NFS4_OPEN_DELEGATE_NONE;
2612 open->op_recall = 0; 2658 open->op_recall = 0;
2613 switch (open->op_claim_type) { 2659 switch (open->op_claim_type) {
@@ -2655,7 +2701,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2655 dp->dl_flock = fl; 2701 dp->dl_flock = fl;
2656 2702
2657 /* vfs_setlease checks to see if delegation should be handed out. 2703 /* vfs_setlease checks to see if delegation should be handed out.
2658 * the lock_manager callbacks fl_mylease and fl_change are used 2704 * the lock_manager callback fl_change is used
2659 */ 2705 */
2660 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { 2706 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2661 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2707 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
@@ -2794,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2794 renew_client(clp); 2840 renew_client(clp);
2795 status = nfserr_cb_path_down; 2841 status = nfserr_cb_path_down;
2796 if (!list_empty(&clp->cl_delegations) 2842 if (!list_empty(&clp->cl_delegations)
2797 && !atomic_read(&clp->cl_cb_set)) 2843 && clp->cl_cb_state != NFSD4_CB_UP)
2798 goto out; 2844 goto out;
2799 status = nfs_ok; 2845 status = nfs_ok;
2800out: 2846out:
@@ -3081,9 +3127,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
3081 if (status) 3127 if (status)
3082 goto out; 3128 goto out;
3083 renew_client(dp->dl_client); 3129 renew_client(dp->dl_client);
3084 if (filpp) 3130 if (filpp) {
3085 *filpp = find_readable_file(dp->dl_file); 3131 *filpp = find_readable_file(dp->dl_file);
3086 BUG_ON(!*filpp); 3132 BUG_ON(!*filpp);
3133 }
3087 } else { /* open or lock stateid */ 3134 } else { /* open or lock stateid */
3088 stp = find_stateid(stateid, flags); 3135 stp = find_stateid(stateid, flags);
3089 if (!stp) 3136 if (!stp)
@@ -4107,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
4107 unsigned int strhashval = clientstr_hashval(name); 4154 unsigned int strhashval = clientstr_hashval(name);
4108 struct nfs4_client *clp; 4155 struct nfs4_client *clp;
4109 4156
4110 clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id); 4157 clp = find_confirmed_client_by_str(name, strhashval);
4111 return clp ? 1 : 0; 4158 return clp ? 1 : 0;
4112} 4159}
4113 4160
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f35a94a04026..956629b9cdc9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
44#include <linux/namei.h> 44#include <linux/namei.h>
45#include <linux/statfs.h> 45#include <linux/statfs.h>
46#include <linux/utsname.h> 46#include <linux/utsname.h>
47#include <linux/nfsd_idmap.h>
48#include <linux/nfs4_acl.h>
49#include <linux/sunrpc/svcauth_gss.h> 47#include <linux/sunrpc/svcauth_gss.h>
50 48
49#include "idmap.h"
50#include "acl.h"
51#include "xdr4.h" 51#include "xdr4.h"
52#include "vfs.h" 52#include "vfs.h"
53 53
54
54#define NFSDDBG_FACILITY NFSDDBG_XDR 55#define NFSDDBG_FACILITY NFSDDBG_XDR
55 56
56/* 57/*
@@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
288 len += XDR_QUADLEN(dummy32) << 2; 289 len += XDR_QUADLEN(dummy32) << 2;
289 READMEM(buf, dummy32); 290 READMEM(buf, dummy32);
290 ace->whotype = nfs4_acl_get_whotype(buf, dummy32); 291 ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
291 host_err = 0; 292 status = nfs_ok;
292 if (ace->whotype != NFS4_ACL_WHO_NAMED) 293 if (ace->whotype != NFS4_ACL_WHO_NAMED)
293 ace->who = 0; 294 ace->who = 0;
294 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) 295 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
295 host_err = nfsd_map_name_to_gid(argp->rqstp, 296 status = nfsd_map_name_to_gid(argp->rqstp,
296 buf, dummy32, &ace->who); 297 buf, dummy32, &ace->who);
297 else 298 else
298 host_err = nfsd_map_name_to_uid(argp->rqstp, 299 status = nfsd_map_name_to_uid(argp->rqstp,
299 buf, dummy32, &ace->who); 300 buf, dummy32, &ace->who);
300 if (host_err) 301 if (status)
301 goto out_nfserr; 302 return status;
302 } 303 }
303 } else 304 } else
304 *acl = NULL; 305 *acl = NULL;
@@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
420 DECODE_TAIL; 421 DECODE_TAIL;
421} 422}
422 423
424static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
425{
426 DECODE_HEAD;
427 u32 dummy;
428
429 READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
430 COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
431 READ32(bcts->dir);
432 /* XXX: Perhaps Tom Tucker could help us figure out how we
433 * should be using ctsa_use_conn_in_rdma_mode: */
434 READ32(dummy);
435
436 DECODE_TAIL;
437}
438
423static __be32 439static __be32
424nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) 440nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
425{ 441{
@@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
847} 863}
848 864
849static __be32 865static __be32
866nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
867 struct nfsd4_secinfo_no_name *sin)
868{
869 DECODE_HEAD;
870
871 READ_BUF(4);
872 READ32(sin->sin_style);
873 DECODE_TAIL;
874}
875
876static __be32
850nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) 877nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
851{ 878{
852 __be32 status; 879 __be32 status;
@@ -1005,7 +1032,7 @@ static __be32
1005nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, 1032nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1006 struct nfsd4_exchange_id *exid) 1033 struct nfsd4_exchange_id *exid)
1007{ 1034{
1008 int dummy; 1035 int dummy, tmp;
1009 DECODE_HEAD; 1036 DECODE_HEAD;
1010 1037
1011 READ_BUF(NFS4_VERIFIER_SIZE); 1038 READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1053 1080
1054 /* ssp_hash_algs<> */ 1081 /* ssp_hash_algs<> */
1055 READ_BUF(4); 1082 READ_BUF(4);
1056 READ32(dummy); 1083 READ32(tmp);
1057 READ_BUF(dummy); 1084 while (tmp--) {
1058 p += XDR_QUADLEN(dummy); 1085 READ_BUF(4);
1086 READ32(dummy);
1087 READ_BUF(dummy);
1088 p += XDR_QUADLEN(dummy);
1089 }
1059 1090
1060 /* ssp_encr_algs<> */ 1091 /* ssp_encr_algs<> */
1061 READ_BUF(4); 1092 READ_BUF(4);
1062 READ32(dummy); 1093 READ32(tmp);
1063 READ_BUF(dummy); 1094 while (tmp--) {
1064 p += XDR_QUADLEN(dummy); 1095 READ_BUF(4);
1096 READ32(dummy);
1097 READ_BUF(dummy);
1098 p += XDR_QUADLEN(dummy);
1099 }
1065 1100
1066 /* ssp_window and ssp_num_gss_handles */ 1101 /* ssp_window and ssp_num_gss_handles */
1067 READ_BUF(8); 1102 READ_BUF(8);
@@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1339 1374
1340 /* new operations for NFSv4.1 */ 1375 /* new operations for NFSv4.1 */
1341 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, 1376 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp,
1342 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp, 1377 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
1343 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, 1378 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
1344 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, 1379 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
1345 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, 1380 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1350 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, 1385 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
1351 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, 1386 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, 1387 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
1353 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, 1388 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
1354 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, 1389 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
1355 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, 1390 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
1356 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, 1391 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2309,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2309 case nfserr_resource: 2344 case nfserr_resource:
2310 nfserr = nfserr_toosmall; 2345 nfserr = nfserr_toosmall;
2311 goto fail; 2346 goto fail;
2312 case nfserr_dropit:
2313 goto fail;
2314 case nfserr_noent: 2347 case nfserr_noent:
2315 goto skip_entry; 2348 goto skip_entry;
2316 default: 2349 default:
@@ -2365,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2365 return nfserr; 2398 return nfserr;
2366} 2399}
2367 2400
2401static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
2402{
2403 __be32 *p;
2404
2405 if (!nfserr) {
2406 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
2407 WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
2408 WRITE32(bcts->dir);
2409 /* XXX: ? */
2410 WRITE32(0);
2411 ADJUST_ARGS();
2412 }
2413 return nfserr;
2414}
2415
2368static __be32 2416static __be32
2369nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) 2417nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
2370{ 2418{
@@ -2826,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2826} 2874}
2827 2875
2828static __be32 2876static __be32
2829nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, 2877nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
2830 struct nfsd4_secinfo *secinfo) 2878 __be32 nfserr,struct svc_export *exp)
2831{ 2879{
2832 int i = 0; 2880 int i = 0;
2833 struct svc_export *exp = secinfo->si_exp;
2834 u32 nflavs; 2881 u32 nflavs;
2835 struct exp_flavor_info *flavs; 2882 struct exp_flavor_info *flavs;
2836 struct exp_flavor_info def_flavs[2]; 2883 struct exp_flavor_info def_flavs[2];
@@ -2892,6 +2939,20 @@ out:
2892 return nfserr; 2939 return nfserr;
2893} 2940}
2894 2941
2942static __be32
2943nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
2944 struct nfsd4_secinfo *secinfo)
2945{
2946 return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
2947}
2948
2949static __be32
2950nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
2951 struct nfsd4_secinfo_no_name *secinfo)
2952{
2953 return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
2954}
2955
2895/* 2956/*
2896 * The SETATTR encode routine is special -- it always encodes a bitmap, 2957 * The SETATTR encode routine is special -- it always encodes a bitmap,
2897 * regardless of the error status. 2958 * regardless of the error status.
@@ -3076,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3076 WRITE32(seq->seqid); 3137 WRITE32(seq->seqid);
3077 WRITE32(seq->slotid); 3138 WRITE32(seq->slotid);
3078 WRITE32(seq->maxslots); 3139 WRITE32(seq->maxslots);
3079 /* 3140 /* For now: target_maxslots = maxslots */
3080 * FIXME: for now:
3081 * target_maxslots = maxslots
3082 * status_flags = 0
3083 */
3084 WRITE32(seq->maxslots); 3141 WRITE32(seq->maxslots);
3085 WRITE32(0); 3142 WRITE32(seq->status_flags);
3086 3143
3087 ADJUST_ARGS(); 3144 ADJUST_ARGS();
3088 resp->cstate.datap = p; /* DRC cache data pointer */ 3145 resp->cstate.datap = p; /* DRC cache data pointer */
@@ -3143,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3143 3200
3144 /* NFSv4.1 operations */ 3201 /* NFSv4.1 operations */
3145 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, 3202 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
3146 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop, 3203 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
3147 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, 3204 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3148 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, 3205 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3149 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, 3206 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
@@ -3154,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3154 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, 3211 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3155 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, 3212 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3156 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, 3213 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
3157 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, 3214 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
3158 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, 3215 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3159 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, 3216 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
3160 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, 3217 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4514ebbee4d6..33b3e2b06779 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
8#include <linux/namei.h> 8#include <linux/namei.h>
9#include <linux/ctype.h> 9#include <linux/ctype.h>
10 10
11#include <linux/nfsd_idmap.h>
12#include <linux/sunrpc/svcsock.h> 11#include <linux/sunrpc/svcsock.h>
13#include <linux/nfsd/syscall.h> 12#include <linux/nfsd/syscall.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
16 15
16#include "idmap.h"
17#include "nfsd.h" 17#include "nfsd.h"
18#include "cache.h" 18#include "cache.h"
19 19
@@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
127 127
128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
129{ 129{
130#ifdef CONFIG_NFSD_DEPRECATED
130 static int warned; 131 static int warned;
131 if (file->f_dentry->d_name.name[0] == '.' && !warned) { 132 if (file->f_dentry->d_name.name[0] == '.' && !warned) {
132 printk(KERN_INFO 133 printk(KERN_INFO
@@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
135 current->comm, file->f_dentry->d_name.name); 136 current->comm, file->f_dentry->d_name.name);
136 warned = 1; 137 warned = 1;
137 } 138 }
139#endif
138 if (! file->private_data) { 140 if (! file->private_data) {
139 /* An attempt to read a transaction file without writing 141 /* An attempt to read a transaction file without writing
140 * causes a 0-byte write so that the file can return 142 * causes a 0-byte write so that the file can return
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6b641cf2c19a..7ecfa2420307 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void nfsd_lockd_shutdown(void);
158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) 158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) 159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
160#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) 160#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE)
161#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER)
161#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) 162#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD)
162#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) 163#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL)
163#define nfserr_grace cpu_to_be32(NFSERR_GRACE) 164#define nfserr_grace cpu_to_be32(NFSERR_GRACE)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784b..e15dc45fc5ec 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,9 +735,9 @@ nfserrno (int errno)
735 { nfserr_stale, -ESTALE }, 735 { nfserr_stale, -ESTALE },
736 { nfserr_jukebox, -ETIMEDOUT }, 736 { nfserr_jukebox, -ETIMEDOUT },
737 { nfserr_jukebox, -ERESTARTSYS }, 737 { nfserr_jukebox, -ERESTARTSYS },
738 { nfserr_dropit, -EAGAIN }, 738 { nfserr_jukebox, -EAGAIN },
739 { nfserr_dropit, -ENOMEM }, 739 { nfserr_jukebox, -EWOULDBLOCK },
740 { nfserr_badname, -ESRCH }, 740 { nfserr_jukebox, -ENOMEM },
741 { nfserr_io, -ETXTBSY }, 741 { nfserr_io, -ETXTBSY },
742 { nfserr_notsupp, -EOPNOTSUPP }, 742 { nfserr_notsupp, -EOPNOTSUPP },
743 { nfserr_toosmall, -ETOOSMALL }, 743 { nfserr_toosmall, -ETOOSMALL },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2bae1d86f5f2..18743c4d8bca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
608 /* Now call the procedure handler, and encode NFS status. */ 608 /* Now call the procedure handler, and encode NFS status. */
609 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 609 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
610 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 610 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
611 if (nfserr == nfserr_dropit) { 611 if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
612 dprintk("nfsd: Dropping request; may be revisited later\n"); 612 dprintk("nfsd: Dropping request; may be revisited later\n");
613 nfsd_cache_update(rqstp, RC_NOCACHE, NULL); 613 nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
614 return 0; 614 return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 39adc27b0685..3074656ba7bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,10 +68,12 @@ typedef struct {
68struct nfsd4_callback { 68struct nfsd4_callback {
69 void *cb_op; 69 void *cb_op;
70 struct nfs4_client *cb_clp; 70 struct nfs4_client *cb_clp;
71 struct list_head cb_per_client;
71 u32 cb_minorversion; 72 u32 cb_minorversion;
72 struct rpc_message cb_msg; 73 struct rpc_message cb_msg;
73 const struct rpc_call_ops *cb_ops; 74 const struct rpc_call_ops *cb_ops;
74 struct work_struct cb_work; 75 struct work_struct cb_work;
76 bool cb_done;
75}; 77};
76 78
77struct nfs4_delegation { 79struct nfs4_delegation {
@@ -81,6 +83,7 @@ struct nfs4_delegation {
81 atomic_t dl_count; /* ref count */ 83 atomic_t dl_count; /* ref count */
82 struct nfs4_client *dl_client; 84 struct nfs4_client *dl_client;
83 struct nfs4_file *dl_file; 85 struct nfs4_file *dl_file;
86 struct file *dl_vfs_file;
84 struct file_lock *dl_flock; 87 struct file_lock *dl_flock;
85 u32 dl_type; 88 u32 dl_type;
86 time_t dl_time; 89 time_t dl_time;
@@ -95,6 +98,7 @@ struct nfs4_delegation {
95struct nfs4_cb_conn { 98struct nfs4_cb_conn {
96 /* SETCLIENTID info */ 99 /* SETCLIENTID info */
97 struct sockaddr_storage cb_addr; 100 struct sockaddr_storage cb_addr;
101 struct sockaddr_storage cb_saddr;
98 size_t cb_addrlen; 102 size_t cb_addrlen;
99 u32 cb_prog; /* used only in 4.0 case; 103 u32 cb_prog; /* used only in 4.0 case;
100 per-session otherwise */ 104 per-session otherwise */
@@ -146,6 +150,11 @@ struct nfsd4_create_session {
146 u32 gid; 150 u32 gid;
147}; 151};
148 152
153struct nfsd4_bind_conn_to_session {
154 struct nfs4_sessionid sessionid;
155 u32 dir;
156};
157
149/* The single slot clientid cache structure */ 158/* The single slot clientid cache structure */
150struct nfsd4_clid_slot { 159struct nfsd4_clid_slot {
151 u32 sl_seqid; 160 u32 sl_seqid;
@@ -235,9 +244,13 @@ struct nfs4_client {
235 unsigned long cl_cb_flags; 244 unsigned long cl_cb_flags;
236 struct rpc_clnt *cl_cb_client; 245 struct rpc_clnt *cl_cb_client;
237 u32 cl_cb_ident; 246 u32 cl_cb_ident;
238 atomic_t cl_cb_set; 247#define NFSD4_CB_UP 0
248#define NFSD4_CB_UNKNOWN 1
249#define NFSD4_CB_DOWN 2
250 int cl_cb_state;
239 struct nfsd4_callback cl_cb_null; 251 struct nfsd4_callback cl_cb_null;
240 struct nfsd4_session *cl_cb_session; 252 struct nfsd4_session *cl_cb_session;
253 struct list_head cl_callbacks; /* list of in-progress callbacks */
241 254
242 /* for all client information that callback code might need: */ 255 /* for all client information that callback code might need: */
243 spinlock_t cl_lock; 256 spinlock_t cl_lock;
@@ -454,6 +467,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
454extern void nfs4_free_stateowner(struct kref *kref); 467extern void nfs4_free_stateowner(struct kref *kref);
455extern int set_callback_cred(void); 468extern int set_callback_cred(void);
456extern void nfsd4_probe_callback(struct nfs4_client *clp); 469extern void nfsd4_probe_callback(struct nfs4_client *clp);
470extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
457extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 471extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
458extern void nfsd4_do_callback_rpc(struct work_struct *); 472extern void nfsd4_do_callback_rpc(struct work_struct *);
459extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 473extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3a359023c9f7..641117f2188d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
1#define MSNFS /* HACK HACK */
2/* 1/*
3 * File operations used by nfsd. Some of these have been ripped from 2 * File operations used by nfsd. Some of these have been ripped from
4 * other parts of the kernel because they weren't exported, others 3 * other parts of the kernel because they weren't exported, others
@@ -35,8 +34,8 @@
35#endif /* CONFIG_NFSD_V3 */ 34#endif /* CONFIG_NFSD_V3 */
36 35
37#ifdef CONFIG_NFSD_V4 36#ifdef CONFIG_NFSD_V4
38#include <linux/nfs4_acl.h> 37#include "acl.h"
39#include <linux/nfsd_idmap.h> 38#include "idmap.h"
40#endif /* CONFIG_NFSD_V4 */ 39#endif /* CONFIG_NFSD_V4 */
41 40
42#include "nfsd.h" 41#include "nfsd.h"
@@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
88 .dentry = dget(dentry)}; 87 .dentry = dget(dentry)};
89 int err = 0; 88 int err = 0;
90 89
91 while (d_mountpoint(path.dentry) && follow_down(&path)) 90 err = follow_down(&path, false);
92 ; 91 if (err < 0)
92 goto out;
93 93
94 exp2 = rqst_exp_get_by_name(rqstp, &path); 94 exp2 = rqst_exp_get_by_name(rqstp, &path);
95 if (IS_ERR(exp2)) { 95 if (IS_ERR(exp2)) {
@@ -273,6 +273,13 @@ out:
273 return err; 273 return err;
274} 274}
275 275
276static int nfsd_break_lease(struct inode *inode)
277{
278 if (!S_ISREG(inode->i_mode))
279 return 0;
280 return break_lease(inode, O_WRONLY | O_NONBLOCK);
281}
282
276/* 283/*
277 * Commit metadata changes to stable storage. 284 * Commit metadata changes to stable storage.
278 */ 285 */
@@ -375,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
375 goto out; 382 goto out;
376 } 383 }
377 384
378 /*
379 * If we are changing the size of the file, then
380 * we need to break all leases.
381 */
382 host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
383 if (host_err == -EWOULDBLOCK)
384 host_err = -ETIMEDOUT;
385 if (host_err) /* ENOMEM or EWOULDBLOCK */
386 goto out_nfserr;
387
388 host_err = get_write_access(inode); 385 host_err = get_write_access(inode);
389 if (host_err) 386 if (host_err)
390 goto out_nfserr; 387 goto out_nfserr;
@@ -425,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
425 422
426 err = nfserr_notsync; 423 err = nfserr_notsync;
427 if (!check_guard || guardtime == inode->i_ctime.tv_sec) { 424 if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
425 host_err = nfsd_break_lease(inode);
426 if (host_err)
427 goto out_nfserr;
428 fh_lock(fhp); 428 fh_lock(fhp);
429
429 host_err = notify_change(dentry, iap); 430 host_err = notify_change(dentry, iap);
430 err = nfserrno(host_err); 431 err = nfserrno(host_err);
431 fh_unlock(fhp); 432 fh_unlock(fhp);
@@ -752,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
752 */ 753 */
753 if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) 754 if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
754 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); 755 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
755 if (host_err == -EWOULDBLOCK)
756 host_err = -ETIMEDOUT;
757 if (host_err) /* NOMEM or WOULDBLOCK */ 756 if (host_err) /* NOMEM or WOULDBLOCK */
758 goto out_nfserr; 757 goto out_nfserr;
759 758
@@ -845,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
845 struct page **pp = rqstp->rq_respages + rqstp->rq_resused; 844 struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
846 struct page *page = buf->page; 845 struct page *page = buf->page;
847 size_t size; 846 size_t size;
848 int ret;
849
850 ret = buf->ops->confirm(pipe, buf);
851 if (unlikely(ret))
852 return ret;
853 847
854 size = sd->len; 848 size = sd->len;
855 849
@@ -879,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
879 return __splice_from_pipe(pipe, sd, nfsd_splice_actor); 873 return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
880} 874}
881 875
882static inline int svc_msnfs(struct svc_fh *ffhp)
883{
884#ifdef MSNFS
885 return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
886#else
887 return 0;
888#endif
889}
890
891static __be32 876static __be32
892nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 877nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
893 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 878 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -900,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
900 err = nfserr_perm; 885 err = nfserr_perm;
901 inode = file->f_path.dentry->d_inode; 886 inode = file->f_path.dentry->d_inode;
902 887
903 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
904 goto out;
905
906 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 888 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
907 struct splice_desc sd = { 889 struct splice_desc sd = {
908 .len = 0, 890 .len = 0,
@@ -927,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
927 fsnotify_access(file); 909 fsnotify_access(file);
928 } else 910 } else
929 err = nfserrno(host_err); 911 err = nfserrno(host_err);
930out:
931 return err; 912 return err;
932} 913}
933 914
@@ -992,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
992 int stable = *stablep; 973 int stable = *stablep;
993 int use_wgather; 974 int use_wgather;
994 975
995#ifdef MSNFS
996 err = nfserr_perm;
997
998 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
999 (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
1000 goto out;
1001#endif
1002
1003 dentry = file->f_path.dentry; 976 dentry = file->f_path.dentry;
1004 inode = dentry->d_inode; 977 inode = dentry->d_inode;
1005 exp = fhp->fh_export; 978 exp = fhp->fh_export;
@@ -1050,7 +1023,6 @@ out_nfserr:
1050 err = 0; 1023 err = 0;
1051 else 1024 else
1052 err = nfserrno(host_err); 1025 err = nfserrno(host_err);
1053out:
1054 return err; 1026 return err;
1055} 1027}
1056 1028
@@ -1670,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1670 err = nfserrno(host_err); 1642 err = nfserrno(host_err);
1671 goto out_dput; 1643 goto out_dput;
1672 } 1644 }
1645 err = nfserr_noent;
1646 if (!dold->d_inode)
1647 goto out_drop_write;
1648 host_err = nfsd_break_lease(dold->d_inode);
1649 if (host_err)
1650 goto out_drop_write;
1673 host_err = vfs_link(dold, dirp, dnew); 1651 host_err = vfs_link(dold, dirp, dnew);
1674 if (!host_err) { 1652 if (!host_err) {
1675 err = nfserrno(commit_metadata(ffhp)); 1653 err = nfserrno(commit_metadata(ffhp));
@@ -1681,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1681 else 1659 else
1682 err = nfserrno(host_err); 1660 err = nfserrno(host_err);
1683 } 1661 }
1662out_drop_write:
1684 mnt_drop_write(tfhp->fh_export->ex_path.mnt); 1663 mnt_drop_write(tfhp->fh_export->ex_path.mnt);
1685out_dput: 1664out_dput:
1686 dput(dnew); 1665 dput(dnew);
@@ -1755,12 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1755 if (ndentry == trap) 1734 if (ndentry == trap)
1756 goto out_dput_new; 1735 goto out_dput_new;
1757 1736
1758 if (svc_msnfs(ffhp) &&
1759 ((odentry->d_count > 1) || (ndentry->d_count > 1))) {
1760 host_err = -EPERM;
1761 goto out_dput_new;
1762 }
1763
1764 host_err = -EXDEV; 1737 host_err = -EXDEV;
1765 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) 1738 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
1766 goto out_dput_new; 1739 goto out_dput_new;
@@ -1768,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1768 if (host_err) 1741 if (host_err)
1769 goto out_dput_new; 1742 goto out_dput_new;
1770 1743
1744 host_err = nfsd_break_lease(odentry->d_inode);
1745 if (host_err)
1746 goto out_drop_write;
1771 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1747 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1772 if (!host_err) { 1748 if (!host_err) {
1773 host_err = commit_metadata(tfhp); 1749 host_err = commit_metadata(tfhp);
1774 if (!host_err) 1750 if (!host_err)
1775 host_err = commit_metadata(ffhp); 1751 host_err = commit_metadata(ffhp);
1776 } 1752 }
1777 1753out_drop_write:
1778 mnt_drop_write(ffhp->fh_export->ex_path.mnt); 1754 mnt_drop_write(ffhp->fh_export->ex_path.mnt);
1779
1780 out_dput_new: 1755 out_dput_new:
1781 dput(ndentry); 1756 dput(ndentry);
1782 out_dput_old: 1757 out_dput_old:
@@ -1839,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1839 if (host_err) 1814 if (host_err)
1840 goto out_nfserr; 1815 goto out_nfserr;
1841 1816
1842 if (type != S_IFDIR) { /* It's UNLINK */ 1817 host_err = nfsd_break_lease(rdentry->d_inode);
1843#ifdef MSNFS 1818 if (host_err)
1844 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1819 goto out_put;
1845 (rdentry->d_count > 1)) { 1820 if (type != S_IFDIR)
1846 host_err = -EPERM;
1847 } else
1848#endif
1849 host_err = vfs_unlink(dirp, rdentry); 1821 host_err = vfs_unlink(dirp, rdentry);
1850 } else { /* It's RMDIR */ 1822 else
1851 host_err = vfs_rmdir(dirp, rdentry); 1823 host_err = vfs_rmdir(dirp, rdentry);
1852 } 1824out_put:
1853
1854 dput(rdentry); 1825 dput(rdentry);
1855 1826
1856 if (!host_err) 1827 if (!host_err)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 60fce3dc5cb5..366401e1a536 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
311 struct svc_export *si_exp; /* response */ 311 struct svc_export *si_exp; /* response */
312}; 312};
313 313
314struct nfsd4_secinfo_no_name {
315 u32 sin_style; /* request */
316 struct svc_export *sin_exp; /* response */
317};
318
314struct nfsd4_setattr { 319struct nfsd4_setattr {
315 stateid_t sa_stateid; /* request */ 320 stateid_t sa_stateid; /* request */
316 u32 sa_bmval[3]; /* request */ 321 u32 sa_bmval[3]; /* request */
@@ -373,8 +378,8 @@ struct nfsd4_sequence {
373 u32 cachethis; /* request */ 378 u32 cachethis; /* request */
374#if 0 379#if 0
375 u32 target_maxslots; /* response */ 380 u32 target_maxslots; /* response */
376 u32 status_flags; /* response */
377#endif /* not yet */ 381#endif /* not yet */
382 u32 status_flags; /* response */
378}; 383};
379 384
380struct nfsd4_destroy_session { 385struct nfsd4_destroy_session {
@@ -422,6 +427,7 @@ struct nfsd4_op {
422 427
423 /* NFSv4.1 */ 428 /* NFSv4.1 */
424 struct nfsd4_exchange_id exchange_id; 429 struct nfsd4_exchange_id exchange_id;
430 struct nfsd4_bind_conn_to_session bind_conn_to_session;
425 struct nfsd4_create_session create_session; 431 struct nfsd4_create_session create_session;
426 struct nfsd4_destroy_session destroy_session; 432 struct nfsd4_destroy_session destroy_session;
427 struct nfsd4_sequence sequence; 433 struct nfsd4_sequence sequence;
@@ -518,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
518 struct nfsd4_sequence *seq); 524 struct nfsd4_sequence *seq);
519extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 525extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
520 struct nfsd4_compound_state *, struct nfsd4_exchange_id *); 526 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
527extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
521extern __be32 nfsd4_create_session(struct svc_rqst *, 528extern __be32 nfsd4_create_session(struct svc_rqst *,
522 struct nfsd4_compound_state *, 529 struct nfsd4_compound_state *,
523 struct nfsd4_create_session *); 530 struct nfsd4_create_session *);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 8b782b062baa..3ee67c67cc52 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -35,7 +35,20 @@
35 35
36struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) 36struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
37{ 37{
38 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); 38 return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
39}
40
41static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
42 const char *fname, int err)
43{
44 struct inode *inode = bmap->b_inode;
45
46 if (err == -EINVAL) {
47 nilfs_error(inode->i_sb, fname,
48 "broken bmap (inode number=%lu)\n", inode->i_ino);
49 err = -EIO;
50 }
51 return err;
39} 52}
40 53
41/** 54/**
@@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
66 79
67 down_read(&bmap->b_sem); 80 down_read(&bmap->b_sem);
68 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); 81 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
69 if (ret < 0) 82 if (ret < 0) {
83 ret = nilfs_bmap_convert_error(bmap, __func__, ret);
70 goto out; 84 goto out;
85 }
71 if (NILFS_BMAP_USE_VBN(bmap)) { 86 if (NILFS_BMAP_USE_VBN(bmap)) {
72 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp, 87 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
73 &blocknr); 88 &blocknr);
@@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
88 down_read(&bmap->b_sem); 103 down_read(&bmap->b_sem);
89 ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks); 104 ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
90 up_read(&bmap->b_sem); 105 up_read(&bmap->b_sem);
91 return ret; 106
107 return nilfs_bmap_convert_error(bmap, __func__, ret);
92} 108}
93 109
94static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 110static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap,
144 down_write(&bmap->b_sem); 160 down_write(&bmap->b_sem);
145 ret = nilfs_bmap_do_insert(bmap, key, rec); 161 ret = nilfs_bmap_do_insert(bmap, key, rec);
146 up_write(&bmap->b_sem); 162 up_write(&bmap->b_sem);
147 return ret; 163
164 return nilfs_bmap_convert_error(bmap, __func__, ret);
148} 165}
149 166
150static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) 167static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
@@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
180 197
181 down_read(&bmap->b_sem); 198 down_read(&bmap->b_sem);
182 ret = bmap->b_ops->bop_last_key(bmap, &lastkey); 199 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
183 if (!ret)
184 *key = lastkey;
185 up_read(&bmap->b_sem); 200 up_read(&bmap->b_sem);
201
202 if (ret < 0)
203 ret = nilfs_bmap_convert_error(bmap, __func__, ret);
204 else
205 *key = lastkey;
186 return ret; 206 return ret;
187} 207}
188 208
@@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
210 down_write(&bmap->b_sem); 230 down_write(&bmap->b_sem);
211 ret = nilfs_bmap_do_delete(bmap, key); 231 ret = nilfs_bmap_do_delete(bmap, key);
212 up_write(&bmap->b_sem); 232 up_write(&bmap->b_sem);
213 return ret; 233
234 return nilfs_bmap_convert_error(bmap, __func__, ret);
214} 235}
215 236
216static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) 237static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
@@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
261 down_write(&bmap->b_sem); 282 down_write(&bmap->b_sem);
262 ret = nilfs_bmap_do_truncate(bmap, key); 283 ret = nilfs_bmap_do_truncate(bmap, key);
263 up_write(&bmap->b_sem); 284 up_write(&bmap->b_sem);
264 return ret; 285
286 return nilfs_bmap_convert_error(bmap, __func__, ret);
265} 287}
266 288
267/** 289/**
@@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
300 down_write(&bmap->b_sem); 322 down_write(&bmap->b_sem);
301 ret = bmap->b_ops->bop_propagate(bmap, bh); 323 ret = bmap->b_ops->bop_propagate(bmap, bh);
302 up_write(&bmap->b_sem); 324 up_write(&bmap->b_sem);
303 return ret; 325
326 return nilfs_bmap_convert_error(bmap, __func__, ret);
304} 327}
305 328
306/** 329/**
@@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
344 down_write(&bmap->b_sem); 367 down_write(&bmap->b_sem);
345 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo); 368 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
346 up_write(&bmap->b_sem); 369 up_write(&bmap->b_sem);
347 return ret; 370
371 return nilfs_bmap_convert_error(bmap, __func__, ret);
348} 372}
349 373
350/** 374/**
@@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
373 down_write(&bmap->b_sem); 397 down_write(&bmap->b_sem);
374 ret = bmap->b_ops->bop_mark(bmap, key, level); 398 ret = bmap->b_ops->bop_mark(bmap, key, level);
375 up_write(&bmap->b_sem); 399 up_write(&bmap->b_sem);
376 return ret; 400
401 return nilfs_bmap_convert_error(bmap, __func__, ret);
377} 402}
378 403
379/** 404/**
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5115814cb745..388e9e8f5286 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -104,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
104 if (pblocknr == 0) { 104 if (pblocknr == 0) {
105 pblocknr = blocknr; 105 pblocknr = blocknr;
106 if (inode->i_ino != NILFS_DAT_INO) { 106 if (inode->i_ino != NILFS_DAT_INO) {
107 struct inode *dat = 107 struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
108 nilfs_dat_inode(NILFS_I_NILFS(inode));
109 108
110 /* blocknr is a virtual block number */ 109 /* blocknr is a virtual block number */
111 err = nilfs_dat_translate(dat, blocknr, &pblocknr); 110 err = nilfs_dat_translate(dat, blocknr, &pblocknr);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index cb003c8ee1f6..9d45773b79e6 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page,
91 unsigned from, unsigned to) 91 unsigned from, unsigned to)
92{ 92{
93 struct inode *dir = mapping->host; 93 struct inode *dir = mapping->host;
94 struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
95 loff_t pos = page_offset(page) + from; 94 loff_t pos = page_offset(page) + from;
96 unsigned len = to - from; 95 unsigned len = to - from;
97 unsigned nr_dirty, copied; 96 unsigned nr_dirty, copied;
@@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page,
103 i_size_write(dir, pos + copied); 102 i_size_write(dir, pos + copied);
104 if (IS_DIRSYNC(dir)) 103 if (IS_DIRSYNC(dir))
105 nilfs_set_transaction_flag(NILFS_TI_SYNC); 104 nilfs_set_transaction_flag(NILFS_TI_SYNC);
106 err = nilfs_set_file_dirty(sbi, dir, nr_dirty); 105 err = nilfs_set_file_dirty(dir, nr_dirty);
107 WARN_ON(err); /* do not happen */ 106 WARN_ON(err); /* do not happen */
108 unlock_page(page); 107 unlock_page(page);
109} 108}
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c9a30d7ff6fc..2f560c9fb808 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate, 155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr, 156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission, 157 .permission = nilfs_permission,
158 .fiemap = nilfs_fiemap,
158}; 159};
159 160
160/* end of file */ 161/* end of file */
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 9f8a2da67f90..bfc73d3a30ed 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -149,14 +149,9 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
149 } 149 }
150 150
151 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); 151 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
152 if (unlikely(err)) { 152 if (unlikely(err))
153 if (err == -EINVAL) 153 nilfs_warning(sb, __func__, "unable to read inode: %lu",
154 nilfs_error(sb, __func__, "ifile is broken"); 154 (unsigned long) ino);
155 else
156 nilfs_warning(sb, __func__,
157 "unable to read inode: %lu",
158 (unsigned long) ino);
159 }
160 return err; 155 return err;
161} 156}
162 157
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 77b48c8fab17..2fd440d8d6b8 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -58,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
58 struct nilfs_inode_info *ii = NILFS_I(inode); 58 struct nilfs_inode_info *ii = NILFS_I(inode);
59 __u64 blknum = 0; 59 __u64 blknum = 0;
60 int err = 0, ret; 60 int err = 0, ret;
61 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); 61 struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
62 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; 62 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
63 63
64 down_read(&NILFS_MDT(dat)->mi_sem); 64 down_read(&NILFS_MDT(dat)->mi_sem);
@@ -96,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
96 inode->i_ino, 96 inode->i_ino,
97 (unsigned long long)blkoff); 97 (unsigned long long)blkoff);
98 err = 0; 98 err = 0;
99 } else if (err == -EINVAL) {
100 nilfs_error(inode->i_sb, __func__,
101 "broken bmap (inode=%lu)\n",
102 inode->i_ino);
103 err = -EIO;
104 } 99 }
105 nilfs_transaction_abort(inode->i_sb); 100 nilfs_transaction_abort(inode->i_sb);
106 goto out; 101 goto out;
@@ -109,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
109 nilfs_transaction_commit(inode->i_sb); /* never fails */ 104 nilfs_transaction_commit(inode->i_sb); /* never fails */
110 /* Error handling should be detailed */ 105 /* Error handling should be detailed */
111 set_buffer_new(bh_result); 106 set_buffer_new(bh_result);
107 set_buffer_delay(bh_result);
112 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed 108 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
113 to proper value */ 109 to proper value */
114 } else if (ret == -ENOENT) { 110 } else if (ret == -ENOENT) {
@@ -185,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page)
185 181
186 if (ret) { 182 if (ret) {
187 struct inode *inode = page->mapping->host; 183 struct inode *inode = page->mapping->host;
188 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
189 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); 184 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
190 185
191 nilfs_set_file_dirty(sbi, inode, nr_dirty); 186 nilfs_set_file_dirty(inode, nr_dirty);
192 } 187 }
193 return ret; 188 return ret;
194} 189}
@@ -229,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
229 start + copied); 224 start + copied);
230 copied = generic_write_end(file, mapping, pos, len, copied, page, 225 copied = generic_write_end(file, mapping, pos, len, copied, page,
231 fsdata); 226 fsdata);
232 nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty); 227 nilfs_set_file_dirty(inode, nr_dirty);
233 err = nilfs_transaction_commit(inode->i_sb); 228 err = nilfs_transaction_commit(inode->i_sb);
234 return err ? : copied; 229 return err ? : copied;
235} 230}
@@ -425,13 +420,12 @@ static int __nilfs_read_inode(struct super_block *sb,
425 struct nilfs_root *root, unsigned long ino, 420 struct nilfs_root *root, unsigned long ino,
426 struct inode *inode) 421 struct inode *inode)
427{ 422{
428 struct nilfs_sb_info *sbi = NILFS_SB(sb); 423 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
429 struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
430 struct buffer_head *bh; 424 struct buffer_head *bh;
431 struct nilfs_inode *raw_inode; 425 struct nilfs_inode *raw_inode;
432 int err; 426 int err;
433 427
434 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 428 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
435 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh); 429 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
436 if (unlikely(err)) 430 if (unlikely(err))
437 goto bad_inode; 431 goto bad_inode;
@@ -461,7 +455,7 @@ static int __nilfs_read_inode(struct super_block *sb,
461 } 455 }
462 nilfs_ifile_unmap_inode(root->ifile, ino, bh); 456 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
463 brelse(bh); 457 brelse(bh);
464 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 458 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
465 nilfs_set_inode_flags(inode); 459 nilfs_set_inode_flags(inode);
466 return 0; 460 return 0;
467 461
@@ -470,7 +464,7 @@ static int __nilfs_read_inode(struct super_block *sb,
470 brelse(bh); 464 brelse(bh);
471 465
472 bad_inode: 466 bad_inode:
473 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 467 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
474 return err; 468 return err;
475} 469}
476 470
@@ -629,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
629 623
630 if (!test_bit(NILFS_I_BMAP, &ii->i_state)) 624 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
631 return; 625 return;
632 repeat: 626repeat:
633 ret = nilfs_bmap_last_key(ii->i_bmap, &b); 627 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
634 if (ret == -ENOENT) 628 if (ret == -ENOENT)
635 return; 629 return;
@@ -646,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
646 nilfs_bmap_truncate(ii->i_bmap, b) == 0)) 640 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
647 goto repeat; 641 goto repeat;
648 642
649 failed: 643failed:
650 if (ret == -EINVAL) 644 nilfs_warning(ii->vfs_inode.i_sb, __func__,
651 nilfs_error(ii->vfs_inode.i_sb, __func__, 645 "failed to truncate bmap (ino=%lu, err=%d)",
652 "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino); 646 ii->vfs_inode.i_ino, ret);
653 else
654 nilfs_warning(ii->vfs_inode.i_sb, __func__,
655 "failed to truncate bmap (ino=%lu, err=%d)",
656 ii->vfs_inode.i_ino, ret);
657} 647}
658 648
659void nilfs_truncate(struct inode *inode) 649void nilfs_truncate(struct inode *inode)
@@ -682,7 +672,7 @@ void nilfs_truncate(struct inode *inode)
682 nilfs_set_transaction_flag(NILFS_TI_SYNC); 672 nilfs_set_transaction_flag(NILFS_TI_SYNC);
683 673
684 nilfs_mark_inode_dirty(inode); 674 nilfs_mark_inode_dirty(inode);
685 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); 675 nilfs_set_file_dirty(inode, 0);
686 nilfs_transaction_commit(sb); 676 nilfs_transaction_commit(sb);
687 /* May construct a logical segment and may fail in sync mode. 677 /* May construct a logical segment and may fail in sync mode.
688 But truncate has no return value. */ 678 But truncate has no return value. */
@@ -800,9 +790,9 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
800 return generic_permission(inode, mask, flags, NULL); 790 return generic_permission(inode, mask, flags, NULL);
801} 791}
802 792
803int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, 793int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
804 struct buffer_head **pbh)
805{ 794{
795 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
806 struct nilfs_inode_info *ii = NILFS_I(inode); 796 struct nilfs_inode_info *ii = NILFS_I(inode);
807 int err; 797 int err;
808 798
@@ -843,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode)
843 return ret; 833 return ret;
844} 834}
845 835
846int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, 836int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
847 unsigned nr_dirty)
848{ 837{
838 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
849 struct nilfs_inode_info *ii = NILFS_I(inode); 839 struct nilfs_inode_info *ii = NILFS_I(inode);
850 840
851 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); 841 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
@@ -878,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
878 868
879int nilfs_mark_inode_dirty(struct inode *inode) 869int nilfs_mark_inode_dirty(struct inode *inode)
880{ 870{
881 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
882 struct buffer_head *ibh; 871 struct buffer_head *ibh;
883 int err; 872 int err;
884 873
885 err = nilfs_load_inode_block(sbi, inode, &ibh); 874 err = nilfs_load_inode_block(inode, &ibh);
886 if (unlikely(err)) { 875 if (unlikely(err)) {
887 nilfs_warning(inode->i_sb, __func__, 876 nilfs_warning(inode->i_sb, __func__,
888 "failed to reget inode block.\n"); 877 "failed to reget inode block.\n");
@@ -924,3 +913,134 @@ void nilfs_dirty_inode(struct inode *inode)
924 nilfs_mark_inode_dirty(inode); 913 nilfs_mark_inode_dirty(inode);
925 nilfs_transaction_commit(inode->i_sb); /* never fails */ 914 nilfs_transaction_commit(inode->i_sb); /* never fails */
926} 915}
916
917int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
918 __u64 start, __u64 len)
919{
920 struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
921 __u64 logical = 0, phys = 0, size = 0;
922 __u32 flags = 0;
923 loff_t isize;
924 sector_t blkoff, end_blkoff;
925 sector_t delalloc_blkoff;
926 unsigned long delalloc_blklen;
927 unsigned int blkbits = inode->i_blkbits;
928 int ret, n;
929
930 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
931 if (ret)
932 return ret;
933
934 mutex_lock(&inode->i_mutex);
935
936 isize = i_size_read(inode);
937
938 blkoff = start >> blkbits;
939 end_blkoff = (start + len - 1) >> blkbits;
940
941 delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
942 &delalloc_blkoff);
943
944 do {
945 __u64 blkphy;
946 unsigned int maxblocks;
947
948 if (delalloc_blklen && blkoff == delalloc_blkoff) {
949 if (size) {
950 /* End of the current extent */
951 ret = fiemap_fill_next_extent(
952 fieinfo, logical, phys, size, flags);
953 if (ret)
954 break;
955 }
956 if (blkoff > end_blkoff)
957 break;
958
959 flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
960 logical = blkoff << blkbits;
961 phys = 0;
962 size = delalloc_blklen << blkbits;
963
964 blkoff = delalloc_blkoff + delalloc_blklen;
965 delalloc_blklen = nilfs_find_uncommitted_extent(
966 inode, blkoff, &delalloc_blkoff);
967 continue;
968 }
969
970 /*
971 * Limit the number of blocks that we look up so as
972 * not to get into the next delayed allocation extent.
973 */
974 maxblocks = INT_MAX;
975 if (delalloc_blklen)
976 maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
977 maxblocks);
978 blkphy = 0;
979
980 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
981 n = nilfs_bmap_lookup_contig(
982 NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
983 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
984
985 if (n < 0) {
986 int past_eof;
987
988 if (unlikely(n != -ENOENT))
989 break; /* error */
990
991 /* HOLE */
992 blkoff++;
993 past_eof = ((blkoff << blkbits) >= isize);
994
995 if (size) {
996 /* End of the current extent */
997
998 if (past_eof)
999 flags |= FIEMAP_EXTENT_LAST;
1000
1001 ret = fiemap_fill_next_extent(
1002 fieinfo, logical, phys, size, flags);
1003 if (ret)
1004 break;
1005 size = 0;
1006 }
1007 if (blkoff > end_blkoff || past_eof)
1008 break;
1009 } else {
1010 if (size) {
1011 if (phys && blkphy << blkbits == phys + size) {
1012 /* The current extent goes on */
1013 size += n << blkbits;
1014 } else {
1015 /* Terminate the current extent */
1016 ret = fiemap_fill_next_extent(
1017 fieinfo, logical, phys, size,
1018 flags);
1019 if (ret || blkoff > end_blkoff)
1020 break;
1021
1022 /* Start another extent */
1023 flags = FIEMAP_EXTENT_MERGED;
1024 logical = blkoff << blkbits;
1025 phys = blkphy << blkbits;
1026 size = n << blkbits;
1027 }
1028 } else {
1029 /* Start a new extent */
1030 flags = FIEMAP_EXTENT_MERGED;
1031 logical = blkoff << blkbits;
1032 phys = blkphy << blkbits;
1033 size = n << blkbits;
1034 }
1035 blkoff += n;
1036 }
1037 cond_resched();
1038 } while (true);
1039
1040 /* If ret is 1 then we just hit the end of the extent array */
1041 if (ret == 1)
1042 ret = 0;
1043
1044 mutex_unlock(&inode->i_mutex);
1045 return ret;
1046}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b185e937a335..496738963fdb 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -233,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
233 int ret; 233 int ret;
234 234
235 down_read(&nilfs->ns_segctor_sem); 235 down_read(&nilfs->ns_segctor_sem);
236 ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs); 236 ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
237 up_read(&nilfs->ns_segctor_sem); 237 up_read(&nilfs->ns_segctor_sem);
238 return ret; 238 return ret;
239} 239}
@@ -242,8 +242,7 @@ static ssize_t
242nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, 242nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
243 void *buf, size_t size, size_t nmembs) 243 void *buf, size_t size, size_t nmembs)
244{ 244{
245 struct inode *dat = nilfs_dat_inode(nilfs); 245 struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
246 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
247 struct nilfs_bdesc *bdescs = buf; 246 struct nilfs_bdesc *bdescs = buf;
248 int ret, i; 247 int ret, i;
249 248
@@ -421,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
421 size_t nmembs = argv->v_nmembs; 420 size_t nmembs = argv->v_nmembs;
422 int ret; 421 int ret;
423 422
424 ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs); 423 ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
425 424
426 return (ret < 0) ? ret : nmembs; 425 return (ret < 0) ? ret : nmembs;
427} 426}
@@ -430,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
430 struct nilfs_argv *argv, void *buf) 429 struct nilfs_argv *argv, void *buf)
431{ 430{
432 size_t nmembs = argv->v_nmembs; 431 size_t nmembs = argv->v_nmembs;
433 struct inode *dat = nilfs_dat_inode(nilfs); 432 struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
434 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
435 struct nilfs_bdesc *bdescs = buf; 433 struct nilfs_bdesc *bdescs = buf;
436 int ret, i; 434 int ret, i;
437 435
@@ -450,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
450 /* skip dead block */ 448 /* skip dead block */
451 continue; 449 continue;
452 if (bdescs[i].bd_level == 0) { 450 if (bdescs[i].bd_level == 0) {
453 ret = nilfs_mdt_mark_block_dirty(dat, 451 ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
454 bdescs[i].bd_offset); 452 bdescs[i].bd_offset);
455 if (ret < 0) { 453 if (ret < 0) {
456 WARN_ON(ret == -ENOENT); 454 WARN_ON(ret == -ENOENT);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 39a5b84e2c9f..6a0e2a189f60 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -237,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
237 * 237 *
238 * %-ENOENT - the specified block does not exist (hole block) 238 * %-ENOENT - the specified block does not exist (hole block)
239 * 239 *
240 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
241 *
242 * %-EROFS - Read only filesystem (for create mode) 240 * %-EROFS - Read only filesystem (for create mode)
243 */ 241 */
244int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, 242int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
@@ -273,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
273 * %-ENOMEM - Insufficient memory available. 271 * %-ENOMEM - Insufficient memory available.
274 * 272 *
275 * %-EIO - I/O error 273 * %-EIO - I/O error
276 *
277 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
278 */ 274 */
279int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) 275int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
280{ 276{
@@ -350,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
350 * %-EIO - I/O error 346 * %-EIO - I/O error
351 * 347 *
352 * %-ENOENT - the specified block does not exist (hole block) 348 * %-ENOENT - the specified block does not exist (hole block)
353 *
354 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
355 */ 349 */
356int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) 350int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
357{ 351{
@@ -499,31 +493,29 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
499 struct buffer_head *bh_frozen; 493 struct buffer_head *bh_frozen;
500 struct page *page; 494 struct page *page;
501 int blkbits = inode->i_blkbits; 495 int blkbits = inode->i_blkbits;
502 int ret = -ENOMEM;
503 496
504 page = grab_cache_page(&shadow->frozen_data, bh->b_page->index); 497 page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
505 if (!page) 498 if (!page)
506 return ret; 499 return -ENOMEM;
507 500
508 if (!page_has_buffers(page)) 501 if (!page_has_buffers(page))
509 create_empty_buffers(page, 1 << blkbits, 0); 502 create_empty_buffers(page, 1 << blkbits, 0);
510 503
511 bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); 504 bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
512 if (bh_frozen) { 505
513 if (!buffer_uptodate(bh_frozen)) 506 if (!buffer_uptodate(bh_frozen))
514 nilfs_copy_buffer(bh_frozen, bh); 507 nilfs_copy_buffer(bh_frozen, bh);
515 if (list_empty(&bh_frozen->b_assoc_buffers)) { 508 if (list_empty(&bh_frozen->b_assoc_buffers)) {
516 list_add_tail(&bh_frozen->b_assoc_buffers, 509 list_add_tail(&bh_frozen->b_assoc_buffers,
517 &shadow->frozen_buffers); 510 &shadow->frozen_buffers);
518 set_buffer_nilfs_redirected(bh); 511 set_buffer_nilfs_redirected(bh);
519 } else { 512 } else {
520 brelse(bh_frozen); /* already frozen */ 513 brelse(bh_frozen); /* already frozen */
521 }
522 ret = 0;
523 } 514 }
515
524 unlock_page(page); 516 unlock_page(page);
525 page_cache_release(page); 517 page_cache_release(page);
526 return ret; 518 return 0;
527} 519}
528 520
529struct buffer_head * 521struct buffer_head *
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 6e9557ecf161..98034271cd02 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -577,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = {
577 .rename = nilfs_rename, 577 .rename = nilfs_rename,
578 .setattr = nilfs_setattr, 578 .setattr = nilfs_setattr,
579 .permission = nilfs_permission, 579 .permission = nilfs_permission,
580 .fiemap = nilfs_fiemap,
580}; 581};
581 582
582const struct inode_operations nilfs_special_inode_operations = { 583const struct inode_operations nilfs_special_inode_operations = {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 0ca98823db59..777e8fd04304 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -190,11 +190,6 @@ static inline int nilfs_doing_construction(void)
190 return nilfs_test_transaction_flag(NILFS_TI_WRITER); 190 return nilfs_test_transaction_flag(NILFS_TI_WRITER);
191} 191}
192 192
193static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
194{
195 return nilfs->ns_dat;
196}
197
198/* 193/*
199 * function prototype 194 * function prototype
200 */ 195 */
@@ -257,13 +252,13 @@ extern void nilfs_truncate(struct inode *);
257extern void nilfs_evict_inode(struct inode *); 252extern void nilfs_evict_inode(struct inode *);
258extern int nilfs_setattr(struct dentry *, struct iattr *); 253extern int nilfs_setattr(struct dentry *, struct iattr *);
259int nilfs_permission(struct inode *inode, int mask, unsigned int flags); 254int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
260extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, 255int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
261 struct buffer_head **);
262extern int nilfs_inode_dirty(struct inode *); 256extern int nilfs_inode_dirty(struct inode *);
263extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *, 257int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
264 unsigned);
265extern int nilfs_mark_inode_dirty(struct inode *); 258extern int nilfs_mark_inode_dirty(struct inode *);
266extern void nilfs_dirty_inode(struct inode *); 259extern void nilfs_dirty_inode(struct inode *);
260int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
261 __u64 start, __u64 len);
267 262
268/* super.c */ 263/* super.c */
269extern struct inode *nilfs_alloc_inode(struct super_block *); 264extern struct inode *nilfs_alloc_inode(struct super_block *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a6c3c2e817f8..0c432416cfef 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -491,7 +491,7 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
491 } 491 }
492 return nc; 492 return nc;
493} 493}
494 494
495void nilfs_mapping_init_once(struct address_space *mapping) 495void nilfs_mapping_init_once(struct address_space *mapping)
496{ 496{
497 memset(mapping, 0, sizeof(*mapping)); 497 memset(mapping, 0, sizeof(*mapping));
@@ -546,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page)
546 } 546 }
547 return TestClearPageDirty(page); 547 return TestClearPageDirty(page);
548} 548}
549
550/**
551 * nilfs_find_uncommitted_extent - find extent of uncommitted data
552 * @inode: inode
553 * @start_blk: start block offset (in)
554 * @blkoff: start offset of the found extent (out)
555 *
556 * This function searches an extent of buffers marked "delayed" which
557 * starts from a block offset equal to or larger than @start_blk. If
558 * such an extent was found, this will store the start offset in
559 * @blkoff and return its length in blocks. Otherwise, zero is
560 * returned.
561 */
562unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
563 sector_t start_blk,
564 sector_t *blkoff)
565{
566 unsigned int i;
567 pgoff_t index;
568 unsigned int nblocks_in_page;
569 unsigned long length = 0;
570 sector_t b;
571 struct pagevec pvec;
572 struct page *page;
573
574 if (inode->i_mapping->nrpages == 0)
575 return 0;
576
577 index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
578 nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
579
580 pagevec_init(&pvec, 0);
581
582repeat:
583 pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
584 pvec.pages);
585 if (pvec.nr == 0)
586 return length;
587
588 if (length > 0 && pvec.pages[0]->index > index)
589 goto out;
590
591 b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
592 i = 0;
593 do {
594 page = pvec.pages[i];
595
596 lock_page(page);
597 if (page_has_buffers(page)) {
598 struct buffer_head *bh, *head;
599
600 bh = head = page_buffers(page);
601 do {
602 if (b < start_blk)
603 continue;
604 if (buffer_delay(bh)) {
605 if (length == 0)
606 *blkoff = b;
607 length++;
608 } else if (length > 0) {
609 goto out_locked;
610 }
611 } while (++b, bh = bh->b_this_page, bh != head);
612 } else {
613 if (length > 0)
614 goto out_locked;
615
616 b += nblocks_in_page;
617 }
618 unlock_page(page);
619
620 } while (++i < pagevec_count(&pvec));
621
622 index = page->index + 1;
623 pagevec_release(&pvec);
624 cond_resched();
625 goto repeat;
626
627out_locked:
628 unlock_page(page);
629out:
630 pagevec_release(&pvec);
631 return length;
632}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index fb9e8a8a2038..622df27cd891 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -66,6 +66,9 @@ void nilfs_mapping_init(struct address_space *mapping,
66 struct backing_dev_info *bdi, 66 struct backing_dev_info *bdi,
67 const struct address_space_operations *aops); 67 const struct address_space_operations *aops);
68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
69unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
70 sector_t start_blk,
71 sector_t *blkoff);
69 72
70#define NILFS_PAGE_BUG(page, m, a...) \ 73#define NILFS_PAGE_BUG(page, m, a...) \
71 do { nilfs_page_bug(page); BUG(); } while (0) 74 do { nilfs_page_bug(page); BUG(); } while (0)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5d2711c28da7..3dfcd3b7d389 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -535,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
535 if (unlikely(err)) 535 if (unlikely(err))
536 goto failed_page; 536 goto failed_page;
537 537
538 err = nilfs_set_file_dirty(sbi, inode, 1); 538 err = nilfs_set_file_dirty(inode, 1);
539 if (unlikely(err)) 539 if (unlikely(err))
540 goto failed_page; 540 goto failed_page;
541 541
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 35a07157b980..7a17715f215f 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -27,14 +27,6 @@
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29 29
30/*
31 * Mount options
32 */
33struct nilfs_mount_options {
34 unsigned long mount_opt;
35 __u64 snapshot_cno;
36};
37
38struct the_nilfs; 30struct the_nilfs;
39struct nilfs_sc_info; 31struct nilfs_sc_info;
40 32
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 687d090cea34..55ebae5c7f39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -504,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
504 return err; 504 return err;
505} 505}
506 506
507static int nilfs_handle_bmap_error(int err, const char *fname,
508 struct inode *inode, struct super_block *sb)
509{
510 if (err == -EINVAL) {
511 nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
512 inode->i_ino);
513 err = -EIO;
514 }
515 return err;
516}
517
518/* 507/*
519 * Callback functions that enumerate, mark, and collect dirty blocks 508 * Callback functions that enumerate, mark, and collect dirty blocks
520 */ 509 */
@@ -524,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
524 int err; 513 int err;
525 514
526 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); 515 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
527 if (unlikely(err < 0)) 516 if (err < 0)
528 return nilfs_handle_bmap_error(err, __func__, inode, 517 return err;
529 sci->sc_super);
530 518
531 err = nilfs_segctor_add_file_block(sci, bh, inode, 519 err = nilfs_segctor_add_file_block(sci, bh, inode,
532 sizeof(struct nilfs_binfo_v)); 520 sizeof(struct nilfs_binfo_v));
@@ -539,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
539 struct buffer_head *bh, 527 struct buffer_head *bh,
540 struct inode *inode) 528 struct inode *inode)
541{ 529{
542 int err; 530 return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
543
544 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
545 if (unlikely(err < 0))
546 return nilfs_handle_bmap_error(err, __func__, inode,
547 sci->sc_super);
548 return 0;
549} 531}
550 532
551static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, 533static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
@@ -588,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
588 int err; 570 int err;
589 571
590 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); 572 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
591 if (unlikely(err < 0)) 573 if (err < 0)
592 return nilfs_handle_bmap_error(err, __func__, inode, 574 return err;
593 sci->sc_super);
594 575
595 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); 576 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
596 if (!err) 577 if (!err)
@@ -776,9 +757,8 @@ static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
776 ret++; 757 ret++;
777 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) 758 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
778 ret++; 759 ret++;
779 if (ret || nilfs_doing_gc()) 760 if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
780 if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs))) 761 ret++;
781 ret++;
782 return ret; 762 return ret;
783} 763}
784 764
@@ -814,7 +794,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
814 nilfs_mdt_clear_dirty(sci->sc_root->ifile); 794 nilfs_mdt_clear_dirty(sci->sc_root->ifile);
815 nilfs_mdt_clear_dirty(nilfs->ns_cpfile); 795 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
816 nilfs_mdt_clear_dirty(nilfs->ns_sufile); 796 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
817 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); 797 nilfs_mdt_clear_dirty(nilfs->ns_dat);
818} 798}
819 799
820static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) 800static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
@@ -923,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
923 nilfs->ns_nongc_ctime : sci->sc_seg_ctime); 903 nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
924 raw_sr->sr_flags = 0; 904 raw_sr->sr_flags = 0;
925 905
926 nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr + 906 nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
927 NILFS_SR_DAT_OFFSET(isz), 1); 907 NILFS_SR_DAT_OFFSET(isz), 1);
928 nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + 908 nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
929 NILFS_SR_CPFILE_OFFSET(isz), 1); 909 NILFS_SR_CPFILE_OFFSET(isz), 1);
@@ -1179,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1179 sci->sc_stage.scnt++; /* Fall through */ 1159 sci->sc_stage.scnt++; /* Fall through */
1180 case NILFS_ST_DAT: 1160 case NILFS_ST_DAT:
1181 dat_stage: 1161 dat_stage:
1182 err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs), 1162 err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
1183 &nilfs_sc_dat_ops); 1163 &nilfs_sc_dat_ops);
1184 if (unlikely(err)) 1164 if (unlikely(err))
1185 break; 1165 break;
@@ -1563,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1563 return 0; 1543 return 0;
1564 1544
1565 failed_bmap: 1545 failed_bmap:
1566 err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
1567 return err; 1546 return err;
1568} 1547}
1569 1548
@@ -1783,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1783 if (!err) { 1762 if (!err) {
1784 set_buffer_uptodate(bh); 1763 set_buffer_uptodate(bh);
1785 clear_buffer_dirty(bh); 1764 clear_buffer_dirty(bh);
1765 clear_buffer_delay(bh);
1786 clear_buffer_nilfs_volatile(bh); 1766 clear_buffer_nilfs_volatile(bh);
1787 } 1767 }
1788 brelse(bh); /* for b_assoc_buffers */ 1768 brelse(bh); /* for b_assoc_buffers */
@@ -1909,6 +1889,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1909 b_assoc_buffers) { 1889 b_assoc_buffers) {
1910 set_buffer_uptodate(bh); 1890 set_buffer_uptodate(bh);
1911 clear_buffer_dirty(bh); 1891 clear_buffer_dirty(bh);
1892 clear_buffer_delay(bh);
1912 clear_buffer_nilfs_volatile(bh); 1893 clear_buffer_nilfs_volatile(bh);
1913 clear_buffer_nilfs_redirected(bh); 1894 clear_buffer_nilfs_redirected(bh);
1914 if (bh == segbuf->sb_super_root) { 1895 if (bh == segbuf->sb_super_root) {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e2dcc9c733f7..0994f6a76c07 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -47,7 +47,6 @@
47#include <linux/crc32.h> 47#include <linux/crc32.h>
48#include <linux/vfs.h> 48#include <linux/vfs.h>
49#include <linux/writeback.h> 49#include <linux/writeback.h>
50#include <linux/kobject.h>
51#include <linux/seq_file.h> 50#include <linux/seq_file.h>
52#include <linux/mount.h> 51#include <linux/mount.h>
53#include "nilfs.h" 52#include "nilfs.h"
@@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function,
111 const char *fmt, ...) 110 const char *fmt, ...)
112{ 111{
113 struct nilfs_sb_info *sbi = NILFS_SB(sb); 112 struct nilfs_sb_info *sbi = NILFS_SB(sb);
113 struct va_format vaf;
114 va_list args; 114 va_list args;
115 115
116 va_start(args, fmt); 116 va_start(args, fmt);
117 printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function); 117
118 vprintk(fmt, args); 118 vaf.fmt = fmt;
119 printk("\n"); 119 vaf.va = &args;
120
121 printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
122 sb->s_id, function, &vaf);
123
120 va_end(args); 124 va_end(args);
121 125
122 if (!(sb->s_flags & MS_RDONLY)) { 126 if (!(sb->s_flags & MS_RDONLY)) {
@@ -136,13 +140,17 @@ void nilfs_error(struct super_block *sb, const char *function,
136void nilfs_warning(struct super_block *sb, const char *function, 140void nilfs_warning(struct super_block *sb, const char *function,
137 const char *fmt, ...) 141 const char *fmt, ...)
138{ 142{
143 struct va_format vaf;
139 va_list args; 144 va_list args;
140 145
141 va_start(args, fmt); 146 va_start(args, fmt);
142 printk(KERN_WARNING "NILFS warning (device %s): %s: ", 147
143 sb->s_id, function); 148 vaf.fmt = fmt;
144 vprintk(fmt, args); 149 vaf.va = &args;
145 printk("\n"); 150
151 printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
152 sb->s_id, function, &vaf);
153
146 va_end(args); 154 va_end(args);
147} 155}
148 156
@@ -1010,11 +1018,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1010 struct nilfs_sb_info *sbi = NILFS_SB(sb); 1018 struct nilfs_sb_info *sbi = NILFS_SB(sb);
1011 struct the_nilfs *nilfs = sbi->s_nilfs; 1019 struct the_nilfs *nilfs = sbi->s_nilfs;
1012 unsigned long old_sb_flags; 1020 unsigned long old_sb_flags;
1013 struct nilfs_mount_options old_opts; 1021 unsigned long old_mount_opt;
1014 int err; 1022 int err;
1015 1023
1016 old_sb_flags = sb->s_flags; 1024 old_sb_flags = sb->s_flags;
1017 old_opts.mount_opt = sbi->s_mount_opt; 1025 old_mount_opt = sbi->s_mount_opt;
1018 1026
1019 if (!parse_options(data, sb, 1)) { 1027 if (!parse_options(data, sb, 1)) {
1020 err = -EINVAL; 1028 err = -EINVAL;
@@ -1083,7 +1091,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1083 1091
1084 restore_opts: 1092 restore_opts:
1085 sb->s_flags = old_sb_flags; 1093 sb->s_flags = old_sb_flags;
1086 sbi->s_mount_opt = old_opts.mount_opt; 1094 sbi->s_mount_opt = old_mount_opt;
1087 return err; 1095 return err;
1088} 1096}
1089 1097
@@ -1155,14 +1163,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
1155{ 1163{
1156 struct nilfs_super_data sd; 1164 struct nilfs_super_data sd;
1157 struct super_block *s; 1165 struct super_block *s;
1158 fmode_t mode = FMODE_READ; 1166 fmode_t mode = FMODE_READ | FMODE_EXCL;
1159 struct dentry *root_dentry; 1167 struct dentry *root_dentry;
1160 int err, s_new = false; 1168 int err, s_new = false;
1161 1169
1162 if (!(flags & MS_RDONLY)) 1170 if (!(flags & MS_RDONLY))
1163 mode |= FMODE_WRITE; 1171 mode |= FMODE_WRITE;
1164 1172
1165 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1173 sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1166 if (IS_ERR(sd.bdev)) 1174 if (IS_ERR(sd.bdev))
1167 return ERR_CAST(sd.bdev); 1175 return ERR_CAST(sd.bdev);
1168 1176
@@ -1241,7 +1249,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
1241 } 1249 }
1242 1250
1243 if (!s_new) 1251 if (!s_new)
1244 close_bdev_exclusive(sd.bdev, mode); 1252 blkdev_put(sd.bdev, mode);
1245 1253
1246 return root_dentry; 1254 return root_dentry;
1247 1255
@@ -1250,7 +1258,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
1250 1258
1251 failed: 1259 failed:
1252 if (!s_new) 1260 if (!s_new)
1253 close_bdev_exclusive(sd.bdev, mode); 1261 blkdev_put(sd.bdev, mode);
1254 return ERR_PTR(err); 1262 return ERR_PTR(err);
1255} 1263}
1256 1264
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0254be2d73c6..ad4ac607cf57 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -329,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
329 printk(KERN_INFO "NILFS: recovery complete.\n"); 329 printk(KERN_INFO "NILFS: recovery complete.\n");
330 330
331 skip_recovery: 331 skip_recovery:
332 set_nilfs_loaded(nilfs);
333 nilfs_clear_recovery_info(&ri); 332 nilfs_clear_recovery_info(&ri);
334 sbi->s_super->s_flags = s_flags; 333 sbi->s_super->s_flags = s_flags;
335 return 0; 334 return 0;
@@ -651,12 +650,11 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
651 650
652int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) 651int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
653{ 652{
654 struct inode *dat = nilfs_dat_inode(nilfs);
655 unsigned long ncleansegs; 653 unsigned long ncleansegs;
656 654
657 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 655 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
658 ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); 656 ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
659 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 657 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
660 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; 658 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
661 return 0; 659 return 0;
662} 660}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 69226e14b745..fd85e4c05c6b 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -36,8 +36,6 @@
36/* the_nilfs struct */ 36/* the_nilfs struct */
37enum { 37enum {
38 THE_NILFS_INIT = 0, /* Information from super_block is set */ 38 THE_NILFS_INIT = 0, /* Information from super_block is set */
39 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
40 the latest checkpoint was loaded */
41 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
42 THE_NILFS_GC_RUNNING, /* gc process is running */ 40 THE_NILFS_GC_RUNNING, /* gc process is running */
43 THE_NILFS_SB_DIRTY, /* super block is dirty */ 41 THE_NILFS_SB_DIRTY, /* super block is dirty */
@@ -178,7 +176,6 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \
178} 176}
179 177
180THE_NILFS_FNS(INIT, init) 178THE_NILFS_FNS(INIT, init)
181THE_NILFS_FNS(LOADED, loaded)
182THE_NILFS_FNS(DISCONTINUED, discontinued) 179THE_NILFS_FNS(DISCONTINUED, discontinued)
183THE_NILFS_FNS(GC_RUNNING, gc_running) 180THE_NILFS_FNS(GC_RUNNING, gc_running)
184THE_NILFS_FNS(SB_DIRTY, sb_dirty) 181THE_NILFS_FNS(SB_DIRTY, sb_dirty)
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 3ac36b7bf6b9..7dceff005a67 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -6,7 +6,7 @@ config FANOTIFY
6 ---help--- 6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access 7 Say Y here to enable fanotify suport. fanotify is a file access
8 notification system which differs from inotify in that it sends 8 notification system which differs from inotify in that it sends
9 and open file descriptor to the userspace listener along with 9 an open file descriptor to the userspace listener along with
10 the event. 10 the event.
11 11
12 If unsure, say Y. 12 If unsure, say Y.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be992544..4ff028fcfd6e 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\" 9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG 12EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a4..f4b1057abdd2 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2007 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1380 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s 1380 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1381 * single-segment behaviour. 1381 * single-segment behaviour.
1382 * 1382 *
1383 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both 1383 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
1384 * when atomic and when not atomic. This is ok because 1384 * atomic and when not atomic. This is ok because it calls
1385 * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic() 1385 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
1386 * and it is ok to call this when non-atomic. 1386 * fact, the only difference between __copy_from_user_inatomic() and
1387 * Infact, the only difference between __copy_from_user_inatomic() and
1388 * __copy_from_user() is that the latter calls might_sleep() and the former 1387 * __copy_from_user() is that the latter calls might_sleep() and the former
1389 * should not zero the tail of the buffer on error. And on many 1388 * should not zero the tail of the buffer on error. And on many architectures
1390 * architectures __copy_from_user_inatomic() is just defined to 1389 * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
1391 * __copy_from_user() so it makes no difference at all on those architectures. 1390 * makes no difference at all on those architectures.
1392 */ 1391 */
1393static inline size_t ntfs_copy_from_user_iovec(struct page **pages, 1392static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1394 unsigned nr_pages, unsigned ofs, const struct iovec **iov, 1393 unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1409 if (unlikely(copied != len)) { 1408 if (unlikely(copied != len)) {
1410 /* Do it the slow way. */ 1409 /* Do it the slow way. */
1411 addr = kmap(*pages); 1410 addr = kmap(*pages);
1412 copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs, 1411 copied = __ntfs_copy_from_user_iovec_inatomic(addr +
1413 *iov, *iov_ofs, len); 1412 ofs, *iov, *iov_ofs, len);
1414 /*
1415 * Zero the rest of the target like __copy_from_user().
1416 */
1417 memset(addr + ofs + copied, 0, len - copied);
1418 kunmap(*pages);
1419 if (unlikely(copied != len)) 1413 if (unlikely(copied != len))
1420 goto err_out; 1414 goto err_out;
1415 kunmap(*pages);
1421 } 1416 }
1422 total += len; 1417 total += len;
1418 ntfs_set_next_iovec(iov, iov_ofs, len);
1423 bytes -= len; 1419 bytes -= len;
1424 if (!bytes) 1420 if (!bytes)
1425 break; 1421 break;
1426 ntfs_set_next_iovec(iov, iov_ofs, len);
1427 ofs = 0; 1422 ofs = 0;
1428 } while (++pages < last_page); 1423 } while (++pages < last_page);
1429out: 1424out:
1430 return total; 1425 return total;
1431err_out: 1426err_out:
1432 total += copied; 1427 BUG_ON(copied > len);
1433 /* Zero the rest of the target like __copy_from_user(). */ 1428 /* Zero the rest of the target like __copy_from_user(). */
1429 memset(addr + ofs + copied, 0, len - copied);
1430 kunmap(*pages);
1431 total += copied;
1432 ntfs_set_next_iovec(iov, iov_ofs, copied);
1434 while (++pages < last_page) { 1433 while (++pages < last_page) {
1435 bytes -= len; 1434 bytes -= len;
1436 if (!bytes) 1435 if (!bytes)
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a30ecacc01f2..29099a07b9fe 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. 2 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2007 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * Copyright (c) 2001,2002 Richard Russon 5 * Copyright (c) 2001,2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -3193,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
3193 ntfs_sysctl(0); 3193 ntfs_sysctl(0);
3194} 3194}
3195 3195
3196MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>"); 3196MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
3197MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov"); 3197MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
3198MODULE_VERSION(NTFS_VERSION); 3198MODULE_VERSION(NTFS_VERSION);
3199MODULE_LICENSE("GPL"); 3199MODULE_LICENSE("GPL");
3200#ifdef DEBUG 3200#ifdef DEBUG
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698e..77a8de5f7119 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
1config OCFS2_FS 1config OCFS2_FS
2 tristate "OCFS2 file system support" 2 tristate "OCFS2 file system support"
3 depends on NET && SYSFS 3 depends on NET && SYSFS && CONFIGFS_FS
4 select CONFIGFS_FS
5 select JBD2 4 select JBD2
6 select CRC32 5 select CRC32
7 select QUOTA 6 select QUOTA
@@ -51,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
51 50
52config OCFS2_FS_STATS 51config OCFS2_FS_STATS
53 bool "OCFS2 statistics" 52 bool "OCFS2 statistics"
54 depends on OCFS2_FS 53 depends on OCFS2_FS && DEBUG_FS
55 default y 54 default y
56 help 55 help
57 This option allows some fs statistics to be captured. Enabling 56 This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5007d1..e4984e259cb6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
565 return ret; 565 return ret;
566} 566}
567 567
568static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
569static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 568static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
570 struct ocfs2_extent_block *eb); 569 struct ocfs2_extent_block *eb);
571static void ocfs2_adjust_rightmost_records(handle_t *handle, 570static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5858 5857
5859 ocfs2_journal_dirty(handle, tl_bh); 5858 ocfs2_journal_dirty(handle, tl_bh);
5860 5859
5860 osb->truncated_clusters += num_clusters;
5861bail: 5861bail:
5862 mlog_exit(status); 5862 mlog_exit(status);
5863 return status; 5863 return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5929 i--; 5929 i--;
5930 } 5930 }
5931 5931
5932 osb->truncated_clusters = 0;
5933
5932bail: 5934bail:
5933 mlog_exit(status); 5935 mlog_exit(status);
5934 return status; 5936 return status;
@@ -7139,64 +7141,6 @@ bail:
7139} 7141}
7140 7142
7141/* 7143/*
7142 * Expects the inode to already be locked.
7143 */
7144int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7145 struct inode *inode,
7146 struct buffer_head *fe_bh,
7147 struct ocfs2_truncate_context **tc)
7148{
7149 int status;
7150 unsigned int new_i_clusters;
7151 struct ocfs2_dinode *fe;
7152 struct ocfs2_extent_block *eb;
7153 struct buffer_head *last_eb_bh = NULL;
7154
7155 mlog_entry_void();
7156
7157 *tc = NULL;
7158
7159 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7160 i_size_read(inode));
7161 fe = (struct ocfs2_dinode *) fe_bh->b_data;
7162
7163 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7164 "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7165 (unsigned long long)le64_to_cpu(fe->i_size));
7166
7167 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7168 if (!(*tc)) {
7169 status = -ENOMEM;
7170 mlog_errno(status);
7171 goto bail;
7172 }
7173 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7174
7175 if (fe->id2.i_list.l_tree_depth) {
7176 status = ocfs2_read_extent_block(INODE_CACHE(inode),
7177 le64_to_cpu(fe->i_last_eb_blk),
7178 &last_eb_bh);
7179 if (status < 0) {
7180 mlog_errno(status);
7181 goto bail;
7182 }
7183 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7184 }
7185
7186 (*tc)->tc_last_eb_bh = last_eb_bh;
7187
7188 status = 0;
7189bail:
7190 if (status < 0) {
7191 if (*tc)
7192 ocfs2_free_truncate_context(*tc);
7193 *tc = NULL;
7194 }
7195 mlog_exit_void();
7196 return status;
7197}
7198
7199/*
7200 * 'start' is inclusive, 'end' is not. 7144 * 'start' is inclusive, 'end' is not.
7201 */ 7145 */
7202int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 7146int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
7270out: 7214out:
7271 return ret; 7215 return ret;
7272} 7216}
7273
7274static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7275{
7276 /*
7277 * The caller is responsible for completing deallocation
7278 * before freeing the context.
7279 */
7280 if (tc->tc_dealloc.c_first_suballocator != NULL)
7281 mlog(ML_NOTICE,
7282 "Truncate completion has non-empty dealloc context\n");
7283
7284 brelse(tc->tc_last_eb_bh);
7285
7286 kfree(tc);
7287}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b554b99..3bd08a03251c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
228 228
229int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 229int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
230 u64 range_start, u64 range_end); 230 u64 range_start, u64 range_end);
231int ocfs2_prepare_truncate(struct ocfs2_super *osb,
232 struct inode *inode,
233 struct buffer_head *fe_bh,
234 struct ocfs2_truncate_context **tc);
235int ocfs2_commit_truncate(struct ocfs2_super *osb, 231int ocfs2_commit_truncate(struct ocfs2_super *osb,
236 struct inode *inode, 232 struct inode *inode,
237 struct buffer_head *di_bh); 233 struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0d7c5540ad66..1fbb0e20131b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1630,6 +1630,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1630 return ret; 1630 return ret;
1631} 1631}
1632 1632
1633/*
1634 * Try to flush truncate logs if we can free enough clusters from it.
1635 * As for return value, "< 0" means error, "0" no space and "1" means
1636 * we have freed enough spaces and let the caller try to allocate again.
1637 */
1638static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
1639 unsigned int needed)
1640{
1641 tid_t target;
1642 int ret = 0;
1643 unsigned int truncated_clusters;
1644
1645 mutex_lock(&osb->osb_tl_inode->i_mutex);
1646 truncated_clusters = osb->truncated_clusters;
1647 mutex_unlock(&osb->osb_tl_inode->i_mutex);
1648
1649 /*
1650 * Check whether we can succeed in allocating if we free
1651 * the truncate log.
1652 */
1653 if (truncated_clusters < needed)
1654 goto out;
1655
1656 ret = ocfs2_flush_truncate_log(osb);
1657 if (ret) {
1658 mlog_errno(ret);
1659 goto out;
1660 }
1661
1662 if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
1663 jbd2_log_wait_commit(osb->journal->j_journal, target);
1664 ret = 1;
1665 }
1666out:
1667 return ret;
1668}
1669
1633int ocfs2_write_begin_nolock(struct file *filp, 1670int ocfs2_write_begin_nolock(struct file *filp,
1634 struct address_space *mapping, 1671 struct address_space *mapping,
1635 loff_t pos, unsigned len, unsigned flags, 1672 loff_t pos, unsigned len, unsigned flags,
@@ -1637,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
1637 struct buffer_head *di_bh, struct page *mmap_page) 1674 struct buffer_head *di_bh, struct page *mmap_page)
1638{ 1675{
1639 int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; 1676 int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
1640 unsigned int clusters_to_alloc, extents_to_split; 1677 unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
1641 struct ocfs2_write_ctxt *wc; 1678 struct ocfs2_write_ctxt *wc;
1642 struct inode *inode = mapping->host; 1679 struct inode *inode = mapping->host;
1643 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1680 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1646,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct file *filp,
1646 struct ocfs2_alloc_context *meta_ac = NULL; 1683 struct ocfs2_alloc_context *meta_ac = NULL;
1647 handle_t *handle; 1684 handle_t *handle;
1648 struct ocfs2_extent_tree et; 1685 struct ocfs2_extent_tree et;
1686 int try_free = 1, ret1;
1649 1687
1688try_again:
1650 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1689 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1651 if (ret) { 1690 if (ret) {
1652 mlog_errno(ret); 1691 mlog_errno(ret);
@@ -1681,6 +1720,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
1681 mlog_errno(ret); 1720 mlog_errno(ret);
1682 goto out; 1721 goto out;
1683 } else if (ret == 1) { 1722 } else if (ret == 1) {
1723 clusters_need = wc->w_clen;
1684 ret = ocfs2_refcount_cow(inode, filp, di_bh, 1724 ret = ocfs2_refcount_cow(inode, filp, di_bh,
1685 wc->w_cpos, wc->w_clen, UINT_MAX); 1725 wc->w_cpos, wc->w_clen, UINT_MAX);
1686 if (ret) { 1726 if (ret) {
@@ -1695,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
1695 mlog_errno(ret); 1735 mlog_errno(ret);
1696 goto out; 1736 goto out;
1697 } 1737 }
1738 clusters_need += clusters_to_alloc;
1698 1739
1699 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1740 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1700 1741
@@ -1817,6 +1858,22 @@ out:
1817 ocfs2_free_alloc_context(data_ac); 1858 ocfs2_free_alloc_context(data_ac);
1818 if (meta_ac) 1859 if (meta_ac)
1819 ocfs2_free_alloc_context(meta_ac); 1860 ocfs2_free_alloc_context(meta_ac);
1861
1862 if (ret == -ENOSPC && try_free) {
1863 /*
1864 * Try to free some truncate log so that we can have enough
1865 * clusters to allocate.
1866 */
1867 try_free = 0;
1868
1869 ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
1870 if (ret1 == 1)
1871 goto try_again;
1872
1873 if (ret1 < 0)
1874 mlog_errno(ret1);
1875 }
1876
1820 return ret; 1877 return ret;
1821} 1878}
1822 1879
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9e3d45bcb5fd..b108e863d8f6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
82#define O2HB_DB_TYPE_REGION_LIVENODES 4 82#define O2HB_DB_TYPE_REGION_LIVENODES 4
83#define O2HB_DB_TYPE_REGION_NUMBER 5 83#define O2HB_DB_TYPE_REGION_NUMBER 5
84#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 84#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
85#define O2HB_DB_TYPE_REGION_PINNED 7
85struct o2hb_debug_buf { 86struct o2hb_debug_buf {
86 int db_type; 87 int db_type;
87 int db_size; 88 int db_size;
@@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
101#define O2HB_DEBUG_FAILEDREGIONS "failed_regions" 102#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
102#define O2HB_DEBUG_REGION_NUMBER "num" 103#define O2HB_DEBUG_REGION_NUMBER "num"
103#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" 104#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
105#define O2HB_DEBUG_REGION_PINNED "pinned"
104 106
105static struct dentry *o2hb_debug_dir; 107static struct dentry *o2hb_debug_dir;
106static struct dentry *o2hb_debug_livenodes; 108static struct dentry *o2hb_debug_livenodes;
@@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
132unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 134unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
133unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; 135unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
134 136
137/*
138 * o2hb_dependent_users tracks the number of registered callbacks that depend
139 * on heartbeat. o2net and o2dlm are two entities that register this callback.
140 * However only o2dlm depends on the heartbeat. It does not want the heartbeat
141 * to stop while a dlm domain is still active.
142 */
143unsigned int o2hb_dependent_users;
144
145/*
146 * In global heartbeat mode, all regions are pinned if there are one or more
147 * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
148 * regions are unpinned if the region count exceeds the cut off or the number
149 * of dependent users falls to zero.
150 */
151#define O2HB_PIN_CUT_OFF 3
152
153/*
154 * In local heartbeat mode, we assume the dlm domain name to be the same as
155 * region uuid. This is true for domains created for the file system but not
156 * necessarily true for userdlm domains. This is a known limitation.
157 *
158 * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
159 * works for both file system and userdlm domains.
160 */
161static int o2hb_region_pin(const char *region_uuid);
162static void o2hb_region_unpin(const char *region_uuid);
163
135/* Only sets a new threshold if there are no active regions. 164/* Only sets a new threshold if there are no active regions.
136 * 165 *
137 * No locking or otherwise interesting code is required for reading 166 * No locking or otherwise interesting code is required for reading
@@ -186,7 +215,9 @@ struct o2hb_region {
186 struct config_item hr_item; 215 struct config_item hr_item;
187 216
188 struct list_head hr_all_item; 217 struct list_head hr_all_item;
189 unsigned hr_unclean_stop:1; 218 unsigned hr_unclean_stop:1,
219 hr_item_pinned:1,
220 hr_item_dropped:1;
190 221
191 /* protected by the hr_callback_sem */ 222 /* protected by the hr_callback_sem */
192 struct task_struct *hr_task; 223 struct task_struct *hr_task;
@@ -212,9 +243,11 @@ struct o2hb_region {
212 struct dentry *hr_debug_livenodes; 243 struct dentry *hr_debug_livenodes;
213 struct dentry *hr_debug_regnum; 244 struct dentry *hr_debug_regnum;
214 struct dentry *hr_debug_elapsed_time; 245 struct dentry *hr_debug_elapsed_time;
246 struct dentry *hr_debug_pinned;
215 struct o2hb_debug_buf *hr_db_livenodes; 247 struct o2hb_debug_buf *hr_db_livenodes;
216 struct o2hb_debug_buf *hr_db_regnum; 248 struct o2hb_debug_buf *hr_db_regnum;
217 struct o2hb_debug_buf *hr_db_elapsed_time; 249 struct o2hb_debug_buf *hr_db_elapsed_time;
250 struct o2hb_debug_buf *hr_db_pinned;
218 251
219 /* let the person setting up hb wait for it to return until it 252 /* let the person setting up hb wait for it to return until it
220 * has reached a 'steady' state. This will be fixed when we have 253 * has reached a 'steady' state. This will be fixed when we have
@@ -701,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
701 config_item_name(&reg->hr_item)); 734 config_item_name(&reg->hr_item));
702 735
703 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); 736 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
737
738 /*
739 * If global heartbeat active, unpin all regions if the
740 * region count > CUT_OFF
741 */
742 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
743 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
744 o2hb_region_unpin(NULL);
704} 745}
705 746
706static int o2hb_check_slot(struct o2hb_region *reg, 747static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1041,6 +1082,9 @@ static int o2hb_thread(void *data)
1041 1082
1042 set_user_nice(current, -20); 1083 set_user_nice(current, -20);
1043 1084
1085 /* Pin node */
1086 o2nm_depend_this_node();
1087
1044 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 1088 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
1045 /* We track the time spent inside 1089 /* We track the time spent inside
1046 * o2hb_do_disk_heartbeat so that we avoid more than 1090 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -1090,6 +1134,9 @@ static int o2hb_thread(void *data)
1090 mlog_errno(ret); 1134 mlog_errno(ret);
1091 } 1135 }
1092 1136
1137 /* Unpin node */
1138 o2nm_undepend_this_node();
1139
1093 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); 1140 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
1094 1141
1095 return 0; 1142 return 0;
@@ -1142,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
1142 reg->hr_last_timeout_start)); 1189 reg->hr_last_timeout_start));
1143 goto done; 1190 goto done;
1144 1191
1192 case O2HB_DB_TYPE_REGION_PINNED:
1193 reg = (struct o2hb_region *)db->db_data;
1194 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
1195 !!reg->hr_item_pinned);
1196 goto done;
1197
1145 default: 1198 default:
1146 goto done; 1199 goto done;
1147 } 1200 }
@@ -1315,6 +1368,8 @@ int o2hb_init(void)
1315 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); 1368 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
1316 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); 1369 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
1317 1370
1371 o2hb_dependent_users = 0;
1372
1318 return o2hb_debug_init(); 1373 return o2hb_debug_init();
1319} 1374}
1320 1375
@@ -1384,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item)
1384 debugfs_remove(reg->hr_debug_livenodes); 1439 debugfs_remove(reg->hr_debug_livenodes);
1385 debugfs_remove(reg->hr_debug_regnum); 1440 debugfs_remove(reg->hr_debug_regnum);
1386 debugfs_remove(reg->hr_debug_elapsed_time); 1441 debugfs_remove(reg->hr_debug_elapsed_time);
1442 debugfs_remove(reg->hr_debug_pinned);
1387 debugfs_remove(reg->hr_debug_dir); 1443 debugfs_remove(reg->hr_debug_dir);
1388 1444
1389 spin_lock(&o2hb_live_lock); 1445 spin_lock(&o2hb_live_lock);
@@ -1673,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1673 goto out; 1729 goto out;
1674 1730
1675 reg->hr_bdev = I_BDEV(filp->f_mapping->host); 1731 reg->hr_bdev = I_BDEV(filp->f_mapping->host);
1676 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ); 1732 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
1677 if (ret) { 1733 if (ret) {
1678 reg->hr_bdev = NULL; 1734 reg->hr_bdev = NULL;
1679 goto out; 1735 goto out;
@@ -1948,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
1948 goto bail; 2004 goto bail;
1949 } 2005 }
1950 2006
2007 reg->hr_debug_pinned =
2008 o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
2009 reg->hr_debug_dir,
2010 &(reg->hr_db_pinned),
2011 sizeof(*(reg->hr_db_pinned)),
2012 O2HB_DB_TYPE_REGION_PINNED,
2013 0, 0, reg);
2014 if (!reg->hr_debug_pinned) {
2015 mlog_errno(ret);
2016 goto bail;
2017 }
2018
1951 ret = 0; 2019 ret = 0;
1952bail: 2020bail:
1953 return ret; 2021 return ret;
@@ -2002,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2002{ 2070{
2003 struct task_struct *hb_task; 2071 struct task_struct *hb_task;
2004 struct o2hb_region *reg = to_o2hb_region(item); 2072 struct o2hb_region *reg = to_o2hb_region(item);
2073 int quorum_region = 0;
2005 2074
2006 /* stop the thread when the user removes the region dir */ 2075 /* stop the thread when the user removes the region dir */
2007 spin_lock(&o2hb_live_lock); 2076 spin_lock(&o2hb_live_lock);
2008 if (o2hb_global_heartbeat_active()) { 2077 if (o2hb_global_heartbeat_active()) {
2009 clear_bit(reg->hr_region_num, o2hb_region_bitmap); 2078 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2010 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); 2079 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2080 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2081 quorum_region = 1;
2082 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2011 } 2083 }
2012 hb_task = reg->hr_task; 2084 hb_task = reg->hr_task;
2013 reg->hr_task = NULL; 2085 reg->hr_task = NULL;
2086 reg->hr_item_dropped = 1;
2014 spin_unlock(&o2hb_live_lock); 2087 spin_unlock(&o2hb_live_lock);
2015 2088
2016 if (hb_task) 2089 if (hb_task)
@@ -2028,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2028 if (o2hb_global_heartbeat_active()) 2101 if (o2hb_global_heartbeat_active())
2029 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", 2102 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2030 config_item_name(&reg->hr_item)); 2103 config_item_name(&reg->hr_item));
2104
2031 config_item_put(item); 2105 config_item_put(item);
2106
2107 if (!o2hb_global_heartbeat_active() || !quorum_region)
2108 return;
2109
2110 /*
2111 * If global heartbeat active and there are dependent users,
2112 * pin all regions if quorum region count <= CUT_OFF
2113 */
2114 spin_lock(&o2hb_live_lock);
2115
2116 if (!o2hb_dependent_users)
2117 goto unlock;
2118
2119 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
2120 O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
2121 o2hb_region_pin(NULL);
2122
2123unlock:
2124 spin_unlock(&o2hb_live_lock);
2032} 2125}
2033 2126
2034struct o2hb_heartbeat_group_attribute { 2127struct o2hb_heartbeat_group_attribute {
@@ -2214,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
2214} 2307}
2215EXPORT_SYMBOL_GPL(o2hb_setup_callback); 2308EXPORT_SYMBOL_GPL(o2hb_setup_callback);
2216 2309
2217static struct o2hb_region *o2hb_find_region(const char *region_uuid) 2310/*
2311 * In local heartbeat mode, region_uuid passed matches the dlm domain name.
2312 * In global heartbeat mode, region_uuid passed is NULL.
2313 *
2314 * In local, we only pin the matching region. In global we pin all the active
2315 * regions.
2316 */
2317static int o2hb_region_pin(const char *region_uuid)
2218{ 2318{
2219 struct o2hb_region *p, *reg = NULL; 2319 int ret = 0, found = 0;
2320 struct o2hb_region *reg;
2321 char *uuid;
2220 2322
2221 assert_spin_locked(&o2hb_live_lock); 2323 assert_spin_locked(&o2hb_live_lock);
2222 2324
2223 list_for_each_entry(p, &o2hb_all_regions, hr_all_item) { 2325 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2224 if (!strcmp(region_uuid, config_item_name(&p->hr_item))) { 2326 uuid = config_item_name(&reg->hr_item);
2225 reg = p; 2327
2226 break; 2328 /* local heartbeat */
2329 if (region_uuid) {
2330 if (strcmp(region_uuid, uuid))
2331 continue;
2332 found = 1;
2333 }
2334
2335 if (reg->hr_item_pinned || reg->hr_item_dropped)
2336 goto skip_pin;
2337
2338 /* Ignore ENOENT only for local hb (userdlm domain) */
2339 ret = o2nm_depend_item(&reg->hr_item);
2340 if (!ret) {
2341 mlog(ML_CLUSTER, "Pin region %s\n", uuid);
2342 reg->hr_item_pinned = 1;
2343 } else {
2344 if (ret == -ENOENT && found)
2345 ret = 0;
2346 else {
2347 mlog(ML_ERROR, "Pin region %s fails with %d\n",
2348 uuid, ret);
2349 break;
2350 }
2227 } 2351 }
2352skip_pin:
2353 if (found)
2354 break;
2228 } 2355 }
2229 2356
2230 return reg; 2357 return ret;
2231} 2358}
2232 2359
2233static int o2hb_region_get(const char *region_uuid) 2360/*
2361 * In local heartbeat mode, region_uuid passed matches the dlm domain name.
2362 * In global heartbeat mode, region_uuid passed is NULL.
2363 *
2364 * In local, we only unpin the matching region. In global we unpin all the
2365 * active regions.
2366 */
2367static void o2hb_region_unpin(const char *region_uuid)
2234{ 2368{
2235 int ret = 0;
2236 struct o2hb_region *reg; 2369 struct o2hb_region *reg;
2370 char *uuid;
2371 int found = 0;
2237 2372
2238 spin_lock(&o2hb_live_lock); 2373 assert_spin_locked(&o2hb_live_lock);
2239 2374
2240 reg = o2hb_find_region(region_uuid); 2375 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2241 if (!reg) 2376 uuid = config_item_name(&reg->hr_item);
2242 ret = -ENOENT; 2377 if (region_uuid) {
2243 spin_unlock(&o2hb_live_lock); 2378 if (strcmp(region_uuid, uuid))
2379 continue;
2380 found = 1;
2381 }
2244 2382
2245 if (ret) 2383 if (reg->hr_item_pinned) {
2246 goto out; 2384 mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
2385 o2nm_undepend_item(&reg->hr_item);
2386 reg->hr_item_pinned = 0;
2387 }
2388 if (found)
2389 break;
2390 }
2391}
2247 2392
2248 ret = o2nm_depend_this_node(); 2393static int o2hb_region_inc_user(const char *region_uuid)
2249 if (ret) 2394{
2250 goto out; 2395 int ret = 0;
2251 2396
2252 ret = o2nm_depend_item(&reg->hr_item); 2397 spin_lock(&o2hb_live_lock);
2253 if (ret)
2254 o2nm_undepend_this_node();
2255 2398
2256out: 2399 /* local heartbeat */
2400 if (!o2hb_global_heartbeat_active()) {
2401 ret = o2hb_region_pin(region_uuid);
2402 goto unlock;
2403 }
2404
2405 /*
2406 * if global heartbeat active and this is the first dependent user,
2407 * pin all regions if quorum region count <= CUT_OFF
2408 */
2409 o2hb_dependent_users++;
2410 if (o2hb_dependent_users > 1)
2411 goto unlock;
2412
2413 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
2414 O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
2415 ret = o2hb_region_pin(NULL);
2416
2417unlock:
2418 spin_unlock(&o2hb_live_lock);
2257 return ret; 2419 return ret;
2258} 2420}
2259 2421
2260static void o2hb_region_put(const char *region_uuid) 2422void o2hb_region_dec_user(const char *region_uuid)
2261{ 2423{
2262 struct o2hb_region *reg;
2263
2264 spin_lock(&o2hb_live_lock); 2424 spin_lock(&o2hb_live_lock);
2265 2425
2266 reg = o2hb_find_region(region_uuid); 2426 /* local heartbeat */
2427 if (!o2hb_global_heartbeat_active()) {
2428 o2hb_region_unpin(region_uuid);
2429 goto unlock;
2430 }
2267 2431
2268 spin_unlock(&o2hb_live_lock); 2432 /*
2433 * if global heartbeat active and there are no dependent users,
2434 * unpin all quorum regions
2435 */
2436 o2hb_dependent_users--;
2437 if (!o2hb_dependent_users)
2438 o2hb_region_unpin(NULL);
2269 2439
2270 if (reg) { 2440unlock:
2271 o2nm_undepend_item(&reg->hr_item); 2441 spin_unlock(&o2hb_live_lock);
2272 o2nm_undepend_this_node();
2273 }
2274} 2442}
2275 2443
2276int o2hb_register_callback(const char *region_uuid, 2444int o2hb_register_callback(const char *region_uuid,
@@ -2291,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
2291 } 2459 }
2292 2460
2293 if (region_uuid) { 2461 if (region_uuid) {
2294 ret = o2hb_region_get(region_uuid); 2462 ret = o2hb_region_inc_user(region_uuid);
2295 if (ret) 2463 if (ret) {
2464 mlog_errno(ret);
2296 goto out; 2465 goto out;
2466 }
2297 } 2467 }
2298 2468
2299 down_write(&o2hb_callback_sem); 2469 down_write(&o2hb_callback_sem);
@@ -2311,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
2311 up_write(&o2hb_callback_sem); 2481 up_write(&o2hb_callback_sem);
2312 ret = 0; 2482 ret = 0;
2313out: 2483out:
2314 mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n", 2484 mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
2315 ret, __builtin_return_address(0), hc); 2485 ret, __builtin_return_address(0), hc);
2316 return ret; 2486 return ret;
2317} 2487}
@@ -2322,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
2322{ 2492{
2323 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); 2493 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
2324 2494
2325 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", 2495 mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
2326 __builtin_return_address(0), hc); 2496 __builtin_return_address(0), hc);
2327 2497
2328 /* XXX Can this happen _with_ a region reference? */ 2498 /* XXX Can this happen _with_ a region reference? */
@@ -2330,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
2330 return; 2500 return;
2331 2501
2332 if (region_uuid) 2502 if (region_uuid)
2333 o2hb_region_put(region_uuid); 2503 o2hb_region_dec_user(region_uuid);
2334 2504
2335 down_write(&o2hb_callback_sem); 2505 down_write(&o2hb_callback_sem);
2336 2506
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b02..3a5835904b3d 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
46#define O2NET_DEBUG_DIR "o2net" 46#define O2NET_DEBUG_DIR "o2net"
47#define SC_DEBUG_NAME "sock_containers" 47#define SC_DEBUG_NAME "sock_containers"
48#define NST_DEBUG_NAME "send_tracking" 48#define NST_DEBUG_NAME "send_tracking"
49#define STATS_DEBUG_NAME "stats"
50
51#define SHOW_SOCK_CONTAINERS 0
52#define SHOW_SOCK_STATS 1
49 53
50static struct dentry *o2net_dentry; 54static struct dentry *o2net_dentry;
51static struct dentry *sc_dentry; 55static struct dentry *sc_dentry;
52static struct dentry *nst_dentry; 56static struct dentry *nst_dentry;
57static struct dentry *stats_dentry;
53 58
54static DEFINE_SPINLOCK(o2net_debug_lock); 59static DEFINE_SPINLOCK(o2net_debug_lock);
55 60
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
123static int nst_seq_show(struct seq_file *seq, void *v) 128static int nst_seq_show(struct seq_file *seq, void *v)
124{ 129{
125 struct o2net_send_tracking *nst, *dummy_nst = seq->private; 130 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
131 ktime_t now;
132 s64 sock, send, status;
126 133
127 spin_lock(&o2net_debug_lock); 134 spin_lock(&o2net_debug_lock);
128 nst = next_nst(dummy_nst); 135 nst = next_nst(dummy_nst);
136 if (!nst)
137 goto out;
129 138
130 if (nst != NULL) { 139 now = ktime_get();
131 /* get_task_comm isn't exported. oh well. */ 140 sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
132 seq_printf(seq, "%p:\n" 141 send = ktime_to_us(ktime_sub(now, nst->st_send_time));
133 " pid: %lu\n" 142 status = ktime_to_us(ktime_sub(now, nst->st_status_time));
134 " tgid: %lu\n" 143
135 " process name: %s\n" 144 /* get_task_comm isn't exported. oh well. */
136 " node: %u\n" 145 seq_printf(seq, "%p:\n"
137 " sc: %p\n" 146 " pid: %lu\n"
138 " message id: %d\n" 147 " tgid: %lu\n"
139 " message type: %u\n" 148 " process name: %s\n"
140 " message key: 0x%08x\n" 149 " node: %u\n"
141 " sock acquiry: %lu.%ld\n" 150 " sc: %p\n"
142 " send start: %lu.%ld\n" 151 " message id: %d\n"
143 " wait start: %lu.%ld\n", 152 " message type: %u\n"
144 nst, (unsigned long)nst->st_task->pid, 153 " message key: 0x%08x\n"
145 (unsigned long)nst->st_task->tgid, 154 " sock acquiry: %lld usecs ago\n"
146 nst->st_task->comm, nst->st_node, 155 " send start: %lld usecs ago\n"
147 nst->st_sc, nst->st_id, nst->st_msg_type, 156 " wait start: %lld usecs ago\n",
148 nst->st_msg_key, 157 nst, (unsigned long)task_pid_nr(nst->st_task),
149 nst->st_sock_time.tv_sec, 158 (unsigned long)nst->st_task->tgid,
150 (long)nst->st_sock_time.tv_usec, 159 nst->st_task->comm, nst->st_node,
151 nst->st_send_time.tv_sec, 160 nst->st_sc, nst->st_id, nst->st_msg_type,
152 (long)nst->st_send_time.tv_usec, 161 nst->st_msg_key,
153 nst->st_status_time.tv_sec, 162 (long long)sock,
154 (long)nst->st_status_time.tv_usec); 163 (long long)send,
155 } 164 (long long)status);
156 165
166out:
157 spin_unlock(&o2net_debug_lock); 167 spin_unlock(&o2net_debug_lock);
158 168
159 return 0; 169 return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
228 spin_unlock(&o2net_debug_lock); 238 spin_unlock(&o2net_debug_lock);
229} 239}
230 240
241struct o2net_sock_debug {
242 int dbg_ctxt;
243 struct o2net_sock_container *dbg_sock;
244};
245
231static struct o2net_sock_container 246static struct o2net_sock_container
232 *next_sc(struct o2net_sock_container *sc_start) 247 *next_sc(struct o2net_sock_container *sc_start)
233{ 248{
@@ -253,7 +268,8 @@ static struct o2net_sock_container
253 268
254static void *sc_seq_start(struct seq_file *seq, loff_t *pos) 269static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
255{ 270{
256 struct o2net_sock_container *sc, *dummy_sc = seq->private; 271 struct o2net_sock_debug *sd = seq->private;
272 struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
257 273
258 spin_lock(&o2net_debug_lock); 274 spin_lock(&o2net_debug_lock);
259 sc = next_sc(dummy_sc); 275 sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
264 280
265static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) 281static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266{ 282{
267 struct o2net_sock_container *sc, *dummy_sc = seq->private; 283 struct o2net_sock_debug *sd = seq->private;
284 struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
268 285
269 spin_lock(&o2net_debug_lock); 286 spin_lock(&o2net_debug_lock);
270 sc = next_sc(dummy_sc); 287 sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
276 return sc; /* unused, just needs to be null when done */ 293 return sc; /* unused, just needs to be null when done */
277} 294}
278 295
279#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec 296#ifdef CONFIG_OCFS2_FS_STATS
297# define sc_send_count(_s) ((_s)->sc_send_count)
298# define sc_recv_count(_s) ((_s)->sc_recv_count)
299# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total))
300# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total))
301# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total))
302# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total))
303#else
304# define sc_send_count(_s) (0U)
305# define sc_recv_count(_s) (0U)
306# define sc_tv_acquiry_total_ns(_s) (0LL)
307# define sc_tv_send_total_ns(_s) (0LL)
308# define sc_tv_status_total_ns(_s) (0LL)
309# define sc_tv_process_total_ns(_s) (0LL)
310#endif
311
312/* So that debugfs.ocfs2 can determine which format is being used */
313#define O2NET_STATS_STR_VERSION 1
314static void sc_show_sock_stats(struct seq_file *seq,
315 struct o2net_sock_container *sc)
316{
317 if (!sc)
318 return;
319
320 seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
321 sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
322 (long long)sc_tv_acquiry_total_ns(sc),
323 (long long)sc_tv_send_total_ns(sc),
324 (long long)sc_tv_status_total_ns(sc),
325 (unsigned long)sc_recv_count(sc),
326 (long long)sc_tv_process_total_ns(sc));
327}
328
329static void sc_show_sock_container(struct seq_file *seq,
330 struct o2net_sock_container *sc)
331{
332 struct inet_sock *inet = NULL;
333 __be32 saddr = 0, daddr = 0;
334 __be16 sport = 0, dport = 0;
335
336 if (!sc)
337 return;
338
339 if (sc->sc_sock) {
340 inet = inet_sk(sc->sc_sock->sk);
341 /* the stack's structs aren't sparse endian clean */
342 saddr = (__force __be32)inet->inet_saddr;
343 daddr = (__force __be32)inet->inet_daddr;
344 sport = (__force __be16)inet->inet_sport;
345 dport = (__force __be16)inet->inet_dport;
346 }
347
348 /* XXX sigh, inet-> doesn't have sparse annotation so any
349 * use of it here generates a warning with -Wbitwise */
350 seq_printf(seq, "%p:\n"
351 " krefs: %d\n"
352 " sock: %pI4:%u -> "
353 "%pI4:%u\n"
354 " remote node: %s\n"
355 " page off: %zu\n"
356 " handshake ok: %u\n"
357 " timer: %lld usecs\n"
358 " data ready: %lld usecs\n"
359 " advance start: %lld usecs\n"
360 " advance stop: %lld usecs\n"
361 " func start: %lld usecs\n"
362 " func stop: %lld usecs\n"
363 " func key: 0x%08x\n"
364 " func type: %u\n",
365 sc,
366 atomic_read(&sc->sc_kref.refcount),
367 &saddr, inet ? ntohs(sport) : 0,
368 &daddr, inet ? ntohs(dport) : 0,
369 sc->sc_node->nd_name,
370 sc->sc_page_off,
371 sc->sc_handshake_ok,
372 (long long)ktime_to_us(sc->sc_tv_timer),
373 (long long)ktime_to_us(sc->sc_tv_data_ready),
374 (long long)ktime_to_us(sc->sc_tv_advance_start),
375 (long long)ktime_to_us(sc->sc_tv_advance_stop),
376 (long long)ktime_to_us(sc->sc_tv_func_start),
377 (long long)ktime_to_us(sc->sc_tv_func_stop),
378 sc->sc_msg_key,
379 sc->sc_msg_type);
380}
280 381
281static int sc_seq_show(struct seq_file *seq, void *v) 382static int sc_seq_show(struct seq_file *seq, void *v)
282{ 383{
283 struct o2net_sock_container *sc, *dummy_sc = seq->private; 384 struct o2net_sock_debug *sd = seq->private;
385 struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
284 386
285 spin_lock(&o2net_debug_lock); 387 spin_lock(&o2net_debug_lock);
286 sc = next_sc(dummy_sc); 388 sc = next_sc(dummy_sc);
287 389
288 if (sc != NULL) { 390 if (sc) {
289 struct inet_sock *inet = NULL; 391 if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
290 392 sc_show_sock_container(seq, sc);
291 __be32 saddr = 0, daddr = 0; 393 else
292 __be16 sport = 0, dport = 0; 394 sc_show_sock_stats(seq, sc);
293
294 if (sc->sc_sock) {
295 inet = inet_sk(sc->sc_sock->sk);
296 /* the stack's structs aren't sparse endian clean */
297 saddr = (__force __be32)inet->inet_saddr;
298 daddr = (__force __be32)inet->inet_daddr;
299 sport = (__force __be16)inet->inet_sport;
300 dport = (__force __be16)inet->inet_dport;
301 }
302
303 /* XXX sigh, inet-> doesn't have sparse annotation so any
304 * use of it here generates a warning with -Wbitwise */
305 seq_printf(seq, "%p:\n"
306 " krefs: %d\n"
307 " sock: %pI4:%u -> "
308 "%pI4:%u\n"
309 " remote node: %s\n"
310 " page off: %zu\n"
311 " handshake ok: %u\n"
312 " timer: %lu.%ld\n"
313 " data ready: %lu.%ld\n"
314 " advance start: %lu.%ld\n"
315 " advance stop: %lu.%ld\n"
316 " func start: %lu.%ld\n"
317 " func stop: %lu.%ld\n"
318 " func key: %u\n"
319 " func type: %u\n",
320 sc,
321 atomic_read(&sc->sc_kref.refcount),
322 &saddr, inet ? ntohs(sport) : 0,
323 &daddr, inet ? ntohs(dport) : 0,
324 sc->sc_node->nd_name,
325 sc->sc_page_off,
326 sc->sc_handshake_ok,
327 TV_SEC_USEC(sc->sc_tv_timer),
328 TV_SEC_USEC(sc->sc_tv_data_ready),
329 TV_SEC_USEC(sc->sc_tv_advance_start),
330 TV_SEC_USEC(sc->sc_tv_advance_stop),
331 TV_SEC_USEC(sc->sc_tv_func_start),
332 TV_SEC_USEC(sc->sc_tv_func_stop),
333 sc->sc_msg_key,
334 sc->sc_msg_type);
335 } 395 }
336 396
337
338 spin_unlock(&o2net_debug_lock); 397 spin_unlock(&o2net_debug_lock);
339 398
340 return 0; 399 return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
351 .show = sc_seq_show, 410 .show = sc_seq_show,
352}; 411};
353 412
354static int sc_fop_open(struct inode *inode, struct file *file) 413static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
355{ 414{
356 struct o2net_sock_container *dummy_sc; 415 struct o2net_sock_container *dummy_sc;
357 struct seq_file *seq; 416 struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
369 goto out; 428 goto out;
370 429
371 seq = file->private_data; 430 seq = file->private_data;
372 seq->private = dummy_sc; 431 seq->private = sd;
432 sd->dbg_sock = dummy_sc;
373 o2net_debug_add_sc(dummy_sc); 433 o2net_debug_add_sc(dummy_sc);
374 434
375 dummy_sc = NULL; 435 dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
382static int sc_fop_release(struct inode *inode, struct file *file) 442static int sc_fop_release(struct inode *inode, struct file *file)
383{ 443{
384 struct seq_file *seq = file->private_data; 444 struct seq_file *seq = file->private_data;
385 struct o2net_sock_container *dummy_sc = seq->private; 445 struct o2net_sock_debug *sd = seq->private;
446 struct o2net_sock_container *dummy_sc = sd->dbg_sock;
386 447
387 o2net_debug_del_sc(dummy_sc); 448 o2net_debug_del_sc(dummy_sc);
388 return seq_release_private(inode, file); 449 return seq_release_private(inode, file);
389} 450}
390 451
452static int stats_fop_open(struct inode *inode, struct file *file)
453{
454 struct o2net_sock_debug *sd;
455
456 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
457 if (sd == NULL)
458 return -ENOMEM;
459
460 sd->dbg_ctxt = SHOW_SOCK_STATS;
461 sd->dbg_sock = NULL;
462
463 return sc_common_open(file, sd);
464}
465
466static const struct file_operations stats_seq_fops = {
467 .open = stats_fop_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
470 .release = sc_fop_release,
471};
472
473static int sc_fop_open(struct inode *inode, struct file *file)
474{
475 struct o2net_sock_debug *sd;
476
477 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
478 if (sd == NULL)
479 return -ENOMEM;
480
481 sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
482 sd->dbg_sock = NULL;
483
484 return sc_common_open(file, sd);
485}
486
391static const struct file_operations sc_seq_fops = { 487static const struct file_operations sc_seq_fops = {
392 .open = sc_fop_open, 488 .open = sc_fop_open,
393 .read = seq_read, 489 .read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
419 goto bail; 515 goto bail;
420 } 516 }
421 517
518 stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
519 o2net_dentry, NULL,
520 &stats_seq_fops);
521 if (!stats_dentry) {
522 mlog_errno(-ENOMEM);
523 goto bail;
524 }
525
422 return 0; 526 return 0;
423bail: 527bail:
424 if (sc_dentry) 528 debugfs_remove(stats_dentry);
425 debugfs_remove(sc_dentry); 529 debugfs_remove(sc_dentry);
426 if (nst_dentry) 530 debugfs_remove(nst_dentry);
427 debugfs_remove(nst_dentry); 531 debugfs_remove(o2net_dentry);
428 if (o2net_dentry)
429 debugfs_remove(o2net_dentry);
430 return -ENOMEM; 532 return -ENOMEM;
431} 533}
432 534
433void o2net_debugfs_exit(void) 535void o2net_debugfs_exit(void)
434{ 536{
435 if (sc_dentry) 537 debugfs_remove(stats_dentry);
436 debugfs_remove(sc_dentry); 538 debugfs_remove(sc_dentry);
437 if (nst_dentry) 539 debugfs_remove(nst_dentry);
438 debugfs_remove(nst_dentry); 540 debugfs_remove(o2net_dentry);
439 if (o2net_dentry)
440 debugfs_remove(o2net_dentry);
441} 541}
442 542
443#endif /* CONFIG_DEBUG_FS */ 543#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9aa426e42123..3b11cb1e38fc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
153 nst->st_node = node; 153 nst->st_node = node;
154} 154}
155 155
156static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) 156static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
157{ 157{
158 do_gettimeofday(&nst->st_sock_time); 158 nst->st_sock_time = ktime_get();
159} 159}
160 160
161static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) 161static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
162{ 162{
163 do_gettimeofday(&nst->st_send_time); 163 nst->st_send_time = ktime_get();
164} 164}
165 165
166static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) 166static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
167{ 167{
168 do_gettimeofday(&nst->st_status_time); 168 nst->st_status_time = ktime_get();
169} 169}
170 170
171static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, 171static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
172 struct o2net_sock_container *sc) 172 struct o2net_sock_container *sc)
173{ 173{
174 nst->st_sc = sc; 174 nst->st_sc = sc;
175} 175}
176 176
177static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) 177static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
178 u32 msg_id)
178{ 179{
179 nst->st_id = msg_id; 180 nst->st_id = msg_id;
180} 181}
181 182
182#else /* CONFIG_DEBUG_FS */ 183static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
183
184static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
185 u32 msgkey, struct task_struct *task, u8 node)
186{ 184{
185 sc->sc_tv_timer = ktime_get();
187} 186}
188 187
189static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) 188static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
190{ 189{
190 sc->sc_tv_data_ready = ktime_get();
191} 191}
192 192
193static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) 193static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
194{ 194{
195 sc->sc_tv_advance_start = ktime_get();
195} 196}
196 197
197static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) 198static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
198{ 199{
200 sc->sc_tv_advance_stop = ktime_get();
199} 201}
200 202
201static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, 203static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
202 struct o2net_sock_container *sc)
203{ 204{
205 sc->sc_tv_func_start = ktime_get();
204} 206}
205 207
206static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, 208static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
207 u32 msg_id)
208{ 209{
210 sc->sc_tv_func_stop = ktime_get();
209} 211}
210 212
213static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
214{
215 return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
216}
217#else /* CONFIG_DEBUG_FS */
218# define o2net_init_nst(a, b, c, d, e)
219# define o2net_set_nst_sock_time(a)
220# define o2net_set_nst_send_time(a)
221# define o2net_set_nst_status_time(a)
222# define o2net_set_nst_sock_container(a, b)
223# define o2net_set_nst_msg_id(a, b)
224# define o2net_set_sock_timer(a)
225# define o2net_set_data_ready_time(a)
226# define o2net_set_advance_start_time(a)
227# define o2net_set_advance_stop_time(a)
228# define o2net_set_func_start_time(a)
229# define o2net_set_func_stop_time(a)
230# define o2net_get_func_run_time(a) (ktime_t)0
211#endif /* CONFIG_DEBUG_FS */ 231#endif /* CONFIG_DEBUG_FS */
212 232
233#ifdef CONFIG_OCFS2_FS_STATS
234static void o2net_update_send_stats(struct o2net_send_tracking *nst,
235 struct o2net_sock_container *sc)
236{
237 sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
238 ktime_sub(ktime_get(),
239 nst->st_status_time));
240 sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
241 ktime_sub(nst->st_status_time,
242 nst->st_send_time));
243 sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
244 ktime_sub(nst->st_send_time,
245 nst->st_sock_time));
246 sc->sc_send_count++;
247}
248
249static void o2net_update_recv_stats(struct o2net_sock_container *sc)
250{
251 sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
252 o2net_get_func_run_time(sc));
253 sc->sc_recv_count++;
254}
255
256#else
257
258# define o2net_update_send_stats(a, b)
259
260# define o2net_update_recv_stats(sc)
261
262#endif /* CONFIG_OCFS2_FS_STATS */
263
213static inline int o2net_reconnect_delay(void) 264static inline int o2net_reconnect_delay(void)
214{ 265{
215 return o2nm_single_cluster->cl_reconnect_delay_ms; 266 return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
355 sc->sc_sock = NULL; 406 sc->sc_sock = NULL;
356 } 407 }
357 408
409 o2nm_undepend_item(&sc->sc_node->nd_item);
358 o2nm_node_put(sc->sc_node); 410 o2nm_node_put(sc->sc_node);
359 sc->sc_node = NULL; 411 sc->sc_node = NULL;
360 412
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
376{ 428{
377 struct o2net_sock_container *sc, *ret = NULL; 429 struct o2net_sock_container *sc, *ret = NULL;
378 struct page *page = NULL; 430 struct page *page = NULL;
431 int status = 0;
379 432
380 page = alloc_page(GFP_NOFS); 433 page = alloc_page(GFP_NOFS);
381 sc = kzalloc(sizeof(*sc), GFP_NOFS); 434 sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
386 o2nm_node_get(node); 439 o2nm_node_get(node);
387 sc->sc_node = node; 440 sc->sc_node = node;
388 441
442 /* pin the node item of the remote node */
443 status = o2nm_depend_item(&node->nd_item);
444 if (status) {
445 mlog_errno(status);
446 o2nm_node_put(node);
447 goto out;
448 }
389 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed); 449 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
390 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty); 450 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
391 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); 451 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
546 if (sk->sk_user_data) { 606 if (sk->sk_user_data) {
547 struct o2net_sock_container *sc = sk->sk_user_data; 607 struct o2net_sock_container *sc = sk->sk_user_data;
548 sclog(sc, "data_ready hit\n"); 608 sclog(sc, "data_ready hit\n");
549 do_gettimeofday(&sc->sc_tv_data_ready); 609 o2net_set_data_ready_time(sc);
550 o2net_sc_queue_work(sc, &sc->sc_rx_work); 610 o2net_sc_queue_work(sc, &sc->sc_rx_work);
551 ready = sc->sc_data_ready; 611 ready = sc->sc_data_ready;
552 } else { 612 } else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
1070 o2net_set_nst_status_time(&nst); 1130 o2net_set_nst_status_time(&nst);
1071 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); 1131 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
1072 1132
1133 o2net_update_send_stats(&nst, sc);
1134
1073 /* Note that we avoid overwriting the callers status return 1135 /* Note that we avoid overwriting the callers status return
1074 * variable if a system error was reported on the other 1136 * variable if a system error was reported on the other
1075 * side. Callers beware. */ 1137 * side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
1183 if (syserr != O2NET_ERR_NONE) 1245 if (syserr != O2NET_ERR_NONE)
1184 goto out_respond; 1246 goto out_respond;
1185 1247
1186 do_gettimeofday(&sc->sc_tv_func_start); 1248 o2net_set_func_start_time(sc);
1187 sc->sc_msg_key = be32_to_cpu(hdr->key); 1249 sc->sc_msg_key = be32_to_cpu(hdr->key);
1188 sc->sc_msg_type = be16_to_cpu(hdr->msg_type); 1250 sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
1189 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + 1251 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
1190 be16_to_cpu(hdr->data_len), 1252 be16_to_cpu(hdr->data_len),
1191 nmh->nh_func_data, &ret_data); 1253 nmh->nh_func_data, &ret_data);
1192 do_gettimeofday(&sc->sc_tv_func_stop); 1254 o2net_set_func_stop_time(sc);
1255
1256 o2net_update_recv_stats(sc);
1193 1257
1194out_respond: 1258out_respond:
1195 /* this destroys the hdr, so don't use it after this */ 1259 /* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
1300 size_t datalen; 1364 size_t datalen;
1301 1365
1302 sclog(sc, "receiving\n"); 1366 sclog(sc, "receiving\n");
1303 do_gettimeofday(&sc->sc_tv_advance_start); 1367 o2net_set_advance_start_time(sc);
1304 1368
1305 if (unlikely(sc->sc_handshake_ok == 0)) { 1369 if (unlikely(sc->sc_handshake_ok == 0)) {
1306 if(sc->sc_page_off < sizeof(struct o2net_handshake)) { 1370 if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
1375 1439
1376out: 1440out:
1377 sclog(sc, "ret = %d\n", ret); 1441 sclog(sc, "ret = %d\n", ret);
1378 do_gettimeofday(&sc->sc_tv_advance_stop); 1442 o2net_set_advance_stop_time(sc);
1379 return ret; 1443 return ret;
1380} 1444}
1381 1445
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
1475{ 1539{
1476 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1540 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1477 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1541 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1478 struct timeval now;
1479 1542
1480 do_gettimeofday(&now); 1543#ifdef CONFIG_DEBUG_FS
1544 ktime_t now = ktime_get();
1545#endif
1481 1546
1482 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1547 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
1483 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1548 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
1484 o2net_idle_timeout() / 1000, 1549 o2net_idle_timeout() / 1000,
1485 o2net_idle_timeout() % 1000); 1550 o2net_idle_timeout() % 1000);
1486 mlog(ML_NOTICE, "here are some times that might help debug the " 1551
1487 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1552#ifdef CONFIG_DEBUG_FS
1488 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1553 mlog(ML_NOTICE, "Here are some times that might help debug the "
1489 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, 1554 "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
1490 now.tv_sec, (long) now.tv_usec, 1555 "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
1491 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, 1556 (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
1492 sc->sc_tv_advance_start.tv_sec, 1557 (long long)ktime_to_us(sc->sc_tv_data_ready),
1493 (long) sc->sc_tv_advance_start.tv_usec, 1558 (long long)ktime_to_us(sc->sc_tv_advance_start),
1494 sc->sc_tv_advance_stop.tv_sec, 1559 (long long)ktime_to_us(sc->sc_tv_advance_stop),
1495 (long) sc->sc_tv_advance_stop.tv_usec,
1496 sc->sc_msg_key, sc->sc_msg_type, 1560 sc->sc_msg_key, sc->sc_msg_type,
1497 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, 1561 (long long)ktime_to_us(sc->sc_tv_func_start),
1498 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); 1562 (long long)ktime_to_us(sc->sc_tv_func_stop));
1563#endif
1499 1564
1500 /* 1565 /*
1501 * Initialize the nn_timeout so that the next connection attempt 1566 * Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
1511 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); 1576 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
1512 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, 1577 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
1513 msecs_to_jiffies(o2net_keepalive_delay())); 1578 msecs_to_jiffies(o2net_keepalive_delay()));
1514 do_gettimeofday(&sc->sc_tv_timer); 1579 o2net_set_sock_timer(sc);
1515 mod_timer(&sc->sc_idle_timeout, 1580 mod_timer(&sc->sc_idle_timeout,
1516 jiffies + msecs_to_jiffies(o2net_idle_timeout())); 1581 jiffies + msecs_to_jiffies(o2net_idle_timeout()));
1517} 1582}
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 15fdbdf9eb4b..4cbcb65784a3 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -166,18 +166,27 @@ struct o2net_sock_container {
166 /* original handlers for the sockets */ 166 /* original handlers for the sockets */
167 void (*sc_state_change)(struct sock *sk); 167 void (*sc_state_change)(struct sock *sk);
168 void (*sc_data_ready)(struct sock *sk, int bytes); 168 void (*sc_data_ready)(struct sock *sk, int bytes);
169#ifdef CONFIG_DEBUG_FS 169
170 struct list_head sc_net_debug_item;
171#endif
172 struct timeval sc_tv_timer;
173 struct timeval sc_tv_data_ready;
174 struct timeval sc_tv_advance_start;
175 struct timeval sc_tv_advance_stop;
176 struct timeval sc_tv_func_start;
177 struct timeval sc_tv_func_stop;
178 u32 sc_msg_key; 170 u32 sc_msg_key;
179 u16 sc_msg_type; 171 u16 sc_msg_type;
180 172
173#ifdef CONFIG_DEBUG_FS
174 struct list_head sc_net_debug_item;
175 ktime_t sc_tv_timer;
176 ktime_t sc_tv_data_ready;
177 ktime_t sc_tv_advance_start;
178 ktime_t sc_tv_advance_stop;
179 ktime_t sc_tv_func_start;
180 ktime_t sc_tv_func_stop;
181#endif
182#ifdef CONFIG_OCFS2_FS_STATS
183 ktime_t sc_tv_acquiry_total;
184 ktime_t sc_tv_send_total;
185 ktime_t sc_tv_status_total;
186 u32 sc_send_count;
187 u32 sc_recv_count;
188 ktime_t sc_tv_process_total;
189#endif
181 struct mutex sc_send_lock; 190 struct mutex sc_send_lock;
182}; 191};
183 192
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
220 u32 st_msg_type; 229 u32 st_msg_type;
221 u32 st_msg_key; 230 u32 st_msg_key;
222 u8 st_node; 231 u8 st_node;
223 struct timeval st_sock_time; 232 ktime_t st_sock_time;
224 struct timeval st_send_time; 233 ktime_t st_send_time;
225 struct timeval st_status_time; 234 ktime_t st_status_time;
226}; 235};
227#else 236#else
228struct o2net_send_tracking { 237struct o2net_send_tracking {
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f44999156839..3a3ed4bb794b 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
90 90
91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
92{ 92{
93 mlog_entry_void(); 93 struct dlm_lock_resource *res;
94 94
95 BUG_ON(!dlm); 95 BUG_ON(!dlm);
96 BUG_ON(!lock); 96 BUG_ON(!lock);
97 97
98 res = lock->lockres;
99
98 assert_spin_locked(&dlm->ast_lock); 100 assert_spin_locked(&dlm->ast_lock);
101
99 if (!list_empty(&lock->ast_list)) { 102 if (!list_empty(&lock->ast_list)) {
100 mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n", 103 mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
104 "AST list not empty, pending %d, newlevel %d\n",
105 dlm->name, res->lockname.len, res->lockname.name,
106 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
107 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
101 lock->ast_pending, lock->ml.type); 108 lock->ast_pending, lock->ml.type);
102 BUG(); 109 BUG();
103 } 110 }
104 if (lock->ast_pending) 111 if (lock->ast_pending)
105 mlog(0, "lock has an ast getting flushed right now\n"); 112 mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
113 dlm->name, res->lockname.len, res->lockname.name,
114 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
115 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
106 116
107 /* putting lock on list, add a ref */ 117 /* putting lock on list, add a ref */
108 dlm_lock_get(lock); 118 dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
110 120
111 /* check to see if this ast obsoletes the bast */ 121 /* check to see if this ast obsoletes the bast */
112 if (dlm_should_cancel_bast(dlm, lock)) { 122 if (dlm_should_cancel_bast(dlm, lock)) {
113 struct dlm_lock_resource *res = lock->lockres; 123 mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
114 mlog(0, "%s: cancelling bast for %.*s\n", 124 dlm->name, res->lockname.len, res->lockname.name,
115 dlm->name, res->lockname.len, res->lockname.name); 125 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
126 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
116 lock->bast_pending = 0; 127 lock->bast_pending = 0;
117 list_del_init(&lock->bast_list); 128 list_del_init(&lock->bast_list);
118 lock->ml.highest_blocked = LKM_IVMODE; 129 lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
134 145
135void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 146void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
136{ 147{
137 mlog_entry_void();
138
139 BUG_ON(!dlm); 148 BUG_ON(!dlm);
140 BUG_ON(!lock); 149 BUG_ON(!lock);
141 150
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
147 156
148void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 157void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
149{ 158{
150 mlog_entry_void(); 159 struct dlm_lock_resource *res;
151 160
152 BUG_ON(!dlm); 161 BUG_ON(!dlm);
153 BUG_ON(!lock); 162 BUG_ON(!lock);
163
154 assert_spin_locked(&dlm->ast_lock); 164 assert_spin_locked(&dlm->ast_lock);
155 165
166 res = lock->lockres;
167
156 BUG_ON(!list_empty(&lock->bast_list)); 168 BUG_ON(!list_empty(&lock->bast_list));
157 if (lock->bast_pending) 169 if (lock->bast_pending)
158 mlog(0, "lock has a bast getting flushed right now\n"); 170 mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
171 dlm->name, res->lockname.len, res->lockname.name,
172 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
173 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
159 174
160 /* putting lock on list, add a ref */ 175 /* putting lock on list, add a ref */
161 dlm_lock_get(lock); 176 dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
167 182
168void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 183void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
169{ 184{
170 mlog_entry_void();
171
172 BUG_ON(!dlm); 185 BUG_ON(!dlm);
173 BUG_ON(!lock); 186 BUG_ON(!lock);
174 187
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
213 dlm_astlockfunc_t *fn; 226 dlm_astlockfunc_t *fn;
214 struct dlm_lockstatus *lksb; 227 struct dlm_lockstatus *lksb;
215 228
216 mlog_entry_void(); 229 mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
230 res->lockname.len, res->lockname.name,
231 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
232 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
217 233
218 lksb = lock->lksb; 234 lksb = lock->lksb;
219 fn = lock->ast; 235 fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
231 struct dlm_lockstatus *lksb; 247 struct dlm_lockstatus *lksb;
232 int lksbflags; 248 int lksbflags;
233 249
234 mlog_entry_void(); 250 mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
251 res->lockname.len, res->lockname.name,
252 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
253 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
235 254
236 lksb = lock->lksb; 255 lksb = lock->lksb;
237 BUG_ON(lock->ml.node == dlm->node_num); 256 BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
250{ 269{
251 dlm_bastlockfunc_t *fn = lock->bast; 270 dlm_bastlockfunc_t *fn = lock->bast;
252 271
253 mlog_entry_void();
254 BUG_ON(lock->ml.node != dlm->node_num); 272 BUG_ON(lock->ml.node != dlm->node_num);
255 273
274 mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
275 dlm->name, res->lockname.len, res->lockname.name,
276 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
277 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
278 blocked_type);
279
256 (*fn)(lock->astdata, blocked_type); 280 (*fn)(lock->astdata, blocked_type);
257} 281}
258 282
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
332 /* cannot get a proxy ast message if this node owns it */ 356 /* cannot get a proxy ast message if this node owns it */
333 BUG_ON(res->owner == dlm->node_num); 357 BUG_ON(res->owner == dlm->node_num);
334 358
335 mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name); 359 mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
360 res->lockname.name);
336 361
337 spin_lock(&res->spinlock); 362 spin_lock(&res->spinlock);
338 if (res->state & DLM_LOCK_RES_RECOVERING) { 363 if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
382 if (past->type == DLM_AST) { 407 if (past->type == DLM_AST) {
383 /* do not alter lock refcount. switching lists. */ 408 /* do not alter lock refcount. switching lists. */
384 list_move_tail(&lock->list, &res->granted); 409 list_move_tail(&lock->list, &res->granted);
385 mlog(0, "ast: Adding to granted list... type=%d, " 410 mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
386 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); 411 dlm->name, res->lockname.len, res->lockname.name,
412 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
413 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
414 lock->ml.type, lock->ml.convert_type);
415
387 if (lock->ml.convert_type != LKM_IVMODE) { 416 if (lock->ml.convert_type != LKM_IVMODE) {
388 lock->ml.type = lock->ml.convert_type; 417 lock->ml.type = lock->ml.convert_type;
389 lock->ml.convert_type = LKM_IVMODE; 418 lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
426 size_t veclen = 1; 455 size_t veclen = 1;
427 int status; 456 int status;
428 457
429 mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n", 458 mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
430 res->lockname.len, res->lockname.name, lock->ml.node, 459 res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
431 msg_type, blocked_type); 460 blocked_type);
432 461
433 memset(&past, 0, sizeof(struct dlm_proxy_ast)); 462 memset(&past, 0, sizeof(struct dlm_proxy_ast));
434 past.node_idx = dlm->node_num; 463 past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
441 vec[0].iov_len = sizeof(struct dlm_proxy_ast); 470 vec[0].iov_len = sizeof(struct dlm_proxy_ast);
442 vec[0].iov_base = &past; 471 vec[0].iov_base = &past;
443 if (flags & DLM_LKSB_GET_LVB) { 472 if (flags & DLM_LKSB_GET_LVB) {
444 mlog(0, "returning requested LVB data\n");
445 be32_add_cpu(&past.flags, LKM_GET_LVB); 473 be32_add_cpu(&past.flags, LKM_GET_LVB);
446 vec[1].iov_len = DLM_LVB_LEN; 474 vec[1].iov_len = DLM_LVB_LEN;
447 vec[1].iov_base = lock->lksb->lvb; 475 vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
451 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, 479 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
452 lock->ml.node, &status); 480 lock->ml.node, &status);
453 if (ret < 0) 481 if (ret < 0)
454 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 482 mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
455 "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key, 483 dlm->name, res->lockname.len, res->lockname.name, ret,
456 lock->ml.node); 484 lock->ml.node);
457 else { 485 else {
458 if (status == DLM_RECOVERING) { 486 if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index b36d0bf77a5a..4bdf7baee344 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) 50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
51 51
52enum dlm_mle_type { 52enum dlm_mle_type {
53 DLM_MLE_BLOCK, 53 DLM_MLE_BLOCK = 0,
54 DLM_MLE_MASTER, 54 DLM_MLE_MASTER = 1,
55 DLM_MLE_MIGRATION, 55 DLM_MLE_MIGRATION = 2,
56 DLM_MLE_NUM_TYPES 56 DLM_MLE_NUM_TYPES = 3,
57}; 57};
58 58
59struct dlm_master_list_entry { 59struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
82 82
83enum dlm_ast_type { 83enum dlm_ast_type {
84 DLM_AST = 0, 84 DLM_AST = 0,
85 DLM_BAST, 85 DLM_BAST = 1,
86 DLM_ASTUNLOCK 86 DLM_ASTUNLOCK = 2,
87}; 87};
88 88
89 89
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
119 119
120enum dlm_ctxt_state { 120enum dlm_ctxt_state {
121 DLM_CTXT_NEW = 0, 121 DLM_CTXT_NEW = 0,
122 DLM_CTXT_JOINED, 122 DLM_CTXT_JOINED = 1,
123 DLM_CTXT_IN_SHUTDOWN, 123 DLM_CTXT_IN_SHUTDOWN = 2,
124 DLM_CTXT_LEAVING, 124 DLM_CTXT_LEAVING = 3,
125}; 125};
126 126
127struct dlm_ctxt 127struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
388 388
389enum dlm_lockres_list { 389enum dlm_lockres_list {
390 DLM_GRANTED_LIST = 0, 390 DLM_GRANTED_LIST = 0,
391 DLM_CONVERTING_LIST, 391 DLM_CONVERTING_LIST = 1,
392 DLM_BLOCKED_LIST 392 DLM_BLOCKED_LIST = 2,
393}; 393};
394 394
395static inline int dlm_lvb_is_empty(char *lvb) 395static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,27 +427,27 @@ struct dlm_node_iter
427 427
428 428
429enum { 429enum {
430 DLM_MASTER_REQUEST_MSG = 500, 430 DLM_MASTER_REQUEST_MSG = 500,
431 DLM_UNUSED_MSG1, /* 501 */ 431 DLM_UNUSED_MSG1 = 501,
432 DLM_ASSERT_MASTER_MSG, /* 502 */ 432 DLM_ASSERT_MASTER_MSG = 502,
433 DLM_CREATE_LOCK_MSG, /* 503 */ 433 DLM_CREATE_LOCK_MSG = 503,
434 DLM_CONVERT_LOCK_MSG, /* 504 */ 434 DLM_CONVERT_LOCK_MSG = 504,
435 DLM_PROXY_AST_MSG, /* 505 */ 435 DLM_PROXY_AST_MSG = 505,
436 DLM_UNLOCK_LOCK_MSG, /* 506 */ 436 DLM_UNLOCK_LOCK_MSG = 506,
437 DLM_DEREF_LOCKRES_MSG, /* 507 */ 437 DLM_DEREF_LOCKRES_MSG = 507,
438 DLM_MIGRATE_REQUEST_MSG, /* 508 */ 438 DLM_MIGRATE_REQUEST_MSG = 508,
439 DLM_MIG_LOCKRES_MSG, /* 509 */ 439 DLM_MIG_LOCKRES_MSG = 509,
440 DLM_QUERY_JOIN_MSG, /* 510 */ 440 DLM_QUERY_JOIN_MSG = 510,
441 DLM_ASSERT_JOINED_MSG, /* 511 */ 441 DLM_ASSERT_JOINED_MSG = 511,
442 DLM_CANCEL_JOIN_MSG, /* 512 */ 442 DLM_CANCEL_JOIN_MSG = 512,
443 DLM_EXIT_DOMAIN_MSG, /* 513 */ 443 DLM_EXIT_DOMAIN_MSG = 513,
444 DLM_MASTER_REQUERY_MSG, /* 514 */ 444 DLM_MASTER_REQUERY_MSG = 514,
445 DLM_LOCK_REQUEST_MSG, /* 515 */ 445 DLM_LOCK_REQUEST_MSG = 515,
446 DLM_RECO_DATA_DONE_MSG, /* 516 */ 446 DLM_RECO_DATA_DONE_MSG = 516,
447 DLM_BEGIN_RECO_MSG, /* 517 */ 447 DLM_BEGIN_RECO_MSG = 517,
448 DLM_FINALIZE_RECO_MSG, /* 518 */ 448 DLM_FINALIZE_RECO_MSG = 518,
449 DLM_QUERY_REGION, /* 519 */ 449 DLM_QUERY_REGION = 519,
450 DLM_QUERY_NODEINFO, /* 520 */ 450 DLM_QUERY_NODEINFO = 520,
451}; 451};
452 452
453struct dlm_reco_node_data 453struct dlm_reco_node_data
@@ -460,19 +460,19 @@ struct dlm_reco_node_data
460enum { 460enum {
461 DLM_RECO_NODE_DATA_DEAD = -1, 461 DLM_RECO_NODE_DATA_DEAD = -1,
462 DLM_RECO_NODE_DATA_INIT = 0, 462 DLM_RECO_NODE_DATA_INIT = 0,
463 DLM_RECO_NODE_DATA_REQUESTING, 463 DLM_RECO_NODE_DATA_REQUESTING = 1,
464 DLM_RECO_NODE_DATA_REQUESTED, 464 DLM_RECO_NODE_DATA_REQUESTED = 2,
465 DLM_RECO_NODE_DATA_RECEIVING, 465 DLM_RECO_NODE_DATA_RECEIVING = 3,
466 DLM_RECO_NODE_DATA_DONE, 466 DLM_RECO_NODE_DATA_DONE = 4,
467 DLM_RECO_NODE_DATA_FINALIZE_SENT, 467 DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
468}; 468};
469 469
470 470
471enum { 471enum {
472 DLM_MASTER_RESP_NO = 0, 472 DLM_MASTER_RESP_NO = 0,
473 DLM_MASTER_RESP_YES, 473 DLM_MASTER_RESP_YES = 1,
474 DLM_MASTER_RESP_MAYBE, 474 DLM_MASTER_RESP_MAYBE = 2,
475 DLM_MASTER_RESP_ERROR 475 DLM_MASTER_RESP_ERROR = 3,
476}; 476};
477 477
478 478
@@ -649,9 +649,9 @@ struct dlm_proxy_ast
649#define DLM_MOD_KEY (0x666c6172) 649#define DLM_MOD_KEY (0x666c6172)
650enum dlm_query_join_response_code { 650enum dlm_query_join_response_code {
651 JOIN_DISALLOW = 0, 651 JOIN_DISALLOW = 0,
652 JOIN_OK, 652 JOIN_OK = 1,
653 JOIN_OK_NO_MAP, 653 JOIN_OK_NO_MAP = 2,
654 JOIN_PROTOCOL_MISMATCH, 654 JOIN_PROTOCOL_MISMATCH = 3,
655}; 655};
656 656
657struct dlm_query_join_packet { 657struct dlm_query_join_packet {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 272ec8631a51..04a32be0aeb9 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
370 kref_get(&dc->debug_refcnt); 370 kref_get(&dc->debug_refcnt);
371} 371}
372 372
373static struct debug_buffer *debug_buffer_allocate(void) 373static int debug_release(struct inode *inode, struct file *file)
374{ 374{
375 struct debug_buffer *db = NULL; 375 free_page((unsigned long)file->private_data);
376 376 return 0;
377 db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
378 if (!db)
379 goto bail;
380
381 db->len = PAGE_SIZE;
382 db->buf = kmalloc(db->len, GFP_KERNEL);
383 if (!db->buf)
384 goto bail;
385
386 return db;
387bail:
388 kfree(db);
389 return NULL;
390}
391
392static ssize_t debug_buffer_read(struct file *file, char __user *buf,
393 size_t nbytes, loff_t *ppos)
394{
395 struct debug_buffer *db = file->private_data;
396
397 return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
398}
399
400static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
401{
402 struct debug_buffer *db = file->private_data;
403 loff_t new = -1;
404
405 switch (whence) {
406 case 0:
407 new = off;
408 break;
409 case 1:
410 new = file->f_pos + off;
411 break;
412 }
413
414 if (new < 0 || new > db->len)
415 return -EINVAL;
416
417 return (file->f_pos = new);
418} 377}
419 378
420static int debug_buffer_release(struct inode *inode, struct file *file) 379static ssize_t debug_read(struct file *file, char __user *buf,
380 size_t nbytes, loff_t *ppos)
421{ 381{
422 struct debug_buffer *db = file->private_data; 382 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
423 383 i_size_read(file->f_mapping->host));
424 if (db)
425 kfree(db->buf);
426 kfree(db);
427
428 return 0;
429} 384}
430/* end - util funcs */ 385/* end - util funcs */
431 386
432/* begin - purge list funcs */ 387/* begin - purge list funcs */
433static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 388static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
434{ 389{
435 struct dlm_lock_resource *res; 390 struct dlm_lock_resource *res;
436 int out = 0; 391 int out = 0;
437 unsigned long total = 0; 392 unsigned long total = 0;
438 393
439 out += snprintf(db->buf + out, db->len - out, 394 out += snprintf(buf + out, len - out,
440 "Dumping Purgelist for Domain: %s\n", dlm->name); 395 "Dumping Purgelist for Domain: %s\n", dlm->name);
441 396
442 spin_lock(&dlm->spinlock); 397 spin_lock(&dlm->spinlock);
443 list_for_each_entry(res, &dlm->purge_list, purge) { 398 list_for_each_entry(res, &dlm->purge_list, purge) {
444 ++total; 399 ++total;
445 if (db->len - out < 100) 400 if (len - out < 100)
446 continue; 401 continue;
447 spin_lock(&res->spinlock); 402 spin_lock(&res->spinlock);
448 out += stringify_lockname(res->lockname.name, 403 out += stringify_lockname(res->lockname.name,
449 res->lockname.len, 404 res->lockname.len,
450 db->buf + out, db->len - out); 405 buf + out, len - out);
451 out += snprintf(db->buf + out, db->len - out, "\t%ld\n", 406 out += snprintf(buf + out, len - out, "\t%ld\n",
452 (jiffies - res->last_used)/HZ); 407 (jiffies - res->last_used)/HZ);
453 spin_unlock(&res->spinlock); 408 spin_unlock(&res->spinlock);
454 } 409 }
455 spin_unlock(&dlm->spinlock); 410 spin_unlock(&dlm->spinlock);
456 411
457 out += snprintf(db->buf + out, db->len - out, 412 out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
458 "Total on list: %ld\n", total);
459 413
460 return out; 414 return out;
461} 415}
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
463static int debug_purgelist_open(struct inode *inode, struct file *file) 417static int debug_purgelist_open(struct inode *inode, struct file *file)
464{ 418{
465 struct dlm_ctxt *dlm = inode->i_private; 419 struct dlm_ctxt *dlm = inode->i_private;
466 struct debug_buffer *db; 420 char *buf = NULL;
467 421
468 db = debug_buffer_allocate(); 422 buf = (char *) get_zeroed_page(GFP_NOFS);
469 if (!db) 423 if (!buf)
470 goto bail; 424 goto bail;
471 425
472 db->len = debug_purgelist_print(dlm, db); 426 i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
473 427
474 file->private_data = db; 428 file->private_data = buf;
475 429
476 return 0; 430 return 0;
477bail: 431bail:
@@ -480,14 +434,14 @@ bail:
480 434
481static const struct file_operations debug_purgelist_fops = { 435static const struct file_operations debug_purgelist_fops = {
482 .open = debug_purgelist_open, 436 .open = debug_purgelist_open,
483 .release = debug_buffer_release, 437 .release = debug_release,
484 .read = debug_buffer_read, 438 .read = debug_read,
485 .llseek = debug_buffer_llseek, 439 .llseek = generic_file_llseek,
486}; 440};
487/* end - purge list funcs */ 441/* end - purge list funcs */
488 442
489/* begin - debug mle funcs */ 443/* begin - debug mle funcs */
490static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 444static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
491{ 445{
492 struct dlm_master_list_entry *mle; 446 struct dlm_master_list_entry *mle;
493 struct hlist_head *bucket; 447 struct hlist_head *bucket;
@@ -495,7 +449,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
495 int i, out = 0; 449 int i, out = 0;
496 unsigned long total = 0, longest = 0, bucket_count = 0; 450 unsigned long total = 0, longest = 0, bucket_count = 0;
497 451
498 out += snprintf(db->buf + out, db->len - out, 452 out += snprintf(buf + out, len - out,
499 "Dumping MLEs for Domain: %s\n", dlm->name); 453 "Dumping MLEs for Domain: %s\n", dlm->name);
500 454
501 spin_lock(&dlm->master_lock); 455 spin_lock(&dlm->master_lock);
@@ -506,16 +460,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
506 master_hash_node); 460 master_hash_node);
507 ++total; 461 ++total;
508 ++bucket_count; 462 ++bucket_count;
509 if (db->len - out < 200) 463 if (len - out < 200)
510 continue; 464 continue;
511 out += dump_mle(mle, db->buf + out, db->len - out); 465 out += dump_mle(mle, buf + out, len - out);
512 } 466 }
513 longest = max(longest, bucket_count); 467 longest = max(longest, bucket_count);
514 bucket_count = 0; 468 bucket_count = 0;
515 } 469 }
516 spin_unlock(&dlm->master_lock); 470 spin_unlock(&dlm->master_lock);
517 471
518 out += snprintf(db->buf + out, db->len - out, 472 out += snprintf(buf + out, len - out,
519 "Total: %ld, Longest: %ld\n", total, longest); 473 "Total: %ld, Longest: %ld\n", total, longest);
520 return out; 474 return out;
521} 475}
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
523static int debug_mle_open(struct inode *inode, struct file *file) 477static int debug_mle_open(struct inode *inode, struct file *file)
524{ 478{
525 struct dlm_ctxt *dlm = inode->i_private; 479 struct dlm_ctxt *dlm = inode->i_private;
526 struct debug_buffer *db; 480 char *buf = NULL;
527 481
528 db = debug_buffer_allocate(); 482 buf = (char *) get_zeroed_page(GFP_NOFS);
529 if (!db) 483 if (!buf)
530 goto bail; 484 goto bail;
531 485
532 db->len = debug_mle_print(dlm, db); 486 i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
533 487
534 file->private_data = db; 488 file->private_data = buf;
535 489
536 return 0; 490 return 0;
537bail: 491bail:
@@ -540,9 +494,9 @@ bail:
540 494
541static const struct file_operations debug_mle_fops = { 495static const struct file_operations debug_mle_fops = {
542 .open = debug_mle_open, 496 .open = debug_mle_open,
543 .release = debug_buffer_release, 497 .release = debug_release,
544 .read = debug_buffer_read, 498 .read = debug_read,
545 .llseek = debug_buffer_llseek, 499 .llseek = generic_file_llseek,
546}; 500};
547 501
548/* end - debug mle funcs */ 502/* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
757/* end - debug lockres funcs */ 711/* end - debug lockres funcs */
758 712
759/* begin - debug state funcs */ 713/* begin - debug state funcs */
760static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 714static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
761{ 715{
762 int out = 0; 716 int out = 0;
763 struct dlm_reco_node_data *node; 717 struct dlm_reco_node_data *node;
@@ -781,35 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
781 } 735 }
782 736
783 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ 737 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
784 out += snprintf(db->buf + out, db->len - out, 738 out += snprintf(buf + out, len - out,
785 "Domain: %s Key: 0x%08x Protocol: %d.%d\n", 739 "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
786 dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, 740 dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
787 dlm->dlm_locking_proto.pv_minor); 741 dlm->dlm_locking_proto.pv_minor);
788 742
789 /* Thread Pid: xxx Node: xxx State: xxxxx */ 743 /* Thread Pid: xxx Node: xxx State: xxxxx */
790 out += snprintf(db->buf + out, db->len - out, 744 out += snprintf(buf + out, len - out,
791 "Thread Pid: %d Node: %d State: %s\n", 745 "Thread Pid: %d Node: %d State: %s\n",
792 dlm->dlm_thread_task->pid, dlm->node_num, state); 746 task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
793 747
794 /* Number of Joins: xxx Joining Node: xxx */ 748 /* Number of Joins: xxx Joining Node: xxx */
795 out += snprintf(db->buf + out, db->len - out, 749 out += snprintf(buf + out, len - out,
796 "Number of Joins: %d Joining Node: %d\n", 750 "Number of Joins: %d Joining Node: %d\n",
797 dlm->num_joins, dlm->joining_node); 751 dlm->num_joins, dlm->joining_node);
798 752
799 /* Domain Map: xx xx xx */ 753 /* Domain Map: xx xx xx */
800 out += snprintf(db->buf + out, db->len - out, "Domain Map: "); 754 out += snprintf(buf + out, len - out, "Domain Map: ");
801 out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, 755 out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
802 db->buf + out, db->len - out); 756 buf + out, len - out);
803 out += snprintf(db->buf + out, db->len - out, "\n"); 757 out += snprintf(buf + out, len - out, "\n");
804 758
805 /* Live Map: xx xx xx */ 759 /* Live Map: xx xx xx */
806 out += snprintf(db->buf + out, db->len - out, "Live Map: "); 760 out += snprintf(buf + out, len - out, "Live Map: ");
807 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, 761 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
808 db->buf + out, db->len - out); 762 buf + out, len - out);
809 out += snprintf(db->buf + out, db->len - out, "\n"); 763 out += snprintf(buf + out, len - out, "\n");
810 764
811 /* Lock Resources: xxx (xxx) */ 765 /* Lock Resources: xxx (xxx) */
812 out += snprintf(db->buf + out, db->len - out, 766 out += snprintf(buf + out, len - out,
813 "Lock Resources: %d (%d)\n", 767 "Lock Resources: %d (%d)\n",
814 atomic_read(&dlm->res_cur_count), 768 atomic_read(&dlm->res_cur_count),
815 atomic_read(&dlm->res_tot_count)); 769 atomic_read(&dlm->res_tot_count));
@@ -821,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
821 cur_mles += atomic_read(&dlm->mle_cur_count[i]); 775 cur_mles += atomic_read(&dlm->mle_cur_count[i]);
822 776
823 /* MLEs: xxx (xxx) */ 777 /* MLEs: xxx (xxx) */
824 out += snprintf(db->buf + out, db->len - out, 778 out += snprintf(buf + out, len - out,
825 "MLEs: %d (%d)\n", cur_mles, tot_mles); 779 "MLEs: %d (%d)\n", cur_mles, tot_mles);
826 780
827 /* Blocking: xxx (xxx) */ 781 /* Blocking: xxx (xxx) */
828 out += snprintf(db->buf + out, db->len - out, 782 out += snprintf(buf + out, len - out,
829 " Blocking: %d (%d)\n", 783 " Blocking: %d (%d)\n",
830 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), 784 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
831 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); 785 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
832 786
833 /* Mastery: xxx (xxx) */ 787 /* Mastery: xxx (xxx) */
834 out += snprintf(db->buf + out, db->len - out, 788 out += snprintf(buf + out, len - out,
835 " Mastery: %d (%d)\n", 789 " Mastery: %d (%d)\n",
836 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), 790 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
837 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); 791 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
838 792
839 /* Migration: xxx (xxx) */ 793 /* Migration: xxx (xxx) */
840 out += snprintf(db->buf + out, db->len - out, 794 out += snprintf(buf + out, len - out,
841 " Migration: %d (%d)\n", 795 " Migration: %d (%d)\n",
842 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), 796 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
843 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); 797 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
844 798
845 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ 799 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
846 out += snprintf(db->buf + out, db->len - out, 800 out += snprintf(buf + out, len - out,
847 "Lists: Dirty=%s Purge=%s PendingASTs=%s " 801 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
848 "PendingBASTs=%s\n", 802 "PendingBASTs=%s\n",
849 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), 803 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -852,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
852 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); 806 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
853 807
854 /* Purge Count: xxx Refs: xxx */ 808 /* Purge Count: xxx Refs: xxx */
855 out += snprintf(db->buf + out, db->len - out, 809 out += snprintf(buf + out, len - out,
856 "Purge Count: %d Refs: %d\n", dlm->purge_count, 810 "Purge Count: %d Refs: %d\n", dlm->purge_count,
857 atomic_read(&dlm->dlm_refs.refcount)); 811 atomic_read(&dlm->dlm_refs.refcount));
858 812
859 /* Dead Node: xxx */ 813 /* Dead Node: xxx */
860 out += snprintf(db->buf + out, db->len - out, 814 out += snprintf(buf + out, len - out,
861 "Dead Node: %d\n", dlm->reco.dead_node); 815 "Dead Node: %d\n", dlm->reco.dead_node);
862 816
863 /* What about DLM_RECO_STATE_FINALIZE? */ 817 /* What about DLM_RECO_STATE_FINALIZE? */
@@ -867,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
867 state = "INACTIVE"; 821 state = "INACTIVE";
868 822
869 /* Recovery Pid: xxxx Master: xxx State: xxxx */ 823 /* Recovery Pid: xxxx Master: xxx State: xxxx */
870 out += snprintf(db->buf + out, db->len - out, 824 out += snprintf(buf + out, len - out,
871 "Recovery Pid: %d Master: %d State: %s\n", 825 "Recovery Pid: %d Master: %d State: %s\n",
872 dlm->dlm_reco_thread_task->pid, 826 task_pid_nr(dlm->dlm_reco_thread_task),
873 dlm->reco.new_master, state); 827 dlm->reco.new_master, state);
874 828
875 /* Recovery Map: xx xx */ 829 /* Recovery Map: xx xx */
876 out += snprintf(db->buf + out, db->len - out, "Recovery Map: "); 830 out += snprintf(buf + out, len - out, "Recovery Map: ");
877 out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, 831 out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
878 db->buf + out, db->len - out); 832 buf + out, len - out);
879 out += snprintf(db->buf + out, db->len - out, "\n"); 833 out += snprintf(buf + out, len - out, "\n");
880 834
881 /* Recovery Node State: */ 835 /* Recovery Node State: */
882 out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n"); 836 out += snprintf(buf + out, len - out, "Recovery Node State:\n");
883 list_for_each_entry(node, &dlm->reco.node_data, list) { 837 list_for_each_entry(node, &dlm->reco.node_data, list) {
884 switch (node->state) { 838 switch (node->state) {
885 case DLM_RECO_NODE_DATA_INIT: 839 case DLM_RECO_NODE_DATA_INIT:
@@ -907,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
907 state = "BAD"; 861 state = "BAD";
908 break; 862 break;
909 } 863 }
910 out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n", 864 out += snprintf(buf + out, len - out, "\t%u - %s\n",
911 node->node_num, state); 865 node->node_num, state);
912 } 866 }
913 867
@@ -919,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
919static int debug_state_open(struct inode *inode, struct file *file) 873static int debug_state_open(struct inode *inode, struct file *file)
920{ 874{
921 struct dlm_ctxt *dlm = inode->i_private; 875 struct dlm_ctxt *dlm = inode->i_private;
922 struct debug_buffer *db = NULL; 876 char *buf = NULL;
923 877
924 db = debug_buffer_allocate(); 878 buf = (char *) get_zeroed_page(GFP_NOFS);
925 if (!db) 879 if (!buf)
926 goto bail; 880 goto bail;
927 881
928 db->len = debug_state_print(dlm, db); 882 i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
929 883
930 file->private_data = db; 884 file->private_data = buf;
931 885
932 return 0; 886 return 0;
933bail: 887bail:
@@ -936,9 +890,9 @@ bail:
936 890
937static const struct file_operations debug_state_fops = { 891static const struct file_operations debug_state_fops = {
938 .open = debug_state_open, 892 .open = debug_state_open,
939 .release = debug_buffer_release, 893 .release = debug_release,
940 .read = debug_buffer_read, 894 .read = debug_read,
941 .llseek = debug_buffer_llseek, 895 .llseek = generic_file_llseek,
942}; 896};
943/* end - debug state funcs */ 897/* end - debug state funcs */
944 898
@@ -1002,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
1002 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; 956 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
1003 957
1004 if (dc) { 958 if (dc) {
1005 if (dc->debug_purgelist_dentry) 959 debugfs_remove(dc->debug_purgelist_dentry);
1006 debugfs_remove(dc->debug_purgelist_dentry); 960 debugfs_remove(dc->debug_mle_dentry);
1007 if (dc->debug_mle_dentry) 961 debugfs_remove(dc->debug_lockres_dentry);
1008 debugfs_remove(dc->debug_mle_dentry); 962 debugfs_remove(dc->debug_state_dentry);
1009 if (dc->debug_lockres_dentry)
1010 debugfs_remove(dc->debug_lockres_dentry);
1011 if (dc->debug_state_dentry)
1012 debugfs_remove(dc->debug_state_dentry);
1013 dlm_debug_put(dc); 963 dlm_debug_put(dc);
1014 } 964 }
1015} 965}
@@ -1040,8 +990,7 @@ bail:
1040 990
1041void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) 991void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
1042{ 992{
1043 if (dlm->dlm_debugfs_subroot) 993 debugfs_remove(dlm->dlm_debugfs_subroot);
1044 debugfs_remove(dlm->dlm_debugfs_subroot);
1045} 994}
1046 995
1047/* debugfs root */ 996/* debugfs root */
@@ -1057,7 +1006,6 @@ int dlm_create_debugfs_root(void)
1057 1006
1058void dlm_destroy_debugfs_root(void) 1007void dlm_destroy_debugfs_root(void)
1059{ 1008{
1060 if (dlm_debugfs_root) 1009 debugfs_remove(dlm_debugfs_root);
1061 debugfs_remove(dlm_debugfs_root);
1062} 1010}
1063#endif /* CONFIG_DEBUG_FS */ 1011#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c7..1f27c4812d1a 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
37 struct dentry *debug_purgelist_dentry; 37 struct dentry *debug_purgelist_dentry;
38}; 38};
39 39
40struct debug_buffer {
41 int len;
42 char *buf;
43};
44
45struct debug_lockres { 40struct debug_lockres {
46 int dl_len; 41 int dl_len;
47 char *dl_buf; 42 char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index cc2aaa96cfe5..7e38a072d720 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -460,8 +460,6 @@ redo_bucket:
460 } 460 }
461 cond_resched_lock(&dlm->spinlock); 461 cond_resched_lock(&dlm->spinlock);
462 num += n; 462 num += n;
463 mlog(0, "%s: touched %d lockreses in bucket %d "
464 "(tot=%d)\n", dlm->name, n, i, num);
465 } 463 }
466 spin_unlock(&dlm->spinlock); 464 spin_unlock(&dlm->spinlock);
467 wake_up(&dlm->dlm_thread_wq); 465 wake_up(&dlm->dlm_thread_wq);
@@ -1661,8 +1659,8 @@ bail:
1661 1659
1662static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) 1660static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
1663{ 1661{
1664 o2hb_unregister_callback(NULL, &dlm->dlm_hb_up); 1662 o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
1665 o2hb_unregister_callback(NULL, &dlm->dlm_hb_down); 1663 o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
1666 o2net_unregister_handler_list(&dlm->dlm_domain_handlers); 1664 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
1667} 1665}
1668 1666
@@ -1674,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1674 1672
1675 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, 1673 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
1676 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); 1674 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
1677 status = o2hb_register_callback(NULL, &dlm->dlm_hb_down); 1675 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
1678 if (status) 1676 if (status)
1679 goto bail; 1677 goto bail;
1680 1678
1681 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, 1679 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
1682 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); 1680 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
1683 status = o2hb_register_callback(NULL, &dlm->dlm_hb_up); 1681 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
1684 if (status) 1682 if (status)
1685 goto bail; 1683 goto bail;
1686 1684
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369961c4..7009292aac5a 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
106 106
107 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) 107 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
108 return 0; 108 return 0;
109 if (!dlm_lock_compatible(tmplock->ml.convert_type,
110 lock->ml.type))
111 return 0;
109 } 112 }
110 113
111 return 1; 114 return 1;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf33d9b..1d6d1d22c471 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
122void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, 122void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
123 struct dlm_lock_resource *res) 123 struct dlm_lock_resource *res)
124{ 124{
125 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
126
127 assert_spin_locked(&dlm->spinlock); 125 assert_spin_locked(&dlm->spinlock);
128 assert_spin_locked(&res->spinlock); 126 assert_spin_locked(&res->spinlock);
129 127
130 if (__dlm_lockres_unused(res)){ 128 if (__dlm_lockres_unused(res)){
131 if (list_empty(&res->purge)) { 129 if (list_empty(&res->purge)) {
132 mlog(0, "putting lockres %.*s:%p onto purge list\n", 130 mlog(0, "%s: Adding res %.*s to purge list\n",
133 res->lockname.len, res->lockname.name, res); 131 dlm->name, res->lockname.len, res->lockname.name);
134 132
135 res->last_used = jiffies; 133 res->last_used = jiffies;
136 dlm_lockres_get(res); 134 dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
138 dlm->purge_count++; 136 dlm->purge_count++;
139 } 137 }
140 } else if (!list_empty(&res->purge)) { 138 } else if (!list_empty(&res->purge)) {
141 mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n", 139 mlog(0, "%s: Removing res %.*s from purge list\n",
142 res->lockname.len, res->lockname.name, res, res->owner); 140 dlm->name, res->lockname.len, res->lockname.name);
143 141
144 list_del_init(&res->purge); 142 list_del_init(&res->purge);
145 dlm_lockres_put(res); 143 dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
150void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, 148void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
151 struct dlm_lock_resource *res) 149 struct dlm_lock_resource *res)
152{ 150{
153 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
154 spin_lock(&dlm->spinlock); 151 spin_lock(&dlm->spinlock);
155 spin_lock(&res->spinlock); 152 spin_lock(&res->spinlock);
156 153
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
171 168
172 master = (res->owner == dlm->node_num); 169 master = (res->owner == dlm->node_num);
173 170
174 171 mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
175 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, 172 res->lockname.len, res->lockname.name, master);
176 res->lockname.name, master);
177 173
178 if (!master) { 174 if (!master) {
179 res->state |= DLM_LOCK_RES_DROPPING_REF; 175 res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
189 /* clear our bit from the master's refmap, ignore errors */ 185 /* clear our bit from the master's refmap, ignore errors */
190 ret = dlm_drop_lockres_ref(dlm, res); 186 ret = dlm_drop_lockres_ref(dlm, res);
191 if (ret < 0) { 187 if (ret < 0) {
192 mlog_errno(ret); 188 mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
189 res->lockname.len, res->lockname.name, ret);
193 if (!dlm_is_host_down(ret)) 190 if (!dlm_is_host_down(ret))
194 BUG(); 191 BUG();
195 } 192 }
196 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
197 dlm->name, res->lockname.len, res->lockname.name, ret);
198 spin_lock(&dlm->spinlock); 193 spin_lock(&dlm->spinlock);
199 spin_lock(&res->spinlock); 194 spin_lock(&res->spinlock);
200 } 195 }
201 196
202 if (!list_empty(&res->purge)) { 197 if (!list_empty(&res->purge)) {
203 mlog(0, "removing lockres %.*s:%p from purgelist, " 198 mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
204 "master = %d\n", res->lockname.len, res->lockname.name, 199 dlm->name, res->lockname.len, res->lockname.name, master);
205 res, master);
206 list_del_init(&res->purge); 200 list_del_init(&res->purge);
207 dlm_lockres_put(res); 201 dlm_lockres_put(res);
208 dlm->purge_count--; 202 dlm->purge_count--;
209 } 203 }
210 204
211 if (!__dlm_lockres_unused(res)) { 205 if (!__dlm_lockres_unused(res)) {
212 mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n", 206 mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
213 dlm->name, res->lockname.len, res->lockname.name); 207 dlm->name, res->lockname.len, res->lockname.name);
214 __dlm_print_one_lock_resource(res); 208 __dlm_print_one_lock_resource(res);
215 BUG(); 209 BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
266 unused = __dlm_lockres_unused(lockres); 260 unused = __dlm_lockres_unused(lockres);
267 if (!unused || 261 if (!unused ||
268 (lockres->state & DLM_LOCK_RES_MIGRATING)) { 262 (lockres->state & DLM_LOCK_RES_MIGRATING)) {
269 mlog(0, "lockres %s:%.*s: is in use or " 263 mlog(0, "%s: res %.*s is in use or being remastered, "
270 "being remastered, used %d, state %d\n", 264 "used %d, state %d\n", dlm->name,
271 dlm->name, lockres->lockname.len, 265 lockres->lockname.len, lockres->lockname.name,
272 lockres->lockname.name, !unused, lockres->state); 266 !unused, lockres->state);
273 list_move_tail(&dlm->purge_list, &lockres->purge); 267 list_move_tail(&dlm->purge_list, &lockres->purge);
274 spin_unlock(&lockres->spinlock); 268 spin_unlock(&lockres->spinlock);
275 continue; 269 continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
296 struct list_head *head; 290 struct list_head *head;
297 int can_grant = 1; 291 int can_grant = 1;
298 292
299 //mlog(0, "res->lockname.len=%d\n", res->lockname.len); 293 /*
300 //mlog(0, "res->lockname.name=%p\n", res->lockname.name); 294 * Because this function is called with the lockres
301 //mlog(0, "shuffle res %.*s\n", res->lockname.len,
302 // res->lockname.name);
303
304 /* because this function is called with the lockres
305 * spinlock, and because we know that it is not migrating/ 295 * spinlock, and because we know that it is not migrating/
306 * recovering/in-progress, it is fine to reserve asts and 296 * recovering/in-progress, it is fine to reserve asts and
307 * basts right before queueing them all throughout */ 297 * basts right before queueing them all throughout
298 */
308 assert_spin_locked(&dlm->ast_lock); 299 assert_spin_locked(&dlm->ast_lock);
309 assert_spin_locked(&res->spinlock); 300 assert_spin_locked(&res->spinlock);
310 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| 301 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
314converting: 305converting:
315 if (list_empty(&res->converting)) 306 if (list_empty(&res->converting))
316 goto blocked; 307 goto blocked;
317 mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len, 308 mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
318 res->lockname.name); 309 res->lockname.len, res->lockname.name);
319 310
320 target = list_entry(res->converting.next, struct dlm_lock, list); 311 target = list_entry(res->converting.next, struct dlm_lock, list);
321 if (target->ml.convert_type == LKM_IVMODE) { 312 if (target->ml.convert_type == LKM_IVMODE) {
322 mlog(ML_ERROR, "%.*s: converting a lock with no " 313 mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
323 "convert_type!\n", res->lockname.len, res->lockname.name); 314 dlm->name, res->lockname.len, res->lockname.name);
324 BUG(); 315 BUG();
325 } 316 }
326 head = &res->granted; 317 head = &res->granted;
@@ -365,9 +356,12 @@ converting:
365 spin_lock(&target->spinlock); 356 spin_lock(&target->spinlock);
366 BUG_ON(target->ml.highest_blocked != LKM_IVMODE); 357 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
367 358
368 mlog(0, "calling ast for converting lock: %.*s, have: %d, " 359 mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
369 "granting: %d, node: %u\n", res->lockname.len, 360 "%d => %d, node %u\n", dlm->name, res->lockname.len,
370 res->lockname.name, target->ml.type, 361 res->lockname.name,
362 dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
363 dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
364 target->ml.type,
371 target->ml.convert_type, target->ml.node); 365 target->ml.convert_type, target->ml.node);
372 366
373 target->ml.type = target->ml.convert_type; 367 target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
428 spin_lock(&target->spinlock); 422 spin_lock(&target->spinlock);
429 BUG_ON(target->ml.highest_blocked != LKM_IVMODE); 423 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
430 424
431 mlog(0, "calling ast for blocked lock: %.*s, granting: %d, " 425 mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
432 "node: %u\n", res->lockname.len, res->lockname.name, 426 "node %u\n", dlm->name, res->lockname.len,
427 res->lockname.name,
428 dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
429 dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
433 target->ml.type, target->ml.node); 430 target->ml.type, target->ml.node);
434 431
435 // target->ml.type is already correct 432 /* target->ml.type is already correct */
436 list_move_tail(&target->list, &res->granted); 433 list_move_tail(&target->list, &res->granted);
437 434
438 BUG_ON(!target->lksb); 435 BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
453/* must have NO locks when calling this with res !=NULL * */ 450/* must have NO locks when calling this with res !=NULL * */
454void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 451void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
455{ 452{
456 mlog_entry("dlm=%p, res=%p\n", dlm, res);
457 if (res) { 453 if (res) {
458 spin_lock(&dlm->spinlock); 454 spin_lock(&dlm->spinlock);
459 spin_lock(&res->spinlock); 455 spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
466 462
467void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 463void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
468{ 464{
469 mlog_entry("dlm=%p, res=%p\n", dlm, res);
470
471 assert_spin_locked(&dlm->spinlock); 465 assert_spin_locked(&dlm->spinlock);
472 assert_spin_locked(&res->spinlock); 466 assert_spin_locked(&res->spinlock);
473 467
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
484 res->state |= DLM_LOCK_RES_DIRTY; 478 res->state |= DLM_LOCK_RES_DIRTY;
485 } 479 }
486 } 480 }
481
482 mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
483 res->lockname.name);
487} 484}
488 485
489 486
490/* Launch the NM thread for the mounted volume */ 487/* Launch the NM thread for the mounted volume */
491int dlm_launch_thread(struct dlm_ctxt *dlm) 488int dlm_launch_thread(struct dlm_ctxt *dlm)
492{ 489{
493 mlog(0, "starting dlm thread...\n"); 490 mlog(0, "Starting dlm_thread...\n");
494 491
495 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); 492 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
496 if (IS_ERR(dlm->dlm_thread_task)) { 493 if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
505void dlm_complete_thread(struct dlm_ctxt *dlm) 502void dlm_complete_thread(struct dlm_ctxt *dlm)
506{ 503{
507 if (dlm->dlm_thread_task) { 504 if (dlm->dlm_thread_task) {
508 mlog(ML_KTHREAD, "waiting for dlm thread to exit\n"); 505 mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
509 kthread_stop(dlm->dlm_thread_task); 506 kthread_stop(dlm->dlm_thread_task);
510 dlm->dlm_thread_task = NULL; 507 dlm->dlm_thread_task = NULL;
511 } 508 }
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
536 /* get an extra ref on lock */ 533 /* get an extra ref on lock */
537 dlm_lock_get(lock); 534 dlm_lock_get(lock);
538 res = lock->lockres; 535 res = lock->lockres;
539 mlog(0, "delivering an ast for this lockres\n"); 536 mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
537 "node %u\n", dlm->name, res->lockname.len,
538 res->lockname.name,
539 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
540 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
541 lock->ml.type, lock->ml.node);
540 542
541 BUG_ON(!lock->ast_pending); 543 BUG_ON(!lock->ast_pending);
542 544
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
557 /* possible that another ast was queued while 559 /* possible that another ast was queued while
558 * we were delivering the last one */ 560 * we were delivering the last one */
559 if (!list_empty(&lock->ast_list)) { 561 if (!list_empty(&lock->ast_list)) {
560 mlog(0, "aha another ast got queued while " 562 mlog(0, "%s: res %.*s, AST queued while flushing last "
561 "we were finishing the last one. will " 563 "one\n", dlm->name, res->lockname.len,
562 "keep the ast_pending flag set.\n"); 564 res->lockname.name);
563 } else 565 } else
564 lock->ast_pending = 0; 566 lock->ast_pending = 0;
565 567
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
590 dlm_lock_put(lock); 592 dlm_lock_put(lock);
591 spin_unlock(&dlm->ast_lock); 593 spin_unlock(&dlm->ast_lock);
592 594
593 mlog(0, "delivering a bast for this lockres " 595 mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
594 "(blocked = %d\n", hi); 596 "blocked %d, node %u\n",
597 dlm->name, res->lockname.len, res->lockname.name,
598 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
599 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
600 hi, lock->ml.node);
595 601
596 if (lock->ml.node != dlm->node_num) { 602 if (lock->ml.node != dlm->node_num) {
597 ret = dlm_send_proxy_bast(dlm, res, lock, hi); 603 ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
605 /* possible that another bast was queued while 611 /* possible that another bast was queued while
606 * we were delivering the last one */ 612 * we were delivering the last one */
607 if (!list_empty(&lock->bast_list)) { 613 if (!list_empty(&lock->bast_list)) {
608 mlog(0, "aha another bast got queued while " 614 mlog(0, "%s: res %.*s, BAST queued while flushing last "
609 "we were finishing the last one. will " 615 "one\n", dlm->name, res->lockname.len,
610 "keep the bast_pending flag set.\n"); 616 res->lockname.name);
611 } else 617 } else
612 lock->bast_pending = 0; 618 lock->bast_pending = 0;
613 619
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
675 spin_lock(&res->spinlock); 681 spin_lock(&res->spinlock);
676 if (res->owner != dlm->node_num) { 682 if (res->owner != dlm->node_num) {
677 __dlm_print_one_lock_resource(res); 683 __dlm_print_one_lock_resource(res);
678 mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n", 684 mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
679 res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no", 685 " dirty %d\n", dlm->name,
680 res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no", 686 !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
681 res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no", 687 !!(res->state & DLM_LOCK_RES_MIGRATING),
682 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); 688 !!(res->state & DLM_LOCK_RES_RECOVERING),
689 !!(res->state & DLM_LOCK_RES_DIRTY));
683 } 690 }
684 BUG_ON(res->owner != dlm->node_num); 691 BUG_ON(res->owner != dlm->node_num);
685 692
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
693 res->state &= ~DLM_LOCK_RES_DIRTY; 700 res->state &= ~DLM_LOCK_RES_DIRTY;
694 spin_unlock(&res->spinlock); 701 spin_unlock(&res->spinlock);
695 spin_unlock(&dlm->ast_lock); 702 spin_unlock(&dlm->ast_lock);
696 mlog(0, "delaying list shuffling for in-" 703 mlog(0, "%s: res %.*s, inprogress, delay list "
697 "progress lockres %.*s, state=%d\n", 704 "shuffle, state %d\n", dlm->name,
698 res->lockname.len, res->lockname.name, 705 res->lockname.len, res->lockname.name,
699 res->state); 706 res->state);
700 delay = 1; 707 delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
706 * spinlock and do NOT have the dlm lock. 713 * spinlock and do NOT have the dlm lock.
707 * safe to reserve/queue asts and run the lists. */ 714 * safe to reserve/queue asts and run the lists. */
708 715
709 mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
710 "res=%.*s\n", dlm->name,
711 res->lockname.len, res->lockname.name);
712
713 /* called while holding lockres lock */ 716 /* called while holding lockres lock */
714 dlm_shuffle_lists(dlm, res); 717 dlm_shuffle_lists(dlm, res);
715 res->state &= ~DLM_LOCK_RES_DIRTY; 718 res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
733 /* unlikely, but we may need to give time to 736 /* unlikely, but we may need to give time to
734 * other tasks */ 737 * other tasks */
735 if (!--n) { 738 if (!--n) {
736 mlog(0, "throttling dlm_thread\n"); 739 mlog(0, "%s: Throttling dlm thread\n",
740 dlm->name);
737 break; 741 break;
738 } 742 }
739 } 743 }
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 6adafa576065..5dbc3062b4fd 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -137,9 +137,7 @@ check_gen:
137 } 137 }
138 138
139 result = d_obtain_alias(inode); 139 result = d_obtain_alias(inode);
140 if (!IS_ERR(result)) 140 if (IS_ERR(result))
141 d_set_d_op(result, &ocfs2_dentry_ops);
142 else
143 mlog_errno(PTR_ERR(result)); 141 mlog_errno(PTR_ERR(result));
144 142
145bail: 143bail:
@@ -175,8 +173,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
175 } 173 }
176 174
177 parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); 175 parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
178 if (!IS_ERR(parent))
179 d_set_d_op(parent, &ocfs2_dentry_ops);
180 176
181bail_unlock: 177bail_unlock:
182 ocfs2_inode_unlock(dir, 0); 178 ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bdadbae09094..a6651956482e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1989,28 +1989,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1989 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1989 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1990} 1990}
1991 1991
1992static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, 1992static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
1993 loff_t len) 1993 loff_t len)
1994{ 1994{
1995 struct inode *inode = file->f_path.dentry->d_inode;
1995 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1996 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1996 struct ocfs2_space_resv sr; 1997 struct ocfs2_space_resv sr;
1997 int change_size = 1; 1998 int change_size = 1;
1999 int cmd = OCFS2_IOC_RESVSP64;
1998 2000
2001 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2002 return -EOPNOTSUPP;
1999 if (!ocfs2_writes_unwritten_extents(osb)) 2003 if (!ocfs2_writes_unwritten_extents(osb))
2000 return -EOPNOTSUPP; 2004 return -EOPNOTSUPP;
2001 2005
2002 if (S_ISDIR(inode->i_mode))
2003 return -ENODEV;
2004
2005 if (mode & FALLOC_FL_KEEP_SIZE) 2006 if (mode & FALLOC_FL_KEEP_SIZE)
2006 change_size = 0; 2007 change_size = 0;
2007 2008
2009 if (mode & FALLOC_FL_PUNCH_HOLE)
2010 cmd = OCFS2_IOC_UNRESVSP64;
2011
2008 sr.l_whence = 0; 2012 sr.l_whence = 0;
2009 sr.l_start = (s64)offset; 2013 sr.l_start = (s64)offset;
2010 sr.l_len = (s64)len; 2014 sr.l_len = (s64)len;
2011 2015
2012 return __ocfs2_change_file_space(NULL, inode, offset, 2016 return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2013 OCFS2_IOC_RESVSP64, &sr, change_size); 2017 change_size);
2014} 2018}
2015 2019
2016int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, 2020int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -2606,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
2606 .getxattr = generic_getxattr, 2610 .getxattr = generic_getxattr,
2607 .listxattr = ocfs2_listxattr, 2611 .listxattr = ocfs2_listxattr,
2608 .removexattr = generic_removexattr, 2612 .removexattr = generic_removexattr,
2609 .fallocate = ocfs2_fallocate,
2610 .fiemap = ocfs2_fiemap, 2613 .fiemap = ocfs2_fiemap,
2611}; 2614};
2612 2615
@@ -2638,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
2638 .flock = ocfs2_flock, 2641 .flock = ocfs2_flock,
2639 .splice_read = ocfs2_file_splice_read, 2642 .splice_read = ocfs2_file_splice_read,
2640 .splice_write = ocfs2_file_splice_write, 2643 .splice_write = ocfs2_file_splice_write,
2644 .fallocate = ocfs2_fallocate,
2641}; 2645};
2642 2646
2643const struct file_operations ocfs2_dops = { 2647const struct file_operations ocfs2_dops = {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f935fd6600dd..4068c6c4c6f6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -434,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
434 * #1 and #2 can be simply solved by never taking the lock 434 * #1 and #2 can be simply solved by never taking the lock
435 * here for system files (which are the only type we read 435 * here for system files (which are the only type we read
436 * during mount). It's a heavier approach, but our main 436 * during mount). It's a heavier approach, but our main
437 * concern is user-accesible files anyway. 437 * concern is user-accessible files anyway.
438 * 438 *
439 * #3 works itself out because we'll eventually take the 439 * #3 works itself out because we'll eventually take the
440 * cluster lock before trusting anything anyway. 440 * cluster lock before trusting anything anyway.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d14cad6e2e41..849fb4a2e814 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
147 spin_unlock(&oi->ip_lock); 147 spin_unlock(&oi->ip_lock);
148 148
149bail_add: 149bail_add:
150 d_set_d_op(dentry, &ocfs2_dentry_ops);
151 ret = d_splice_alias(inode, dentry); 150 ret = d_splice_alias(inode, dentry);
152 151
153 if (inode) { 152 if (inode) {
@@ -415,7 +414,6 @@ static int ocfs2_mknod(struct inode *dir,
415 mlog_errno(status); 414 mlog_errno(status);
416 goto leave; 415 goto leave;
417 } 416 }
418 d_set_d_op(dentry, &ocfs2_dentry_ops);
419 417
420 status = ocfs2_add_entry(handle, dentry, inode, 418 status = ocfs2_add_entry(handle, dentry, inode,
421 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 419 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -743,7 +741,6 @@ static int ocfs2_link(struct dentry *old_dentry,
743 } 741 }
744 742
745 ihold(inode); 743 ihold(inode);
746 d_set_d_op(dentry, &ocfs2_dentry_ops);
747 d_instantiate(dentry, inode); 744 d_instantiate(dentry, inode);
748 745
749out_commit: 746out_commit:
@@ -1017,8 +1014,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1017 * An error return must mean that no cluster locks 1014 * An error return must mean that no cluster locks
1018 * were held on function exit. 1015 * were held on function exit.
1019 */ 1016 */
1020 if (oi1->ip_blkno != oi2->ip_blkno) 1017 if (oi1->ip_blkno != oi2->ip_blkno) {
1021 ocfs2_inode_unlock(inode2, 1); 1018 ocfs2_inode_unlock(inode2, 1);
1019 brelse(*bh2);
1020 *bh2 = NULL;
1021 }
1022 1022
1023 if (status != -ENOENT) 1023 if (status != -ENOENT)
1024 mlog_errno(status); 1024 mlog_errno(status);
@@ -1794,7 +1794,6 @@ static int ocfs2_symlink(struct inode *dir,
1794 mlog_errno(status); 1794 mlog_errno(status);
1795 goto bail; 1795 goto bail;
1796 } 1796 }
1797 d_set_d_op(dentry, &ocfs2_dentry_ops);
1798 1797
1799 status = ocfs2_add_entry(handle, dentry, inode, 1798 status = ocfs2_add_entry(handle, dentry, inode,
1800 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1799 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2459,7 +2458,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2459 goto out_commit; 2458 goto out_commit;
2460 } 2459 }
2461 2460
2462 d_set_d_op(dentry, &ocfs2_dentry_ops);
2463 d_instantiate(dentry, inode); 2461 d_instantiate(dentry, inode);
2464 status = 0; 2462 status = 0;
2465out_commit: 2463out_commit:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 70dd3b1798f1..51cd6898e7f1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -420,6 +420,11 @@ struct ocfs2_super
420 struct inode *osb_tl_inode; 420 struct inode *osb_tl_inode;
421 struct buffer_head *osb_tl_bh; 421 struct buffer_head *osb_tl_bh;
422 struct delayed_work osb_truncate_log_wq; 422 struct delayed_work osb_truncate_log_wq;
423 /*
424 * How many clusters in our truncate log.
425 * It must be protected by osb_tl_inode->i_mutex.
426 */
427 unsigned int truncated_clusters;
423 428
424 struct ocfs2_node_map osb_recovering_orphan_dirs; 429 struct ocfs2_node_map osb_recovering_orphan_dirs;
425 unsigned int *osb_orphan_wipes; 430 unsigned int *osb_orphan_wipes;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5fed60de7630..71998d4d61d5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1916,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1916 if (res->sr_bg_blkno) { 1916 if (res->sr_bg_blkno) {
1917 /* Attempt to short-circuit the usual search mechanism 1917 /* Attempt to short-circuit the usual search mechanism
1918 * by jumping straight to the most recently used 1918 * by jumping straight to the most recently used
1919 * allocation group. This helps us mantain some 1919 * allocation group. This helps us maintain some
1920 * contiguousness across allocations. */ 1920 * contiguousness across allocations. */
1921 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1921 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1922 min_bits, res, &bits_left); 1922 min_bits, res, &bits_left);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 17ff46fa8a10..38f986d2447e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -993,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
993} 993}
994 994
995/* Handle quota on quotactl */ 995/* Handle quota on quotactl */
996static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, 996static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
997 char *path)
998{ 997{
999 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 998 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
1000 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 999 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -1013,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
1013} 1012}
1014 1013
1015static const struct quotactl_ops ocfs2_quotactl_ops = { 1014static const struct quotactl_ops ocfs2_quotactl_ops = {
1016 .quota_on = ocfs2_quota_on, 1015 .quota_on_meta = ocfs2_quota_on,
1017 .quota_off = ocfs2_quota_off, 1016 .quota_off = ocfs2_quota_off,
1018 .quota_sync = dquot_quota_sync, 1017 .quota_sync = dquot_quota_sync,
1019 .get_info = dquot_get_dqinfo, 1018 .get_info = dquot_get_dqinfo,
@@ -2097,6 +2096,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2097 2096
2098 sb->s_fs_info = osb; 2097 sb->s_fs_info = osb;
2099 sb->s_op = &ocfs2_sops; 2098 sb->s_op = &ocfs2_sops;
2099 sb->s_d_op = &ocfs2_dentry_ops;
2100 sb->s_export_op = &ocfs2_export_ops; 2100 sb->s_export_op = &ocfs2_export_ops;
2101 sb->s_qcop = &ocfs2_quotactl_ops; 2101 sb->s_qcop = &ocfs2_quotactl_ops;
2102 sb->dq_op = &ocfs2_quota_operations; 2102 sb->dq_op = &ocfs2_quota_operations;
diff --git a/fs/open.c b/fs/open.c
index 4197b9ed023d..e52389e1f05b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
223 return -EINVAL; 223 return -EINVAL;
224 224
225 /* Return error if mode is not supported */ 225 /* Return error if mode is not supported */
226 if (mode && !(mode & FALLOC_FL_KEEP_SIZE)) 226 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
227 return -EOPNOTSUPP;
228
229 /* Punch hole must have keep size set */
230 if ((mode & FALLOC_FL_PUNCH_HOLE) &&
231 !(mode & FALLOC_FL_KEEP_SIZE))
227 return -EOPNOTSUPP; 232 return -EOPNOTSUPP;
228 233
229 if (!(file->f_mode & FMODE_WRITE)) 234 if (!(file->f_mode & FMODE_WRITE))
@@ -250,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
250 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 255 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
251 return -EFBIG; 256 return -EFBIG;
252 257
253 if (!inode->i_op->fallocate) 258 if (!file->f_op->fallocate)
254 return -EOPNOTSUPP; 259 return -EOPNOTSUPP;
255 260
256 return inode->i_op->fallocate(inode, mode, offset, len); 261 return file->f_op->fallocate(file, mode, offset, len);
257} 262}
258 263
259SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) 264SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad0c7e2..9c21119512b9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev,
237 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 237 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
238} 238}
239 239
240ssize_t part_ro_show(struct device *dev,
241 struct device_attribute *attr, char *buf)
242{
243 struct hd_struct *p = dev_to_part(dev);
244 return sprintf(buf, "%d\n", p->policy ? 1 : 0);
245}
246
240ssize_t part_alignment_offset_show(struct device *dev, 247ssize_t part_alignment_offset_show(struct device *dev,
241 struct device_attribute *attr, char *buf) 248 struct device_attribute *attr, char *buf)
242{ 249{
@@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev,
312static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); 319static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
313static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 320static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
314static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 321static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
322static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
315static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 323static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
316static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, 324static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
317 NULL); 325 NULL);
@@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = {
326 &dev_attr_partition.attr, 334 &dev_attr_partition.attr,
327 &dev_attr_start.attr, 335 &dev_attr_start.attr,
328 &dev_attr_size.attr, 336 &dev_attr_size.attr,
337 &dev_attr_ro.attr,
329 &dev_attr_alignment_offset.attr, 338 &dev_attr_alignment_offset.attr,
330 &dev_attr_discard_alignment.attr, 339 &dev_attr_discard_alignment.attr,
331 &dev_attr_stat.attr, 340 &dev_attr_stat.attr,
@@ -372,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
372 put_device(part_to_dev(part)); 381 put_device(part_to_dev(part));
373} 382}
374 383
384void __delete_partition(struct hd_struct *part)
385{
386 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
387}
388
375void delete_partition(struct gendisk *disk, int partno) 389void delete_partition(struct gendisk *disk, int partno)
376{ 390{
377 struct disk_part_tbl *ptbl = disk->part_tbl; 391 struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -390,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
390 kobject_put(part->holder_dir); 404 kobject_put(part->holder_dir);
391 device_del(part_to_dev(part)); 405 device_del(part_to_dev(part));
392 406
393 call_rcu(&part->rcu_head, delete_partition_rcu_cb); 407 hd_struct_put(part);
394} 408}
395 409
396static ssize_t whole_disk_show(struct device *dev, 410static ssize_t whole_disk_show(struct device *dev,
@@ -489,6 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
489 if (!dev_get_uevent_suppress(ddev)) 503 if (!dev_get_uevent_suppress(ddev))
490 kobject_uevent(&pdev->kobj, KOBJ_ADD); 504 kobject_uevent(&pdev->kobj, KOBJ_ADD);
491 505
506 hd_ref_init(p);
492 return p; 507 return p;
493 508
494out_free_info: 509out_free_info:
@@ -507,65 +522,6 @@ out_put:
507 return ERR_PTR(err); 522 return ERR_PTR(err);
508} 523}
509 524
510/* Not exported, helper to add_disk(). */
511void register_disk(struct gendisk *disk)
512{
513 struct device *ddev = disk_to_dev(disk);
514 struct block_device *bdev;
515 struct disk_part_iter piter;
516 struct hd_struct *part;
517 int err;
518
519 ddev->parent = disk->driverfs_dev;
520
521 dev_set_name(ddev, disk->disk_name);
522
523 /* delay uevents, until we scanned partition table */
524 dev_set_uevent_suppress(ddev, 1);
525
526 if (device_add(ddev))
527 return;
528 if (!sysfs_deprecated) {
529 err = sysfs_create_link(block_depr, &ddev->kobj,
530 kobject_name(&ddev->kobj));
531 if (err) {
532 device_del(ddev);
533 return;
534 }
535 }
536 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
537 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
538
539 /* No minors to use for partitions */
540 if (!disk_partitionable(disk))
541 goto exit;
542
543 /* No such device (e.g., media were just removed) */
544 if (!get_capacity(disk))
545 goto exit;
546
547 bdev = bdget_disk(disk, 0);
548 if (!bdev)
549 goto exit;
550
551 bdev->bd_invalidated = 1;
552 err = blkdev_get(bdev, FMODE_READ);
553 if (err < 0)
554 goto exit;
555 blkdev_put(bdev, FMODE_READ);
556
557exit:
558 /* announce disk after possible partitions are created */
559 dev_set_uevent_suppress(ddev, 0);
560 kobject_uevent(&ddev->kobj, KOBJ_ADD);
561
562 /* announce possible partitions */
563 disk_part_iter_init(&piter, disk, 0);
564 while ((part = disk_part_iter_next(&piter)))
565 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
566 disk_part_iter_exit(&piter);
567}
568
569static bool disk_unlock_native_capacity(struct gendisk *disk) 525static bool disk_unlock_native_capacity(struct gendisk *disk)
570{ 526{
571 const struct block_device_operations *bdops = disk->fops; 527 const struct block_device_operations *bdops = disk->fops;
@@ -728,33 +684,3 @@ fail:
728} 684}
729 685
730EXPORT_SYMBOL(read_dev_sector); 686EXPORT_SYMBOL(read_dev_sector);
731
732void del_gendisk(struct gendisk *disk)
733{
734 struct disk_part_iter piter;
735 struct hd_struct *part;
736
737 /* invalidate stuff */
738 disk_part_iter_init(&piter, disk,
739 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
740 while ((part = disk_part_iter_next(&piter))) {
741 invalidate_partition(disk, part->partno);
742 delete_partition(disk, part->partno);
743 }
744 disk_part_iter_exit(&piter);
745
746 invalidate_partition(disk, 0);
747 blk_free_devt(disk_to_dev(disk)->devt);
748 set_capacity(disk, 0);
749 disk->flags &= ~GENHD_FL_UP;
750 unlink_gendisk(disk);
751 part_stat_set_all(&disk->part0, 0);
752 disk->part0.stamp = 0;
753
754 kobject_put(disk->part0.holder_dir);
755 kobject_put(disk->slave_dir);
756 disk->driverfs_dev = NULL;
757 if (!sysfs_deprecated)
758 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
759 device_del(disk_to_dev(disk));
760}
diff --git a/fs/pipe.c b/fs/pipe.c
index 68f1f8e4e23b..da42f7db50de 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -441,7 +441,7 @@ redo:
441 break; 441 break;
442 } 442 }
443 if (do_wakeup) { 443 if (do_wakeup) {
444 wake_up_interruptible_sync(&pipe->wait); 444 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
446 } 446 }
447 pipe_wait(pipe); 447 pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
450 450
451 /* Signal writers asynchronously that there is more room. */ 451 /* Signal writers asynchronously that there is more room. */
452 if (do_wakeup) { 452 if (do_wakeup) {
453 wake_up_interruptible_sync(&pipe->wait); 453 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
455 } 455 }
456 if (ret > 0) 456 if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
612 break; 612 break;
613 } 613 }
614 if (do_wakeup) { 614 if (do_wakeup) {
615 wake_up_interruptible_sync(&pipe->wait); 615 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
616 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 616 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
617 do_wakeup = 0; 617 do_wakeup = 0;
618 } 618 }
@@ -623,7 +623,7 @@ redo2:
623out: 623out:
624 mutex_unlock(&inode->i_mutex); 624 mutex_unlock(&inode->i_mutex);
625 if (do_wakeup) { 625 if (do_wakeup) {
626 wake_up_interruptible_sync(&pipe->wait); 626 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
627 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 627 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
628 } 628 }
629 if (ret > 0) 629 if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
715 if (!pipe->readers && !pipe->writers) { 715 if (!pipe->readers && !pipe->writers) {
716 free_pipe_info(inode); 716 free_pipe_info(inode);
717 } else { 717 } else {
718 wake_up_interruptible_sync(&pipe->wait); 718 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
719 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 719 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
720 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 720 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
721 } 721 }
@@ -1004,7 +1004,6 @@ struct file *create_write_pipe(int flags)
1004 goto err_inode; 1004 goto err_inode;
1005 path.mnt = mntget(pipe_mnt); 1005 path.mnt = mntget(pipe_mnt);
1006 1006
1007 d_set_d_op(path.dentry, &pipefs_dentry_operations);
1008 d_instantiate(path.dentry, inode); 1007 d_instantiate(path.dentry, inode);
1009 1008
1010 err = -ENFILE; 1009 err = -ENFILE;
@@ -1266,7 +1265,8 @@ static const struct super_operations pipefs_ops = {
1266static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1265static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1267 int flags, const char *dev_name, void *data) 1266 int flags, const char *dev_name, void *data)
1268{ 1267{
1269 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, PIPEFS_MAGIC); 1268 return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
1269 &pipefs_dentry_operations, PIPEFS_MAGIC);
1270} 1270}
1271 1271
1272static struct file_system_type pipe_fs_type = { 1272static struct file_system_type pipe_fs_type = {
@@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void)
1292static void __exit exit_pipe_fs(void) 1292static void __exit exit_pipe_fs(void)
1293{ 1293{
1294 unregister_filesystem(&pipe_fs_type); 1294 unregister_filesystem(&pipe_fs_type);
1295 mntput_long(pipe_mnt); 1295 mntput(pipe_mnt);
1296} 1296}
1297 1297
1298fs_initcall(init_pipe_fs); 1298fs_initcall(init_pipe_fs);
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 6a0068841d96..15af6222f8a4 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
1config PROC_FS 1config PROC_FS
2 bool "/proc file system support" if EMBEDDED 2 bool "/proc file system support" if EXPERT
3 default y 3 default y
4 help 4 help
5 This is a virtual file system providing information about the status 5 This is a virtual file system providing information about the status
@@ -40,7 +40,7 @@ config PROC_VMCORE
40 Exports the dump image of crashed kernel in ELF format. 40 Exports the dump image of crashed kernel in ELF format.
41 41
42config PROC_SYSCTL 42config PROC_SYSCTL
43 bool "Sysctl support (/proc/sys)" if EMBEDDED 43 bool "Sysctl support (/proc/sys)" if EXPERT
44 depends on PROC_FS 44 depends on PROC_FS
45 select SYSCTL 45 select SYSCTL
46 default y 46 default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
61config PROC_PAGE_MONITOR 61config PROC_PAGE_MONITOR
62 default y 62 default y
63 depends on PROC_FS && MMU 63 depends on PROC_FS && MMU
64 bool "Enable /proc page monitoring" if EMBEDDED 64 bool "Enable /proc page monitoring" if EXPERT
65 help 65 help
66 Various /proc files exist to monitor process memory utilization: 66 Various /proc files exist to monitor process memory utilization:
67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, 67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 288a49e098bf..df434c5f28fb 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,12 +10,12 @@ proc-$(CONFIG_MMU) := mmu.o task_mmu.o
10proc-y += inode.o root.o base.o generic.o array.o \ 10proc-y += inode.o root.o base.o generic.o array.o \
11 proc_tty.o 11 proc_tty.o
12proc-y += cmdline.o 12proc-y += cmdline.o
13proc-y += consoles.o
13proc-y += cpuinfo.o 14proc-y += cpuinfo.o
14proc-y += devices.o 15proc-y += devices.o
15proc-y += interrupts.o 16proc-y += interrupts.o
16proc-y += loadavg.o 17proc-y += loadavg.o
17proc-y += meminfo.o 18proc-y += meminfo.o
18proc-y += proc_console.o
19proc-y += stat.o 19proc-y += stat.o
20proc-y += uptime.o 20proc-y += uptime.o
21proc-y += version.o 21proc-y += version.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fff6572676ae..df2b703b9d0f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
95 95
96 get_task_comm(tcomm, p); 96 get_task_comm(tcomm, p);
97 97
98 seq_printf(m, "Name:\t"); 98 seq_puts(m, "Name:\t");
99 end = m->buf + m->size; 99 end = m->buf + m->size;
100 buf = m->buf + m->count; 100 buf = m->buf + m->count;
101 name = tcomm; 101 name = tcomm;
@@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
122 buf++; 122 buf++;
123 } 123 }
124 m->count = buf - m->buf; 124 m->count = buf - m->buf;
125 seq_printf(m, "\n"); 125 seq_putc(m, '\n');
126} 126}
127 127
128/* 128/*
@@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
208 seq_printf(m, "%d ", GROUP_AT(group_info, g)); 208 seq_printf(m, "%d ", GROUP_AT(group_info, g));
209 put_cred(cred); 209 put_cred(cred);
210 210
211 seq_printf(m, "\n"); 211 seq_putc(m, '\n');
212} 212}
213 213
214static void render_sigset_t(struct seq_file *m, const char *header, 214static void render_sigset_t(struct seq_file *m, const char *header,
@@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
216{ 216{
217 int i; 217 int i;
218 218
219 seq_printf(m, "%s", header); 219 seq_puts(m, header);
220 220
221 i = _NSIG; 221 i = _NSIG;
222 do { 222 do {
@@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
230 seq_printf(m, "%x", x); 230 seq_printf(m, "%x", x);
231 } while (i >= 4); 231 } while (i >= 4);
232 232
233 seq_printf(m, "\n"); 233 seq_putc(m, '\n');
234} 234}
235 235
236static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, 236static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
291{ 291{
292 unsigned __capi; 292 unsigned __capi;
293 293
294 seq_printf(m, "%s", header); 294 seq_puts(m, header);
295 CAP_FOR_EACH_U32(__capi) { 295 CAP_FOR_EACH_U32(__capi) {
296 seq_printf(m, "%08x", 296 seq_printf(m, "%08x",
297 a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); 297 a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
298 } 298 }
299 seq_printf(m, "\n"); 299 seq_putc(m, '\n');
300} 300}
301 301
302static inline void task_cap(struct seq_file *m, struct task_struct *p) 302static inline void task_cap(struct seq_file *m, struct task_struct *p)
@@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
329 329
330static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 330static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
331{ 331{
332 seq_printf(m, "Cpus_allowed:\t"); 332 seq_puts(m, "Cpus_allowed:\t");
333 seq_cpumask(m, &task->cpus_allowed); 333 seq_cpumask(m, &task->cpus_allowed);
334 seq_printf(m, "\n"); 334 seq_putc(m, '\n');
335 seq_printf(m, "Cpus_allowed_list:\t"); 335 seq_puts(m, "Cpus_allowed_list:\t");
336 seq_cpumask_list(m, &task->cpus_allowed); 336 seq_cpumask_list(m, &task->cpus_allowed);
337 seq_printf(m, "\n"); 337 seq_putc(m, '\n');
338} 338}
339 339
340int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 340int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
535int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, 535int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
536 struct pid *pid, struct task_struct *task) 536 struct pid *pid, struct task_struct *task)
537{ 537{
538 int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; 538 unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
539 struct mm_struct *mm = get_task_mm(task); 539 struct mm_struct *mm = get_task_mm(task);
540 540
541 if (mm) { 541 if (mm) {
542 size = task_statm(mm, &shared, &text, &data, &resident); 542 size = task_statm(mm, &shared, &text, &data, &resident);
543 mmput(mm); 543 mmput(mm);
544 } 544 }
545 seq_printf(m, "%d %d %d %d %d %d %d\n", 545 seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
546 size, resident, shared, text, lib, data, 0); 546 size, resident, shared, text, data);
547 547
548 return 0; 548 return 0;
549} 549}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b20962c71a52..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
373 return -ESRCH; 373 return -ESRCH;
374 seq_puts(m, "Latency Top version : v0.1\n"); 374 seq_puts(m, "Latency Top version : v0.1\n");
375 for (i = 0; i < 32; i++) { 375 for (i = 0; i < 32; i++) {
376 if (task->latency_record[i].backtrace[0]) { 376 struct latency_record *lr = &task->latency_record[i];
377 if (lr->backtrace[0]) {
377 int q; 378 int q;
378 seq_printf(m, "%i %li %li ", 379 seq_printf(m, "%i %li %li",
379 task->latency_record[i].count, 380 lr->count, lr->time, lr->max);
380 task->latency_record[i].time,
381 task->latency_record[i].max);
382 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 381 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
383 char sym[KSYM_SYMBOL_LEN]; 382 unsigned long bt = lr->backtrace[q];
384 char *c; 383 if (!bt)
385 if (!task->latency_record[i].backtrace[q])
386 break; 384 break;
387 if (task->latency_record[i].backtrace[q] == ULONG_MAX) 385 if (bt == ULONG_MAX)
388 break; 386 break;
389 sprint_symbol(sym, task->latency_record[i].backtrace[q]); 387 seq_printf(m, " %ps", (void *)bt);
390 c = strchr(sym, '+');
391 if (c)
392 *c = 0;
393 seq_printf(m, "%s ", sym);
394 } 388 }
395 seq_printf(m, "\n"); 389 seq_putc(m, '\n');
396 } 390 }
397 391
398 } 392 }
@@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v)
751 745
752static int proc_single_open(struct inode *inode, struct file *filp) 746static int proc_single_open(struct inode *inode, struct file *filp)
753{ 747{
754 int ret; 748 return single_open(filp, proc_single_show, inode);
755 ret = single_open(filp, proc_single_show, NULL);
756 if (!ret) {
757 struct seq_file *m = filp->private_data;
758
759 m->private = inode;
760 }
761 return ret;
762} 749}
763 750
764static const struct file_operations proc_single_file_operations = { 751static const struct file_operations proc_single_file_operations = {
@@ -1164,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1164 goto err_task_lock; 1151 goto err_task_lock;
1165 } 1152 }
1166 1153
1167 if (oom_score_adj < task->signal->oom_score_adj && 1154 if (oom_score_adj < task->signal->oom_score_adj_min &&
1168 !capable(CAP_SYS_RESOURCE)) { 1155 !capable(CAP_SYS_RESOURCE)) {
1169 err = -EACCES; 1156 err = -EACCES;
1170 goto err_sighand; 1157 goto err_sighand;
@@ -1177,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1177 atomic_dec(&task->mm->oom_disable_count); 1164 atomic_dec(&task->mm->oom_disable_count);
1178 } 1165 }
1179 task->signal->oom_score_adj = oom_score_adj; 1166 task->signal->oom_score_adj = oom_score_adj;
1167 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1168 task->signal->oom_score_adj_min = oom_score_adj;
1180 /* 1169 /*
1181 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is 1170 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1182 * always attainable. 1171 * always attainable.
@@ -1386,15 +1375,7 @@ sched_write(struct file *file, const char __user *buf,
1386 1375
1387static int sched_open(struct inode *inode, struct file *filp) 1376static int sched_open(struct inode *inode, struct file *filp)
1388{ 1377{
1389 int ret; 1378 return single_open(filp, sched_show, inode);
1390
1391 ret = single_open(filp, sched_show, NULL);
1392 if (!ret) {
1393 struct seq_file *m = filp->private_data;
1394
1395 m->private = inode;
1396 }
1397 return ret;
1398} 1379}
1399 1380
1400static const struct file_operations proc_pid_sched_operations = { 1381static const struct file_operations proc_pid_sched_operations = {
@@ -1530,15 +1511,7 @@ static int comm_show(struct seq_file *m, void *v)
1530 1511
1531static int comm_open(struct inode *inode, struct file *filp) 1512static int comm_open(struct inode *inode, struct file *filp)
1532{ 1513{
1533 int ret; 1514 return single_open(filp, comm_show, inode);
1534
1535 ret = single_open(filp, comm_show, NULL);
1536 if (!ret) {
1537 struct seq_file *m = filp->private_data;
1538
1539 m->private = inode;
1540 }
1541 return ret;
1542} 1515}
1543 1516
1544static const struct file_operations proc_pid_set_comm_operations = { 1517static const struct file_operations proc_pid_set_comm_operations = {
diff --git a/fs/proc/proc_console.c b/fs/proc/consoles.c
index 8a707609f528..eafc22ab1fdd 100644
--- a/fs/proc/proc_console.c
+++ b/fs/proc/consoles.c
@@ -106,9 +106,9 @@ static const struct file_operations proc_consoles_operations = {
106 .release = seq_release, 106 .release = seq_release,
107}; 107};
108 108
109static int register_proc_consoles(void) 109static int __init proc_consoles_init(void)
110{ 110{
111 proc_create("consoles", 0, NULL, &proc_consoles_operations); 111 proc_create("consoles", 0, NULL, &proc_consoles_operations);
112 return 0; 112 return 0;
113} 113}
114module_init(register_proc_consoles); 114module_init(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c9..b14347167c35 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
9 9
10 if (i < CHRDEV_MAJOR_HASH_SIZE) { 10 if (i < CHRDEV_MAJOR_HASH_SIZE) {
11 if (i == 0) 11 if (i == 0)
12 seq_printf(f, "Character devices:\n"); 12 seq_puts(f, "Character devices:\n");
13 chrdev_show(f, i); 13 chrdev_show(f, i);
14 } 14 }
15#ifdef CONFIG_BLOCK 15#ifdef CONFIG_BLOCK
16 else { 16 else {
17 i -= CHRDEV_MAJOR_HASH_SIZE; 17 i -= CHRDEV_MAJOR_HASH_SIZE;
18 if (i == 0) 18 if (i == 0)
19 seq_printf(f, "\nBlock devices:\n"); 19 seq_puts(f, "\nBlock devices:\n");
20 blkdev_show(f, i); 20 blkdev_show(f, i);
21 } 21 }
22#endif 22#endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f766be29d2c7..01e07f2a188f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
425 if (de->namelen != dentry->d_name.len) 425 if (de->namelen != dentry->d_name.len)
426 continue; 426 continue;
427 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { 427 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
428 unsigned int ino;
429
430 ino = de->low_ino;
431 pde_get(de); 428 pde_get(de);
432 spin_unlock(&proc_subdir_lock); 429 spin_unlock(&proc_subdir_lock);
433 error = -EINVAL; 430 error = -EINVAL;
434 inode = proc_get_inode(dir->i_sb, ino, de); 431 inode = proc_get_inode(dir->i_sb, de);
435 goto out_unlock; 432 goto out_unlock;
436 } 433 }
437 } 434 }
@@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
768 765
769static void free_proc_entry(struct proc_dir_entry *de) 766static void free_proc_entry(struct proc_dir_entry *de)
770{ 767{
771 unsigned int ino = de->low_ino; 768 release_inode_number(de->low_ino);
772
773 if (ino < PROC_DYNAMIC_FIRST)
774 return;
775
776 release_inode_number(ino);
777 769
778 if (S_ISLNK(de->mode)) 770 if (S_ISLNK(de->mode))
779 kfree(de->data); 771 kfree(de->data);
@@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
834 826
835 wait_for_completion(de->pde_unload_completion); 827 wait_for_completion(de->pde_unload_completion);
836 828
837 goto continue_removing; 829 spin_lock(&de->pde_unload_lock);
838 } 830 }
839 spin_unlock(&de->pde_unload_lock);
840 831
841continue_removing:
842 spin_lock(&de->pde_unload_lock);
843 while (!list_empty(&de->pde_openers)) { 832 while (!list_empty(&de->pde_openers)) {
844 struct pde_opener *pdeo; 833 struct pde_opener *pdeo;
845 834
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6bcb926b101b..176ce4cda68a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -416,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
416}; 416};
417#endif 417#endif
418 418
419struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, 419struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
420 struct proc_dir_entry *de)
421{ 420{
422 struct inode * inode; 421 struct inode * inode;
423 422
424 inode = iget_locked(sb, ino); 423 inode = iget_locked(sb, de->low_ino);
425 if (!inode) 424 if (!inode)
426 return NULL; 425 return NULL;
427 if (inode->i_state & I_NEW) { 426 if (inode->i_state & I_NEW) {
@@ -471,7 +470,7 @@ int proc_fill_super(struct super_block *s)
471 s->s_time_gran = 1; 470 s->s_time_gran = 1;
472 471
473 pde_get(&proc_root); 472 pde_get(&proc_root);
474 root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); 473 root_inode = proc_get_inode(s, &proc_root);
475 if (!root_inode) 474 if (!root_inode)
476 goto out_no_root; 475 goto out_no_root;
477 root_inode->i_uid = 0; 476 root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd12..9ad561ded409 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock;
96struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); 96struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
97int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); 97int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
98unsigned long task_vsize(struct mm_struct *); 98unsigned long task_vsize(struct mm_struct *);
99int task_statm(struct mm_struct *, int *, int *, int *, int *); 99unsigned long task_statm(struct mm_struct *,
100 unsigned long *, unsigned long *, unsigned long *, unsigned long *);
100void task_mem(struct seq_file *, struct mm_struct *); 101void task_mem(struct seq_file *, struct mm_struct *);
101 102
102static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) 103static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
@@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde);
108 109
109extern struct vfsmount *proc_mnt; 110extern struct vfsmount *proc_mnt;
110int proc_fill_super(struct super_block *); 111int proc_fill_super(struct super_block *);
111struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); 112struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
112 113
113/* 114/*
114 * These are generic /proc routines that use the internal 115 * These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6f37c391468d..d245cb23dd72 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
558static const struct file_operations proc_kcore_operations = { 558static const struct file_operations proc_kcore_operations = {
559 .read = read_kcore, 559 .read = read_kcore,
560 .open = open_kcore, 560 .open = open_kcore,
561 .llseek = generic_file_llseek, 561 .llseek = default_llseek,
562}; 562};
563 563
564#ifdef CONFIG_MEMORY_HOTPLUG 564#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
101#ifdef CONFIG_MEMORY_FAILURE 101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %5lu kB\n" 102 "HardwareCorrupted: %5lu kB\n"
103#endif 103#endif
104#ifdef CONFIG_TRANSPARENT_HUGEPAGE
105 "AnonHugePages: %8lu kB\n"
106#endif
104 , 107 ,
105 K(i.totalram), 108 K(i.totalram),
106 K(i.freeram), 109 K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
128 K(i.freeswap), 131 K(i.freeswap),
129 K(global_page_state(NR_FILE_DIRTY)), 132 K(global_page_state(NR_FILE_DIRTY)),
130 K(global_page_state(NR_WRITEBACK)), 133 K(global_page_state(NR_WRITEBACK)),
131 K(global_page_state(NR_ANON_PAGES)), 134 K(global_page_state(NR_ANON_PAGES)
135#ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
137 HPAGE_PMD_NR
138#endif
139 ),
132 K(global_page_state(NR_FILE_MAPPED)), 140 K(global_page_state(NR_FILE_MAPPED)),
133 K(global_page_state(NR_SHMEM)), 141 K(global_page_state(NR_SHMEM)),
134 K(global_page_state(NR_SLAB_RECLAIMABLE) + 142 K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
151#ifdef CONFIG_MEMORY_FAILURE 159#ifdef CONFIG_MEMORY_FAILURE
152 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) 160 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
153#endif 161#endif
162#ifdef CONFIG_TRANSPARENT_HUGEPAGE
163 ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
164 HPAGE_PMD_NR)
165#endif
154 ); 166 );
155 167
156 hugetlb_report_meminfo(m); 168 hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3b8b45660331..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
40 ppage = pfn_to_page(pfn); 40 ppage = pfn_to_page(pfn);
41 else 41 else
42 ppage = NULL; 42 ppage = NULL;
43 if (!ppage) 43 if (!ppage || PageSlab(ppage))
44 pcount = 0; 44 pcount = 0;
45 else 45 else
46 pcount = page_mapcount(ppage); 46 pcount = page_mapcount(ppage);
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
116 if (PageHuge(page)) 116 if (PageHuge(page))
117 u |= 1 << KPF_HUGE; 117 u |= 1 << KPF_HUGE;
118 118
119 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
120
121 /* 119 /*
122 * Caveats on high order pages: 120 * Caveats on high order pages: page->_count will only be set
123 * PG_buddy will only be set on the head page; SLUB/SLQB do the same 121 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
124 * for PG_slab; SLOB won't set PG_slab at all on compound pages. 122 * SLOB won't set PG_slab at all on compound pages.
125 */ 123 */
124 if (PageBuddy(page))
125 u |= 1 << KPF_BUDDY;
126
127 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
128
126 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 129 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
127 u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy);
128 130
129 u |= kpf_copy_bit(k, KPF_ERROR, PG_error); 131 u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
130 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); 132 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc869437..cb761f010300 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
36 } 36 }
37 switch (p->type) { 37 switch (p->type) {
38 case TTY_DRIVER_TYPE_SYSTEM: 38 case TTY_DRIVER_TYPE_SYSTEM:
39 seq_printf(m, "system"); 39 seq_puts(m, "system");
40 if (p->subtype == SYSTEM_TYPE_TTY) 40 if (p->subtype == SYSTEM_TYPE_TTY)
41 seq_printf(m, ":/dev/tty"); 41 seq_puts(m, ":/dev/tty");
42 else if (p->subtype == SYSTEM_TYPE_SYSCONS) 42 else if (p->subtype == SYSTEM_TYPE_SYSCONS)
43 seq_printf(m, ":console"); 43 seq_puts(m, ":console");
44 else if (p->subtype == SYSTEM_TYPE_CONSOLE) 44 else if (p->subtype == SYSTEM_TYPE_CONSOLE)
45 seq_printf(m, ":vtmaster"); 45 seq_puts(m, ":vtmaster");
46 break; 46 break;
47 case TTY_DRIVER_TYPE_CONSOLE: 47 case TTY_DRIVER_TYPE_CONSOLE:
48 seq_printf(m, "console"); 48 seq_puts(m, "console");
49 break; 49 break;
50 case TTY_DRIVER_TYPE_SERIAL: 50 case TTY_DRIVER_TYPE_SERIAL:
51 seq_printf(m, "serial"); 51 seq_puts(m, "serial");
52 break; 52 break;
53 case TTY_DRIVER_TYPE_PTY: 53 case TTY_DRIVER_TYPE_PTY:
54 if (p->subtype == PTY_TYPE_MASTER) 54 if (p->subtype == PTY_TYPE_MASTER)
55 seq_printf(m, "pty:master"); 55 seq_puts(m, "pty:master");
56 else if (p->subtype == PTY_TYPE_SLAVE) 56 else if (p->subtype == PTY_TYPE_SLAVE)
57 seq_printf(m, "pty:slave"); 57 seq_puts(m, "pty:slave");
58 else 58 else
59 seq_printf(m, "pty"); 59 seq_puts(m, "pty");
60 break; 60 break;
61 default: 61 default:
62 seq_printf(m, "type:%d.%d", p->type, p->subtype); 62 seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
74 /* pseudo-drivers first */ 74 /* pseudo-drivers first */
75 seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); 75 seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
76 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0); 76 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
77 seq_printf(m, "system:/dev/tty\n"); 77 seq_puts(m, "system:/dev/tty\n");
78 seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console"); 78 seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
79 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1); 79 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
80 seq_printf(m, "system:console\n"); 80 seq_puts(m, "system:console\n");
81#ifdef CONFIG_UNIX98_PTYS 81#ifdef CONFIG_UNIX98_PTYS
82 seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx"); 82 seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
83 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2); 83 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
84 seq_printf(m, "system\n"); 84 seq_puts(m, "system\n");
85#endif 85#endif
86#ifdef CONFIG_VT 86#ifdef CONFIG_VT
87 seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0"); 87 seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
88 seq_printf(m, "%3d %7d ", TTY_MAJOR, 0); 88 seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
89 seq_printf(m, "system:vtmaster\n"); 89 seq_puts(m, "system:vtmaster\n");
90#endif 90#endif
91 } 91 }
92 92
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 37994737c983..62604be9f58d 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
10{ 10{
11 int i, j; 11 int i, j;
12 12
13 seq_printf(p, " "); 13 seq_puts(p, " ");
14 for_each_possible_cpu(i) 14 for_each_possible_cpu(i)
15 seq_printf(p, "CPU%-8d", i); 15 seq_printf(p, "CPU%-8d", i);
16 seq_printf(p, "\n"); 16 seq_putc(p, '\n');
17 17
18 for (i = 0; i < NR_SOFTIRQS; i++) { 18 for (i = 0; i < NR_SOFTIRQS; i++) {
19 seq_printf(p, "%12s:", softirq_to_name[i]); 19 seq_printf(p, "%12s:", softirq_to_name[i]);
20 for_each_possible_cpu(j) 20 for_each_possible_cpu(j)
21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); 21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
22 seq_printf(p, "\n"); 22 seq_putc(p, '\n');
23 } 23 }
24 return 0; 24 return 0;
25} 25}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e15a19c93bae..1cffa2b8a2fc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -126,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v)
126 126
127 for (i = 0; i < NR_SOFTIRQS; i++) 127 for (i = 0; i < NR_SOFTIRQS; i++)
128 seq_printf(p, " %u", per_softirq_sums[i]); 128 seq_printf(p, " %u", per_softirq_sums[i]);
129 seq_printf(p, "\n"); 129 seq_putc(p, '\n');
130 130
131 return 0; 131 return 0;
132} 132}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c126c83b9a45..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm)
66 return PAGE_SIZE * mm->total_vm; 66 return PAGE_SIZE * mm->total_vm;
67} 67}
68 68
69int task_statm(struct mm_struct *mm, int *shared, int *text, 69unsigned long task_statm(struct mm_struct *mm,
70 int *data, int *resident) 70 unsigned long *shared, unsigned long *text,
71 unsigned long *data, unsigned long *resident)
71{ 72{
72 *shared = get_mm_counter(mm, MM_FILEPAGES); 73 *shared = get_mm_counter(mm, MM_FILEPAGES);
73 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 74 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
@@ -417,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
417 "Anonymous: %8lu kB\n" 418 "Anonymous: %8lu kB\n"
418 "Swap: %8lu kB\n" 419 "Swap: %8lu kB\n"
419 "KernelPageSize: %8lu kB\n" 420 "KernelPageSize: %8lu kB\n"
420 "MMUPageSize: %8lu kB\n", 421 "MMUPageSize: %8lu kB\n"
422 "Locked: %8lu kB\n",
421 (vma->vm_end - vma->vm_start) >> 10, 423 (vma->vm_end - vma->vm_start) >> 10,
422 mss.resident >> 10, 424 mss.resident >> 10,
423 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 425 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -429,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
429 mss.anonymous >> 10, 431 mss.anonymous >> 10,
430 mss.swap >> 10, 432 mss.swap >> 10,
431 vma_kernel_pagesize(vma) >> 10, 433 vma_kernel_pagesize(vma) >> 10,
432 vma_mmu_pagesize(vma) >> 10); 434 vma_mmu_pagesize(vma) >> 10,
435 (vma->vm_flags & VM_LOCKED) ?
436 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
433 437
434 if (m->count < m->size) /* vma is copied successfully */ 438 if (m->count < m->size) /* vma is copied successfully */
435 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 439 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index cb6306e63843..b535d3e5d5f1 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
92 return vsize; 92 return vsize;
93} 93}
94 94
95int task_statm(struct mm_struct *mm, int *shared, int *text, 95unsigned long task_statm(struct mm_struct *mm,
96 int *data, int *resident) 96 unsigned long *shared, unsigned long *text,
97 unsigned long *data, unsigned long *resident)
97{ 98{
98 struct vm_area_struct *vma; 99 struct vm_area_struct *vma;
99 struct vm_region *region; 100 struct vm_region *region;
100 struct rb_node *p; 101 struct rb_node *p;
101 int size = kobjsize(mm); 102 unsigned long size = kobjsize(mm);
102 103
103 down_read(&mm->mmap_sem); 104 down_read(&mm->mmap_sem);
104 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { 105 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0fed41e6efcd..a2a622e079f0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
133EXPORT_SYMBOL(dq_data_lock); 133EXPORT_SYMBOL(dq_data_lock);
134 134
135void __quota_error(struct super_block *sb, const char *func, 135void __quota_error(struct super_block *sb, const char *func,
136 const char *fmt, ...) 136 const char *fmt, ...)
137{ 137{
138 va_list args;
139
140 if (printk_ratelimit()) { 138 if (printk_ratelimit()) {
139 va_list args;
140 struct va_format vaf;
141
141 va_start(args, fmt); 142 va_start(args, fmt);
142 printk(KERN_ERR "Quota error (device %s): %s: ", 143
143 sb->s_id, func); 144 vaf.fmt = fmt;
144 vprintk(fmt, args); 145 vaf.va = &args;
145 printk("\n"); 146
147 printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
148 sb->s_id, func, &vaf);
149
146 va_end(args); 150 va_end(args);
147 } 151 }
148} 152}
@@ -2185,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
2185} 2189}
2186EXPORT_SYMBOL(dquot_resume); 2190EXPORT_SYMBOL(dquot_resume);
2187 2191
2188int dquot_quota_on_path(struct super_block *sb, int type, int format_id, 2192int dquot_quota_on(struct super_block *sb, int type, int format_id,
2189 struct path *path) 2193 struct path *path)
2190{ 2194{
2191 int error = security_quota_on(path->dentry); 2195 int error = security_quota_on(path->dentry);
2192 if (error) 2196 if (error)
@@ -2200,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
2200 DQUOT_LIMITS_ENABLED); 2204 DQUOT_LIMITS_ENABLED);
2201 return error; 2205 return error;
2202} 2206}
2203EXPORT_SYMBOL(dquot_quota_on_path);
2204
2205int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
2206{
2207 struct path path;
2208 int error;
2209
2210 error = kern_path(name, LOOKUP_FOLLOW, &path);
2211 if (!error) {
2212 error = dquot_quota_on_path(sb, type, format_id, &path);
2213 path_put(&path);
2214 }
2215 return error;
2216}
2217EXPORT_SYMBOL(dquot_quota_on); 2207EXPORT_SYMBOL(dquot_quota_on);
2218 2208
2219/* 2209/*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1edb..b34bdb25490c 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
64} 64}
65 65
66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
67 void __user *addr) 67 struct path *path)
68{ 68{
69 char *pathname; 69 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
70 int ret = -ENOSYS; 70 return -ENOSYS;
71 71 if (sb->s_qcop->quota_on_meta)
72 pathname = getname(addr); 72 return sb->s_qcop->quota_on_meta(sb, type, id);
73 if (IS_ERR(pathname)) 73 if (IS_ERR(path))
74 return PTR_ERR(pathname); 74 return PTR_ERR(path);
75 if (sb->s_qcop->quota_on) 75 return sb->s_qcop->quota_on(sb, type, id, path);
76 ret = sb->s_qcop->quota_on(sb, type, id, pathname);
77 putname(pathname);
78 return ret;
79} 76}
80 77
81static int quota_getfmt(struct super_block *sb, int type, void __user *addr) 78static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
241 238
242/* Copy parameters and call proper function */ 239/* Copy parameters and call proper function */
243static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, 240static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
244 void __user *addr) 241 void __user *addr, struct path *path)
245{ 242{
246 int ret; 243 int ret;
247 244
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
256 253
257 switch (cmd) { 254 switch (cmd) {
258 case Q_QUOTAON: 255 case Q_QUOTAON:
259 return quota_quotaon(sb, type, cmd, id, addr); 256 return quota_quotaon(sb, type, cmd, id, path);
260 case Q_QUOTAOFF: 257 case Q_QUOTAOFF:
261 if (!sb->s_qcop->quota_off) 258 if (!sb->s_qcop->quota_off)
262 return -ENOSYS; 259 return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
335{ 332{
336 uint cmds, type; 333 uint cmds, type;
337 struct super_block *sb = NULL; 334 struct super_block *sb = NULL;
335 struct path path, *pathp = NULL;
338 int ret; 336 int ret;
339 337
340 cmds = cmd >> SUBCMDSHIFT; 338 cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
351 return -ENODEV; 349 return -ENODEV;
352 } 350 }
353 351
352 /*
353 * Path for quotaon has to be resolved before grabbing superblock
354 * because that gets s_umount sem which is also possibly needed by path
355 * resolution (think about autofs) and thus deadlocks could arise.
356 */
357 if (cmds == Q_QUOTAON) {
358 ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
359 if (ret)
360 pathp = ERR_PTR(ret);
361 else
362 pathp = &path;
363 }
364
354 sb = quotactl_block(special); 365 sb = quotactl_block(special);
355 if (IS_ERR(sb)) 366 if (IS_ERR(sb))
356 return PTR_ERR(sb); 367 return PTR_ERR(sb);
357 368
358 ret = do_quotactl(sb, type, cmds, id, addr); 369 ret = do_quotactl(sb, type, cmds, id, addr, pathp);
359 370
360 drop_super(sb); 371 drop_super(sb);
372 if (pathp && !IS_ERR(pathp))
373 path_put(pathp);
361 return ret; 374 return ret;
362} 375}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabcc..e41c1becf096 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
468 return -ENOMEM; 468 return -ENOMEM;
469 ret = read_blk(info, *blk, buf); 469 ret = read_blk(info, *blk, buf);
470 if (ret < 0) { 470 if (ret < 0) {
471 quota_error(dquot->dq_sb, "Can't read quota data " 471 quota_error(dquot->dq_sb, "Can't read quota data block %u",
472 "block %u", blk); 472 *blk);
473 goto out_buf; 473 goto out_buf;
474 } 474 }
475 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); 475 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
493 } else { 493 } else {
494 ret = write_blk(info, *blk, buf); 494 ret = write_blk(info, *blk, buf);
495 if (ret < 0) 495 if (ret < 0)
496 quota_error(dquot->dq_sb, "Can't write quota " 496 quota_error(dquot->dq_sb,
497 "tree block %u", blk); 497 "Can't write quota tree block %u",
498 *blk);
498 } 499 }
499 } 500 }
500out_buf: 501out_buf:
diff --git a/fs/read_write.c b/fs/read_write.c
index 5d431bacbea9..5520f8ad5504 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -30,18 +30,9 @@ const struct file_operations generic_ro_fops = {
30 30
31EXPORT_SYMBOL(generic_ro_fops); 31EXPORT_SYMBOL(generic_ro_fops);
32 32
33static int 33static inline int unsigned_offsets(struct file *file)
34__negative_fpos_check(struct file *file, loff_t pos, size_t count)
35{ 34{
36 /* 35 return file->f_mode & FMODE_UNSIGNED_OFFSET;
37 * pos or pos+count is negative here, check overflow.
38 * too big "count" will be caught in rw_verify_area().
39 */
40 if ((pos < 0) && (pos + count < pos))
41 return -EOVERFLOW;
42 if (file->f_mode & FMODE_UNSIGNED_OFFSET)
43 return 0;
44 return -EINVAL;
45} 36}
46 37
47/** 38/**
@@ -75,7 +66,7 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
75 break; 66 break;
76 } 67 }
77 68
78 if (offset < 0 && __negative_fpos_check(file, offset, 0)) 69 if (offset < 0 && !unsigned_offsets(file))
79 return -EINVAL; 70 return -EINVAL;
80 if (offset > inode->i_sb->s_maxbytes) 71 if (offset > inode->i_sb->s_maxbytes)
81 return -EINVAL; 72 return -EINVAL;
@@ -152,7 +143,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
152 offset += file->f_pos; 143 offset += file->f_pos;
153 } 144 }
154 retval = -EINVAL; 145 retval = -EINVAL;
155 if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) { 146 if (offset >= 0 || unsigned_offsets(file)) {
156 if (offset != file->f_pos) { 147 if (offset != file->f_pos) {
157 file->f_pos = offset; 148 file->f_pos = offset;
158 file->f_version = 0; 149 file->f_version = 0;
@@ -252,9 +243,13 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
252 if (unlikely((ssize_t) count < 0)) 243 if (unlikely((ssize_t) count < 0))
253 return retval; 244 return retval;
254 pos = *ppos; 245 pos = *ppos;
255 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) { 246 if (unlikely(pos < 0)) {
256 retval = __negative_fpos_check(file, pos, count); 247 if (!unsigned_offsets(file))
257 if (retval) 248 return retval;
249 if (count >= -pos) /* both values are in 0..LLONG_MAX */
250 return -EOVERFLOW;
251 } else if (unlikely((loff_t) (pos + count) < 0)) {
252 if (!unsigned_offsets(file))
258 return retval; 253 return retval;
259 } 254 }
260 255
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index d31bce1a9f90..3eea859e6990 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2551,8 +2551,6 @@ static int release_journal_dev(struct super_block *super,
2551 result = 0; 2551 result = 0;
2552 2552
2553 if (journal->j_dev_bd != NULL) { 2553 if (journal->j_dev_bd != NULL) {
2554 if (journal->j_dev_bd->bd_dev != super->s_dev)
2555 bd_release(journal->j_dev_bd);
2556 result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); 2554 result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
2557 journal->j_dev_bd = NULL; 2555 journal->j_dev_bd = NULL;
2558 } 2556 }
@@ -2570,7 +2568,7 @@ static int journal_init_dev(struct super_block *super,
2570{ 2568{
2571 int result; 2569 int result;
2572 dev_t jdev; 2570 dev_t jdev;
2573 fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE; 2571 fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
2574 char b[BDEVNAME_SIZE]; 2572 char b[BDEVNAME_SIZE];
2575 2573
2576 result = 0; 2574 result = 0;
@@ -2584,7 +2582,10 @@ static int journal_init_dev(struct super_block *super,
2584 2582
2585 /* there is no "jdev" option and journal is on separate device */ 2583 /* there is no "jdev" option and journal is on separate device */
2586 if ((!jdev_name || !jdev_name[0])) { 2584 if ((!jdev_name || !jdev_name[0])) {
2587 journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); 2585 if (jdev == super->s_dev)
2586 blkdev_mode &= ~FMODE_EXCL;
2587 journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
2588 journal);
2588 journal->j_dev_mode = blkdev_mode; 2589 journal->j_dev_mode = blkdev_mode;
2589 if (IS_ERR(journal->j_dev_bd)) { 2590 if (IS_ERR(journal->j_dev_bd)) {
2590 result = PTR_ERR(journal->j_dev_bd); 2591 result = PTR_ERR(journal->j_dev_bd);
@@ -2593,22 +2594,14 @@ static int journal_init_dev(struct super_block *super,
2593 "cannot init journal device '%s': %i", 2594 "cannot init journal device '%s': %i",
2594 __bdevname(jdev, b), result); 2595 __bdevname(jdev, b), result);
2595 return result; 2596 return result;
2596 } else if (jdev != super->s_dev) { 2597 } else if (jdev != super->s_dev)
2597 result = bd_claim(journal->j_dev_bd, journal);
2598 if (result) {
2599 blkdev_put(journal->j_dev_bd, blkdev_mode);
2600 return result;
2601 }
2602
2603 set_blocksize(journal->j_dev_bd, super->s_blocksize); 2598 set_blocksize(journal->j_dev_bd, super->s_blocksize);
2604 }
2605 2599
2606 return 0; 2600 return 0;
2607 } 2601 }
2608 2602
2609 journal->j_dev_mode = blkdev_mode; 2603 journal->j_dev_mode = blkdev_mode;
2610 journal->j_dev_bd = open_bdev_exclusive(jdev_name, 2604 journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
2611 blkdev_mode, journal);
2612 if (IS_ERR(journal->j_dev_bd)) { 2605 if (IS_ERR(journal->j_dev_bd)) {
2613 result = PTR_ERR(journal->j_dev_bd); 2606 result = PTR_ERR(journal->j_dev_bd);
2614 journal->j_dev_bd = NULL; 2607 journal->j_dev_bd = NULL;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index adbc6f538515..45de98b59466 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int l
586 va_list args; 586 va_list args;
587 int mode, first, last; 587 int mode, first, last;
588 588
589 va_start(args, bh);
590
591 if (!bh) { 589 if (!bh) {
592 printk("print_block: buffer is NULL\n"); 590 printk("print_block: buffer is NULL\n");
593 return; 591 return;
594 } 592 }
595 593
594 va_start(args, bh);
595
596 mode = va_arg(args, int); 596 mode = va_arg(args, int);
597 first = va_arg(args, int); 597 first = va_arg(args, int);
598 last = va_arg(args, int); 598 last = va_arg(args, int);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2575682a9ead..0aab04f46827 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -632,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
632static int reiserfs_release_dquot(struct dquot *); 632static int reiserfs_release_dquot(struct dquot *);
633static int reiserfs_mark_dquot_dirty(struct dquot *); 633static int reiserfs_mark_dquot_dirty(struct dquot *);
634static int reiserfs_write_info(struct super_block *, int); 634static int reiserfs_write_info(struct super_block *, int);
635static int reiserfs_quota_on(struct super_block *, int, int, char *); 635static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
636 636
637static const struct dquot_operations reiserfs_quota_operations = { 637static const struct dquot_operations reiserfs_quota_operations = {
638 .write_dquot = reiserfs_write_dquot, 638 .write_dquot = reiserfs_write_dquot,
@@ -2048,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
2048 * Standard function to be called on quota_on 2048 * Standard function to be called on quota_on
2049 */ 2049 */
2050static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, 2050static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2051 char *name) 2051 struct path *path)
2052{ 2052{
2053 int err; 2053 int err;
2054 struct path path;
2055 struct inode *inode; 2054 struct inode *inode;
2056 struct reiserfs_transaction_handle th; 2055 struct reiserfs_transaction_handle th;
2057 2056
2058 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) 2057 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
2059 return -EINVAL; 2058 return -EINVAL;
2060 2059
2061 err = kern_path(name, LOOKUP_FOLLOW, &path);
2062 if (err)
2063 return err;
2064 /* Quotafile not on the same filesystem? */ 2060 /* Quotafile not on the same filesystem? */
2065 if (path.mnt->mnt_sb != sb) { 2061 if (path->mnt->mnt_sb != sb) {
2066 err = -EXDEV; 2062 err = -EXDEV;
2067 goto out; 2063 goto out;
2068 } 2064 }
2069 inode = path.dentry->d_inode; 2065 inode = path->dentry->d_inode;
2070 /* We must not pack tails for quota files on reiserfs for quota IO to work */ 2066 /* We must not pack tails for quota files on reiserfs for quota IO to work */
2071 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { 2067 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
2072 err = reiserfs_unpack(inode, NULL); 2068 err = reiserfs_unpack(inode, NULL);
@@ -2082,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2082 /* Journaling quota? */ 2078 /* Journaling quota? */
2083 if (REISERFS_SB(sb)->s_qf_names[type]) { 2079 if (REISERFS_SB(sb)->s_qf_names[type]) {
2084 /* Quotafile not of fs root? */ 2080 /* Quotafile not of fs root? */
2085 if (path.dentry->d_parent != sb->s_root) 2081 if (path->dentry->d_parent != sb->s_root)
2086 reiserfs_warning(sb, "super-6521", 2082 reiserfs_warning(sb, "super-6521",
2087 "Quota file not on filesystem root. " 2083 "Quota file not on filesystem root. "
2088 "Journalled quota will not work."); 2084 "Journalled quota will not work.");
@@ -2101,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2101 if (err) 2097 if (err)
2102 goto out; 2098 goto out;
2103 } 2099 }
2104 err = dquot_quota_on_path(sb, type, format_id, &path); 2100 err = dquot_quota_on(sb, type, format_id, path);
2105out: 2101out:
2106 path_put(&path);
2107 return err; 2102 return err;
2108} 2103}
2109 2104
diff --git a/fs/select.c b/fs/select.c
index b7b10aa30861..e56560d2b08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
306 rts.tv_sec = rts.tv_nsec = 0; 306 rts.tv_sec = rts.tv_nsec = 0;
307 307
308 if (timeval) { 308 if (timeval) {
309 if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
310 memset(&rtv, 0, sizeof(rtv));
309 rtv.tv_sec = rts.tv_sec; 311 rtv.tv_sec = rts.tv_sec;
310 rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; 312 rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
311 313
diff --git a/fs/splice.c b/fs/splice.c
index ce2f02579e35..50a5d978da16 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
682{ 682{
683 struct file *file = sd->u.file; 683 struct file *file = sd->u.file;
684 loff_t pos = sd->pos; 684 loff_t pos = sd->pos;
685 int ret, more; 685 int more;
686
687 ret = buf->ops->confirm(pipe, buf);
688 if (!ret) {
689 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
690 if (file->f_op && file->f_op->sendpage)
691 ret = file->f_op->sendpage(file, buf->page, buf->offset,
692 sd->len, &pos, more);
693 else
694 ret = -EINVAL;
695 }
696 686
697 return ret; 687 if (!likely(file->f_op && file->f_op->sendpage))
688 return -EINVAL;
689
690 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
691 return file->f_op->sendpage(file, buf->page, buf->offset,
692 sd->len, &pos, more);
698} 693}
699 694
700/* 695/*
@@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
727 void *fsdata; 722 void *fsdata;
728 int ret; 723 int ret;
729 724
730 /*
731 * make sure the data in this buffer is uptodate
732 */
733 ret = buf->ops->confirm(pipe, buf);
734 if (unlikely(ret))
735 return ret;
736
737 offset = sd->pos & ~PAGE_CACHE_MASK; 725 offset = sd->pos & ~PAGE_CACHE_MASK;
738 726
739 this_len = sd->len; 727 this_len = sd->len;
@@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
805 if (sd->len > sd->total_len) 793 if (sd->len > sd->total_len)
806 sd->len = sd->total_len; 794 sd->len = sd->total_len;
807 795
808 ret = actor(pipe, buf, sd); 796 ret = buf->ops->confirm(pipe, buf);
809 if (ret <= 0) { 797 if (unlikely(ret)) {
810 if (ret == -ENODATA) 798 if (ret == -ENODATA)
811 ret = 0; 799 ret = 0;
812 return ret; 800 return ret;
813 } 801 }
802
803 ret = actor(pipe, buf, sd);
804 if (ret <= 0)
805 return ret;
806
814 buf->offset += ret; 807 buf->offset += ret;
815 buf->len -= ret; 808 buf->len -= ret;
816 809
@@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1044 int ret; 1037 int ret;
1045 void *data; 1038 void *data;
1046 1039
1047 ret = buf->ops->confirm(pipe, buf);
1048 if (ret)
1049 return ret;
1050
1051 data = buf->ops->map(pipe, buf, 0); 1040 data = buf->ops->map(pipe, buf, 0);
1052 ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); 1041 ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1053 buf->ops->unmap(pipe, buf, data); 1042 buf->ops->unmap(pipe, buf, data);
@@ -1495,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1495 char *src; 1484 char *src;
1496 int ret; 1485 int ret;
1497 1486
1498 ret = buf->ops->confirm(pipe, buf);
1499 if (unlikely(ret))
1500 return ret;
1501
1502 /* 1487 /*
1503 * See if we can use the atomic maps, by prefaulting in the 1488 * See if we can use the atomic maps, by prefaulting in the
1504 * pages and doing an atomic copy 1489 * pages and doing an atomic copy
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d04..aa68a8a31518 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
29config SQUASHFS_XATTR 29config SQUASHFS_XATTR
30 bool "Squashfs XATTR support" 30 bool "Squashfs XATTR support"
31 depends on SQUASHFS 31 depends on SQUASHFS
32 default n
33 help 32 help
34 Saying Y here includes support for extended attributes (xattrs). 33 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by 34 Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
40config SQUASHFS_LZO 39config SQUASHFS_LZO
41 bool "Include support for LZO compressed file systems" 40 bool "Include support for LZO compressed file systems"
42 depends on SQUASHFS 41 depends on SQUASHFS
43 default n
44 select LZO_DECOMPRESS 42 select LZO_DECOMPRESS
45 help 43 help
46 Saying Y here includes support for reading Squashfs file systems 44 Saying Y here includes support for reading Squashfs file systems
@@ -53,10 +51,24 @@ config SQUASHFS_LZO
53 51
54 If unsure, say N. 52 If unsure, say N.
55 53
54config SQUASHFS_XZ
55 bool "Include support for XZ compressed file systems"
56 depends on SQUASHFS
57 select XZ_DEC
58 help
59 Saying Y here includes support for reading Squashfs file systems
60 compressed with XZ compresssion. XZ gives better compression than
61 the default zlib compression, at the expense of greater CPU and
62 memory overhead.
63
64 XZ is not the standard compression used in Squashfs and so most
65 file systems will be readable without selecting this option.
66
67 If unsure, say N.
68
56config SQUASHFS_EMBEDDED 69config SQUASHFS_EMBEDDED
57 bool "Additional option for memory-constrained systems" 70 bool "Additional option for memory-constrained systems"
58 depends on SQUASHFS 71 depends on SQUASHFS
59 default n
60 help 72 help
61 Saying Y here allows you to specify cache size. 73 Saying Y here allows you to specify cache size.
62 74
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d328..cecf2bea07af 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o 8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o 9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
10squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb840..2fb2882f0fa7 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
34 34
35#include "squashfs_fs.h" 35#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
37#include "squashfs_fs_i.h"
38#include "squashfs.h" 37#include "squashfs.h"
39#include "decompressor.h" 38#include "decompressor.h"
40 39
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee9059..26b15ae34d6f 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
55 55
56#include "squashfs_fs.h" 56#include "squashfs_fs.h"
57#include "squashfs_fs_sb.h" 57#include "squashfs_fs_sb.h"
58#include "squashfs_fs_i.h"
59#include "squashfs.h" 58#include "squashfs.h"
60 59
61/* 60/*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722f..a5940e54c4dd 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
27 27
28#include "squashfs_fs.h" 28#include "squashfs_fs.h"
29#include "squashfs_fs_sb.h" 29#include "squashfs_fs_sb.h"
30#include "squashfs_fs_i.h"
31#include "decompressor.h" 30#include "decompressor.h"
32#include "squashfs.h" 31#include "squashfs.h"
33 32
@@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
41}; 40};
42 41
43#ifndef CONFIG_SQUASHFS_LZO 42#ifndef CONFIG_SQUASHFS_LZO
44static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { 43static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
45 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
46}; 45};
47#endif 46#endif
48 47
48#ifndef CONFIG_SQUASHFS_XZ
49static const struct squashfs_decompressor squashfs_xz_comp_ops = {
50 NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
51};
52#endif
53
49static const struct squashfs_decompressor squashfs_unknown_comp_ops = { 54static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
50 NULL, NULL, NULL, 0, "unknown", 0 55 NULL, NULL, NULL, 0, "unknown", 0
51}; 56};
52 57
53static const struct squashfs_decompressor *decompressor[] = { 58static const struct squashfs_decompressor *decompressor[] = {
54 &squashfs_zlib_comp_ops, 59 &squashfs_zlib_comp_ops,
55 &squashfs_lzma_unsupported_comp_ops,
56#ifdef CONFIG_SQUASHFS_LZO
57 &squashfs_lzo_comp_ops, 60 &squashfs_lzo_comp_ops,
58#else 61 &squashfs_xz_comp_ops,
59 &squashfs_lzo_unsupported_comp_ops, 62 &squashfs_lzma_unsupported_comp_ops,
60#endif
61 &squashfs_unknown_comp_ops 63 &squashfs_unknown_comp_ops
62}; 64};
63 65
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f6..3b305a70f7aa 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, 52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
53 length, srclength, pages); 53 length, srclength, pages);
54} 54}
55
56#ifdef CONFIG_SQUASHFS_XZ
57extern const struct squashfs_decompressor squashfs_xz_comp_ops;
58#endif
59
60#ifdef CONFIG_SQUASHFS_LZO
61extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
62#endif
63
55#endif 64#endif
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879d..7eef571443c6 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
39 39
40#include "squashfs_fs.h" 40#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h"
43#include "squashfs.h" 42#include "squashfs.h"
44 43
45/* 44/*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b70..d8f32452638e 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
37 37
38#include "squashfs_fs.h" 38#include "squashfs_fs.h"
39#include "squashfs_fs_sb.h" 39#include "squashfs_fs_sb.h"
40#include "squashfs_fs_i.h"
41#include "squashfs.h" 40#include "squashfs.h"
42 41
43/* 42/*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c1..7da759e34c52 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
29 29
30#include "squashfs_fs.h" 30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 32#include "squashfs.h"
34#include "decompressor.h" 33#include "decompressor.h"
35 34
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f72..ba729d808876 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
27 27
28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) 28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
29 29
30static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
31{
32 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
33}
34
35/* block.c */ 30/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, 31extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int, int); 32 int, int);
@@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
104 99
105/* zlib_wrapper.c */ 100/* zlib_wrapper.c */
106extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 101extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
107
108/* lzo_wrapper.c */
109extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab11..39533feffd6d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
238#define ZLIB_COMPRESSION 1 238#define ZLIB_COMPRESSION 1
239#define LZMA_COMPRESSION 2 239#define LZMA_COMPRESSION 2
240#define LZO_COMPRESSION 3 240#define LZO_COMPRESSION 3
241#define XZ_COMPRESSION 4
241 242
242struct squashfs_super_block { 243struct squashfs_super_block {
243 __le32 s_magic; 244 __le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a1..359baefc01fc 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
45 }; 45 };
46 struct inode vfs_inode; 46 struct inode vfs_inode;
47}; 47};
48
49
50static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
51{
52 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
53}
48#endif 54#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d33be5dd6c32..05385dbe1465 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,7 +32,6 @@
32 32
33#include "squashfs_fs.h" 33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h" 34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h" 35#include "squashfs.h"
37#include "xattr.h" 36#include "xattr.h"
38 37
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 000000000000..856756ca5ee4
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,153 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xz_wrapper.c
22 */
23
24
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27#include <linux/slab.h>
28#include <linux/xz.h>
29
30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h"
34#include "decompressor.h"
35
36struct squashfs_xz {
37 struct xz_dec *state;
38 struct xz_buf buf;
39};
40
41static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
42{
43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
44
45 struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
46 if (stream == NULL)
47 goto failed;
48
49 stream->state = xz_dec_init(XZ_PREALLOC, block_size);
50 if (stream->state == NULL)
51 goto failed;
52
53 return stream;
54
55failed:
56 ERROR("Failed to allocate xz workspace\n");
57 kfree(stream);
58 return NULL;
59}
60
61
62static void squashfs_xz_free(void *strm)
63{
64 struct squashfs_xz *stream = strm;
65
66 if (stream) {
67 xz_dec_end(stream->state);
68 kfree(stream);
69 }
70}
71
72
73static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
74 struct buffer_head **bh, int b, int offset, int length, int srclength,
75 int pages)
76{
77 enum xz_ret xz_err;
78 int avail, total = 0, k = 0, page = 0;
79 struct squashfs_xz *stream = msblk->stream;
80
81 mutex_lock(&msblk->read_data_mutex);
82
83 xz_dec_reset(stream->state);
84 stream->buf.in_pos = 0;
85 stream->buf.in_size = 0;
86 stream->buf.out_pos = 0;
87 stream->buf.out_size = PAGE_CACHE_SIZE;
88 stream->buf.out = buffer[page++];
89
90 do {
91 if (stream->buf.in_pos == stream->buf.in_size && k < b) {
92 avail = min(length, msblk->devblksize - offset);
93 length -= avail;
94 wait_on_buffer(bh[k]);
95 if (!buffer_uptodate(bh[k]))
96 goto release_mutex;
97
98 if (avail == 0) {
99 offset = 0;
100 put_bh(bh[k++]);
101 continue;
102 }
103
104 stream->buf.in = bh[k]->b_data + offset;
105 stream->buf.in_size = avail;
106 stream->buf.in_pos = 0;
107 offset = 0;
108 }
109
110 if (stream->buf.out_pos == stream->buf.out_size
111 && page < pages) {
112 stream->buf.out = buffer[page++];
113 stream->buf.out_pos = 0;
114 total += PAGE_CACHE_SIZE;
115 }
116
117 xz_err = xz_dec_run(stream->state, &stream->buf);
118
119 if (stream->buf.in_pos == stream->buf.in_size && k < b)
120 put_bh(bh[k++]);
121 } while (xz_err == XZ_OK);
122
123 if (xz_err != XZ_STREAM_END) {
124 ERROR("xz_dec_run error, data probably corrupt\n");
125 goto release_mutex;
126 }
127
128 if (k < b) {
129 ERROR("xz_uncompress error, input remaining\n");
130 goto release_mutex;
131 }
132
133 total += stream->buf.out_pos;
134 mutex_unlock(&msblk->read_data_mutex);
135 return total;
136
137release_mutex:
138 mutex_unlock(&msblk->read_data_mutex);
139
140 for (; k < b; k++)
141 put_bh(bh[k]);
142
143 return -EIO;
144}
145
146const struct squashfs_decompressor squashfs_xz_comp_ops = {
147 .init = squashfs_xz_init,
148 .free = squashfs_xz_free,
149 .decompress = squashfs_xz_uncompress,
150 .id = XZ_COMPRESSION,
151 .name = "xz",
152 .supported = 1
153};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e483..818a5e063faf 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
29 29
30#include "squashfs_fs.h" 30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 32#include "squashfs.h"
34#include "decompressor.h" 33#include "decompressor.h"
35 34
@@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
66 struct buffer_head **bh, int b, int offset, int length, int srclength, 65 struct buffer_head **bh, int b, int offset, int length, int srclength,
67 int pages) 66 int pages)
68{ 67{
69 int zlib_err = 0, zlib_init = 0; 68 int zlib_err, zlib_init = 0;
70 int avail, bytes, k = 0, page = 0; 69 int k = 0, page = 0;
71 z_stream *stream = msblk->stream; 70 z_stream *stream = msblk->stream;
72 71
73 mutex_lock(&msblk->read_data_mutex); 72 mutex_lock(&msblk->read_data_mutex);
@@ -75,11 +74,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
75 stream->avail_out = 0; 74 stream->avail_out = 0;
76 stream->avail_in = 0; 75 stream->avail_in = 0;
77 76
78 bytes = length;
79 do { 77 do {
80 if (stream->avail_in == 0 && k < b) { 78 if (stream->avail_in == 0 && k < b) {
81 avail = min(bytes, msblk->devblksize - offset); 79 int avail = min(length, msblk->devblksize - offset);
82 bytes -= avail; 80 length -= avail;
83 wait_on_buffer(bh[k]); 81 wait_on_buffer(bh[k]);
84 if (!buffer_uptodate(bh[k])) 82 if (!buffer_uptodate(bh[k]))
85 goto release_mutex; 83 goto release_mutex;
@@ -128,6 +126,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
128 goto release_mutex; 126 goto release_mutex;
129 } 127 }
130 128
129 if (k < b) {
130 ERROR("zlib_uncompress error, data remaining\n");
131 goto release_mutex;
132 }
133
131 length = stream->total_out; 134 length = stream->total_out;
132 mutex_unlock(&msblk->read_data_mutex); 135 mutex_unlock(&msblk->read_data_mutex);
133 return length; 136 return length;
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e213900..d5c61cf2b703 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
75 int error = -EINVAL; 75 int error = -EINVAL;
76 int lookup_flags = 0; 76 int lookup_flags = 0;
77 77
78 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) 78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
79 goto out; 79 goto out;
80 80
81 if (!(flag & AT_SYMLINK_NOFOLLOW)) 81 if (!(flag & AT_SYMLINK_NOFOLLOW))
82 lookup_flags |= LOOKUP_FOLLOW; 82 lookup_flags |= LOOKUP_FOLLOW;
83 if (flag & AT_NO_AUTOMOUNT)
84 lookup_flags |= LOOKUP_NO_AUTOMOUNT;
83 85
84 error = user_path_at(dfd, filename, lookup_flags, &path); 86 error = user_path_at(dfd, filename, lookup_flags, &path);
85 if (error) 87 if (error)
diff --git a/fs/super.c b/fs/super.c
index 823e061faa87..74e149efed81 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -767,13 +767,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
767{ 767{
768 struct block_device *bdev; 768 struct block_device *bdev;
769 struct super_block *s; 769 struct super_block *s;
770 fmode_t mode = FMODE_READ; 770 fmode_t mode = FMODE_READ | FMODE_EXCL;
771 int error = 0; 771 int error = 0;
772 772
773 if (!(flags & MS_RDONLY)) 773 if (!(flags & MS_RDONLY))
774 mode |= FMODE_WRITE; 774 mode |= FMODE_WRITE;
775 775
776 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 776 bdev = blkdev_get_by_path(dev_name, mode, fs_type);
777 if (IS_ERR(bdev)) 777 if (IS_ERR(bdev))
778 return ERR_CAST(bdev); 778 return ERR_CAST(bdev);
779 779
@@ -802,13 +802,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
802 802
803 /* 803 /*
804 * s_umount nests inside bd_mutex during 804 * s_umount nests inside bd_mutex during
805 * __invalidate_device(). close_bdev_exclusive() 805 * __invalidate_device(). blkdev_put() acquires
806 * acquires bd_mutex and can't be called under 806 * bd_mutex and can't be called under s_umount. Drop
807 * s_umount. Drop s_umount temporarily. This is safe 807 * s_umount temporarily. This is safe as we're
808 * as we're holding an active reference. 808 * holding an active reference.
809 */ 809 */
810 up_write(&s->s_umount); 810 up_write(&s->s_umount);
811 close_bdev_exclusive(bdev, mode); 811 blkdev_put(bdev, mode);
812 down_write(&s->s_umount); 812 down_write(&s->s_umount);
813 } else { 813 } else {
814 char b[BDEVNAME_SIZE]; 814 char b[BDEVNAME_SIZE];
@@ -832,7 +832,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
832error_s: 832error_s:
833 error = PTR_ERR(s); 833 error = PTR_ERR(s);
834error_bdev: 834error_bdev:
835 close_bdev_exclusive(bdev, mode); 835 blkdev_put(bdev, mode);
836error: 836error:
837 return ERR_PTR(error); 837 return ERR_PTR(error);
838} 838}
@@ -863,7 +863,8 @@ void kill_block_super(struct super_block *sb)
863 bdev->bd_super = NULL; 863 bdev->bd_super = NULL;
864 generic_shutdown_super(sb); 864 generic_shutdown_super(sb);
865 sync_blockdev(bdev); 865 sync_blockdev(bdev);
866 close_bdev_exclusive(bdev, mode); 866 WARN_ON_ONCE(!(mode & FMODE_EXCL));
867 blkdev_put(bdev, mode | FMODE_EXCL);
867} 868}
868 869
869EXPORT_SYMBOL(kill_block_super); 870EXPORT_SYMBOL(kill_block_super);
@@ -1140,7 +1141,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1140 return mnt; 1141 return mnt;
1141 1142
1142 err: 1143 err:
1143 mntput_long(mnt); 1144 mntput(mnt);
1144 return ERR_PTR(err); 1145 return ERR_PTR(err);
1145} 1146}
1146 1147
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d6..8c41feacbac5 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
1config SYSFS 1config SYSFS
2 bool "sysfs file system support" if EMBEDDED 2 bool "sysfs file system support" if EXPERT
3 default y 3 default y
4 help 4 help
5 The sysfs filesystem is a virtual filesystem that the kernel uses to 5 The sysfs filesystem is a virtual filesystem that the kernel uses to
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 442f34ff1af8..c8769dc222d8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -165,10 +165,7 @@ int sysfs_merge_group(struct kobject *kobj,
165 struct attribute *const *attr; 165 struct attribute *const *attr;
166 int i; 166 int i;
167 167
168 if (grp) 168 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
169 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
170 else
171 dir_sd = sysfs_get(kobj->sd);
172 if (!dir_sd) 169 if (!dir_sd)
173 return -ENOENT; 170 return -ENOENT;
174 171
@@ -195,10 +192,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
195 struct sysfs_dirent *dir_sd; 192 struct sysfs_dirent *dir_sd;
196 struct attribute *const *attr; 193 struct attribute *const *attr;
197 194
198 if (grp) 195 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
199 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
200 else
201 dir_sd = sysfs_get(kobj->sd);
202 if (dir_sd) { 196 if (dir_sd) {
203 for (attr = grp->attrs; *attr; ++attr) 197 for (attr = grp->attrs; *attr; ++attr)
204 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); 198 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 30ac27345586..0a12eb89cd32 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/sysfs.h>
22#include <linux/xattr.h> 23#include <linux/xattr.h>
23#include <linux/security.h> 24#include <linux/security.h>
24#include "sysfs.h" 25#include "sysfs.h"
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ffaaa816bfba..3d28af31d863 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/lockdep.h> 11#include <linux/lockdep.h>
12#include <linux/kobject_ns.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13 14
14struct sysfs_open_dirent; 15struct sysfs_open_dirent;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b5e68da2db32..b427b1208c26 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -48,7 +48,6 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
48 struct inode * inode = NULL; 48 struct inode * inode = NULL;
49 ino_t ino; 49 ino_t ino;
50 50
51 d_set_d_op(dentry, dir->i_sb->s_root->d_op);
52 if (dentry->d_name.len > SYSV_NAMELEN) 51 if (dentry->d_name.len > SYSV_NAMELEN)
53 return ERR_PTR(-ENAMETOOLONG); 52 return ERR_PTR(-ENAMETOOLONG);
54 ino = sysv_inode_by_name(dentry); 53 ino = sysv_inode_by_name(dentry);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 76712aefc4ab..f60c196913ea 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -332,6 +332,10 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
332 sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; 332 sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
333 /* set up enough so that it can read an inode */ 333 /* set up enough so that it can read an inode */
334 sb->s_op = &sysv_sops; 334 sb->s_op = &sysv_sops;
335 if (sbi->s_forced_ro)
336 sb->s_flags |= MS_RDONLY;
337 if (sbi->s_truncate)
338 sb->s_d_op = &sysv_dentry_operations;
335 root_inode = sysv_iget(sb, SYSV_ROOT_INO); 339 root_inode = sysv_iget(sb, SYSV_ROOT_INO);
336 if (IS_ERR(root_inode)) { 340 if (IS_ERR(root_inode)) {
337 printk("SysV FS: get root inode failed\n"); 341 printk("SysV FS: get root inode failed\n");
@@ -343,10 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
343 printk("SysV FS: get root dentry failed\n"); 347 printk("SysV FS: get root dentry failed\n");
344 return 0; 348 return 0;
345 } 349 }
346 if (sbi->s_forced_ro)
347 sb->s_flags |= MS_RDONLY;
348 if (sbi->s_truncate)
349 d_set_d_op(sb->s_root, &sysv_dentry_operations);
350 return 1; 350 return 1;
351} 351}
352 352
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index f8def3c8ea4c..0e0e99bd6bce 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,5 @@
1config UDF_FS 1config UDF_FS
2 tristate "UDF file system support" 2 tristate "UDF file system support"
3 depends on BKL # needs serious work to remove
4 select CRC_ITU_T 3 select CRC_ITU_T
5 help 4 help
6 This is the new file system used on some CD-ROMs and DVDs. Say Y if 5 This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b608efaa4cee..306ee39ef2c3 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
157 udf_debug("bit %ld already set\n", bit + i); 157 udf_debug("bit %ld already set\n", bit + i);
158 udf_debug("byte=%2x\n", 158 udf_debug("byte=%2x\n",
159 ((char *)bh->b_data)[(bit + i) >> 3]); 159 ((char *)bh->b_data)[(bit + i) >> 3]);
160 } else {
161 udf_add_free_space(sb, sbi->s_partition, 1);
162 } 160 }
163 } 161 }
162 udf_add_free_space(sb, sbi->s_partition, count);
164 mark_buffer_dirty(bh); 163 mark_buffer_dirty(bh);
165 if (overflow) { 164 if (overflow) {
166 block += count; 165 block += count;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 51552bf50225..eb8bfe2b89a5 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
30#include <linux/errno.h> 30#include <linux/errno.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/smp_lock.h>
34#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
35 34
36#include "udf_i.h" 35#include "udf_i.h"
@@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
190 struct inode *dir = filp->f_path.dentry->d_inode; 189 struct inode *dir = filp->f_path.dentry->d_inode;
191 int result; 190 int result;
192 191
193 lock_kernel();
194
195 if (filp->f_pos == 0) { 192 if (filp->f_pos == 0) {
196 if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) { 193 if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
197 unlock_kernel();
198 return 0; 194 return 0;
199 } 195 }
200 filp->f_pos++; 196 filp->f_pos++;
201 } 197 }
202 198
203 result = do_udf_readdir(dir, filp, filldir, dirent); 199 result = do_udf_readdir(dir, filp, filldir, dirent);
204 unlock_kernel();
205 return result; 200 return result;
206} 201}
207 202
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 66b9e7e7e4c5..89c78486cbbe 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -32,7 +32,6 @@
32#include <linux/string.h> /* memset */ 32#include <linux/string.h> /* memset */
33#include <linux/capability.h> 33#include <linux/capability.h>
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 35#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
38#include <linux/aio.h> 37#include <linux/aio.h>
@@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
114 size_t count = iocb->ki_left; 113 size_t count = iocb->ki_left;
115 struct udf_inode_info *iinfo = UDF_I(inode); 114 struct udf_inode_info *iinfo = UDF_I(inode);
116 115
116 down_write(&iinfo->i_data_sem);
117 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 117 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
118 if (file->f_flags & O_APPEND) 118 if (file->f_flags & O_APPEND)
119 pos = inode->i_size; 119 pos = inode->i_size;
@@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
126 udf_expand_file_adinicb(inode, pos + count, &err); 126 udf_expand_file_adinicb(inode, pos + count, &err);
127 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 127 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
128 udf_debug("udf_expand_adinicb: err=%d\n", err); 128 udf_debug("udf_expand_adinicb: err=%d\n", err);
129 up_write(&iinfo->i_data_sem);
129 return err; 130 return err;
130 } 131 }
131 } else { 132 } else {
@@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
135 iinfo->i_lenAlloc = inode->i_size; 136 iinfo->i_lenAlloc = inode->i_size;
136 } 137 }
137 } 138 }
139 up_write(&iinfo->i_data_sem);
138 140
139 retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); 141 retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
140 if (retval > 0) 142 if (retval > 0)
@@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
149 long old_block, new_block; 151 long old_block, new_block;
150 int result = -EINVAL; 152 int result = -EINVAL;
151 153
152 lock_kernel();
153
154 if (file_permission(filp, MAY_READ) != 0) { 154 if (file_permission(filp, MAY_READ) != 0) {
155 udf_debug("no permission to access inode %lu\n", inode->i_ino); 155 udf_debug("no permission to access inode %lu\n", inode->i_ino);
156 result = -EPERM; 156 result = -EPERM;
@@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
196 } 196 }
197 197
198out: 198out:
199 unlock_kernel();
200 return result; 199 return result;
201} 200}
202 201
@@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
204{ 203{
205 if (filp->f_mode & FMODE_WRITE) { 204 if (filp->f_mode & FMODE_WRITE) {
206 mutex_lock(&inode->i_mutex); 205 mutex_lock(&inode->i_mutex);
207 lock_kernel(); 206 down_write(&UDF_I(inode)->i_data_sem);
208 udf_discard_prealloc(inode); 207 udf_discard_prealloc(inode);
209 udf_truncate_tail_extent(inode); 208 udf_truncate_tail_extent(inode);
210 unlock_kernel(); 209 up_write(&UDF_I(inode)->i_data_sem);
211 mutex_unlock(&inode->i_mutex); 210 mutex_unlock(&inode->i_mutex);
212 } 211 }
213 return 0; 212 return 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 75d9304d0dc3..6fb7e0adcda0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
92 return NULL; 92 return NULL;
93 } 93 }
94 94
95 mutex_lock(&sbi->s_alloc_mutex);
96 if (sbi->s_lvid_bh) { 95 if (sbi->s_lvid_bh) {
97 struct logicalVolIntegrityDesc *lvid = 96 struct logicalVolIntegrityDescImpUse *lvidiu;
98 (struct logicalVolIntegrityDesc *) 97
99 sbi->s_lvid_bh->b_data; 98 iinfo->i_unique = lvid_get_unique_id(sb);
100 struct logicalVolIntegrityDescImpUse *lvidiu = 99 mutex_lock(&sbi->s_alloc_mutex);
101 udf_sb_lvidiu(sbi); 100 lvidiu = udf_sb_lvidiu(sbi);
102 struct logicalVolHeaderDesc *lvhd;
103 uint64_t uniqueID;
104 lvhd = (struct logicalVolHeaderDesc *)
105 (lvid->logicalVolContentsUse);
106 if (S_ISDIR(mode)) 101 if (S_ISDIR(mode))
107 le32_add_cpu(&lvidiu->numDirs, 1); 102 le32_add_cpu(&lvidiu->numDirs, 1);
108 else 103 else
109 le32_add_cpu(&lvidiu->numFiles, 1); 104 le32_add_cpu(&lvidiu->numFiles, 1);
110 iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
111 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
112 uniqueID += 16;
113 lvhd->uniqueID = cpu_to_le64(uniqueID);
114 udf_updated_lvid(sb); 105 udf_updated_lvid(sb);
106 mutex_unlock(&sbi->s_alloc_mutex);
115 } 107 }
116 mutex_unlock(&sbi->s_alloc_mutex);
117 108
118 inode_init_owner(inode, dir, mode); 109 inode_init_owner(inode, dir, mode);
119 110
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fc48f37aa2dd..c6a2e782b97b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -31,7 +31,6 @@
31 31
32#include "udfdecl.h" 32#include "udfdecl.h"
33#include <linux/mm.h> 33#include <linux/mm.h>
34#include <linux/smp_lock.h>
35#include <linux/module.h> 34#include <linux/module.h>
36#include <linux/pagemap.h> 35#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
@@ -51,6 +50,7 @@ MODULE_LICENSE("GPL");
51static mode_t udf_convert_permissions(struct fileEntry *); 50static mode_t udf_convert_permissions(struct fileEntry *);
52static int udf_update_inode(struct inode *, int); 51static int udf_update_inode(struct inode *, int);
53static void udf_fill_inode(struct inode *, struct buffer_head *); 52static void udf_fill_inode(struct inode *, struct buffer_head *);
53static int udf_sync_inode(struct inode *inode);
54static int udf_alloc_i_data(struct inode *inode, size_t size); 54static int udf_alloc_i_data(struct inode *inode, size_t size);
55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
56 sector_t *, int *); 56 sector_t *, int *);
@@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode)
79 want_delete = 1; 79 want_delete = 1;
80 inode->i_size = 0; 80 inode->i_size = 0;
81 udf_truncate(inode); 81 udf_truncate(inode);
82 lock_kernel();
83 udf_update_inode(inode, IS_SYNC(inode)); 82 udf_update_inode(inode, IS_SYNC(inode));
84 unlock_kernel();
85 } 83 }
86 invalidate_inode_buffers(inode); 84 invalidate_inode_buffers(inode);
87 end_writeback(inode); 85 end_writeback(inode);
@@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode)
97 kfree(iinfo->i_ext.i_data); 95 kfree(iinfo->i_ext.i_data);
98 iinfo->i_ext.i_data = NULL; 96 iinfo->i_ext.i_data = NULL;
99 if (want_delete) { 97 if (want_delete) {
100 lock_kernel();
101 udf_free_inode(inode); 98 udf_free_inode(inode);
102 unlock_kernel();
103 } 99 }
104} 100}
105 101
@@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block,
302 err = -EIO; 298 err = -EIO;
303 new = 0; 299 new = 0;
304 bh = NULL; 300 bh = NULL;
305
306 lock_kernel();
307
308 iinfo = UDF_I(inode); 301 iinfo = UDF_I(inode);
302
303 down_write(&iinfo->i_data_sem);
309 if (block == iinfo->i_next_alloc_block + 1) { 304 if (block == iinfo->i_next_alloc_block + 1) {
310 iinfo->i_next_alloc_block++; 305 iinfo->i_next_alloc_block++;
311 iinfo->i_next_alloc_goal++; 306 iinfo->i_next_alloc_goal++;
@@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
324 map_bh(bh_result, inode->i_sb, phys); 319 map_bh(bh_result, inode->i_sb, phys);
325 320
326abort: 321abort:
327 unlock_kernel(); 322 up_write(&iinfo->i_data_sem);
328 return err; 323 return err;
329} 324}
330 325
@@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode)
1022 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 1017 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1023 return; 1018 return;
1024 1019
1025 lock_kernel();
1026 iinfo = UDF_I(inode); 1020 iinfo = UDF_I(inode);
1027 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1021 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1022 down_write(&iinfo->i_data_sem);
1028 if (inode->i_sb->s_blocksize < 1023 if (inode->i_sb->s_blocksize <
1029 (udf_file_entry_alloc_offset(inode) + 1024 (udf_file_entry_alloc_offset(inode) +
1030 inode->i_size)) { 1025 inode->i_size)) {
1031 udf_expand_file_adinicb(inode, inode->i_size, &err); 1026 udf_expand_file_adinicb(inode, inode->i_size, &err);
1032 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1027 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1033 inode->i_size = iinfo->i_lenAlloc; 1028 inode->i_size = iinfo->i_lenAlloc;
1034 unlock_kernel(); 1029 up_write(&iinfo->i_data_sem);
1035 return; 1030 return;
1036 } else 1031 } else
1037 udf_truncate_extents(inode); 1032 udf_truncate_extents(inode);
@@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode)
1042 offset - udf_file_entry_alloc_offset(inode)); 1037 offset - udf_file_entry_alloc_offset(inode));
1043 iinfo->i_lenAlloc = inode->i_size; 1038 iinfo->i_lenAlloc = inode->i_size;
1044 } 1039 }
1040 up_write(&iinfo->i_data_sem);
1045 } else { 1041 } else {
1046 block_truncate_page(inode->i_mapping, inode->i_size, 1042 block_truncate_page(inode->i_mapping, inode->i_size,
1047 udf_get_block); 1043 udf_get_block);
1044 down_write(&iinfo->i_data_sem);
1048 udf_truncate_extents(inode); 1045 udf_truncate_extents(inode);
1046 up_write(&iinfo->i_data_sem);
1049 } 1047 }
1050 1048
1051 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); 1049 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
@@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode)
1053 udf_sync_inode(inode); 1051 udf_sync_inode(inode);
1054 else 1052 else
1055 mark_inode_dirty(inode); 1053 mark_inode_dirty(inode);
1056 unlock_kernel();
1057} 1054}
1058 1055
1059static void __udf_read_inode(struct inode *inode) 1056static void __udf_read_inode(struct inode *inode)
@@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1202 return; 1199 return;
1203 } 1200 }
1204 1201
1202 read_lock(&sbi->s_cred_lock);
1205 inode->i_uid = le32_to_cpu(fe->uid); 1203 inode->i_uid = le32_to_cpu(fe->uid);
1206 if (inode->i_uid == -1 || 1204 if (inode->i_uid == -1 ||
1207 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || 1205 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
@@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1214 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) 1212 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
1215 inode->i_gid = UDF_SB(inode->i_sb)->s_gid; 1213 inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
1216 1214
1217 inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
1218 if (!inode->i_nlink)
1219 inode->i_nlink = 1;
1220
1221 inode->i_size = le64_to_cpu(fe->informationLength);
1222 iinfo->i_lenExtents = inode->i_size;
1223
1224 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && 1215 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
1225 sbi->s_fmode != UDF_INVALID_MODE) 1216 sbi->s_fmode != UDF_INVALID_MODE)
1226 inode->i_mode = sbi->s_fmode; 1217 inode->i_mode = sbi->s_fmode;
@@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1230 else 1221 else
1231 inode->i_mode = udf_convert_permissions(fe); 1222 inode->i_mode = udf_convert_permissions(fe);
1232 inode->i_mode &= ~sbi->s_umask; 1223 inode->i_mode &= ~sbi->s_umask;
1224 read_unlock(&sbi->s_cred_lock);
1225
1226 inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
1227 if (!inode->i_nlink)
1228 inode->i_nlink = 1;
1229
1230 inode->i_size = le64_to_cpu(fe->informationLength);
1231 iinfo->i_lenExtents = inode->i_size;
1233 1232
1234 if (iinfo->i_efe == 0) { 1233 if (iinfo->i_efe == 0) {
1235 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << 1234 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
1373 1372
1374int udf_write_inode(struct inode *inode, struct writeback_control *wbc) 1373int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
1375{ 1374{
1376 int ret; 1375 return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1377
1378 lock_kernel();
1379 ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1380 unlock_kernel();
1381
1382 return ret;
1383} 1376}
1384 1377
1385int udf_sync_inode(struct inode *inode) 1378static int udf_sync_inode(struct inode *inode)
1386{ 1379{
1387 return udf_update_inode(inode, 1); 1380 return udf_update_inode(inode, 1);
1388} 1381}
@@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2048 struct extent_position epos = {}; 2041 struct extent_position epos = {};
2049 int ret; 2042 int ret;
2050 2043
2051 lock_kernel(); 2044 down_read(&UDF_I(inode)->i_data_sem);
2052 2045
2053 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == 2046 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
2054 (EXT_RECORDED_ALLOCATED >> 30)) 2047 (EXT_RECORDED_ALLOCATED >> 30))
@@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2056 else 2049 else
2057 ret = 0; 2050 ret = 0;
2058 2051
2059 unlock_kernel(); 2052 up_read(&UDF_I(inode)->i_data_sem);
2060 brelse(epos.bh); 2053 brelse(epos.bh);
2061 2054
2062 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) 2055 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6d8dc02baebb..2be0f9eb86d2 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/sched.h> 31#include <linux/sched.h>
33#include <linux/crc-itu-t.h> 32#include <linux/crc-itu-t.h>
@@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
228 } 227 }
229 228
230 if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) && 229 if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
231 isdotdot) { 230 isdotdot)
232 brelse(epos.bh); 231 goto out_ok;
233 return fi;
234 }
235 232
236 if (!lfi) 233 if (!lfi)
237 continue; 234 continue;
@@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
263 if (dentry->d_name.len > UDF_NAME_LEN - 2) 260 if (dentry->d_name.len > UDF_NAME_LEN - 2)
264 return ERR_PTR(-ENAMETOOLONG); 261 return ERR_PTR(-ENAMETOOLONG);
265 262
266 lock_kernel();
267#ifdef UDF_RECOVERY 263#ifdef UDF_RECOVERY
268 /* temporary shorthand for specifying files by inode number */ 264 /* temporary shorthand for specifying files by inode number */
269 if (!strncmp(dentry->d_name.name, ".B=", 3)) { 265 if (!strncmp(dentry->d_name.name, ".B=", 3)) {
@@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
275 }; 271 };
276 inode = udf_iget(dir->i_sb, lb); 272 inode = udf_iget(dir->i_sb, lb);
277 if (!inode) { 273 if (!inode) {
278 unlock_kernel();
279 return ERR_PTR(-EACCES); 274 return ERR_PTR(-EACCES);
280 } 275 }
281 } else 276 } else
@@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
291 loc = lelb_to_cpu(cfi.icb.extLocation); 286 loc = lelb_to_cpu(cfi.icb.extLocation);
292 inode = udf_iget(dir->i_sb, &loc); 287 inode = udf_iget(dir->i_sb, &loc);
293 if (!inode) { 288 if (!inode) {
294 unlock_kernel();
295 return ERR_PTR(-EACCES); 289 return ERR_PTR(-EACCES);
296 } 290 }
297 } 291 }
298 unlock_kernel();
299 292
300 return d_splice_alias(inode, dentry); 293 return d_splice_alias(inode, dentry);
301} 294}
@@ -476,15 +469,19 @@ add:
476 f_pos >> dir->i_sb->s_blocksize_bits, 1, err); 469 f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
477 if (!fibh->ebh) 470 if (!fibh->ebh)
478 goto out_err; 471 goto out_err;
472 /* Extents could have been merged, invalidate our position */
473 brelse(epos.bh);
474 epos.bh = NULL;
475 epos.block = dinfo->i_location;
476 epos.offset = udf_file_entry_alloc_offset(dir);
479 477
480 if (!fibh->soffset) { 478 if (!fibh->soffset) {
481 if (udf_next_aext(dir, &epos, &eloc, &elen, 1) == 479 /* Find the freshly allocated block */
482 (EXT_RECORDED_ALLOCATED >> 30)) { 480 while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
483 block = eloc.logicalBlockNum + ((elen - 1) >> 481 (EXT_RECORDED_ALLOCATED >> 30))
482 ;
483 block = eloc.logicalBlockNum + ((elen - 1) >>
484 dir->i_sb->s_blocksize_bits); 484 dir->i_sb->s_blocksize_bits);
485 } else
486 block++;
487
488 brelse(fibh->sbh); 485 brelse(fibh->sbh);
489 fibh->sbh = fibh->ebh; 486 fibh->sbh = fibh->ebh;
490 fi = (struct fileIdentDesc *)(fibh->sbh->b_data); 487 fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
@@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
562 int err; 559 int err;
563 struct udf_inode_info *iinfo; 560 struct udf_inode_info *iinfo;
564 561
565 lock_kernel();
566 inode = udf_new_inode(dir, mode, &err); 562 inode = udf_new_inode(dir, mode, &err);
567 if (!inode) { 563 if (!inode) {
568 unlock_kernel();
569 return err; 564 return err;
570 } 565 }
571 566
@@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
583 inode->i_nlink--; 578 inode->i_nlink--;
584 mark_inode_dirty(inode); 579 mark_inode_dirty(inode);
585 iput(inode); 580 iput(inode);
586 unlock_kernel();
587 return err; 581 return err;
588 } 582 }
589 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 583 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
596 if (fibh.sbh != fibh.ebh) 590 if (fibh.sbh != fibh.ebh)
597 brelse(fibh.ebh); 591 brelse(fibh.ebh);
598 brelse(fibh.sbh); 592 brelse(fibh.sbh);
599 unlock_kernel();
600 d_instantiate(dentry, inode); 593 d_instantiate(dentry, inode);
601 594
602 return 0; 595 return 0;
@@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
614 if (!old_valid_dev(rdev)) 607 if (!old_valid_dev(rdev))
615 return -EINVAL; 608 return -EINVAL;
616 609
617 lock_kernel();
618 err = -EIO; 610 err = -EIO;
619 inode = udf_new_inode(dir, mode, &err); 611 inode = udf_new_inode(dir, mode, &err);
620 if (!inode) 612 if (!inode)
@@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
627 inode->i_nlink--; 619 inode->i_nlink--;
628 mark_inode_dirty(inode); 620 mark_inode_dirty(inode);
629 iput(inode); 621 iput(inode);
630 unlock_kernel();
631 return err; 622 return err;
632 } 623 }
633 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 624 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
646 err = 0; 637 err = 0;
647 638
648out: 639out:
649 unlock_kernel();
650 return err; 640 return err;
651} 641}
652 642
@@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
659 struct udf_inode_info *dinfo = UDF_I(dir); 649 struct udf_inode_info *dinfo = UDF_I(dir);
660 struct udf_inode_info *iinfo; 650 struct udf_inode_info *iinfo;
661 651
662 lock_kernel();
663 err = -EMLINK; 652 err = -EMLINK;
664 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 653 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
665 goto out; 654 goto out;
@@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
712 err = 0; 701 err = 0;
713 702
714out: 703out:
715 unlock_kernel();
716 return err; 704 return err;
717} 705}
718 706
@@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
794 struct kernel_lb_addr tloc; 782 struct kernel_lb_addr tloc;
795 783
796 retval = -ENOENT; 784 retval = -ENOENT;
797 lock_kernel();
798 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 785 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
799 if (!fi) 786 if (!fi)
800 goto out; 787 goto out;
@@ -826,7 +813,6 @@ end_rmdir:
826 brelse(fibh.sbh); 813 brelse(fibh.sbh);
827 814
828out: 815out:
829 unlock_kernel();
830 return retval; 816 return retval;
831} 817}
832 818
@@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
840 struct kernel_lb_addr tloc; 826 struct kernel_lb_addr tloc;
841 827
842 retval = -ENOENT; 828 retval = -ENOENT;
843 lock_kernel();
844 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 829 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
845 if (!fi) 830 if (!fi)
846 goto out; 831 goto out;
@@ -870,7 +855,6 @@ end_unlink:
870 brelse(fibh.sbh); 855 brelse(fibh.sbh);
871 856
872out: 857out:
873 unlock_kernel();
874 return retval; 858 return retval;
875} 859}
876 860
@@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
890 int block; 874 int block;
891 unsigned char *name = NULL; 875 unsigned char *name = NULL;
892 int namelen; 876 int namelen;
893 struct buffer_head *bh;
894 struct udf_inode_info *iinfo; 877 struct udf_inode_info *iinfo;
878 struct super_block *sb = dir->i_sb;
895 879
896 lock_kernel();
897 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); 880 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
898 if (!inode) 881 if (!inode)
899 goto out; 882 goto out;
900 883
884 iinfo = UDF_I(inode);
885 down_write(&iinfo->i_data_sem);
901 name = kmalloc(UDF_NAME_LEN, GFP_NOFS); 886 name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
902 if (!name) { 887 if (!name) {
903 err = -ENOMEM; 888 err = -ENOMEM;
904 goto out_no_entry; 889 goto out_no_entry;
905 } 890 }
906 891
907 iinfo = UDF_I(inode);
908 inode->i_data.a_ops = &udf_symlink_aops; 892 inode->i_data.a_ops = &udf_symlink_aops;
909 inode->i_op = &udf_symlink_inode_operations; 893 inode->i_op = &udf_symlink_inode_operations;
910 894
@@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
912 struct kernel_lb_addr eloc; 896 struct kernel_lb_addr eloc;
913 uint32_t bsize; 897 uint32_t bsize;
914 898
915 block = udf_new_block(inode->i_sb, inode, 899 block = udf_new_block(sb, inode,
916 iinfo->i_location.partitionReferenceNum, 900 iinfo->i_location.partitionReferenceNum,
917 iinfo->i_location.logicalBlockNum, &err); 901 iinfo->i_location.logicalBlockNum, &err);
918 if (!block) 902 if (!block)
@@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
923 eloc.logicalBlockNum = block; 907 eloc.logicalBlockNum = block;
924 eloc.partitionReferenceNum = 908 eloc.partitionReferenceNum =
925 iinfo->i_location.partitionReferenceNum; 909 iinfo->i_location.partitionReferenceNum;
926 bsize = inode->i_sb->s_blocksize; 910 bsize = sb->s_blocksize;
927 iinfo->i_lenExtents = bsize; 911 iinfo->i_lenExtents = bsize;
928 udf_add_aext(inode, &epos, &eloc, bsize, 0); 912 udf_add_aext(inode, &epos, &eloc, bsize, 0);
929 brelse(epos.bh); 913 brelse(epos.bh);
930 914
931 block = udf_get_pblock(inode->i_sb, block, 915 block = udf_get_pblock(sb, block,
932 iinfo->i_location.partitionReferenceNum, 916 iinfo->i_location.partitionReferenceNum,
933 0); 917 0);
934 epos.bh = udf_tgetblk(inode->i_sb, block); 918 epos.bh = udf_tgetblk(sb, block);
935 lock_buffer(epos.bh); 919 lock_buffer(epos.bh);
936 memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize); 920 memset(epos.bh->b_data, 0x00, bsize);
937 set_buffer_uptodate(epos.bh); 921 set_buffer_uptodate(epos.bh);
938 unlock_buffer(epos.bh); 922 unlock_buffer(epos.bh);
939 mark_buffer_dirty_inode(epos.bh, inode); 923 mark_buffer_dirty_inode(epos.bh, inode);
@@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
941 } else 925 } else
942 ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr; 926 ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
943 927
944 eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode); 928 eoffset = sb->s_blocksize - udf_ext0_offset(inode);
945 pc = (struct pathComponent *)ea; 929 pc = (struct pathComponent *)ea;
946 930
947 if (*symname == '/') { 931 if (*symname == '/') {
@@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
981 } 965 }
982 966
983 if (pc->componentType == 5) { 967 if (pc->componentType == 5) {
984 namelen = udf_put_filename(inode->i_sb, compstart, name, 968 namelen = udf_put_filename(sb, compstart, name,
985 symname - compstart); 969 symname - compstart);
986 if (!namelen) 970 if (!namelen)
987 goto out_no_entry; 971 goto out_no_entry;
@@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1015 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 999 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1016 if (!fi) 1000 if (!fi)
1017 goto out_no_entry; 1001 goto out_no_entry;
1018 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 1002 cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
1019 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); 1003 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
1020 bh = UDF_SB(inode->i_sb)->s_lvid_bh; 1004 if (UDF_SB(inode->i_sb)->s_lvid_bh) {
1021 if (bh) {
1022 struct logicalVolIntegrityDesc *lvid =
1023 (struct logicalVolIntegrityDesc *)bh->b_data;
1024 struct logicalVolHeaderDesc *lvhd;
1025 uint64_t uniqueID;
1026 lvhd = (struct logicalVolHeaderDesc *)
1027 lvid->logicalVolContentsUse;
1028 uniqueID = le64_to_cpu(lvhd->uniqueID);
1029 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = 1005 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
1030 cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL); 1006 cpu_to_le32(lvid_get_unique_id(sb));
1031 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
1032 uniqueID += 16;
1033 lvhd->uniqueID = cpu_to_le64(uniqueID);
1034 mark_buffer_dirty(bh);
1035 } 1007 }
1036 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 1008 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1037 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1009 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1038 mark_inode_dirty(dir); 1010 mark_inode_dirty(dir);
1011 up_write(&iinfo->i_data_sem);
1039 if (fibh.sbh != fibh.ebh) 1012 if (fibh.sbh != fibh.ebh)
1040 brelse(fibh.ebh); 1013 brelse(fibh.ebh);
1041 brelse(fibh.sbh); 1014 brelse(fibh.sbh);
@@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1044 1017
1045out: 1018out:
1046 kfree(name); 1019 kfree(name);
1047 unlock_kernel();
1048 return err; 1020 return err;
1049 1021
1050out_no_entry: 1022out_no_entry:
1023 up_write(&iinfo->i_data_sem);
1051 inode_dec_link_count(inode); 1024 inode_dec_link_count(inode);
1052 iput(inode); 1025 iput(inode);
1053 goto out; 1026 goto out;
@@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1060 struct udf_fileident_bh fibh; 1033 struct udf_fileident_bh fibh;
1061 struct fileIdentDesc cfi, *fi; 1034 struct fileIdentDesc cfi, *fi;
1062 int err; 1035 int err;
1063 struct buffer_head *bh;
1064 1036
1065 lock_kernel();
1066 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1037 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1067 unlock_kernel();
1068 return -EMLINK; 1038 return -EMLINK;
1069 } 1039 }
1070 1040
1071 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 1041 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1072 if (!fi) { 1042 if (!fi) {
1073 unlock_kernel();
1074 return err; 1043 return err;
1075 } 1044 }
1076 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 1045 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
1077 cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); 1046 cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
1078 bh = UDF_SB(inode->i_sb)->s_lvid_bh; 1047 if (UDF_SB(inode->i_sb)->s_lvid_bh) {
1079 if (bh) {
1080 struct logicalVolIntegrityDesc *lvid =
1081 (struct logicalVolIntegrityDesc *)bh->b_data;
1082 struct logicalVolHeaderDesc *lvhd;
1083 uint64_t uniqueID;
1084 lvhd = (struct logicalVolHeaderDesc *)
1085 (lvid->logicalVolContentsUse);
1086 uniqueID = le64_to_cpu(lvhd->uniqueID);
1087 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = 1048 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
1088 cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL); 1049 cpu_to_le32(lvid_get_unique_id(inode->i_sb));
1089 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
1090 uniqueID += 16;
1091 lvhd->uniqueID = cpu_to_le64(uniqueID);
1092 mark_buffer_dirty(bh);
1093 } 1050 }
1094 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 1051 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1095 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1052 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
@@ -1103,7 +1060,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1103 mark_inode_dirty(inode); 1060 mark_inode_dirty(inode);
1104 ihold(inode); 1061 ihold(inode);
1105 d_instantiate(dentry, inode); 1062 d_instantiate(dentry, inode);
1106 unlock_kernel();
1107 1063
1108 return 0; 1064 return 0;
1109} 1065}
@@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1124 struct kernel_lb_addr tloc; 1080 struct kernel_lb_addr tloc;
1125 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1081 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1126 1082
1127 lock_kernel();
1128 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1083 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1129 if (ofi) { 1084 if (ofi) {
1130 if (ofibh.sbh != ofibh.ebh) 1085 if (ofibh.sbh != ofibh.ebh)
@@ -1248,7 +1203,6 @@ end_rename:
1248 brelse(nfibh.ebh); 1203 brelse(nfibh.ebh);
1249 brelse(nfibh.sbh); 1204 brelse(nfibh.sbh);
1250 } 1205 }
1251 unlock_kernel();
1252 1206
1253 return retval; 1207 return retval;
1254} 1208}
@@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child)
1261 struct fileIdentDesc cfi; 1215 struct fileIdentDesc cfi;
1262 struct udf_fileident_bh fibh; 1216 struct udf_fileident_bh fibh;
1263 1217
1264 lock_kernel();
1265 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) 1218 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
1266 goto out_unlock; 1219 goto out_unlock;
1267 1220
@@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child)
1273 inode = udf_iget(child->d_inode->i_sb, &tloc); 1226 inode = udf_iget(child->d_inode->i_sb, &tloc);
1274 if (!inode) 1227 if (!inode)
1275 goto out_unlock; 1228 goto out_unlock;
1276 unlock_kernel();
1277 1229
1278 return d_obtain_alias(inode); 1230 return d_obtain_alias(inode);
1279out_unlock: 1231out_unlock:
1280 unlock_kernel();
1281 return ERR_PTR(-EACCES); 1232 return ERR_PTR(-EACCES);
1282} 1233}
1283 1234
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 745eb209be0c..a71090ea0e07 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -25,6 +25,7 @@
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/mutex.h>
28 29
29uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, 30uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
30 uint16_t partition, uint32_t offset) 31 uint16_t partition, uint32_t offset)
@@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
159 struct udf_sb_info *sbi = UDF_SB(sb); 160 struct udf_sb_info *sbi = UDF_SB(sb);
160 u16 reallocationTableLen; 161 u16 reallocationTableLen;
161 struct buffer_head *bh; 162 struct buffer_head *bh;
163 int ret = 0;
162 164
165 mutex_lock(&sbi->s_alloc_mutex);
163 for (i = 0; i < sbi->s_partitions; i++) { 166 for (i = 0; i < sbi->s_partitions; i++) {
164 struct udf_part_map *map = &sbi->s_partmaps[i]; 167 struct udf_part_map *map = &sbi->s_partmaps[i];
165 if (old_block > map->s_partition_root && 168 if (old_block > map->s_partition_root &&
@@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
175 break; 178 break;
176 } 179 }
177 180
178 if (!st) 181 if (!st) {
179 return 1; 182 ret = 1;
183 goto out;
184 }
180 185
181 reallocationTableLen = 186 reallocationTableLen =
182 le16_to_cpu(st->reallocationTableLen); 187 le16_to_cpu(st->reallocationTableLen);
@@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
207 ((old_block - 212 ((old_block -
208 map->s_partition_root) & 213 map->s_partition_root) &
209 (sdata->s_packet_len - 1)); 214 (sdata->s_packet_len - 1));
210 return 0; 215 ret = 0;
216 goto out;
211 } else if (origLoc == packet) { 217 } else if (origLoc == packet) {
212 *new_block = le32_to_cpu( 218 *new_block = le32_to_cpu(
213 entry->mappedLocation) + 219 entry->mappedLocation) +
214 ((old_block - 220 ((old_block -
215 map->s_partition_root) & 221 map->s_partition_root) &
216 (sdata->s_packet_len - 1)); 222 (sdata->s_packet_len - 1));
217 return 0; 223 ret = 0;
224 goto out;
218 } else if (origLoc > packet) 225 } else if (origLoc > packet)
219 break; 226 break;
220 } 227 }
@@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
251 st->mapEntry[k].mappedLocation) + 258 st->mapEntry[k].mappedLocation) +
252 ((old_block - map->s_partition_root) & 259 ((old_block - map->s_partition_root) &
253 (sdata->s_packet_len - 1)); 260 (sdata->s_packet_len - 1));
254 return 0; 261 ret = 0;
262 goto out;
255 } 263 }
256 264
257 return 1; 265 ret = 1;
266 goto out;
258 } /* if old_block */ 267 } /* if old_block */
259 } 268 }
260 269
261 if (i == sbi->s_partitions) { 270 if (i == sbi->s_partitions) {
262 /* outside of partitions */ 271 /* outside of partitions */
263 /* for now, fail =) */ 272 /* for now, fail =) */
264 return 1; 273 ret = 1;
265 } 274 }
266 275
267 return 0; 276out:
277 mutex_unlock(&sbi->s_alloc_mutex);
278 return ret;
268} 279}
269 280
270static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, 281static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b539d53320fb..7b27b063ff6d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
48#include <linux/stat.h> 48#include <linux/stat.h>
49#include <linux/cdrom.h> 49#include <linux/cdrom.h>
50#include <linux/nls.h> 50#include <linux/nls.h>
51#include <linux/smp_lock.h>
52#include <linux/buffer_head.h> 51#include <linux/buffer_head.h>
53#include <linux/vfs.h> 52#include <linux/vfs.h>
54#include <linux/vmalloc.h> 53#include <linux/vmalloc.h>
@@ -135,6 +134,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
135 ei->i_next_alloc_block = 0; 134 ei->i_next_alloc_block = 0;
136 ei->i_next_alloc_goal = 0; 135 ei->i_next_alloc_goal = 0;
137 ei->i_strat4096 = 0; 136 ei->i_strat4096 = 0;
137 init_rwsem(&ei->i_data_sem);
138 138
139 return &ei->vfs_inode; 139 return &ei->vfs_inode;
140} 140}
@@ -574,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
574 if (!udf_parse_options(options, &uopt, true)) 574 if (!udf_parse_options(options, &uopt, true))
575 return -EINVAL; 575 return -EINVAL;
576 576
577 lock_kernel(); 577 write_lock(&sbi->s_cred_lock);
578 sbi->s_flags = uopt.flags; 578 sbi->s_flags = uopt.flags;
579 sbi->s_uid = uopt.uid; 579 sbi->s_uid = uopt.uid;
580 sbi->s_gid = uopt.gid; 580 sbi->s_gid = uopt.gid;
581 sbi->s_umask = uopt.umask; 581 sbi->s_umask = uopt.umask;
582 sbi->s_fmode = uopt.fmode; 582 sbi->s_fmode = uopt.fmode;
583 sbi->s_dmode = uopt.dmode; 583 sbi->s_dmode = uopt.dmode;
584 write_unlock(&sbi->s_cred_lock);
584 585
585 if (sbi->s_lvid_bh) { 586 if (sbi->s_lvid_bh) {
586 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); 587 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -597,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
597 udf_open_lvid(sb); 598 udf_open_lvid(sb);
598 599
599out_unlock: 600out_unlock:
600 unlock_kernel();
601 return error; 601 return error;
602} 602}
603 603
@@ -966,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
966 (sizeof(struct buffer_head *) * nr_groups); 966 (sizeof(struct buffer_head *) * nr_groups);
967 967
968 if (size <= PAGE_SIZE) 968 if (size <= PAGE_SIZE)
969 bitmap = kmalloc(size, GFP_KERNEL); 969 bitmap = kzalloc(size, GFP_KERNEL);
970 else 970 else
971 bitmap = vmalloc(size); /* TODO: get rid of vmalloc */ 971 bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
972 972
973 if (bitmap == NULL) { 973 if (bitmap == NULL) {
974 udf_error(sb, __func__, 974 udf_error(sb, __func__,
@@ -977,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
977 return NULL; 977 return NULL;
978 } 978 }
979 979
980 memset(bitmap, 0x00, size);
981 bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); 980 bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
982 bitmap->s_nr_groups = nr_groups; 981 bitmap->s_nr_groups = nr_groups;
983 return bitmap; 982 return bitmap;
@@ -1781,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb)
1781 1780
1782 if (!bh) 1781 if (!bh)
1783 return; 1782 return;
1783
1784 mutex_lock(&sbi->s_alloc_mutex);
1784 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1785 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1785 lvidiu = udf_sb_lvidiu(sbi); 1786 lvidiu = udf_sb_lvidiu(sbi);
1786 1787
@@ -1797,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb)
1797 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1798 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1798 mark_buffer_dirty(bh); 1799 mark_buffer_dirty(bh);
1799 sbi->s_lvid_dirty = 0; 1800 sbi->s_lvid_dirty = 0;
1801 mutex_unlock(&sbi->s_alloc_mutex);
1800} 1802}
1801 1803
1802static void udf_close_lvid(struct super_block *sb) 1804static void udf_close_lvid(struct super_block *sb)
@@ -1809,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb)
1809 if (!bh) 1811 if (!bh)
1810 return; 1812 return;
1811 1813
1814 mutex_lock(&sbi->s_alloc_mutex);
1812 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1815 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1813 lvidiu = udf_sb_lvidiu(sbi); 1816 lvidiu = udf_sb_lvidiu(sbi);
1814 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1817 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1829,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb)
1829 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1832 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1830 mark_buffer_dirty(bh); 1833 mark_buffer_dirty(bh);
1831 sbi->s_lvid_dirty = 0; 1834 sbi->s_lvid_dirty = 0;
1835 mutex_unlock(&sbi->s_alloc_mutex);
1836}
1837
1838u64 lvid_get_unique_id(struct super_block *sb)
1839{
1840 struct buffer_head *bh;
1841 struct udf_sb_info *sbi = UDF_SB(sb);
1842 struct logicalVolIntegrityDesc *lvid;
1843 struct logicalVolHeaderDesc *lvhd;
1844 u64 uniqueID;
1845 u64 ret;
1846
1847 bh = sbi->s_lvid_bh;
1848 if (!bh)
1849 return 0;
1850
1851 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1852 lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
1853
1854 mutex_lock(&sbi->s_alloc_mutex);
1855 ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
1856 if (!(++uniqueID & 0xFFFFFFFF))
1857 uniqueID += 16;
1858 lvhd->uniqueID = cpu_to_le64(uniqueID);
1859 mutex_unlock(&sbi->s_alloc_mutex);
1860 mark_buffer_dirty(bh);
1861
1862 return ret;
1832} 1863}
1833 1864
1834static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) 1865static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1886,8 +1917,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1886 struct kernel_lb_addr rootdir, fileset; 1917 struct kernel_lb_addr rootdir, fileset;
1887 struct udf_sb_info *sbi; 1918 struct udf_sb_info *sbi;
1888 1919
1889 lock_kernel();
1890
1891 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1920 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1892 uopt.uid = -1; 1921 uopt.uid = -1;
1893 uopt.gid = -1; 1922 uopt.gid = -1;
@@ -1896,10 +1925,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1896 uopt.dmode = UDF_INVALID_MODE; 1925 uopt.dmode = UDF_INVALID_MODE;
1897 1926
1898 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); 1927 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
1899 if (!sbi) { 1928 if (!sbi)
1900 unlock_kernel();
1901 return -ENOMEM; 1929 return -ENOMEM;
1902 }
1903 1930
1904 sb->s_fs_info = sbi; 1931 sb->s_fs_info = sbi;
1905 1932
@@ -1936,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1936 sbi->s_fmode = uopt.fmode; 1963 sbi->s_fmode = uopt.fmode;
1937 sbi->s_dmode = uopt.dmode; 1964 sbi->s_dmode = uopt.dmode;
1938 sbi->s_nls_map = uopt.nls_map; 1965 sbi->s_nls_map = uopt.nls_map;
1966 rwlock_init(&sbi->s_cred_lock);
1939 1967
1940 if (uopt.session == 0xFFFFFFFF) 1968 if (uopt.session == 0xFFFFFFFF)
1941 sbi->s_session = udf_get_last_session(sb); 1969 sbi->s_session = udf_get_last_session(sb);
@@ -2045,7 +2073,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2045 goto error_out; 2073 goto error_out;
2046 } 2074 }
2047 sb->s_maxbytes = MAX_LFS_FILESIZE; 2075 sb->s_maxbytes = MAX_LFS_FILESIZE;
2048 unlock_kernel();
2049 return 0; 2076 return 0;
2050 2077
2051error_out: 2078error_out:
@@ -2066,7 +2093,6 @@ error_out:
2066 kfree(sbi); 2093 kfree(sbi);
2067 sb->s_fs_info = NULL; 2094 sb->s_fs_info = NULL;
2068 2095
2069 unlock_kernel();
2070 return -EINVAL; 2096 return -EINVAL;
2071} 2097}
2072 2098
@@ -2105,8 +2131,6 @@ static void udf_put_super(struct super_block *sb)
2105 2131
2106 sbi = UDF_SB(sb); 2132 sbi = UDF_SB(sb);
2107 2133
2108 lock_kernel();
2109
2110 if (sbi->s_vat_inode) 2134 if (sbi->s_vat_inode)
2111 iput(sbi->s_vat_inode); 2135 iput(sbi->s_vat_inode);
2112 if (sbi->s_partitions) 2136 if (sbi->s_partitions)
@@ -2122,8 +2146,6 @@ static void udf_put_super(struct super_block *sb)
2122 kfree(sbi->s_partmaps); 2146 kfree(sbi->s_partmaps);
2123 kfree(sb->s_fs_info); 2147 kfree(sb->s_fs_info);
2124 sb->s_fs_info = NULL; 2148 sb->s_fs_info = NULL;
2125
2126 unlock_kernel();
2127} 2149}
2128 2150
2129static int udf_sync_fs(struct super_block *sb, int wait) 2151static int udf_sync_fs(struct super_block *sb, int wait)
@@ -2186,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2186 uint16_t ident; 2208 uint16_t ident;
2187 struct spaceBitmapDesc *bm; 2209 struct spaceBitmapDesc *bm;
2188 2210
2189 lock_kernel();
2190
2191 loc.logicalBlockNum = bitmap->s_extPosition; 2211 loc.logicalBlockNum = bitmap->s_extPosition;
2192 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 2212 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
2193 bh = udf_read_ptagged(sb, &loc, 0, &ident); 2213 bh = udf_read_ptagged(sb, &loc, 0, &ident);
@@ -2224,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2224 } 2244 }
2225 } 2245 }
2226 brelse(bh); 2246 brelse(bh);
2227
2228out: 2247out:
2229 unlock_kernel();
2230
2231 return accum; 2248 return accum;
2232} 2249}
2233 2250
@@ -2240,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2240 int8_t etype; 2257 int8_t etype;
2241 struct extent_position epos; 2258 struct extent_position epos;
2242 2259
2243 lock_kernel(); 2260 mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
2244
2245 epos.block = UDF_I(table)->i_location; 2261 epos.block = UDF_I(table)->i_location;
2246 epos.offset = sizeof(struct unallocSpaceEntry); 2262 epos.offset = sizeof(struct unallocSpaceEntry);
2247 epos.bh = NULL; 2263 epos.bh = NULL;
@@ -2250,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2250 accum += (elen >> table->i_sb->s_blocksize_bits); 2266 accum += (elen >> table->i_sb->s_blocksize_bits);
2251 2267
2252 brelse(epos.bh); 2268 brelse(epos.bh);
2253 2269 mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
2254 unlock_kernel();
2255 2270
2256 return accum; 2271 return accum;
2257} 2272}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 16064787d2b7..b1d4488b0f14 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/pagemap.h> 29#include <linux/pagemap.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include "udf_i.h" 31#include "udf_i.h"
33 32
@@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page)
78 int err = -EIO; 77 int err = -EIO;
79 unsigned char *p = kmap(page); 78 unsigned char *p = kmap(page);
80 struct udf_inode_info *iinfo; 79 struct udf_inode_info *iinfo;
80 uint32_t pos;
81 81
82 lock_kernel();
83 iinfo = UDF_I(inode); 82 iinfo = UDF_I(inode);
83 pos = udf_block_map(inode, 0);
84
85 down_read(&iinfo->i_data_sem);
84 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 86 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
85 symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr; 87 symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
86 } else { 88 } else {
87 bh = sb_bread(inode->i_sb, udf_block_map(inode, 0)); 89 bh = sb_bread(inode->i_sb, pos);
88 90
89 if (!bh) 91 if (!bh)
90 goto out; 92 goto out;
@@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page)
95 udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p); 97 udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
96 brelse(bh); 98 brelse(bh);
97 99
98 unlock_kernel(); 100 up_read(&iinfo->i_data_sem);
99 SetPageUptodate(page); 101 SetPageUptodate(page);
100 kunmap(page); 102 kunmap(page);
101 unlock_page(page); 103 unlock_page(page);
102 return 0; 104 return 0;
103 105
104out: 106out:
105 unlock_kernel(); 107 up_read(&iinfo->i_data_sem);
106 SetPageError(page); 108 SetPageError(page);
107 kunmap(page); 109 kunmap(page);
108 unlock_page(page); 110 unlock_page(page);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index e58d1de41073..d1bd31ea724e 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,18 @@
1#ifndef _UDF_I_H 1#ifndef _UDF_I_H
2#define _UDF_I_H 2#define _UDF_I_H
3 3
4/*
5 * The i_data_sem and i_mutex serve for protection of allocation information
6 * of a regular files and symlinks. This includes all extents belonging to
7 * the file/symlink, a fact whether data are in-inode or in external data
8 * blocks, preallocation, goal block information... When extents are read,
9 * i_mutex or i_data_sem must be held (for reading is enough in case of
10 * i_data_sem). When extents are changed, i_data_sem must be held for writing
11 * and also i_mutex must be held.
12 *
13 * For directories i_mutex is used for all the necessary protection.
14 */
15
4struct udf_inode_info { 16struct udf_inode_info {
5 struct timespec i_crtime; 17 struct timespec i_crtime;
6 /* Physical address of inode */ 18 /* Physical address of inode */
@@ -21,6 +33,7 @@ struct udf_inode_info {
21 struct long_ad *i_lad; 33 struct long_ad *i_lad;
22 __u8 *i_data; 34 __u8 *i_data;
23 } i_ext; 35 } i_ext;
36 struct rw_semaphore i_data_sem;
24 struct inode vfs_inode; 37 struct inode vfs_inode;
25}; 38};
26 39
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d113b72c2768..4858c191242b 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -2,6 +2,7 @@
2#define __LINUX_UDF_SB_H 2#define __LINUX_UDF_SB_H
3 3
4#include <linux/mutex.h> 4#include <linux/mutex.h>
5#include <linux/bitops.h>
5 6
6/* Since UDF 2.01 is ISO 13346 based... */ 7/* Since UDF 2.01 is ISO 13346 based... */
7#define UDF_SUPER_MAGIC 0x15013346 8#define UDF_SUPER_MAGIC 0x15013346
@@ -128,6 +129,8 @@ struct udf_sb_info {
128 uid_t s_uid; 129 uid_t s_uid;
129 mode_t s_fmode; 130 mode_t s_fmode;
130 mode_t s_dmode; 131 mode_t s_dmode;
132 /* Lock protecting consistency of above permission settings */
133 rwlock_t s_cred_lock;
131 134
132 /* Root Info */ 135 /* Root Info */
133 struct timespec s_record_time; 136 struct timespec s_record_time;
@@ -139,7 +142,7 @@ struct udf_sb_info {
139 __u16 s_udfrev; 142 __u16 s_udfrev;
140 143
141 /* Miscellaneous flags */ 144 /* Miscellaneous flags */
142 __u32 s_flags; 145 unsigned long s_flags;
143 146
144 /* Encoding info */ 147 /* Encoding info */
145 struct nls_table *s_nls_map; 148 struct nls_table *s_nls_map;
@@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
161 164
162int udf_compute_nr_groups(struct super_block *sb, u32 partition); 165int udf_compute_nr_groups(struct super_block *sb, u32 partition);
163 166
164#define UDF_QUERY_FLAG(X,Y) ( UDF_SB(X)->s_flags & ( 1 << (Y) ) ) 167static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
165#define UDF_SET_FLAG(X,Y) ( UDF_SB(X)->s_flags |= ( 1 << (Y) ) ) 168{
166#define UDF_CLEAR_FLAG(X,Y) ( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) ) 169 return test_bit(flag, &UDF_SB(sb)->s_flags);
170}
171
172static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
173{
174 set_bit(flag, &UDF_SB(sb)->s_flags);
175}
176
177static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
178{
179 clear_bit(flag, &UDF_SB(sb)->s_flags);
180}
167 181
168#endif /* __LINUX_UDF_SB_H */ 182#endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 6995ab1f4305..eba48209f9f3 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -111,6 +111,8 @@ struct extent_position {
111}; 111};
112 112
113/* super.c */ 113/* super.c */
114
115__attribute__((format(printf, 3, 4)))
114extern void udf_warning(struct super_block *, const char *, const char *, ...); 116extern void udf_warning(struct super_block *, const char *, const char *, ...);
115static inline void udf_updated_lvid(struct super_block *sb) 117static inline void udf_updated_lvid(struct super_block *sb)
116{ 118{
@@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb)
123 sb->s_dirt = 1; 125 sb->s_dirt = 1;
124 UDF_SB(sb)->s_lvid_dirty = 1; 126 UDF_SB(sb)->s_lvid_dirty = 1;
125} 127}
128extern u64 lvid_get_unique_id(struct super_block *sb);
126 129
127/* namei.c */ 130/* namei.c */
128extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, 131extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
133extern long udf_ioctl(struct file *, unsigned int, unsigned long); 136extern long udf_ioctl(struct file *, unsigned int, unsigned long);
134/* inode.c */ 137/* inode.c */
135extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 138extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
136extern int udf_sync_inode(struct inode *);
137extern void udf_expand_file_adinicb(struct inode *, int, int *); 139extern void udf_expand_file_adinicb(struct inode *, int, int *);
138extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 140extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
139extern struct buffer_head *udf_bread(struct inode *, int, int, int *); 141extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..faca44997099 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
98 kmem.o \ 98 kmem.o \
99 xfs_aops.o \ 99 xfs_aops.o \
100 xfs_buf.o \ 100 xfs_buf.o \
101 xfs_discard.o \
101 xfs_export.o \ 102 xfs_export.o \
102 xfs_file.o \ 103 xfs_file.o \
103 xfs_fs_subr.o \ 104 xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_SV_H__
19#define __XFS_SUPPORT_SV_H__
20
21#include <linux/wait.h>
22#include <linux/sched.h>
23#include <linux/spinlock.h>
24
25/*
26 * Synchronisation variables.
27 *
28 * (Parameters "pri", "svf" and "rts" are not implemented)
29 */
30
31typedef struct sv_s {
32 wait_queue_head_t waiters;
33} sv_t;
34
35static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
36{
37 DECLARE_WAITQUEUE(wait, current);
38
39 add_wait_queue_exclusive(&sv->waiters, &wait);
40 __set_current_state(TASK_UNINTERRUPTIBLE);
41 spin_unlock(lock);
42
43 schedule();
44
45 remove_wait_queue(&sv->waiters, &wait);
46}
47
48#define sv_init(sv,flag,name) \
49 init_waitqueue_head(&(sv)->waiters)
50#define sv_destroy(sv) \
51 /*NOTHING*/
52#define sv_wait(sv, pri, lock, s) \
53 _sv_wait(sv, lock)
54#define sv_signal(sv) \
55 wake_up(&(sv)->waiters)
56#define sv_broadcast(sv) \
57 wake_up_all(&(sv)->waiters)
58
59#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f61223ed6..ec7bbb5645b6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
38#include <linux/pagevec.h> 38#include <linux/pagevec.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40 40
41/*
42 * Types of I/O for bmap clustering and I/O completion tracking.
43 */
44enum {
45 IO_READ, /* mapping for a read */
46 IO_DELAY, /* mapping covers delalloc region */
47 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
48 IO_NEW /* just allocated */
49};
50 41
51/* 42/*
52 * Prime number of hash buckets since address is used as the key. 43 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
182 xfs_inode_t *ip = XFS_I(ioend->io_inode); 173 xfs_inode_t *ip = XFS_I(ioend->io_inode);
183 xfs_fsize_t isize; 174 xfs_fsize_t isize;
184 175
185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
186 ASSERT(ioend->io_type != IO_READ);
187
188 if (unlikely(ioend->io_error)) 176 if (unlikely(ioend->io_error))
189 return 0; 177 return 0;
190 178
@@ -244,10 +232,8 @@ xfs_end_io(
244 * We might have to update the on-disk file size after extending 232 * We might have to update the on-disk file size after extending
245 * writes. 233 * writes.
246 */ 234 */
247 if (ioend->io_type != IO_READ) { 235 error = xfs_setfilesize(ioend);
248 error = xfs_setfilesize(ioend); 236 ASSERT(!error || error == EAGAIN);
249 ASSERT(!error || error == EAGAIN);
250 }
251 237
252 /* 238 /*
253 * If we didn't complete processing of the ioend, requeue it to the 239 * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
318xfs_map_blocks( 304xfs_map_blocks(
319 struct inode *inode, 305 struct inode *inode,
320 loff_t offset, 306 loff_t offset,
321 ssize_t count,
322 struct xfs_bmbt_irec *imap, 307 struct xfs_bmbt_irec *imap,
323 int flags) 308 int type,
309 int nonblocking)
324{ 310{
325 int nmaps = 1; 311 struct xfs_inode *ip = XFS_I(inode);
326 int new = 0; 312 struct xfs_mount *mp = ip->i_mount;
313 ssize_t count = 1 << inode->i_blkbits;
314 xfs_fileoff_t offset_fsb, end_fsb;
315 int error = 0;
316 int bmapi_flags = XFS_BMAPI_ENTIRE;
317 int nimaps = 1;
318
319 if (XFS_FORCED_SHUTDOWN(mp))
320 return -XFS_ERROR(EIO);
321
322 if (type == IO_UNWRITTEN)
323 bmapi_flags |= XFS_BMAPI_IGSTATE;
324
325 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
326 if (nonblocking)
327 return -XFS_ERROR(EAGAIN);
328 xfs_ilock(ip, XFS_ILOCK_SHARED);
329 }
327 330
328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); 331 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
332 (ip->i_df.if_flags & XFS_IFEXTENTS));
333 ASSERT(offset <= mp->m_maxioffset);
334
335 if (offset + count > mp->m_maxioffset)
336 count = mp->m_maxioffset - offset;
337 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
338 offset_fsb = XFS_B_TO_FSBT(mp, offset);
339 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
340 bmapi_flags, NULL, 0, imap, &nimaps, NULL);
341 xfs_iunlock(ip, XFS_ILOCK_SHARED);
342
343 if (error)
344 return -XFS_ERROR(error);
345
346 if (type == IO_DELALLOC &&
347 (!nimaps || isnullstartblock(imap->br_startblock))) {
348 error = xfs_iomap_write_allocate(ip, offset, count, imap);
349 if (!error)
350 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
351 return -XFS_ERROR(error);
352 }
353
354#ifdef DEBUG
355 if (type == IO_UNWRITTEN) {
356 ASSERT(nimaps);
357 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
358 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
359 }
360#endif
361 if (nimaps)
362 trace_xfs_map_blocks_found(ip, offset, count, type, imap);
363 return 0;
329} 364}
330 365
331STATIC int 366STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
380 415
381 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
382 WRITE_SYNC_PLUG : WRITE, bio); 417 WRITE_SYNC_PLUG : WRITE, bio);
383 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
384 bio_put(bio);
385} 418}
386 419
387STATIC struct bio * 420STATIC struct bio *
388xfs_alloc_ioend_bio( 421xfs_alloc_ioend_bio(
389 struct buffer_head *bh) 422 struct buffer_head *bh)
390{ 423{
391 struct bio *bio;
392 int nvecs = bio_get_nr_vecs(bh->b_bdev); 424 int nvecs = bio_get_nr_vecs(bh->b_bdev);
393 425 struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
394 do {
395 bio = bio_alloc(GFP_NOIO, nvecs);
396 nvecs >>= 1;
397 } while (!bio);
398 426
399 ASSERT(bio->bi_private == NULL); 427 ASSERT(bio->bi_private == NULL);
400 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 428 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
401 bio->bi_bdev = bh->b_bdev; 429 bio->bi_bdev = bh->b_bdev;
402 bio_get(bio);
403 return bio; 430 return bio;
404} 431}
405 432
@@ -470,9 +497,8 @@ xfs_submit_ioend(
470 /* Pass 1 - start writeback */ 497 /* Pass 1 - start writeback */
471 do { 498 do {
472 next = ioend->io_list; 499 next = ioend->io_list;
473 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 500 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
474 xfs_start_buffer_writeback(bh); 501 xfs_start_buffer_writeback(bh);
475 }
476 } while ((ioend = next) != NULL); 502 } while ((ioend = next) != NULL);
477 503
478 /* Pass 2 - submit I/O */ 504 /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
600 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 626 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 627 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
602 628
603 lock_buffer(bh);
604 xfs_map_buffer(inode, bh, imap, offset); 629 xfs_map_buffer(inode, bh, imap, offset);
605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
606 set_buffer_mapped(bh); 630 set_buffer_mapped(bh);
607 clear_buffer_delay(bh); 631 clear_buffer_delay(bh);
608 clear_buffer_unwritten(bh); 632 clear_buffer_unwritten(bh);
609} 633}
610 634
611/* 635/*
612 * Look for a page at index that is suitable for clustering.
613 */
614STATIC unsigned int
615xfs_probe_page(
616 struct page *page,
617 unsigned int pg_offset)
618{
619 struct buffer_head *bh, *head;
620 int ret = 0;
621
622 if (PageWriteback(page))
623 return 0;
624 if (!PageDirty(page))
625 return 0;
626 if (!page->mapping)
627 return 0;
628 if (!page_has_buffers(page))
629 return 0;
630
631 bh = head = page_buffers(page);
632 do {
633 if (!buffer_uptodate(bh))
634 break;
635 if (!buffer_mapped(bh))
636 break;
637 ret += bh->b_size;
638 if (ret >= pg_offset)
639 break;
640 } while ((bh = bh->b_this_page) != head);
641
642 return ret;
643}
644
645STATIC size_t
646xfs_probe_cluster(
647 struct inode *inode,
648 struct page *startpage,
649 struct buffer_head *bh,
650 struct buffer_head *head)
651{
652 struct pagevec pvec;
653 pgoff_t tindex, tlast, tloff;
654 size_t total = 0;
655 int done = 0, i;
656
657 /* First sum forwards in this page */
658 do {
659 if (!buffer_uptodate(bh) || !buffer_mapped(bh))
660 return total;
661 total += bh->b_size;
662 } while ((bh = bh->b_this_page) != head);
663
664 /* if we reached the end of the page, sum forwards in following pages */
665 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
666 tindex = startpage->index + 1;
667
668 /* Prune this back to avoid pathological behavior */
669 tloff = min(tlast, startpage->index + 64);
670
671 pagevec_init(&pvec, 0);
672 while (!done && tindex <= tloff) {
673 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
674
675 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
676 break;
677
678 for (i = 0; i < pagevec_count(&pvec); i++) {
679 struct page *page = pvec.pages[i];
680 size_t pg_offset, pg_len = 0;
681
682 if (tindex == tlast) {
683 pg_offset =
684 i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
685 if (!pg_offset) {
686 done = 1;
687 break;
688 }
689 } else
690 pg_offset = PAGE_CACHE_SIZE;
691
692 if (page->index == tindex && trylock_page(page)) {
693 pg_len = xfs_probe_page(page, pg_offset);
694 unlock_page(page);
695 }
696
697 if (!pg_len) {
698 done = 1;
699 break;
700 }
701
702 total += pg_len;
703 tindex++;
704 }
705
706 pagevec_release(&pvec);
707 cond_resched();
708 }
709
710 return total;
711}
712
713/*
714 * Test if a given page is suitable for writing as part of an unwritten 636 * Test if a given page is suitable for writing as part of an unwritten
715 * or delayed allocate extent. 637 * or delayed allocate extent.
716 */ 638 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
731 if (buffer_unwritten(bh)) 653 if (buffer_unwritten(bh))
732 acceptable = (type == IO_UNWRITTEN); 654 acceptable = (type == IO_UNWRITTEN);
733 else if (buffer_delay(bh)) 655 else if (buffer_delay(bh))
734 acceptable = (type == IO_DELAY); 656 acceptable = (type == IO_DELALLOC);
735 else if (buffer_dirty(bh) && buffer_mapped(bh)) 657 else if (buffer_dirty(bh) && buffer_mapped(bh))
736 acceptable = (type == IO_NEW); 658 acceptable = (type == IO_OVERWRITE);
737 else 659 else
738 break; 660 break;
739 } while ((bh = bh->b_this_page) != head); 661 } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
758 loff_t tindex, 680 loff_t tindex,
759 struct xfs_bmbt_irec *imap, 681 struct xfs_bmbt_irec *imap,
760 xfs_ioend_t **ioendp, 682 xfs_ioend_t **ioendp,
761 struct writeback_control *wbc, 683 struct writeback_control *wbc)
762 int all_bh)
763{ 684{
764 struct buffer_head *bh, *head; 685 struct buffer_head *bh, *head;
765 xfs_off_t end_offset; 686 xfs_off_t end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
814 continue; 735 continue;
815 } 736 }
816 737
817 if (buffer_unwritten(bh) || buffer_delay(bh)) { 738 if (buffer_unwritten(bh) || buffer_delay(bh) ||
739 buffer_mapped(bh)) {
818 if (buffer_unwritten(bh)) 740 if (buffer_unwritten(bh))
819 type = IO_UNWRITTEN; 741 type = IO_UNWRITTEN;
742 else if (buffer_delay(bh))
743 type = IO_DELALLOC;
820 else 744 else
821 type = IO_DELAY; 745 type = IO_OVERWRITE;
822 746
823 if (!xfs_imap_valid(inode, imap, offset)) { 747 if (!xfs_imap_valid(inode, imap, offset)) {
824 done = 1; 748 done = 1;
825 continue; 749 continue;
826 } 750 }
827 751
828 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 752 lock_buffer(bh);
829 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 753 if (type != IO_OVERWRITE)
830 754 xfs_map_at_offset(inode, bh, imap, offset);
831 xfs_map_at_offset(inode, bh, imap, offset);
832 xfs_add_to_ioend(inode, bh, offset, type, 755 xfs_add_to_ioend(inode, bh, offset, type,
833 ioendp, done); 756 ioendp, done);
834 757
835 page_dirty--; 758 page_dirty--;
836 count++; 759 count++;
837 } else { 760 } else {
838 type = IO_NEW; 761 done = 1;
839 if (buffer_mapped(bh) && all_bh) {
840 lock_buffer(bh);
841 xfs_add_to_ioend(inode, bh, offset,
842 type, ioendp, done);
843 count++;
844 page_dirty--;
845 } else {
846 done = 1;
847 }
848 } 762 }
849 } while (offset += len, (bh = bh->b_this_page) != head); 763 } while (offset += len, (bh = bh->b_this_page) != head);
850 764
@@ -876,7 +790,6 @@ xfs_cluster_write(
876 struct xfs_bmbt_irec *imap, 790 struct xfs_bmbt_irec *imap,
877 xfs_ioend_t **ioendp, 791 xfs_ioend_t **ioendp,
878 struct writeback_control *wbc, 792 struct writeback_control *wbc,
879 int all_bh,
880 pgoff_t tlast) 793 pgoff_t tlast)
881{ 794{
882 struct pagevec pvec; 795 struct pagevec pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
891 804
892 for (i = 0; i < pagevec_count(&pvec); i++) { 805 for (i = 0; i < pagevec_count(&pvec); i++) {
893 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 806 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
894 imap, ioendp, wbc, all_bh); 807 imap, ioendp, wbc);
895 if (done) 808 if (done)
896 break; 809 break;
897 } 810 }
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
935 struct buffer_head *bh, *head; 848 struct buffer_head *bh, *head;
936 loff_t offset = page_offset(page); 849 loff_t offset = page_offset(page);
937 850
938 if (!xfs_is_delayed_page(page, IO_DELAY)) 851 if (!xfs_is_delayed_page(page, IO_DELALLOC))
939 goto out_invalidate; 852 goto out_invalidate;
940 853
941 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 854 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
1002 unsigned int type; 915 unsigned int type;
1003 __uint64_t end_offset; 916 __uint64_t end_offset;
1004 pgoff_t end_index, last_index; 917 pgoff_t end_index, last_index;
1005 ssize_t size, len; 918 ssize_t len;
1006 int flags, err, imap_valid = 0, uptodate = 1; 919 int err, imap_valid = 0, uptodate = 1;
1007 int count = 0; 920 int count = 0;
1008 int all_bh = 0; 921 int nonblocking = 0;
1009 922
1010 trace_xfs_writepage(inode, page, 0); 923 trace_xfs_writepage(inode, page, 0);
1011 924
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
1056 969
1057 bh = head = page_buffers(page); 970 bh = head = page_buffers(page);
1058 offset = page_offset(page); 971 offset = page_offset(page);
1059 flags = BMAPI_READ; 972 type = IO_OVERWRITE;
1060 type = IO_NEW; 973
974 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
975 nonblocking = 1;
1061 976
1062 do { 977 do {
978 int new_ioend = 0;
979
1063 if (offset >= end_offset) 980 if (offset >= end_offset)
1064 break; 981 break;
1065 if (!buffer_uptodate(bh)) 982 if (!buffer_uptodate(bh))
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
1076 continue; 993 continue;
1077 } 994 }
1078 995
1079 if (imap_valid) 996 if (buffer_unwritten(bh)) {
1080 imap_valid = xfs_imap_valid(inode, &imap, offset); 997 if (type != IO_UNWRITTEN) {
1081
1082 if (buffer_unwritten(bh) || buffer_delay(bh)) {
1083 int new_ioend = 0;
1084
1085 /*
1086 * Make sure we don't use a read-only iomap
1087 */
1088 if (flags == BMAPI_READ)
1089 imap_valid = 0;
1090
1091 if (buffer_unwritten(bh)) {
1092 type = IO_UNWRITTEN; 998 type = IO_UNWRITTEN;
1093 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 999 imap_valid = 0;
1094 } else if (buffer_delay(bh)) {
1095 type = IO_DELAY;
1096 flags = BMAPI_ALLOCATE;
1097
1098 if (wbc->sync_mode == WB_SYNC_NONE)
1099 flags |= BMAPI_TRYLOCK;
1100 }
1101
1102 if (!imap_valid) {
1103 /*
1104 * If we didn't have a valid mapping then we
1105 * need to ensure that we put the new mapping
1106 * in a new ioend structure. This needs to be
1107 * done to ensure that the ioends correctly
1108 * reflect the block mappings at io completion
1109 * for unwritten extent conversion.
1110 */
1111 new_ioend = 1;
1112 err = xfs_map_blocks(inode, offset, len,
1113 &imap, flags);
1114 if (err)
1115 goto error;
1116 imap_valid = xfs_imap_valid(inode, &imap,
1117 offset);
1118 } 1000 }
1119 if (imap_valid) { 1001 } else if (buffer_delay(bh)) {
1120 xfs_map_at_offset(inode, bh, &imap, offset); 1002 if (type != IO_DELALLOC) {
1121 xfs_add_to_ioend(inode, bh, offset, type, 1003 type = IO_DELALLOC;
1122 &ioend, new_ioend); 1004 imap_valid = 0;
1123 count++;
1124 } 1005 }
1125 } else if (buffer_uptodate(bh)) { 1006 } else if (buffer_uptodate(bh)) {
1126 /* 1007 if (type != IO_OVERWRITE) {
1127 * we got here because the buffer is already mapped. 1008 type = IO_OVERWRITE;
1128 * That means it must already have extents allocated 1009 imap_valid = 0;
1129 * underneath it. Map the extent by reading it.
1130 */
1131 if (!imap_valid || flags != BMAPI_READ) {
1132 flags = BMAPI_READ;
1133 size = xfs_probe_cluster(inode, page, bh, head);
1134 err = xfs_map_blocks(inode, offset, size,
1135 &imap, flags);
1136 if (err)
1137 goto error;
1138 imap_valid = xfs_imap_valid(inode, &imap,
1139 offset);
1140 } 1010 }
1011 } else {
1012 if (PageUptodate(page)) {
1013 ASSERT(buffer_mapped(bh));
1014 imap_valid = 0;
1015 }
1016 continue;
1017 }
1141 1018
1019 if (imap_valid)
1020 imap_valid = xfs_imap_valid(inode, &imap, offset);
1021 if (!imap_valid) {
1142 /* 1022 /*
1143 * We set the type to IO_NEW in case we are doing a 1023 * If we didn't have a valid mapping then we need to
1144 * small write at EOF that is extending the file but 1024 * put the new mapping into a separate ioend structure.
1145 * without needing an allocation. We need to update the 1025 * This ensures non-contiguous extents always have
1146 * file size on I/O completion in this case so it is 1026 * separate ioends, which is particularly important
1147 * the same case as having just allocated a new extent 1027 * for unwritten extent conversion at I/O completion
1148 * that we are writing into for the first time. 1028 * time.
1149 */ 1029 */
1150 type = IO_NEW; 1030 new_ioend = 1;
1151 if (trylock_buffer(bh)) { 1031 err = xfs_map_blocks(inode, offset, &imap, type,
1152 if (imap_valid) 1032 nonblocking);
1153 all_bh = 1; 1033 if (err)
1154 xfs_add_to_ioend(inode, bh, offset, type, 1034 goto error;
1155 &ioend, !imap_valid); 1035 imap_valid = xfs_imap_valid(inode, &imap, offset);
1156 count++; 1036 }
1157 } else { 1037 if (imap_valid) {
1158 imap_valid = 0; 1038 lock_buffer(bh);
1159 } 1039 if (type != IO_OVERWRITE)
1160 } else if (PageUptodate(page)) { 1040 xfs_map_at_offset(inode, bh, &imap, offset);
1161 ASSERT(buffer_mapped(bh)); 1041 xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1162 imap_valid = 0; 1042 new_ioend);
1043 count++;
1163 } 1044 }
1164 1045
1165 if (!iohead) 1046 if (!iohead)
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
1188 end_index = last_index; 1069 end_index = last_index;
1189 1070
1190 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1071 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1191 wbc, all_bh, end_index); 1072 wbc, end_index);
1192 } 1073 }
1193 1074
1194 if (iohead) 1075 if (iohead)
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
1257 int create, 1138 int create,
1258 int direct) 1139 int direct)
1259{ 1140{
1260 int flags = create ? BMAPI_WRITE : BMAPI_READ; 1141 struct xfs_inode *ip = XFS_I(inode);
1142 struct xfs_mount *mp = ip->i_mount;
1143 xfs_fileoff_t offset_fsb, end_fsb;
1144 int error = 0;
1145 int lockmode = 0;
1261 struct xfs_bmbt_irec imap; 1146 struct xfs_bmbt_irec imap;
1147 int nimaps = 1;
1262 xfs_off_t offset; 1148 xfs_off_t offset;
1263 ssize_t size; 1149 ssize_t size;
1264 int nimap = 1;
1265 int new = 0; 1150 int new = 0;
1266 int error; 1151
1152 if (XFS_FORCED_SHUTDOWN(mp))
1153 return -XFS_ERROR(EIO);
1267 1154
1268 offset = (xfs_off_t)iblock << inode->i_blkbits; 1155 offset = (xfs_off_t)iblock << inode->i_blkbits;
1269 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1156 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
1272 if (!create && direct && offset >= i_size_read(inode)) 1159 if (!create && direct && offset >= i_size_read(inode))
1273 return 0; 1160 return 0;
1274 1161
1275 if (direct && create) 1162 if (create) {
1276 flags |= BMAPI_DIRECT; 1163 lockmode = XFS_ILOCK_EXCL;
1164 xfs_ilock(ip, lockmode);
1165 } else {
1166 lockmode = xfs_ilock_map_shared(ip);
1167 }
1168
1169 ASSERT(offset <= mp->m_maxioffset);
1170 if (offset + size > mp->m_maxioffset)
1171 size = mp->m_maxioffset - offset;
1172 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1173 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1277 1174
1278 error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, 1175 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
1279 &new); 1176 XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
1280 if (error) 1177 if (error)
1281 return -error; 1178 goto out_unlock;
1282 if (nimap == 0) 1179
1283 return 0; 1180 if (create &&
1181 (!nimaps ||
1182 (imap.br_startblock == HOLESTARTBLOCK ||
1183 imap.br_startblock == DELAYSTARTBLOCK))) {
1184 if (direct) {
1185 error = xfs_iomap_write_direct(ip, offset, size,
1186 &imap, nimaps);
1187 } else {
1188 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1189 }
1190 if (error)
1191 goto out_unlock;
1192
1193 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1194 } else if (nimaps) {
1195 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1196 } else {
1197 trace_xfs_get_blocks_notfound(ip, offset, size);
1198 goto out_unlock;
1199 }
1200 xfs_iunlock(ip, lockmode);
1284 1201
1285 if (imap.br_startblock != HOLESTARTBLOCK && 1202 if (imap.br_startblock != HOLESTARTBLOCK &&
1286 imap.br_startblock != DELAYSTARTBLOCK) { 1203 imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
1347 } 1264 }
1348 1265
1349 return 0; 1266 return 0;
1267
1268out_unlock:
1269 xfs_iunlock(ip, lockmode);
1270 return -error;
1350} 1271}
1351 1272
1352int 1273int
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
1434 ssize_t ret; 1355 ssize_t ret;
1435 1356
1436 if (rw & WRITE) { 1357 if (rw & WRITE) {
1437 iocb->private = xfs_alloc_ioend(inode, IO_NEW); 1358 iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
1438 1359
1439 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1360 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1440 offset, nr_segs, 1361 offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
23extern mempool_t *xfs_ioend_pool; 23extern mempool_t *xfs_ioend_pool;
24 24
25/* 25/*
26 * Types of I/O for bmap clustering and I/O completion tracking.
27 */
28enum {
29 IO_DIRECT = 0, /* special case for direct I/O ioends */
30 IO_DELALLOC, /* mapping covers delalloc region */
31 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
32 IO_OVERWRITE, /* mapping covers already allocated extent */
33};
34
35#define XFS_IO_TYPES \
36 { 0, "" }, \
37 { IO_DELALLOC, "delalloc" }, \
38 { IO_UNWRITTEN, "unwritten" }, \
39 { IO_OVERWRITE, "overwrite" }
40
41/*
26 * xfs_ioend struct manages large extent writes for XFS. 42 * xfs_ioend struct manages large extent writes for XFS.
27 * It can manage several multi-page bio's at once. 43 * It can manage several multi-page bio's at once.
28 */ 44 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6e9e31..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
44 44
45static kmem_zone_t *xfs_buf_zone; 45static kmem_zone_t *xfs_buf_zone;
46STATIC int xfsbufd(void *); 46STATIC int xfsbufd(void *);
47STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
48STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 47STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
49static struct shrinker xfs_buf_shake = {
50 .shrink = xfsbufd_wakeup,
51 .seeks = DEFAULT_SEEKS,
52};
53 48
54static struct workqueue_struct *xfslogd_workqueue; 49static struct workqueue_struct *xfslogd_workqueue;
55struct workqueue_struct *xfsdatad_workqueue; 50struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
168} 163}
169 164
170/* 165/*
171 * Internal xfs_buf_t object manipulation 166 * xfs_buf_lru_add - add a buffer to the LRU.
167 *
168 * The LRU takes a new reference to the buffer so that it will only be freed
169 * once the shrinker takes the buffer off the LRU.
172 */ 170 */
171STATIC void
172xfs_buf_lru_add(
173 struct xfs_buf *bp)
174{
175 struct xfs_buftarg *btp = bp->b_target;
176
177 spin_lock(&btp->bt_lru_lock);
178 if (list_empty(&bp->b_lru)) {
179 atomic_inc(&bp->b_hold);
180 list_add_tail(&bp->b_lru, &btp->bt_lru);
181 btp->bt_lru_nr++;
182 }
183 spin_unlock(&btp->bt_lru_lock);
184}
185
186/*
187 * xfs_buf_lru_del - remove a buffer from the LRU
188 *
189 * The unlocked check is safe here because it only occurs when there are not
190 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
191 * to optimise the shrinker removing the buffer from the LRU and calling
192 * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
193 * bt_lru_lock.
194 */
195STATIC void
196xfs_buf_lru_del(
197 struct xfs_buf *bp)
198{
199 struct xfs_buftarg *btp = bp->b_target;
200
201 if (list_empty(&bp->b_lru))
202 return;
203
204 spin_lock(&btp->bt_lru_lock);
205 if (!list_empty(&bp->b_lru)) {
206 list_del_init(&bp->b_lru);
207 btp->bt_lru_nr--;
208 }
209 spin_unlock(&btp->bt_lru_lock);
210}
211
212/*
213 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
214 * b_lru_ref count so that the buffer is freed immediately when the buffer
215 * reference count falls to zero. If the buffer is already on the LRU, we need
216 * to remove the reference that LRU holds on the buffer.
217 *
218 * This prevents build-up of stale buffers on the LRU.
219 */
220void
221xfs_buf_stale(
222 struct xfs_buf *bp)
223{
224 bp->b_flags |= XBF_STALE;
225 atomic_set(&(bp)->b_lru_ref, 0);
226 if (!list_empty(&bp->b_lru)) {
227 struct xfs_buftarg *btp = bp->b_target;
228
229 spin_lock(&btp->bt_lru_lock);
230 if (!list_empty(&bp->b_lru)) {
231 list_del_init(&bp->b_lru);
232 btp->bt_lru_nr--;
233 atomic_dec(&bp->b_hold);
234 }
235 spin_unlock(&btp->bt_lru_lock);
236 }
237 ASSERT(atomic_read(&bp->b_hold) >= 1);
238}
173 239
174STATIC void 240STATIC void
175_xfs_buf_initialize( 241_xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
186 252
187 memset(bp, 0, sizeof(xfs_buf_t)); 253 memset(bp, 0, sizeof(xfs_buf_t));
188 atomic_set(&bp->b_hold, 1); 254 atomic_set(&bp->b_hold, 1);
255 atomic_set(&bp->b_lru_ref, 1);
189 init_completion(&bp->b_iowait); 256 init_completion(&bp->b_iowait);
257 INIT_LIST_HEAD(&bp->b_lru);
190 INIT_LIST_HEAD(&bp->b_list); 258 INIT_LIST_HEAD(&bp->b_list);
191 RB_CLEAR_NODE(&bp->b_rbnode); 259 RB_CLEAR_NODE(&bp->b_rbnode);
192 sema_init(&bp->b_sema, 0); /* held, no waiters */ 260 sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
262{ 330{
263 trace_xfs_buf_free(bp, _RET_IP_); 331 trace_xfs_buf_free(bp, _RET_IP_);
264 332
333 ASSERT(list_empty(&bp->b_lru));
334
265 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 335 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
266 uint i; 336 uint i;
267 337
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
337 __func__, gfp_mask); 407 __func__, gfp_mask);
338 408
339 XFS_STATS_INC(xb_page_retries); 409 XFS_STATS_INC(xb_page_retries);
340 xfsbufd_wakeup(NULL, 0, gfp_mask);
341 congestion_wait(BLK_RW_ASYNC, HZ/50); 410 congestion_wait(BLK_RW_ASYNC, HZ/50);
342 goto retry; 411 goto retry;
343 } 412 }
@@ -827,7 +896,7 @@ xfs_buf_rele(
827 trace_xfs_buf_rele(bp, _RET_IP_); 896 trace_xfs_buf_rele(bp, _RET_IP_);
828 897
829 if (!pag) { 898 if (!pag) {
830 ASSERT(!bp->b_relse); 899 ASSERT(list_empty(&bp->b_lru));
831 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 900 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
832 if (atomic_dec_and_test(&bp->b_hold)) 901 if (atomic_dec_and_test(&bp->b_hold))
833 xfs_buf_free(bp); 902 xfs_buf_free(bp);
@@ -835,13 +904,15 @@ xfs_buf_rele(
835 } 904 }
836 905
837 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 906 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
907
838 ASSERT(atomic_read(&bp->b_hold) > 0); 908 ASSERT(atomic_read(&bp->b_hold) > 0);
839 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 909 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
840 if (bp->b_relse) { 910 if (!(bp->b_flags & XBF_STALE) &&
841 atomic_inc(&bp->b_hold); 911 atomic_read(&bp->b_lru_ref)) {
912 xfs_buf_lru_add(bp);
842 spin_unlock(&pag->pag_buf_lock); 913 spin_unlock(&pag->pag_buf_lock);
843 bp->b_relse(bp);
844 } else { 914 } else {
915 xfs_buf_lru_del(bp);
845 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 916 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
846 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 917 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
847 spin_unlock(&pag->pag_buf_lock); 918 spin_unlock(&pag->pag_buf_lock);
@@ -1438,51 +1509,84 @@ xfs_buf_iomove(
1438 */ 1509 */
1439 1510
1440/* 1511/*
1441 * Wait for any bufs with callbacks that have been submitted but 1512 * Wait for any bufs with callbacks that have been submitted but have not yet
1442 * have not yet returned... walk the hash list for the target. 1513 * returned. These buffers will have an elevated hold count, so wait on those
1514 * while freeing all the buffers only held by the LRU.
1443 */ 1515 */
1444void 1516void
1445xfs_wait_buftarg( 1517xfs_wait_buftarg(
1446 struct xfs_buftarg *btp) 1518 struct xfs_buftarg *btp)
1447{ 1519{
1448 struct xfs_perag *pag; 1520 struct xfs_buf *bp;
1449 uint i;
1450 1521
1451 for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { 1522restart:
1452 pag = xfs_perag_get(btp->bt_mount, i); 1523 spin_lock(&btp->bt_lru_lock);
1453 spin_lock(&pag->pag_buf_lock); 1524 while (!list_empty(&btp->bt_lru)) {
1454 while (rb_first(&pag->pag_buf_tree)) { 1525 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1455 spin_unlock(&pag->pag_buf_lock); 1526 if (atomic_read(&bp->b_hold) > 1) {
1527 spin_unlock(&btp->bt_lru_lock);
1456 delay(100); 1528 delay(100);
1457 spin_lock(&pag->pag_buf_lock); 1529 goto restart;
1458 } 1530 }
1459 spin_unlock(&pag->pag_buf_lock); 1531 /*
1460 xfs_perag_put(pag); 1532 * clear the LRU reference count so the bufer doesn't get
1533 * ignored in xfs_buf_rele().
1534 */
1535 atomic_set(&bp->b_lru_ref, 0);
1536 spin_unlock(&btp->bt_lru_lock);
1537 xfs_buf_rele(bp);
1538 spin_lock(&btp->bt_lru_lock);
1461 } 1539 }
1540 spin_unlock(&btp->bt_lru_lock);
1462} 1541}
1463 1542
1464/* 1543int
1465 * buftarg list for delwrite queue processing 1544xfs_buftarg_shrink(
1466 */ 1545 struct shrinker *shrink,
1467static LIST_HEAD(xfs_buftarg_list); 1546 int nr_to_scan,
1468static DEFINE_SPINLOCK(xfs_buftarg_lock); 1547 gfp_t mask)
1469
1470STATIC void
1471xfs_register_buftarg(
1472 xfs_buftarg_t *btp)
1473{ 1548{
1474 spin_lock(&xfs_buftarg_lock); 1549 struct xfs_buftarg *btp = container_of(shrink,
1475 list_add(&btp->bt_list, &xfs_buftarg_list); 1550 struct xfs_buftarg, bt_shrinker);
1476 spin_unlock(&xfs_buftarg_lock); 1551 struct xfs_buf *bp;
1477} 1552 LIST_HEAD(dispose);
1478 1553
1479STATIC void 1554 if (!nr_to_scan)
1480xfs_unregister_buftarg( 1555 return btp->bt_lru_nr;
1481 xfs_buftarg_t *btp) 1556
1482{ 1557 spin_lock(&btp->bt_lru_lock);
1483 spin_lock(&xfs_buftarg_lock); 1558 while (!list_empty(&btp->bt_lru)) {
1484 list_del(&btp->bt_list); 1559 if (nr_to_scan-- <= 0)
1485 spin_unlock(&xfs_buftarg_lock); 1560 break;
1561
1562 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1563
1564 /*
1565 * Decrement the b_lru_ref count unless the value is already
1566 * zero. If the value is already zero, we need to reclaim the
1567 * buffer, otherwise it gets another trip through the LRU.
1568 */
1569 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1570 list_move_tail(&bp->b_lru, &btp->bt_lru);
1571 continue;
1572 }
1573
1574 /*
1575 * remove the buffer from the LRU now to avoid needing another
1576 * lock round trip inside xfs_buf_rele().
1577 */
1578 list_move(&bp->b_lru, &dispose);
1579 btp->bt_lru_nr--;
1580 }
1581 spin_unlock(&btp->bt_lru_lock);
1582
1583 while (!list_empty(&dispose)) {
1584 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1585 list_del_init(&bp->b_lru);
1586 xfs_buf_rele(bp);
1587 }
1588
1589 return btp->bt_lru_nr;
1486} 1590}
1487 1591
1488void 1592void
@@ -1490,17 +1594,14 @@ xfs_free_buftarg(
1490 struct xfs_mount *mp, 1594 struct xfs_mount *mp,
1491 struct xfs_buftarg *btp) 1595 struct xfs_buftarg *btp)
1492{ 1596{
1597 unregister_shrinker(&btp->bt_shrinker);
1598
1493 xfs_flush_buftarg(btp, 1); 1599 xfs_flush_buftarg(btp, 1);
1494 if (mp->m_flags & XFS_MOUNT_BARRIER) 1600 if (mp->m_flags & XFS_MOUNT_BARRIER)
1495 xfs_blkdev_issue_flush(btp); 1601 xfs_blkdev_issue_flush(btp);
1496 iput(btp->bt_mapping->host); 1602 iput(btp->bt_mapping->host);
1497 1603
1498 /* Unregister the buftarg first so that we don't get a
1499 * wakeup finding a non-existent task
1500 */
1501 xfs_unregister_buftarg(btp);
1502 kthread_stop(btp->bt_task); 1604 kthread_stop(btp->bt_task);
1503
1504 kmem_free(btp); 1605 kmem_free(btp);
1505} 1606}
1506 1607
@@ -1597,20 +1698,13 @@ xfs_alloc_delwrite_queue(
1597 xfs_buftarg_t *btp, 1698 xfs_buftarg_t *btp,
1598 const char *fsname) 1699 const char *fsname)
1599{ 1700{
1600 int error = 0;
1601
1602 INIT_LIST_HEAD(&btp->bt_list);
1603 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1701 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1604 spin_lock_init(&btp->bt_delwrite_lock); 1702 spin_lock_init(&btp->bt_delwrite_lock);
1605 btp->bt_flags = 0; 1703 btp->bt_flags = 0;
1606 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1704 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1607 if (IS_ERR(btp->bt_task)) { 1705 if (IS_ERR(btp->bt_task))
1608 error = PTR_ERR(btp->bt_task); 1706 return PTR_ERR(btp->bt_task);
1609 goto out_error; 1707 return 0;
1610 }
1611 xfs_register_buftarg(btp);
1612out_error:
1613 return error;
1614} 1708}
1615 1709
1616xfs_buftarg_t * 1710xfs_buftarg_t *
@@ -1627,12 +1721,17 @@ xfs_alloc_buftarg(
1627 btp->bt_mount = mp; 1721 btp->bt_mount = mp;
1628 btp->bt_dev = bdev->bd_dev; 1722 btp->bt_dev = bdev->bd_dev;
1629 btp->bt_bdev = bdev; 1723 btp->bt_bdev = bdev;
1724 INIT_LIST_HEAD(&btp->bt_lru);
1725 spin_lock_init(&btp->bt_lru_lock);
1630 if (xfs_setsize_buftarg_early(btp, bdev)) 1726 if (xfs_setsize_buftarg_early(btp, bdev))
1631 goto error; 1727 goto error;
1632 if (xfs_mapping_buftarg(btp, bdev)) 1728 if (xfs_mapping_buftarg(btp, bdev))
1633 goto error; 1729 goto error;
1634 if (xfs_alloc_delwrite_queue(btp, fsname)) 1730 if (xfs_alloc_delwrite_queue(btp, fsname))
1635 goto error; 1731 goto error;
1732 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1733 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1734 register_shrinker(&btp->bt_shrinker);
1636 return btp; 1735 return btp;
1637 1736
1638error: 1737error:
@@ -1737,27 +1836,6 @@ xfs_buf_runall_queues(
1737 flush_workqueue(queue); 1836 flush_workqueue(queue);
1738} 1837}
1739 1838
1740STATIC int
1741xfsbufd_wakeup(
1742 struct shrinker *shrink,
1743 int priority,
1744 gfp_t mask)
1745{
1746 xfs_buftarg_t *btp;
1747
1748 spin_lock(&xfs_buftarg_lock);
1749 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1750 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1751 continue;
1752 if (list_empty(&btp->bt_delwrite_queue))
1753 continue;
1754 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1755 wake_up_process(btp->bt_task);
1756 }
1757 spin_unlock(&xfs_buftarg_lock);
1758 return 0;
1759}
1760
1761/* 1839/*
1762 * Move as many buffers as specified to the supplied list 1840 * Move as many buffers as specified to the supplied list
1763 * idicating if we skipped any buffers to prevent deadlocks. 1841 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1952,7 +2030,6 @@ xfs_buf_init(void)
1952 if (!xfsconvertd_workqueue) 2030 if (!xfsconvertd_workqueue)
1953 goto out_destroy_xfsdatad_workqueue; 2031 goto out_destroy_xfsdatad_workqueue;
1954 2032
1955 register_shrinker(&xfs_buf_shake);
1956 return 0; 2033 return 0;
1957 2034
1958 out_destroy_xfsdatad_workqueue: 2035 out_destroy_xfsdatad_workqueue:
@@ -1968,7 +2045,6 @@ xfs_buf_init(void)
1968void 2045void
1969xfs_buf_terminate(void) 2046xfs_buf_terminate(void)
1970{ 2047{
1971 unregister_shrinker(&xfs_buf_shake);
1972 destroy_workqueue(xfsconvertd_workqueue); 2048 destroy_workqueue(xfsconvertd_workqueue);
1973 destroy_workqueue(xfsdatad_workqueue); 2049 destroy_workqueue(xfsdatad_workqueue);
1974 destroy_workqueue(xfslogd_workqueue); 2050 destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf98..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
128 128
129 /* per device delwri queue */ 129 /* per device delwri queue */
130 struct task_struct *bt_task; 130 struct task_struct *bt_task;
131 struct list_head bt_list;
132 struct list_head bt_delwrite_queue; 131 struct list_head bt_delwrite_queue;
133 spinlock_t bt_delwrite_lock; 132 spinlock_t bt_delwrite_lock;
134 unsigned long bt_flags; 133 unsigned long bt_flags;
134
135 /* LRU control structures */
136 struct shrinker bt_shrinker;
137 struct list_head bt_lru;
138 spinlock_t bt_lru_lock;
139 unsigned int bt_lru_nr;
135} xfs_buftarg_t; 140} xfs_buftarg_t;
136 141
137/* 142/*
@@ -147,8 +152,6 @@ typedef struct xfs_buftarg {
147 152
148struct xfs_buf; 153struct xfs_buf;
149typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
150typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
151typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
152 155
153#define XB_PAGES 2 156#define XB_PAGES 2
154 157
@@ -164,9 +167,11 @@ typedef struct xfs_buf {
164 xfs_off_t b_file_offset; /* offset in file */ 167 xfs_off_t b_file_offset; /* offset in file */
165 size_t b_buffer_length;/* size of buffer in bytes */ 168 size_t b_buffer_length;/* size of buffer in bytes */
166 atomic_t b_hold; /* reference count */ 169 atomic_t b_hold; /* reference count */
170 atomic_t b_lru_ref; /* lru reclaim ref count */
167 xfs_buf_flags_t b_flags; /* status flags */ 171 xfs_buf_flags_t b_flags; /* status flags */
168 struct semaphore b_sema; /* semaphore for lockables */ 172 struct semaphore b_sema; /* semaphore for lockables */
169 173
174 struct list_head b_lru; /* lru list */
170 wait_queue_head_t b_waiters; /* unpin waiters */ 175 wait_queue_head_t b_waiters; /* unpin waiters */
171 struct list_head b_list; 176 struct list_head b_list;
172 struct xfs_perag *b_pag; /* contains rbtree root */ 177 struct xfs_perag *b_pag; /* contains rbtree root */
@@ -176,7 +181,6 @@ typedef struct xfs_buf {
176 void *b_addr; /* virtual address of buffer */ 181 void *b_addr; /* virtual address of buffer */
177 struct work_struct b_iodone_work; 182 struct work_struct b_iodone_work;
178 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 183 xfs_buf_iodone_t b_iodone; /* I/O completion function */
179 xfs_buf_relse_t b_relse; /* releasing function */
180 struct completion b_iowait; /* queue for I/O waiters */ 184 struct completion b_iowait; /* queue for I/O waiters */
181 void *b_fspriv; 185 void *b_fspriv;
182 void *b_fspriv2; 186 void *b_fspriv2;
@@ -264,7 +268,8 @@ extern void xfs_buf_terminate(void);
264#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 268#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
265 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 269 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
266 270
267#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) 271void xfs_buf_stale(struct xfs_buf *bp);
272#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
268#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 273#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
269#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 274#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
270#define XFS_BUF_SUPER_STALE(bp) do { \ 275#define XFS_BUF_SUPER_STALE(bp) do { \
@@ -315,7 +320,6 @@ extern void xfs_buf_terminate(void);
315#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 320#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
316#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 321#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
317#define XFS_BUF_SET_START(bp) do { } while (0) 322#define XFS_BUF_SET_START(bp) do { } while (0)
318#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
319 323
320#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 324#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
321#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 325#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -328,9 +332,15 @@ extern void xfs_buf_terminate(void);
328#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 332#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
329#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 333#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
330 334
331#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) 335static inline void
336xfs_buf_set_ref(
337 struct xfs_buf *bp,
338 int lru_ref)
339{
340 atomic_set(&bp->b_lru_ref, lru_ref);
341}
342#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
332#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 343#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
333#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
334 344
335#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) 345#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
336 346
@@ -346,8 +356,7 @@ extern void xfs_buf_terminate(void);
346 356
347static inline void xfs_buf_relse(xfs_buf_t *bp) 357static inline void xfs_buf_relse(xfs_buf_t *bp)
348{ 358{
349 if (!bp->b_relse) 359 xfs_buf_unlock(bp);
350 xfs_buf_unlock(bp);
351 xfs_buf_rele(bp); 360 xfs_buf_rele(bp);
352} 361}
353 362
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..05201ae719e5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_trans.h"
26#include "xfs_alloc_btree.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_inode.h"
31#include "xfs_alloc.h"
32#include "xfs_error.h"
33#include "xfs_discard.h"
34#include "xfs_trace.h"
35
36STATIC int
37xfs_trim_extents(
38 struct xfs_mount *mp,
39 xfs_agnumber_t agno,
40 xfs_fsblock_t start,
41 xfs_fsblock_t len,
42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed)
44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
46 struct xfs_btree_cur *cur;
47 struct xfs_buf *agbp;
48 struct xfs_perag *pag;
49 int error;
50 int i;
51
52 pag = xfs_perag_get(mp, agno);
53
54 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
55 if (error || !agbp)
56 goto out_put_perag;
57
58 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
59
60 /*
61 * Force out the log. This means any transactions that might have freed
62 * space before we took the AGF buffer lock are now on disk, and the
63 * volatile disk cache is flushed.
64 */
65 xfs_log_force(mp, XFS_LOG_SYNC);
66
67 /*
68 * Look up the longest btree in the AGF and start with it.
69 */
70 error = xfs_alloc_lookup_le(cur, 0,
71 XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
72 if (error)
73 goto out_del_cursor;
74
75 /*
76 * Loop until we are done with all extents that are large
77 * enough to be worth discarding.
78 */
79 while (i) {
80 xfs_agblock_t fbno;
81 xfs_extlen_t flen;
82
83 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
84 if (error)
85 goto out_del_cursor;
86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
87 ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
88
89 /*
90 * Too small? Give up.
91 */
92 if (flen < minlen) {
93 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
94 goto out_del_cursor;
95 }
96
97 /*
98 * If the extent is entirely outside of the range we are
99 * supposed to discard skip it. Do not bother to trim
100 * down partially overlapping ranges for now.
101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent;
106 }
107
108 /*
109 * If any blocks in the range are still busy, skip the
110 * discard and try again the next time.
111 */
112 if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
113 trace_xfs_discard_busy(mp, agno, fbno, flen);
114 goto next_extent;
115 }
116
117 trace_xfs_discard_extent(mp, agno, fbno, flen);
118 error = -blkdev_issue_discard(bdev,
119 XFS_AGB_TO_DADDR(mp, agno, fbno),
120 XFS_FSB_TO_BB(mp, flen),
121 GFP_NOFS, 0);
122 if (error)
123 goto out_del_cursor;
124 *blocks_trimmed += flen;
125
126next_extent:
127 error = xfs_btree_decrement(cur, 0, &i);
128 if (error)
129 goto out_del_cursor;
130 }
131
132out_del_cursor:
133 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
134 xfs_buf_relse(agbp);
135out_put_perag:
136 xfs_perag_put(pag);
137 return error;
138}
139
140int
141xfs_ioc_trim(
142 struct xfs_mount *mp,
143 struct fstrim_range __user *urange)
144{
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0;
152
153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM);
155 if (copy_from_user(&range, urange, sizeof(range)))
156 return -XFS_ERROR(EFAULT);
157
158 /*
159 * Truncating down the len isn't actually quite correct, but using
160 * XFS_B_TO_FSB would mean we trivially get overflows for values
161 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
162 * used by the fstrim application. In the end it really doesn't
163 * matter as trimming blocks is an advisory interface.
164 */
165 start = XFS_B_TO_FSBT(mp, range.start);
166 len = XFS_B_TO_FSBT(mp, range.len);
167 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
168
169 start_agno = XFS_FSB_TO_AGNO(mp, start);
170 if (start_agno >= mp->m_sb.sb_agcount)
171 return -XFS_ERROR(EINVAL);
172
173 end_agno = XFS_FSB_TO_AGNO(mp, start + len);
174 if (end_agno >= mp->m_sb.sb_agcount)
175 end_agno = mp->m_sb.sb_agcount - 1;
176
177 for (agno = start_agno; agno <= end_agno; agno++) {
178 error = -xfs_trim_extents(mp, agno, start, len, minlen,
179 &blocks_trimmed);
180 if (error)
181 last_error = error;
182 }
183
184 if (last_error)
185 return last_error;
186
187 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
188 if (copy_to_user(urange, &range, sizeof(range)))
189 return -XFS_ERROR(EFAULT);
190 return 0;
191}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
1#ifndef XFS_DISCARD_H
2#define XFS_DISCARD_H 1
3
4struct fstrim_range;
5
6extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
7
8#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..fc0114da7fdd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
70 else 70 else
71 fileid_type = FILEID_INO32_GEN_PARENT; 71 fileid_type = FILEID_INO32_GEN_PARENT;
72 72
73 /* filesystem may contain 64bit inode numbers */ 73 /*
74 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) 74 * If the the filesystem may contain 64bit inode numbers, we need
75 * to use larger file handles that can represent them.
76 *
77 * While we only allocate inodes that do not fit into 32 bits any
78 * large enough filesystem may contain them, thus the slightly
79 * confusing looking conditional below.
80 */
81 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
82 (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
75 fileid_type |= XFS_FILEID_TYPE_64FLAG; 83 fileid_type |= XFS_FILEID_TYPE_64FLAG;
76 84
77 /* 85 /*
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..a55c1b46b219 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38 38
39#include <linux/dcache.h> 39#include <linux/dcache.h>
40#include <linux/falloc.h>
40 41
41static const struct vm_operations_struct xfs_file_vm_ops; 42static const struct vm_operations_struct xfs_file_vm_ops;
42 43
43/* 44/*
45 * Locking primitives for read and write IO paths to ensure we consistently use
46 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
47 */
48static inline void
49xfs_rw_ilock(
50 struct xfs_inode *ip,
51 int type)
52{
53 if (type & XFS_IOLOCK_EXCL)
54 mutex_lock(&VFS_I(ip)->i_mutex);
55 xfs_ilock(ip, type);
56}
57
58static inline void
59xfs_rw_iunlock(
60 struct xfs_inode *ip,
61 int type)
62{
63 xfs_iunlock(ip, type);
64 if (type & XFS_IOLOCK_EXCL)
65 mutex_unlock(&VFS_I(ip)->i_mutex);
66}
67
68static inline void
69xfs_rw_ilock_demote(
70 struct xfs_inode *ip,
71 int type)
72{
73 xfs_ilock_demote(ip, type);
74 if (type & XFS_IOLOCK_EXCL)
75 mutex_unlock(&VFS_I(ip)->i_mutex);
76}
77
78/*
44 * xfs_iozero 79 * xfs_iozero
45 * 80 *
46 * xfs_iozero clears the specified range of buffer supplied, 81 * xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
262 if (XFS_FORCED_SHUTDOWN(mp)) 297 if (XFS_FORCED_SHUTDOWN(mp))
263 return -EIO; 298 return -EIO;
264 299
265 if (unlikely(ioflags & IO_ISDIRECT))
266 mutex_lock(&inode->i_mutex);
267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
268
269 if (unlikely(ioflags & IO_ISDIRECT)) { 300 if (unlikely(ioflags & IO_ISDIRECT)) {
301 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
302
270 if (inode->i_mapping->nrpages) { 303 if (inode->i_mapping->nrpages) {
271 ret = -xfs_flushinval_pages(ip, 304 ret = -xfs_flushinval_pages(ip,
272 (iocb->ki_pos & PAGE_CACHE_MASK), 305 (iocb->ki_pos & PAGE_CACHE_MASK),
273 -1, FI_REMAPF_LOCKED); 306 -1, FI_REMAPF_LOCKED);
307 if (ret) {
308 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310 }
274 } 311 }
275 mutex_unlock(&inode->i_mutex); 312 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
276 if (ret) { 313 } else
277 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 314 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
278 return ret;
279 }
280 }
281 315
282 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 316 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
283 317
@@ -285,7 +319,7 @@ xfs_file_aio_read(
285 if (ret > 0) 319 if (ret > 0)
286 XFS_STATS_ADD(xs_read_bytes, ret); 320 XFS_STATS_ADD(xs_read_bytes, ret);
287 321
288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
289 return ret; 323 return ret;
290} 324}
291 325
@@ -309,7 +343,7 @@ xfs_file_splice_read(
309 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 343 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
310 return -EIO; 344 return -EIO;
311 345
312 xfs_ilock(ip, XFS_IOLOCK_SHARED); 346 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
313 347
314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
315 349
@@ -317,10 +351,61 @@ xfs_file_splice_read(
317 if (ret > 0) 351 if (ret > 0)
318 XFS_STATS_ADD(xs_read_bytes, ret); 352 XFS_STATS_ADD(xs_read_bytes, ret);
319 353
320 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 354 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 return ret; 355 return ret;
322} 356}
323 357
358STATIC void
359xfs_aio_write_isize_update(
360 struct inode *inode,
361 loff_t *ppos,
362 ssize_t bytes_written)
363{
364 struct xfs_inode *ip = XFS_I(inode);
365 xfs_fsize_t isize = i_size_read(inode);
366
367 if (bytes_written > 0)
368 XFS_STATS_ADD(xs_write_bytes, bytes_written);
369
370 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
371 *ppos > isize))
372 *ppos = isize;
373
374 if (*ppos > ip->i_size) {
375 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
376 if (*ppos > ip->i_size)
377 ip->i_size = *ppos;
378 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
379 }
380}
381
382/*
383 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
384 * part of the I/O may have been written to disk before the error occured. In
385 * this case the on-disk file size may have been adjusted beyond the in-memory
386 * file size and now needs to be truncated back.
387 */
388STATIC void
389xfs_aio_write_newsize_update(
390 struct xfs_inode *ip)
391{
392 if (ip->i_new_size) {
393 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
394 ip->i_new_size = 0;
395 if (ip->i_d.di_size > ip->i_size)
396 ip->i_d.di_size = ip->i_size;
397 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
398 }
399}
400
401/*
402 * xfs_file_splice_write() does not use xfs_rw_ilock() because
403 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
404 * couuld cause lock inversions between the aio_write path and the splice path
405 * if someone is doing concurrent splice(2) based writes and write(2) based
406 * writes to the same inode. The only real way to fix this is to re-implement
407 * the generic code here with correct locking orders.
408 */
324STATIC ssize_t 409STATIC ssize_t
325xfs_file_splice_write( 410xfs_file_splice_write(
326 struct pipe_inode_info *pipe, 411 struct pipe_inode_info *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
331{ 416{
332 struct inode *inode = outfilp->f_mapping->host; 417 struct inode *inode = outfilp->f_mapping->host;
333 struct xfs_inode *ip = XFS_I(inode); 418 struct xfs_inode *ip = XFS_I(inode);
334 xfs_fsize_t isize, new_size; 419 xfs_fsize_t new_size;
335 int ioflags = 0; 420 int ioflags = 0;
336 ssize_t ret; 421 ssize_t ret;
337 422
@@ -355,27 +440,9 @@ xfs_file_splice_write(
355 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 440 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
356 441
357 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 442 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
358 if (ret > 0)
359 XFS_STATS_ADD(xs_write_bytes, ret);
360
361 isize = i_size_read(inode);
362 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
363 *ppos = isize;
364
365 if (*ppos > ip->i_size) {
366 xfs_ilock(ip, XFS_ILOCK_EXCL);
367 if (*ppos > ip->i_size)
368 ip->i_size = *ppos;
369 xfs_iunlock(ip, XFS_ILOCK_EXCL);
370 }
371 443
372 if (ip->i_new_size) { 444 xfs_aio_write_isize_update(inode, ppos, ret);
373 xfs_ilock(ip, XFS_ILOCK_EXCL); 445 xfs_aio_write_newsize_update(ip);
374 ip->i_new_size = 0;
375 if (ip->i_d.di_size > ip->i_size)
376 ip->i_d.di_size = ip->i_size;
377 xfs_iunlock(ip, XFS_ILOCK_EXCL);
378 }
379 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 446 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
380 return ret; 447 return ret;
381} 448}
@@ -562,247 +629,314 @@ out_lock:
562 return error; 629 return error;
563} 630}
564 631
632/*
633 * Common pre-write limit and setup checks.
634 *
635 * Returns with iolock held according to @iolock.
636 */
565STATIC ssize_t 637STATIC ssize_t
566xfs_file_aio_write( 638xfs_file_aio_write_checks(
567 struct kiocb *iocb, 639 struct file *file,
568 const struct iovec *iovp, 640 loff_t *pos,
569 unsigned long nr_segs, 641 size_t *count,
570 loff_t pos) 642 int *iolock)
571{ 643{
572 struct file *file = iocb->ki_filp; 644 struct inode *inode = file->f_mapping->host;
573 struct address_space *mapping = file->f_mapping;
574 struct inode *inode = mapping->host;
575 struct xfs_inode *ip = XFS_I(inode); 645 struct xfs_inode *ip = XFS_I(inode);
576 struct xfs_mount *mp = ip->i_mount; 646 xfs_fsize_t new_size;
577 ssize_t ret = 0, error = 0; 647 int error = 0;
578 int ioflags = 0;
579 xfs_fsize_t isize, new_size;
580 int iolock;
581 size_t ocount = 0, count;
582 int need_i_mutex;
583 648
584 XFS_STATS_INC(xs_write_calls); 649 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
650 if (error) {
651 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
652 *iolock = 0;
653 return error;
654 }
585 655
586 BUG_ON(iocb->ki_pos != pos); 656 new_size = *pos + *count;
657 if (new_size > ip->i_size)
658 ip->i_new_size = new_size;
587 659
588 if (unlikely(file->f_flags & O_DIRECT)) 660 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
589 ioflags |= IO_ISDIRECT; 661 file_update_time(file);
590 if (file->f_mode & FMODE_NOCMTIME) 662
591 ioflags |= IO_INVIS; 663 /*
664 * If the offset is beyond the size of the file, we need to zero any
665 * blocks that fall between the existing EOF and the start of this
666 * write.
667 */
668 if (*pos > ip->i_size)
669 error = -xfs_zero_eof(ip, *pos, ip->i_size);
592 670
593 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); 671 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
594 if (error) 672 if (error)
595 return error; 673 return error;
596 674
597 count = ocount; 675 /*
598 if (count == 0) 676 * If we're writing the file then make sure to clear the setuid and
599 return 0; 677 * setgid bits if the process is not being run by root. This keeps
600 678 * people from modifying setuid and setgid binaries.
601 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 679 */
680 return file_remove_suid(file);
602 681
603 if (XFS_FORCED_SHUTDOWN(mp)) 682}
604 return -EIO;
605 683
606relock: 684/*
607 if (ioflags & IO_ISDIRECT) { 685 * xfs_file_dio_aio_write - handle direct IO writes
608 iolock = XFS_IOLOCK_SHARED; 686 *
609 need_i_mutex = 0; 687 * Lock the inode appropriately to prepare for and issue a direct IO write.
610 } else { 688 * By separating it from the buffered write path we remove all the tricky to
611 iolock = XFS_IOLOCK_EXCL; 689 * follow locking changes and looping.
612 need_i_mutex = 1; 690 *
613 mutex_lock(&inode->i_mutex); 691 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
692 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
693 * pages are flushed out.
694 *
695 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
696 * allowing them to be done in parallel with reads and other direct IO writes.
697 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
698 * needs to do sub-block zeroing and that requires serialisation against other
699 * direct IOs to the same block. In this case we need to serialise the
700 * submission of the unaligned IOs so that we don't get racing block zeroing in
701 * the dio layer. To avoid the problem with aio, we also need to wait for
702 * outstanding IOs to complete so that unwritten extent conversion is completed
703 * before we try to map the overlapping block. This is currently implemented by
704 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
705 *
706 * Returns with locks held indicated by @iolock and errors indicated by
707 * negative return values.
708 */
709STATIC ssize_t
710xfs_file_dio_aio_write(
711 struct kiocb *iocb,
712 const struct iovec *iovp,
713 unsigned long nr_segs,
714 loff_t pos,
715 size_t ocount,
716 int *iolock)
717{
718 struct file *file = iocb->ki_filp;
719 struct address_space *mapping = file->f_mapping;
720 struct inode *inode = mapping->host;
721 struct xfs_inode *ip = XFS_I(inode);
722 struct xfs_mount *mp = ip->i_mount;
723 ssize_t ret = 0;
724 size_t count = ocount;
725 int unaligned_io = 0;
726 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
727 mp->m_rtdev_targp : mp->m_ddev_targp;
728
729 *iolock = 0;
730 if ((pos & target->bt_smask) || (count & target->bt_smask))
731 return -XFS_ERROR(EINVAL);
732
733 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
734 unaligned_io = 1;
735
736 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
737 *iolock = XFS_IOLOCK_EXCL;
738 else
739 *iolock = XFS_IOLOCK_SHARED;
740 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
741
742 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
743 if (ret)
744 return ret;
745
746 if (mapping->nrpages) {
747 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
748 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
749 FI_REMAPF_LOCKED);
750 if (ret)
751 return ret;
614 } 752 }
615 753
616 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 754 /*
617 755 * If we are doing unaligned IO, wait for all other IO to drain,
618start: 756 * otherwise demote the lock if we had to flush cached pages
619 error = -generic_write_checks(file, &pos, &count, 757 */
620 S_ISBLK(inode->i_mode)); 758 if (unaligned_io)
621 if (error) { 759 xfs_ioend_wait(ip);
622 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 760 else if (*iolock == XFS_IOLOCK_EXCL) {
623 goto out_unlock_mutex; 761 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
762 *iolock = XFS_IOLOCK_SHARED;
624 } 763 }
625 764
626 if (ioflags & IO_ISDIRECT) { 765 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
627 xfs_buftarg_t *target = 766 ret = generic_file_direct_write(iocb, iovp,
628 XFS_IS_REALTIME_INODE(ip) ? 767 &nr_segs, pos, &iocb->ki_pos, count, ocount);
629 mp->m_rtdev_targp : mp->m_ddev_targp;
630 768
631 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 769 /* No fallback to buffered IO on errors for XFS. */
632 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 770 ASSERT(ret < 0 || ret == count);
633 return XFS_ERROR(-EINVAL); 771 return ret;
634 } 772}
635 773
636 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { 774STATIC ssize_t
637 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 775xfs_file_buffered_aio_write(
638 iolock = XFS_IOLOCK_EXCL; 776 struct kiocb *iocb,
639 need_i_mutex = 1; 777 const struct iovec *iovp,
640 mutex_lock(&inode->i_mutex); 778 unsigned long nr_segs,
641 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 779 loff_t pos,
642 goto start; 780 size_t ocount,
643 } 781 int *iolock)
644 } 782{
783 struct file *file = iocb->ki_filp;
784 struct address_space *mapping = file->f_mapping;
785 struct inode *inode = mapping->host;
786 struct xfs_inode *ip = XFS_I(inode);
787 ssize_t ret;
788 int enospc = 0;
789 size_t count = ocount;
645 790
646 new_size = pos + count; 791 *iolock = XFS_IOLOCK_EXCL;
647 if (new_size > ip->i_size) 792 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
648 ip->i_new_size = new_size;
649 793
650 if (likely(!(ioflags & IO_INVIS))) 794 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
651 file_update_time(file); 795 if (ret)
796 return ret;
652 797
798 /* We can write back this queue in page reclaim */
799 current->backing_dev_info = mapping->backing_dev_info;
800
801write_retry:
802 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
803 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
804 pos, &iocb->ki_pos, count, ret);
653 /* 805 /*
654 * If the offset is beyond the size of the file, we have a couple 806 * if we just got an ENOSPC, flush the inode now we aren't holding any
655 * of things to do. First, if there is already space allocated 807 * page locks and retry *once*
656 * we need to either create holes or zero the disk or ...
657 *
658 * If there is a page where the previous size lands, we need
659 * to zero it out up to the new size.
660 */ 808 */
661 809 if (ret == -ENOSPC && !enospc) {
662 if (pos > ip->i_size) { 810 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
663 error = xfs_zero_eof(ip, pos, ip->i_size); 811 if (ret)
664 if (error) { 812 return ret;
665 xfs_iunlock(ip, XFS_ILOCK_EXCL); 813 enospc = 1;
666 goto out_unlock_internal; 814 goto write_retry;
667 }
668 } 815 }
669 xfs_iunlock(ip, XFS_ILOCK_EXCL); 816 current->backing_dev_info = NULL;
817 return ret;
818}
670 819
671 /* 820STATIC ssize_t
672 * If we're writing the file then make sure to clear the 821xfs_file_aio_write(
673 * setuid and setgid bits if the process is not being run 822 struct kiocb *iocb,
674 * by root. This keeps people from modifying setuid and 823 const struct iovec *iovp,
675 * setgid binaries. 824 unsigned long nr_segs,
676 */ 825 loff_t pos)
677 error = -file_remove_suid(file); 826{
678 if (unlikely(error)) 827 struct file *file = iocb->ki_filp;
679 goto out_unlock_internal; 828 struct address_space *mapping = file->f_mapping;
829 struct inode *inode = mapping->host;
830 struct xfs_inode *ip = XFS_I(inode);
831 ssize_t ret;
832 int iolock;
833 size_t ocount = 0;
680 834
681 /* We can write back this queue in page reclaim */ 835 XFS_STATS_INC(xs_write_calls);
682 current->backing_dev_info = mapping->backing_dev_info;
683 836
684 if ((ioflags & IO_ISDIRECT)) { 837 BUG_ON(iocb->ki_pos != pos);
685 if (mapping->nrpages) {
686 WARN_ON(need_i_mutex == 0);
687 error = xfs_flushinval_pages(ip,
688 (pos & PAGE_CACHE_MASK),
689 -1, FI_REMAPF_LOCKED);
690 if (error)
691 goto out_unlock_internal;
692 }
693 838
694 if (need_i_mutex) { 839 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
695 /* demote the lock now the cached pages are gone */ 840 if (ret)
696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 841 return ret;
697 mutex_unlock(&inode->i_mutex);
698 842
699 iolock = XFS_IOLOCK_SHARED; 843 if (ocount == 0)
700 need_i_mutex = 0; 844 return 0;
701 }
702 845
703 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); 846 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
704 ret = generic_file_direct_write(iocb, iovp,
705 &nr_segs, pos, &iocb->ki_pos, count, ocount);
706 847
707 /* 848 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
708 * direct-io write to a hole: fall through to buffered I/O 849 return -EIO;
709 * for completing the rest of the request.
710 */
711 if (ret >= 0 && ret != count) {
712 XFS_STATS_ADD(xs_write_bytes, ret);
713 850
714 pos += ret; 851 if (unlikely(file->f_flags & O_DIRECT))
715 count -= ret; 852 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
853 ocount, &iolock);
854 else
855 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
856 ocount, &iolock);
716 857
717 ioflags &= ~IO_ISDIRECT; 858 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
718 xfs_iunlock(ip, iolock);
719 goto relock;
720 }
721 } else {
722 int enospc = 0;
723 ssize_t ret2 = 0;
724 859
725write_retry: 860 if (ret <= 0)
726 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); 861 goto out_unlock;
727 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
728 pos, &iocb->ki_pos, count, ret);
729 /*
730 * if we just got an ENOSPC, flush the inode now we
731 * aren't holding any page locks and retry *once*
732 */
733 if (ret2 == -ENOSPC && !enospc) {
734 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
735 if (error)
736 goto out_unlock_internal;
737 enospc = 1;
738 goto write_retry;
739 }
740 ret = ret2;
741 }
742 862
743 current->backing_dev_info = NULL; 863 /* Handle various SYNC-type writes */
864 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
865 loff_t end = pos + ret - 1;
866 int error, error2;
744 867
745 isize = i_size_read(inode); 868 xfs_rw_iunlock(ip, iolock);
746 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) 869 error = filemap_write_and_wait_range(mapping, pos, end);
747 iocb->ki_pos = isize; 870 xfs_rw_ilock(ip, iolock);
748 871
749 if (iocb->ki_pos > ip->i_size) { 872 error2 = -xfs_file_fsync(file,
750 xfs_ilock(ip, XFS_ILOCK_EXCL); 873 (file->f_flags & __O_SYNC) ? 0 : 1);
751 if (iocb->ki_pos > ip->i_size) 874 if (error)
752 ip->i_size = iocb->ki_pos; 875 ret = error;
753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 876 else if (error2)
877 ret = error2;
754 } 878 }
755 879
756 error = -ret; 880out_unlock:
757 if (ret <= 0) 881 xfs_aio_write_newsize_update(ip);
758 goto out_unlock_internal; 882 xfs_rw_iunlock(ip, iolock);
883 return ret;
884}
759 885
760 XFS_STATS_ADD(xs_write_bytes, ret); 886STATIC long
887xfs_file_fallocate(
888 struct file *file,
889 int mode,
890 loff_t offset,
891 loff_t len)
892{
893 struct inode *inode = file->f_path.dentry->d_inode;
894 long error;
895 loff_t new_size = 0;
896 xfs_flock64_t bf;
897 xfs_inode_t *ip = XFS_I(inode);
898 int cmd = XFS_IOC_RESVSP;
761 899
762 /* Handle various SYNC-type writes */ 900 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
763 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 901 return -EOPNOTSUPP;
764 loff_t end = pos + ret - 1;
765 int error2;
766 902
767 xfs_iunlock(ip, iolock); 903 bf.l_whence = 0;
768 if (need_i_mutex) 904 bf.l_start = offset;
769 mutex_unlock(&inode->i_mutex); 905 bf.l_len = len;
770 906
771 error2 = filemap_write_and_wait_range(mapping, pos, end); 907 xfs_ilock(ip, XFS_IOLOCK_EXCL);
772 if (!error)
773 error = error2;
774 if (need_i_mutex)
775 mutex_lock(&inode->i_mutex);
776 xfs_ilock(ip, iolock);
777 908
778 error2 = -xfs_file_fsync(file, 909 if (mode & FALLOC_FL_PUNCH_HOLE)
779 (file->f_flags & __O_SYNC) ? 0 : 1); 910 cmd = XFS_IOC_UNRESVSP;
780 if (!error) 911
781 error = error2; 912 /* check the new inode size is valid before allocating */
913 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
914 offset + len > i_size_read(inode)) {
915 new_size = offset + len;
916 error = inode_newsize_ok(inode, new_size);
917 if (error)
918 goto out_unlock;
782 } 919 }
783 920
784 out_unlock_internal: 921 error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
785 if (ip->i_new_size) { 922 if (error)
786 xfs_ilock(ip, XFS_ILOCK_EXCL); 923 goto out_unlock;
787 ip->i_new_size = 0; 924
788 /* 925 /* Change file size if needed */
789 * If this was a direct or synchronous I/O that failed (such 926 if (new_size) {
790 * as ENOSPC) then part of the I/O may have been written to 927 struct iattr iattr;
791 * disk before the error occured. In this case the on-disk 928
792 * file size may have been adjusted beyond the in-memory file 929 iattr.ia_valid = ATTR_SIZE;
793 * size and now needs to be truncated back. 930 iattr.ia_size = new_size;
794 */ 931 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
795 if (ip->i_d.di_size > ip->i_size)
796 ip->i_d.di_size = ip->i_size;
797 xfs_iunlock(ip, XFS_ILOCK_EXCL);
798 } 932 }
799 xfs_iunlock(ip, iolock); 933
800 out_unlock_mutex: 934out_unlock:
801 if (need_i_mutex) 935 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
802 mutex_unlock(&inode->i_mutex); 936 return error;
803 return -error;
804} 937}
805 938
939
806STATIC int 940STATIC int
807xfs_file_open( 941xfs_file_open(
808 struct inode *inode, 942 struct inode *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
921 .open = xfs_file_open, 1055 .open = xfs_file_open,
922 .release = xfs_file_release, 1056 .release = xfs_file_release,
923 .fsync = xfs_file_fsync, 1057 .fsync = xfs_file_fsync,
1058 .fallocate = xfs_file_fallocate,
924}; 1059};
925 1060
926const struct file_operations xfs_dir_file_operations = { 1061const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392e..b06ede1d0bed 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
39#include "xfs_dfrag.h" 39#include "xfs_dfrag.h"
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_discard.h"
42#include "xfs_quota.h" 43#include "xfs_quota.h"
43#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
44#include "xfs_export.h" 45#include "xfs_export.h"
@@ -1294,6 +1295,8 @@ xfs_file_ioctl(
1294 trace_xfs_file_ioctl(ip); 1295 trace_xfs_file_ioctl(ip);
1295 1296
1296 switch (cmd) { 1297 switch (cmd) {
1298 case FITRIM:
1299 return xfs_ioc_trim(mp, arg);
1297 case XFS_IOC_ALLOCSP: 1300 case XFS_IOC_ALLOCSP:
1298 case XFS_IOC_FREESP: 1301 case XFS_IOC_FREESP:
1299 case XFS_IOC_RESVSP: 1302 case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 94d5fd6a2973..bd5727852fd6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/posix_acl.h> 47#include <linux/posix_acl.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/falloc.h>
50#include <linux/fiemap.h> 49#include <linux/fiemap.h>
51#include <linux/slab.h> 50#include <linux/slab.h>
52 51
@@ -505,58 +504,6 @@ xfs_vn_setattr(
505 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 504 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
506} 505}
507 506
508STATIC long
509xfs_vn_fallocate(
510 struct inode *inode,
511 int mode,
512 loff_t offset,
513 loff_t len)
514{
515 long error;
516 loff_t new_size = 0;
517 xfs_flock64_t bf;
518 xfs_inode_t *ip = XFS_I(inode);
519
520 /* preallocation on directories not yet supported */
521 error = -ENODEV;
522 if (S_ISDIR(inode->i_mode))
523 goto out_error;
524
525 bf.l_whence = 0;
526 bf.l_start = offset;
527 bf.l_len = len;
528
529 xfs_ilock(ip, XFS_IOLOCK_EXCL);
530
531 /* check the new inode size is valid before allocating */
532 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
533 offset + len > i_size_read(inode)) {
534 new_size = offset + len;
535 error = inode_newsize_ok(inode, new_size);
536 if (error)
537 goto out_unlock;
538 }
539
540 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
541 0, XFS_ATTR_NOLOCK);
542 if (error)
543 goto out_unlock;
544
545 /* Change file size if needed */
546 if (new_size) {
547 struct iattr iattr;
548
549 iattr.ia_valid = ATTR_SIZE;
550 iattr.ia_size = new_size;
551 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
552 }
553
554out_unlock:
555 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
556out_error:
557 return error;
558}
559
560#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 507#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
561 508
562/* 509/*
@@ -650,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
650 .getxattr = generic_getxattr, 597 .getxattr = generic_getxattr,
651 .removexattr = generic_removexattr, 598 .removexattr = generic_removexattr,
652 .listxattr = xfs_vn_listxattr, 599 .listxattr = xfs_vn_listxattr,
653 .fallocate = xfs_vn_fallocate,
654 .fiemap = xfs_vn_fiemap, 600 .fiemap = xfs_vn_fiemap,
655}; 601};
656 602
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff79..096494997747 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
37 37
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h>
41#include <time.h> 40#include <time.h>
42 41
43#include <support/debug.h> 42#include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 064f964d4f3c..9731898083ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -606,7 +606,8 @@ xfs_blkdev_get(
606{ 606{
607 int error = 0; 607 int error = 0;
608 608
609 *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); 609 *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
610 mp);
610 if (IS_ERR(*bdevp)) { 611 if (IS_ERR(*bdevp)) {
611 error = PTR_ERR(*bdevp); 612 error = PTR_ERR(*bdevp);
612 printk("XFS: Invalid device [%s], error=%d\n", name, error); 613 printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -620,7 +621,7 @@ xfs_blkdev_put(
620 struct block_device *bdev) 621 struct block_device *bdev)
621{ 622{
622 if (bdev) 623 if (bdev)
623 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 624 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
624} 625}
625 626
626/* 627/*
@@ -834,8 +835,11 @@ xfsaild_wakeup(
834 struct xfs_ail *ailp, 835 struct xfs_ail *ailp,
835 xfs_lsn_t threshold_lsn) 836 xfs_lsn_t threshold_lsn)
836{ 837{
837 ailp->xa_target = threshold_lsn; 838 /* only ever move the target forwards */
838 wake_up_process(ailp->xa_task); 839 if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
840 ailp->xa_target = threshold_lsn;
841 wake_up_process(ailp->xa_task);
842 }
839} 843}
840 844
841STATIC int 845STATIC int
@@ -847,8 +851,17 @@ xfsaild(
847 long tout = 0; /* milliseconds */ 851 long tout = 0; /* milliseconds */
848 852
849 while (!kthread_should_stop()) { 853 while (!kthread_should_stop()) {
850 schedule_timeout_interruptible(tout ? 854 /*
851 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); 855 * for short sleeps indicating congestion, don't allow us to
856 * get woken early. Otherwise all we do is bang on the AIL lock
857 * without making progress.
858 */
859 if (tout && tout <= 20)
860 __set_current_state(TASK_KILLABLE);
861 else
862 __set_current_state(TASK_INTERRUPTIBLE);
863 schedule_timeout(tout ?
864 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
852 865
853 /* swsusp */ 866 /* swsusp */
854 try_to_freeze(); 867 try_to_freeze();
@@ -935,7 +948,7 @@ out_reclaim:
935 * Slab object creation initialisation for the XFS inode. 948 * Slab object creation initialisation for the XFS inode.
936 * This covers only the idempotent fields in the XFS inode; 949 * This covers only the idempotent fields in the XFS inode;
937 * all other fields need to be initialised on allocation 950 * all other fields need to be initialised on allocation
938 * from the slab. This avoids the need to repeatedly intialise 951 * from the slab. This avoids the need to repeatedly initialise
939 * fields in the xfs inode that left in the initialise state 952 * fields in the xfs inode that left in the initialise state
940 * when freeing the inode. 953 * when freeing the inode.
941 */ 954 */
@@ -1118,6 +1131,8 @@ xfs_fs_evict_inode(
1118 */ 1131 */
1119 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 1132 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
1120 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 1133 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
1134 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
1135 &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
1121 1136
1122 xfs_inactive(ip); 1137 xfs_inactive(ip);
1123} 1138}
@@ -1399,7 +1414,7 @@ xfs_fs_freeze(
1399 1414
1400 xfs_save_resvblks(mp); 1415 xfs_save_resvblks(mp);
1401 xfs_quiesce_attr(mp); 1416 xfs_quiesce_attr(mp);
1402 return -xfs_fs_log_dummy(mp, SYNC_WAIT); 1417 return -xfs_fs_log_dummy(mp);
1403} 1418}
1404 1419
1405STATIC int 1420STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1c..e22f0057d21f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
53{ 53{
54 struct inode *inode = VFS_I(ip); 54 struct inode *inode = VFS_I(ip);
55 55
56 ASSERT(rcu_read_lock_held());
57
58 /*
59 * check for stale RCU freed inode
60 *
61 * If the inode has been reallocated, it doesn't matter if it's not in
62 * the AG we are walking - we are walking for writeback, so if it
63 * passes all the "valid inode" checks and is dirty, then we'll write
64 * it back anyway. If it has been reallocated and still being
65 * initialised, the XFS_INEW check below will catch it.
66 */
67 spin_lock(&ip->i_flags_lock);
68 if (!ip->i_ino)
69 goto out_unlock_noent;
70
71 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
72 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
73 goto out_unlock_noent;
74 spin_unlock(&ip->i_flags_lock);
75
56 /* nothing to sync during shutdown */ 76 /* nothing to sync during shutdown */
57 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 77 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
58 return EFSCORRUPTED; 78 return EFSCORRUPTED;
59 79
60 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
61 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
62 return ENOENT;
63
64 /* If we can't grab the inode, it must on it's way to reclaim. */ 80 /* If we can't grab the inode, it must on it's way to reclaim. */
65 if (!igrab(inode)) 81 if (!igrab(inode))
66 return ENOENT; 82 return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
72 88
73 /* inode is valid */ 89 /* inode is valid */
74 return 0; 90 return 0;
91
92out_unlock_noent:
93 spin_unlock(&ip->i_flags_lock);
94 return ENOENT;
75} 95}
76 96
77STATIC int 97STATIC int
@@ -98,12 +118,12 @@ restart:
98 int error = 0; 118 int error = 0;
99 int i; 119 int i;
100 120
101 read_lock(&pag->pag_ici_lock); 121 rcu_read_lock();
102 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 122 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
103 (void **)batch, first_index, 123 (void **)batch, first_index,
104 XFS_LOOKUP_BATCH); 124 XFS_LOOKUP_BATCH);
105 if (!nr_found) { 125 if (!nr_found) {
106 read_unlock(&pag->pag_ici_lock); 126 rcu_read_unlock();
107 break; 127 break;
108 } 128 }
109 129
@@ -118,18 +138,26 @@ restart:
118 batch[i] = NULL; 138 batch[i] = NULL;
119 139
120 /* 140 /*
121 * Update the index for the next lookup. Catch overflows 141 * Update the index for the next lookup. Catch
122 * into the next AG range which can occur if we have inodes 142 * overflows into the next AG range which can occur if
123 * in the last block of the AG and we are currently 143 * we have inodes in the last block of the AG and we
124 * pointing to the last inode. 144 * are currently pointing to the last inode.
145 *
146 * Because we may see inodes that are from the wrong AG
147 * due to RCU freeing and reallocation, only update the
148 * index if it lies in this AG. It was a race that lead
149 * us to see this inode, so another lookup from the
150 * same index will not find it again.
125 */ 151 */
152 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
153 continue;
126 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 154 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 155 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128 done = 1; 156 done = 1;
129 } 157 }
130 158
131 /* unlock now we've grabbed the inodes. */ 159 /* unlock now we've grabbed the inodes. */
132 read_unlock(&pag->pag_ici_lock); 160 rcu_read_unlock();
133 161
134 for (i = 0; i < nr_found; i++) { 162 for (i = 0; i < nr_found; i++) {
135 if (!batch[i]) 163 if (!batch[i])
@@ -334,7 +362,7 @@ xfs_quiesce_data(
334 362
335 /* mark the log as covered if needed */ 363 /* mark the log as covered if needed */
336 if (xfs_log_need_covered(mp)) 364 if (xfs_log_need_covered(mp))
337 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); 365 error2 = xfs_fs_log_dummy(mp);
338 366
339 /* flush data-only devices */ 367 /* flush data-only devices */
340 if (mp->m_rtdev_targp) 368 if (mp->m_rtdev_targp)
@@ -475,13 +503,14 @@ xfs_sync_worker(
475 int error; 503 int error;
476 504
477 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 505 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
478 xfs_log_force(mp, 0);
479 xfs_reclaim_inodes(mp, 0);
480 /* dgc: errors ignored here */ 506 /* dgc: errors ignored here */
481 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
482 if (mp->m_super->s_frozen == SB_UNFROZEN && 507 if (mp->m_super->s_frozen == SB_UNFROZEN &&
483 xfs_log_need_covered(mp)) 508 xfs_log_need_covered(mp))
484 error = xfs_fs_log_dummy(mp, 0); 509 error = xfs_fs_log_dummy(mp);
510 else
511 xfs_log_force(mp, 0);
512 xfs_reclaim_inodes(mp, 0);
513 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
485 } 514 }
486 mp->m_sync_seq++; 515 mp->m_sync_seq++;
487 wake_up(&mp->m_wait_single_sync_task); 516 wake_up(&mp->m_wait_single_sync_task);
@@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag(
592 struct xfs_perag *pag; 621 struct xfs_perag *pag;
593 622
594 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 623 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
595 write_lock(&pag->pag_ici_lock); 624 spin_lock(&pag->pag_ici_lock);
596 spin_lock(&ip->i_flags_lock); 625 spin_lock(&ip->i_flags_lock);
597 __xfs_inode_set_reclaim_tag(pag, ip); 626 __xfs_inode_set_reclaim_tag(pag, ip);
598 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 627 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
599 spin_unlock(&ip->i_flags_lock); 628 spin_unlock(&ip->i_flags_lock);
600 write_unlock(&pag->pag_ici_lock); 629 spin_unlock(&pag->pag_ici_lock);
601 xfs_perag_put(pag); 630 xfs_perag_put(pag);
602} 631}
603 632
@@ -639,9 +668,14 @@ xfs_reclaim_inode_grab(
639 struct xfs_inode *ip, 668 struct xfs_inode *ip,
640 int flags) 669 int flags)
641{ 670{
671 ASSERT(rcu_read_lock_held());
672
673 /* quick check for stale RCU freed inode */
674 if (!ip->i_ino)
675 return 1;
642 676
643 /* 677 /*
644 * do some unlocked checks first to avoid unnecceary lock traffic. 678 * do some unlocked checks first to avoid unnecessary lock traffic.
645 * The first is a flush lock check, the second is a already in reclaim 679 * The first is a flush lock check, the second is a already in reclaim
646 * check. Only do these checks if we are not going to block on locks. 680 * check. Only do these checks if we are not going to block on locks.
647 */ 681 */
@@ -654,11 +688,16 @@ xfs_reclaim_inode_grab(
654 * The radix tree lock here protects a thread in xfs_iget from racing 688 * The radix tree lock here protects a thread in xfs_iget from racing
655 * with us starting reclaim on the inode. Once we have the 689 * with us starting reclaim on the inode. Once we have the
656 * XFS_IRECLAIM flag set it will not touch us. 690 * XFS_IRECLAIM flag set it will not touch us.
691 *
692 * Due to RCU lookup, we may find inodes that have been freed and only
693 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
694 * aren't candidates for reclaim at all, so we must check the
695 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
657 */ 696 */
658 spin_lock(&ip->i_flags_lock); 697 spin_lock(&ip->i_flags_lock);
659 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); 698 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
660 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { 699 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
661 /* ignore as it is already under reclaim */ 700 /* not a reclaim candidate. */
662 spin_unlock(&ip->i_flags_lock); 701 spin_unlock(&ip->i_flags_lock);
663 return 1; 702 return 1;
664 } 703 }
@@ -795,12 +834,12 @@ reclaim:
795 * added to the tree assert that it's been there before to catch 834 * added to the tree assert that it's been there before to catch
796 * problems with the inode life time early on. 835 * problems with the inode life time early on.
797 */ 836 */
798 write_lock(&pag->pag_ici_lock); 837 spin_lock(&pag->pag_ici_lock);
799 if (!radix_tree_delete(&pag->pag_ici_root, 838 if (!radix_tree_delete(&pag->pag_ici_root,
800 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 839 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
801 ASSERT(0); 840 ASSERT(0);
802 __xfs_inode_clear_reclaim(pag, ip); 841 __xfs_inode_clear_reclaim(pag, ip);
803 write_unlock(&pag->pag_ici_lock); 842 spin_unlock(&pag->pag_ici_lock);
804 843
805 /* 844 /*
806 * Here we do an (almost) spurious inode lock in order to coordinate 845 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +903,14 @@ restart:
864 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 903 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
865 int i; 904 int i;
866 905
867 write_lock(&pag->pag_ici_lock); 906 rcu_read_lock();
868 nr_found = radix_tree_gang_lookup_tag( 907 nr_found = radix_tree_gang_lookup_tag(
869 &pag->pag_ici_root, 908 &pag->pag_ici_root,
870 (void **)batch, first_index, 909 (void **)batch, first_index,
871 XFS_LOOKUP_BATCH, 910 XFS_LOOKUP_BATCH,
872 XFS_ICI_RECLAIM_TAG); 911 XFS_ICI_RECLAIM_TAG);
873 if (!nr_found) { 912 if (!nr_found) {
874 write_unlock(&pag->pag_ici_lock); 913 rcu_read_unlock();
875 break; 914 break;
876 } 915 }
877 916
@@ -891,14 +930,24 @@ restart:
891 * occur if we have inodes in the last block of 930 * occur if we have inodes in the last block of
892 * the AG and we are currently pointing to the 931 * the AG and we are currently pointing to the
893 * last inode. 932 * last inode.
933 *
934 * Because we may see inodes that are from the
935 * wrong AG due to RCU freeing and
936 * reallocation, only update the index if it
937 * lies in this AG. It was a race that lead us
938 * to see this inode, so another lookup from
939 * the same index will not find it again.
894 */ 940 */
941 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
942 pag->pag_agno)
943 continue;
895 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 944 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
896 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 945 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
897 done = 1; 946 done = 1;
898 } 947 }
899 948
900 /* unlock now we've grabbed the inodes. */ 949 /* unlock now we've grabbed the inodes. */
901 write_unlock(&pag->pag_ici_lock); 950 rcu_read_unlock();
902 951
903 for (i = 0; i < nr_found; i++) { 952 for (i = 0; i < nr_found; i++) {
904 if (!batch[i]) 953 if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee3cee097e7e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include "xfs_error.h"
21 22
22static struct ctl_table_header *xfs_table_header; 23static struct ctl_table_header *xfs_table_header;
23 24
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
51 52
52 return ret; 53 return ret;
53} 54}
55
56STATIC int
57xfs_panic_mask_proc_handler(
58 ctl_table *ctl,
59 int write,
60 void __user *buffer,
61 size_t *lenp,
62 loff_t *ppos)
63{
64 int ret, *valp = ctl->data;
65
66 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
67 if (!ret && write) {
68 xfs_panic_mask = *valp;
69#ifdef DEBUG
70 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
71#endif
72 }
73 return ret;
74}
54#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
55 76
56static ctl_table xfs_table[] = { 77static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
77 .data = &xfs_params.panic_mask.val, 98 .data = &xfs_params.panic_mask.val,
78 .maxlen = sizeof(int), 99 .maxlen = sizeof(int),
79 .mode = 0644, 100 .mode = 0644,
80 .proc_handler = proc_dointvec_minmax, 101 .proc_handler = xfs_panic_mask_proc_handler,
81 .extra1 = &xfs_params.panic_mask.min, 102 .extra1 = &xfs_params.panic_mask.min,
82 .extra2 = &xfs_params.panic_mask.max 103 .extra2 = &xfs_params.panic_mask.max
83 }, 104 },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c594..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
766 __field(int, curr_res) 766 __field(int, curr_res)
767 __field(int, unit_res) 767 __field(int, unit_res)
768 __field(unsigned int, flags) 768 __field(unsigned int, flags)
769 __field(void *, reserve_headq) 769 __field(int, reserveq)
770 __field(void *, write_headq) 770 __field(int, writeq)
771 __field(int, grant_reserve_cycle) 771 __field(int, grant_reserve_cycle)
772 __field(int, grant_reserve_bytes) 772 __field(int, grant_reserve_bytes)
773 __field(int, grant_write_cycle) 773 __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
784 __entry->curr_res = tic->t_curr_res; 784 __entry->curr_res = tic->t_curr_res;
785 __entry->unit_res = tic->t_unit_res; 785 __entry->unit_res = tic->t_unit_res;
786 __entry->flags = tic->t_flags; 786 __entry->flags = tic->t_flags;
787 __entry->reserve_headq = log->l_reserve_headq; 787 __entry->reserveq = list_empty(&log->l_reserveq);
788 __entry->write_headq = log->l_write_headq; 788 __entry->writeq = list_empty(&log->l_writeq);
789 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; 789 xlog_crack_grant_head(&log->l_grant_reserve_head,
790 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; 790 &__entry->grant_reserve_cycle,
791 __entry->grant_write_cycle = log->l_grant_write_cycle; 791 &__entry->grant_reserve_bytes);
792 __entry->grant_write_bytes = log->l_grant_write_bytes; 792 xlog_crack_grant_head(&log->l_grant_write_head,
793 &__entry->grant_write_cycle,
794 &__entry->grant_write_bytes);
793 __entry->curr_cycle = log->l_curr_cycle; 795 __entry->curr_cycle = log->l_curr_cycle;
794 __entry->curr_block = log->l_curr_block; 796 __entry->curr_block = log->l_curr_block;
795 __entry->tail_lsn = log->l_tail_lsn; 797 __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
796 ), 798 ),
797 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " 799 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
798 "t_unit_res %u t_flags %s reserve_headq 0x%p " 800 "t_unit_res %u t_flags %s reserveq %s "
799 "write_headq 0x%p grant_reserve_cycle %d " 801 "writeq %s grant_reserve_cycle %d "
800 "grant_reserve_bytes %d grant_write_cycle %d " 802 "grant_reserve_bytes %d grant_write_cycle %d "
801 "grant_write_bytes %d curr_cycle %d curr_block %d " 803 "grant_write_bytes %d curr_cycle %d curr_block %d "
802 "tail_cycle %d tail_block %d", 804 "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
807 __entry->curr_res, 809 __entry->curr_res,
808 __entry->unit_res, 810 __entry->unit_res,
809 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), 811 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
810 __entry->reserve_headq, 812 __entry->reserveq ? "empty" : "active",
811 __entry->write_headq, 813 __entry->writeq ? "empty" : "active",
812 __entry->grant_reserve_cycle, 814 __entry->grant_reserve_cycle,
813 __entry->grant_reserve_bytes, 815 __entry->grant_reserve_bytes,
814 __entry->grant_write_cycle, 816 __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
835DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); 837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); 838DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); 839DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
840DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
838DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
839DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 850DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); 851DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
935DEFINE_PAGE_EVENT(xfs_releasepage); 939DEFINE_PAGE_EVENT(xfs_releasepage);
936DEFINE_PAGE_EVENT(xfs_invalidatepage); 940DEFINE_PAGE_EVENT(xfs_invalidatepage);
937 941
938DECLARE_EVENT_CLASS(xfs_iomap_class, 942DECLARE_EVENT_CLASS(xfs_imap_class,
939 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 943 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
940 int flags, struct xfs_bmbt_irec *irec), 944 int type, struct xfs_bmbt_irec *irec),
941 TP_ARGS(ip, offset, count, flags, irec), 945 TP_ARGS(ip, offset, count, type, irec),
942 TP_STRUCT__entry( 946 TP_STRUCT__entry(
943 __field(dev_t, dev) 947 __field(dev_t, dev)
944 __field(xfs_ino_t, ino) 948 __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
946 __field(loff_t, new_size) 950 __field(loff_t, new_size)
947 __field(loff_t, offset) 951 __field(loff_t, offset)
948 __field(size_t, count) 952 __field(size_t, count)
949 __field(int, flags) 953 __field(int, type)
950 __field(xfs_fileoff_t, startoff) 954 __field(xfs_fileoff_t, startoff)
951 __field(xfs_fsblock_t, startblock) 955 __field(xfs_fsblock_t, startblock)
952 __field(xfs_filblks_t, blockcount) 956 __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
958 __entry->new_size = ip->i_new_size; 962 __entry->new_size = ip->i_new_size;
959 __entry->offset = offset; 963 __entry->offset = offset;
960 __entry->count = count; 964 __entry->count = count;
961 __entry->flags = flags; 965 __entry->type = type;
962 __entry->startoff = irec ? irec->br_startoff : 0; 966 __entry->startoff = irec ? irec->br_startoff : 0;
963 __entry->startblock = irec ? irec->br_startblock : 0; 967 __entry->startblock = irec ? irec->br_startblock : 0;
964 __entry->blockcount = irec ? irec->br_blockcount : 0; 968 __entry->blockcount = irec ? irec->br_blockcount : 0;
965 ), 969 ),
966 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 970 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
967 "offset 0x%llx count %zd flags %s " 971 "offset 0x%llx count %zd type %s "
968 "startoff 0x%llx startblock %lld blockcount 0x%llx", 972 "startoff 0x%llx startblock %lld blockcount 0x%llx",
969 MAJOR(__entry->dev), MINOR(__entry->dev), 973 MAJOR(__entry->dev), MINOR(__entry->dev),
970 __entry->ino, 974 __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
972 __entry->new_size, 976 __entry->new_size,
973 __entry->offset, 977 __entry->offset,
974 __entry->count, 978 __entry->count,
975 __print_flags(__entry->flags, "|", BMAPI_FLAGS), 979 __print_symbolic(__entry->type, XFS_IO_TYPES),
976 __entry->startoff, 980 __entry->startoff,
977 (__int64_t)__entry->startblock, 981 (__int64_t)__entry->startblock,
978 __entry->blockcount) 982 __entry->blockcount)
979) 983)
980 984
981#define DEFINE_IOMAP_EVENT(name) \ 985#define DEFINE_IOMAP_EVENT(name) \
982DEFINE_EVENT(xfs_iomap_class, name, \ 986DEFINE_EVENT(xfs_imap_class, name, \
983 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 987 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
984 int flags, struct xfs_bmbt_irec *irec), \ 988 int type, struct xfs_bmbt_irec *irec), \
985 TP_ARGS(ip, offset, count, flags, irec)) 989 TP_ARGS(ip, offset, count, type, irec))
986DEFINE_IOMAP_EVENT(xfs_iomap_enter); 990DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
987DEFINE_IOMAP_EVENT(xfs_iomap_found); 991DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
988DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 992DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
993DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
989 994
990DECLARE_EVENT_CLASS(xfs_simple_io_class, 995DECLARE_EVENT_CLASS(xfs_simple_io_class,
991 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 996 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
1022 TP_ARGS(ip, offset, count)) 1027 TP_ARGS(ip, offset, count))
1023DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 1028DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
1024DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 1029DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
1030DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
1025 1031
1026 1032
1027TRACE_EVENT(xfs_itruncate_start, 1033TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
1420 TP_PROTO(struct xfs_alloc_arg *args), \ 1426 TP_PROTO(struct xfs_alloc_arg *args), \
1421 TP_ARGS(args)) 1427 TP_ARGS(args))
1422DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); 1428DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
1429DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
1423DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); 1430DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
1424DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); 1431DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
1425DEFINE_ALLOC_EVENT(xfs_alloc_near_first); 1432DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1752DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); 1759DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1753DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); 1760DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1754 1761
1762DECLARE_EVENT_CLASS(xfs_discard_class,
1763 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1764 xfs_agblock_t agbno, xfs_extlen_t len),
1765 TP_ARGS(mp, agno, agbno, len),
1766 TP_STRUCT__entry(
1767 __field(dev_t, dev)
1768 __field(xfs_agnumber_t, agno)
1769 __field(xfs_agblock_t, agbno)
1770 __field(xfs_extlen_t, len)
1771 ),
1772 TP_fast_assign(
1773 __entry->dev = mp->m_super->s_dev;
1774 __entry->agno = agno;
1775 __entry->agbno = agbno;
1776 __entry->len = len;
1777 ),
1778 TP_printk("dev %d:%d agno %u agbno %u len %u\n",
1779 MAJOR(__entry->dev), MINOR(__entry->dev),
1780 __entry->agno,
1781 __entry->agbno,
1782 __entry->len)
1783)
1784
1785#define DEFINE_DISCARD_EVENT(name) \
1786DEFINE_EVENT(xfs_discard_class, name, \
1787 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1788 xfs_agblock_t agbno, xfs_extlen_t len), \
1789 TP_ARGS(mp, agno, agbno, len))
1790DEFINE_DISCARD_EVENT(xfs_discard_extent);
1791DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
1792DEFINE_DISCARD_EVENT(xfs_discard_exclude);
1793DEFINE_DISCARD_EVENT(xfs_discard_busy);
1794
1755#endif /* _TRACE_XFS_H */ 1795#endif /* _TRACE_XFS_H */
1756 1796
1757#undef TRACE_INCLUDE_PATH 1797#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a12..d22aa3103106 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
149 ASSERT(list_empty(&dqp->q_freelist)); 149 ASSERT(list_empty(&dqp->q_freelist));
150 150
151 mutex_destroy(&dqp->q_qlock); 151 mutex_destroy(&dqp->q_qlock);
152 sv_destroy(&dqp->q_pinwait);
153 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); 152 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
154 153
155 atomic_dec(&xfs_Gqm->qm_totaldquots); 154 atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a47..0df88897ef84 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
25#include "xfs_mount.h" 25#include "xfs_mount.h"
26#include "xfs_error.h" 26#include "xfs_error.h"
27 27
28static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock);
30
31/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
32#define XFS_MAX_ERR_LEVEL 7
33#define XFS_ERR_MASK ((1 << 3) - 1)
34static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
35 {KERN_EMERG, KERN_ALERT, KERN_CRIT,
36 KERN_ERR, KERN_WARNING, KERN_NOTICE,
37 KERN_INFO, KERN_DEBUG};
38
39void 28void
40cmn_err(register int level, char *fmt, ...) 29cmn_err(
30 const char *lvl,
31 const char *fmt,
32 ...)
41{ 33{
42 char *fp = fmt; 34 struct va_format vaf;
43 int len; 35 va_list args;
44 ulong flags; 36
45 va_list ap; 37 va_start(args, fmt);
46 38 vaf.fmt = fmt;
47 level &= XFS_ERR_MASK; 39 vaf.va = &args;
48 if (level > XFS_MAX_ERR_LEVEL) 40
49 level = XFS_MAX_ERR_LEVEL; 41 printk("%s%pV", lvl, &vaf);
50 spin_lock_irqsave(&xfs_err_lock,flags); 42 va_end(args);
51 va_start(ap, fmt); 43
52 if (*fmt == '!') fp++; 44 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
53 len = vsnprintf(message, sizeof(message), fp, ap);
54 if (len >= sizeof(message))
55 len = sizeof(message) - 1;
56 if (message[len-1] == '\n')
57 message[len-1] = 0;
58 printk("%s%s\n", err_level[level], message);
59 va_end(ap);
60 spin_unlock_irqrestore(&xfs_err_lock,flags);
61 BUG_ON(level == CE_PANIC);
62} 45}
63 46
64void 47void
65xfs_fs_vcmn_err( 48xfs_fs_cmn_err(
66 int level, 49 const char *lvl,
67 struct xfs_mount *mp, 50 struct xfs_mount *mp,
68 char *fmt, 51 const char *fmt,
69 va_list ap) 52 ...)
70{ 53{
71 unsigned long flags; 54 struct va_format vaf;
72 int len = 0; 55 va_list args;
73 56
74 level &= XFS_ERR_MASK; 57 va_start(args, fmt);
75 if (level > XFS_MAX_ERR_LEVEL) 58 vaf.fmt = fmt;
76 level = XFS_MAX_ERR_LEVEL; 59 vaf.va = &args;
77 60
78 spin_lock_irqsave(&xfs_err_lock,flags); 61 printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
62 va_end(args);
79 63
80 if (mp) { 64 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
81 len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); 65}
66
67/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
68void
69xfs_cmn_err(
70 int panic_tag,
71 const char *lvl,
72 struct xfs_mount *mp,
73 const char *fmt,
74 ...)
75{
76 struct va_format vaf;
77 va_list args;
78 int do_panic = 0;
82 79
83 /* 80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
84 * Skip the printk if we can't print anything useful 81 printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
85 * due to an over-long device name. 82 do_panic = 1;
86 */
87 if (len >= sizeof(message))
88 goto out;
89 } 83 }
90 84
91 len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); 85 va_start(args, fmt);
92 if (len >= sizeof(message)) 86 vaf.fmt = fmt;
93 len = sizeof(message) - 1; 87 vaf.va = &args;
94 if (message[len-1] == '\n')
95 message[len-1] = 0;
96 88
97 printk("%s%s\n", err_level[level], message); 89 printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
98 out: 90 va_end(args);
99 spin_unlock_irqrestore(&xfs_err_lock,flags);
100 91
101 BUG_ON(level == CE_PANIC); 92 BUG_ON(do_panic);
102} 93}
103 94
104void 95void
105assfail(char *expr, char *file, int line) 96assfail(char *expr, char *file, int line)
106{ 97{
107 printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); 98 printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
99 file, line);
108 BUG(); 100 BUG();
109} 101}
110 102
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4f..05699f67d475 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
20 20
21#include <stdarg.h> 21#include <stdarg.h>
22 22
23#define CE_DEBUG 7 /* debug */ 23struct xfs_mount;
24#define CE_CONT 6 /* continuation */ 24
25#define CE_NOTE 5 /* notice */ 25#define CE_DEBUG KERN_DEBUG
26#define CE_WARN 4 /* warning */ 26#define CE_CONT KERN_INFO
27#define CE_ALERT 1 /* alert */ 27#define CE_NOTE KERN_NOTICE
28#define CE_PANIC 0 /* panic */ 28#define CE_WARN KERN_WARNING
29 29#define CE_ALERT KERN_ALERT
30extern void cmn_err(int, char *, ...) 30#define CE_PANIC KERN_EMERG
31 __attribute__ ((format (printf, 2, 3))); 31
32void cmn_err(const char *lvl, const char *fmt, ...)
33 __attribute__ ((format (printf, 2, 3)));
34void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
35 const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
36void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
37 const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
38
32extern void assfail(char *expr, char *f, int l); 39extern void assfail(char *expr, char *f, int l);
33 40
34#define ASSERT_ALWAYS(expr) \ 41#define ASSERT_ALWAYS(expr) \
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c022..58632cc17f2d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
227 227
228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
229 229
230 rwlock_t pag_ici_lock; /* incore inode lock */ 230 spinlock_t pag_ici_lock; /* incore inode cache lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
233 struct mutex pag_ici_reclaim_lock; /* serialisation point */ 233 struct mutex pag_ici_reclaim_lock; /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca5..f3227984a9bf 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44static int
45xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
46 xfs_agblock_t bno, xfs_extlen_t len);
47
48/* 44/*
49 * Prototypes for per-ag allocation routines 45 * Prototypes for per-ag allocation routines
50 */ 46 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
94 * Lookup the first record less than or equal to [bno, len] 90 * Lookup the first record less than or equal to [bno, len]
95 * in the btree given by cur. 91 * in the btree given by cur.
96 */ 92 */
97STATIC int /* error */ 93int /* error */
98xfs_alloc_lookup_le( 94xfs_alloc_lookup_le(
99 struct xfs_btree_cur *cur, /* btree cursor */ 95 struct xfs_btree_cur *cur, /* btree cursor */
100 xfs_agblock_t bno, /* starting block of extent */ 96 xfs_agblock_t bno, /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
127/* 123/*
128 * Get the data from the pointed-to record. 124 * Get the data from the pointed-to record.
129 */ 125 */
130STATIC int /* error */ 126int /* error */
131xfs_alloc_get_rec( 127xfs_alloc_get_rec(
132 struct xfs_btree_cur *cur, /* btree cursor */ 128 struct xfs_btree_cur *cur, /* btree cursor */
133 xfs_agblock_t *bno, /* output: starting block of extent */ 129 xfs_agblock_t *bno, /* output: starting block of extent */
@@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact(
577 xfs_extlen_t rlen; /* length of returned extent */ 573 xfs_extlen_t rlen; /* length of returned extent */
578 574
579 ASSERT(args->alignment == 1); 575 ASSERT(args->alignment == 1);
576
580 /* 577 /*
581 * Allocate/initialize a cursor for the by-number freespace btree. 578 * Allocate/initialize a cursor for the by-number freespace btree.
582 */ 579 */
583 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 580 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
584 args->agno, XFS_BTNUM_BNO); 581 args->agno, XFS_BTNUM_BNO);
582
585 /* 583 /*
586 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 584 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
587 * Look for the closest free block <= bno, it must contain bno 585 * Look for the closest free block <= bno, it must contain bno
588 * if any free block does. 586 * if any free block does.
589 */ 587 */
590 if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) 588 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
589 if (error)
591 goto error0; 590 goto error0;
592 if (!i) { 591 if (!i)
593 /* 592 goto not_found;
594 * Didn't find it, return null. 593
595 */
596 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
597 args->agbno = NULLAGBLOCK;
598 return 0;
599 }
600 /* 594 /*
601 * Grab the freespace record. 595 * Grab the freespace record.
602 */ 596 */
603 if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) 597 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
598 if (error)
604 goto error0; 599 goto error0;
605 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 600 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
606 ASSERT(fbno <= args->agbno); 601 ASSERT(fbno <= args->agbno);
607 minend = args->agbno + args->minlen; 602 minend = args->agbno + args->minlen;
608 maxend = args->agbno + args->maxlen; 603 maxend = args->agbno + args->maxlen;
609 fend = fbno + flen; 604 fend = fbno + flen;
605
610 /* 606 /*
611 * Give up if the freespace isn't long enough for the minimum request. 607 * Give up if the freespace isn't long enough for the minimum request.
612 */ 608 */
613 if (fend < minend) { 609 if (fend < minend)
614 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 610 goto not_found;
615 args->agbno = NULLAGBLOCK; 611
616 return 0;
617 }
618 /* 612 /*
619 * End of extent will be smaller of the freespace end and the 613 * End of extent will be smaller of the freespace end and the
620 * maximal requested end. 614 * maximal requested end.
621 */ 615 *
622 end = XFS_AGBLOCK_MIN(fend, maxend);
623 /*
624 * Fix the length according to mod and prod if given. 616 * Fix the length according to mod and prod if given.
625 */ 617 */
618 end = XFS_AGBLOCK_MIN(fend, maxend);
626 args->len = end - args->agbno; 619 args->len = end - args->agbno;
627 xfs_alloc_fix_len(args); 620 xfs_alloc_fix_len(args);
628 if (!xfs_alloc_fix_minleft(args)) { 621 if (!xfs_alloc_fix_minleft(args))
629 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 622 goto not_found;
630 return 0; 623
631 }
632 rlen = args->len; 624 rlen = args->len;
633 ASSERT(args->agbno + rlen <= fend); 625 ASSERT(args->agbno + rlen <= fend);
634 end = args->agbno + rlen; 626 end = args->agbno + rlen;
627
635 /* 628 /*
636 * We are allocating agbno for rlen [agbno .. end] 629 * We are allocating agbno for rlen [agbno .. end]
637 * Allocate/initialize a cursor for the by-size btree. 630 * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact(
640 args->agno, XFS_BTNUM_CNT); 633 args->agno, XFS_BTNUM_CNT);
641 ASSERT(args->agbno + args->len <= 634 ASSERT(args->agbno + args->len <=
642 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 635 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
643 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 636 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
644 args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { 637 args->len, XFSA_FIXUP_BNO_OK);
638 if (error) {
645 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 639 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
646 goto error0; 640 goto error0;
647 } 641 }
642
648 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 643 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
649 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 644 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
650 645
651 trace_xfs_alloc_exact_done(args);
652 args->wasfromfl = 0; 646 args->wasfromfl = 0;
647 trace_xfs_alloc_exact_done(args);
648 return 0;
649
650not_found:
651 /* Didn't find it, return null. */
652 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
653 args->agbno = NULLAGBLOCK;
654 trace_xfs_alloc_exact_notfound(args);
653 return 0; 655 return 0;
654 656
655error0: 657error0:
@@ -659,6 +661,95 @@ error0:
659} 661}
660 662
661/* 663/*
664 * Search the btree in a given direction via the search cursor and compare
665 * the records found against the good extent we've already found.
666 */
667STATIC int
668xfs_alloc_find_best_extent(
669 struct xfs_alloc_arg *args, /* allocation argument structure */
670 struct xfs_btree_cur **gcur, /* good cursor */
671 struct xfs_btree_cur **scur, /* searching cursor */
672 xfs_agblock_t gdiff, /* difference for search comparison */
673 xfs_agblock_t *sbno, /* extent found by search */
674 xfs_extlen_t *slen,
675 xfs_extlen_t *slena, /* aligned length */
676 int dir) /* 0 = search right, 1 = search left */
677{
678 xfs_agblock_t bno;
679 xfs_agblock_t new;
680 xfs_agblock_t sdiff;
681 int error;
682 int i;
683
684 /* The good extent is perfect, no need to search. */
685 if (!gdiff)
686 goto out_use_good;
687
688 /*
689 * Look until we find a better one, run out of space or run off the end.
690 */
691 do {
692 error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
693 if (error)
694 goto error0;
695 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
696 xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
697 args->minlen, &bno, slena);
698
699 /*
700 * The good extent is closer than this one.
701 */
702 if (!dir) {
703 if (bno >= args->agbno + gdiff)
704 goto out_use_good;
705 } else {
706 if (bno <= args->agbno - gdiff)
707 goto out_use_good;
708 }
709
710 /*
711 * Same distance, compare length and pick the best.
712 */
713 if (*slena >= args->minlen) {
714 args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
715 xfs_alloc_fix_len(args);
716
717 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
718 args->alignment, *sbno,
719 *slen, &new);
720
721 /*
722 * Choose closer size and invalidate other cursor.
723 */
724 if (sdiff < gdiff)
725 goto out_use_search;
726 goto out_use_good;
727 }
728
729 if (!dir)
730 error = xfs_btree_increment(*scur, 0, &i);
731 else
732 error = xfs_btree_decrement(*scur, 0, &i);
733 if (error)
734 goto error0;
735 } while (i);
736
737out_use_good:
738 xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
739 *scur = NULL;
740 return 0;
741
742out_use_search:
743 xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
744 *gcur = NULL;
745 return 0;
746
747error0:
748 /* caller invalidates cursors */
749 return error;
750}
751
752/*
662 * Allocate a variable extent near bno in the allocation group agno. 753 * Allocate a variable extent near bno in the allocation group agno.
663 * Extent's length (returned in len) will be between minlen and maxlen, 754 * Extent's length (returned in len) will be between minlen and maxlen,
664 * and of the form k * prod + mod unless there's nothing that large. 755 * and of the form k * prod + mod unless there's nothing that large.
@@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near(
925 } 1016 }
926 } 1017 }
927 } while (bno_cur_lt || bno_cur_gt); 1018 } while (bno_cur_lt || bno_cur_gt);
1019
928 /* 1020 /*
929 * Got both cursors still active, need to find better entry. 1021 * Got both cursors still active, need to find better entry.
930 */ 1022 */
931 if (bno_cur_lt && bno_cur_gt) { 1023 if (bno_cur_lt && bno_cur_gt) {
932 /*
933 * Left side is long enough, look for a right side entry.
934 */
935 if (ltlena >= args->minlen) { 1024 if (ltlena >= args->minlen) {
936 /* 1025 /*
937 * Fix up the length. 1026 * Left side is good, look for a right side entry.
938 */ 1027 */
939 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1028 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
940 xfs_alloc_fix_len(args); 1029 xfs_alloc_fix_len(args);
941 rlen = args->len; 1030 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
942 ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
943 args->alignment, ltbno, ltlen, &ltnew); 1031 args->alignment, ltbno, ltlen, &ltnew);
1032
1033 error = xfs_alloc_find_best_extent(args,
1034 &bno_cur_lt, &bno_cur_gt,
1035 ltdiff, &gtbno, &gtlen, &gtlena,
1036 0 /* search right */);
1037 } else {
1038 ASSERT(gtlena >= args->minlen);
1039
944 /* 1040 /*
945 * Not perfect. 1041 * Right side is good, look for a left side entry.
946 */
947 if (ltdiff) {
948 /*
949 * Look until we find a better one, run out of
950 * space, or run off the end.
951 */
952 while (bno_cur_lt && bno_cur_gt) {
953 if ((error = xfs_alloc_get_rec(
954 bno_cur_gt, &gtbno,
955 &gtlen, &i)))
956 goto error0;
957 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
958 xfs_alloc_compute_aligned(gtbno, gtlen,
959 args->alignment, args->minlen,
960 &gtbnoa, &gtlena);
961 /*
962 * The left one is clearly better.
963 */
964 if (gtbnoa >= args->agbno + ltdiff) {
965 xfs_btree_del_cursor(
966 bno_cur_gt,
967 XFS_BTREE_NOERROR);
968 bno_cur_gt = NULL;
969 break;
970 }
971 /*
972 * If we reach a big enough entry,
973 * compare the two and pick the best.
974 */
975 if (gtlena >= args->minlen) {
976 args->len =
977 XFS_EXTLEN_MIN(gtlena,
978 args->maxlen);
979 xfs_alloc_fix_len(args);
980 rlen = args->len;
981 gtdiff = xfs_alloc_compute_diff(
982 args->agbno, rlen,
983 args->alignment,
984 gtbno, gtlen, &gtnew);
985 /*
986 * Right side is better.
987 */
988 if (gtdiff < ltdiff) {
989 xfs_btree_del_cursor(
990 bno_cur_lt,
991 XFS_BTREE_NOERROR);
992 bno_cur_lt = NULL;
993 }
994 /*
995 * Left side is better.
996 */
997 else {
998 xfs_btree_del_cursor(
999 bno_cur_gt,
1000 XFS_BTREE_NOERROR);
1001 bno_cur_gt = NULL;
1002 }
1003 break;
1004 }
1005 /*
1006 * Fell off the right end.
1007 */
1008 if ((error = xfs_btree_increment(
1009 bno_cur_gt, 0, &i)))
1010 goto error0;
1011 if (!i) {
1012 xfs_btree_del_cursor(
1013 bno_cur_gt,
1014 XFS_BTREE_NOERROR);
1015 bno_cur_gt = NULL;
1016 break;
1017 }
1018 }
1019 }
1020 /*
1021 * The left side is perfect, trash the right side.
1022 */
1023 else {
1024 xfs_btree_del_cursor(bno_cur_gt,
1025 XFS_BTREE_NOERROR);
1026 bno_cur_gt = NULL;
1027 }
1028 }
1029 /*
1030 * It's the right side that was found first, look left.
1031 */
1032 else {
1033 /*
1034 * Fix up the length.
1035 */ 1042 */
1036 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1043 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1037 xfs_alloc_fix_len(args); 1044 xfs_alloc_fix_len(args);
1038 rlen = args->len; 1045 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1039 gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
1040 args->alignment, gtbno, gtlen, &gtnew); 1046 args->alignment, gtbno, gtlen, &gtnew);
1041 /* 1047
1042 * Right side entry isn't perfect. 1048 error = xfs_alloc_find_best_extent(args,
1043 */ 1049 &bno_cur_gt, &bno_cur_lt,
1044 if (gtdiff) { 1050 gtdiff, &ltbno, &ltlen, &ltlena,
1045 /* 1051 1 /* search left */);
1046 * Look until we find a better one, run out of
1047 * space, or run off the end.
1048 */
1049 while (bno_cur_lt && bno_cur_gt) {
1050 if ((error = xfs_alloc_get_rec(
1051 bno_cur_lt, &ltbno,
1052 &ltlen, &i)))
1053 goto error0;
1054 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1055 xfs_alloc_compute_aligned(ltbno, ltlen,
1056 args->alignment, args->minlen,
1057 &ltbnoa, &ltlena);
1058 /*
1059 * The right one is clearly better.
1060 */
1061 if (ltbnoa <= args->agbno - gtdiff) {
1062 xfs_btree_del_cursor(
1063 bno_cur_lt,
1064 XFS_BTREE_NOERROR);
1065 bno_cur_lt = NULL;
1066 break;
1067 }
1068 /*
1069 * If we reach a big enough entry,
1070 * compare the two and pick the best.
1071 */
1072 if (ltlena >= args->minlen) {
1073 args->len = XFS_EXTLEN_MIN(
1074 ltlena, args->maxlen);
1075 xfs_alloc_fix_len(args);
1076 rlen = args->len;
1077 ltdiff = xfs_alloc_compute_diff(
1078 args->agbno, rlen,
1079 args->alignment,
1080 ltbno, ltlen, &ltnew);
1081 /*
1082 * Left side is better.
1083 */
1084 if (ltdiff < gtdiff) {
1085 xfs_btree_del_cursor(
1086 bno_cur_gt,
1087 XFS_BTREE_NOERROR);
1088 bno_cur_gt = NULL;
1089 }
1090 /*
1091 * Right side is better.
1092 */
1093 else {
1094 xfs_btree_del_cursor(
1095 bno_cur_lt,
1096 XFS_BTREE_NOERROR);
1097 bno_cur_lt = NULL;
1098 }
1099 break;
1100 }
1101 /*
1102 * Fell off the left end.
1103 */
1104 if ((error = xfs_btree_decrement(
1105 bno_cur_lt, 0, &i)))
1106 goto error0;
1107 if (!i) {
1108 xfs_btree_del_cursor(bno_cur_lt,
1109 XFS_BTREE_NOERROR);
1110 bno_cur_lt = NULL;
1111 break;
1112 }
1113 }
1114 }
1115 /*
1116 * The right side is perfect, trash the left side.
1117 */
1118 else {
1119 xfs_btree_del_cursor(bno_cur_lt,
1120 XFS_BTREE_NOERROR);
1121 bno_cur_lt = NULL;
1122 }
1123 } 1052 }
1053
1054 if (error)
1055 goto error0;
1124 } 1056 }
1057
1125 /* 1058 /*
1126 * If we couldn't get anything, give up. 1059 * If we couldn't get anything, give up.
1127 */ 1060 */
@@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near(
1130 args->agbno = NULLAGBLOCK; 1063 args->agbno = NULLAGBLOCK;
1131 return 0; 1064 return 0;
1132 } 1065 }
1066
1133 /* 1067 /*
1134 * At this point we have selected a freespace entry, either to the 1068 * At this point we have selected a freespace entry, either to the
1135 * left or to the right. If it's on the right, copy all the 1069 * left or to the right. If it's on the right, copy all the
@@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near(
1146 j = 1; 1080 j = 1;
1147 } else 1081 } else
1148 j = 0; 1082 j = 0;
1083
1149 /* 1084 /*
1150 * Fix up the length and compute the useful address. 1085 * Fix up the length and compute the useful address.
1151 */ 1086 */
@@ -2676,7 +2611,7 @@ restart:
2676 * will require a synchronous transaction, but it can still be 2611 * will require a synchronous transaction, but it can still be
2677 * used to distinguish between a partial or exact match. 2612 * used to distinguish between a partial or exact match.
2678 */ 2613 */
2679static int 2614int
2680xfs_alloc_busy_search( 2615xfs_alloc_busy_search(
2681 struct xfs_mount *mp, 2616 struct xfs_mount *mp,
2682 xfs_agnumber_t agno, 2617 xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..0ab56b32c7eb 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
19#define __XFS_ALLOC_H__ 19#define __XFS_ALLOC_H__
20 20
21struct xfs_buf; 21struct xfs_buf;
22struct xfs_btree_cur;
22struct xfs_mount; 23struct xfs_mount;
23struct xfs_perag; 24struct xfs_perag;
24struct xfs_trans; 25struct xfs_trans;
@@ -118,16 +119,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
118 struct xfs_perag *pag); 119 struct xfs_perag *pag);
119 120
120#ifdef __KERNEL__ 121#ifdef __KERNEL__
121
122void 122void
123xfs_alloc_busy_insert(xfs_trans_t *tp, 123xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
124 xfs_agnumber_t agno, 124 xfs_agblock_t bno, xfs_extlen_t len);
125 xfs_agblock_t bno,
126 xfs_extlen_t len);
127 125
128void 126void
129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); 127xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
130 128
129int
130xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
131 xfs_agblock_t bno, xfs_extlen_t len);
131#endif /* __KERNEL__ */ 132#endif /* __KERNEL__ */
132 133
133/* 134/*
@@ -205,4 +206,18 @@ xfs_free_extent(
205 xfs_fsblock_t bno, /* starting block number of extent */ 206 xfs_fsblock_t bno, /* starting block number of extent */
206 xfs_extlen_t len); /* length of extent */ 207 xfs_extlen_t len); /* length of extent */
207 208
209int /* error */
210xfs_alloc_lookup_le(
211 struct xfs_btree_cur *cur, /* btree cursor */
212 xfs_agblock_t bno, /* starting block of extent */
213 xfs_extlen_t len, /* length of extent */
214 int *stat); /* success/failure */
215
216int /* error */
217xfs_alloc_get_rec(
218 struct xfs_btree_cur *cur, /* btree cursor */
219 xfs_agblock_t *bno, /* output: starting block of extent */
220 xfs_extlen_t *len, /* output: length of extent */
221 int *stat); /* output: success/failure */
222
208#endif /* __XFS_ALLOC_H__ */ 223#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
637 * It didn't all fit, so we have to sort everything on hashval. 637 * It didn't all fit, so we have to sort everything on hashval.
638 */ 638 */
639 sbsize = sf->hdr.count * sizeof(*sbuf); 639 sbsize = sf->hdr.count * sizeof(*sbuf);
640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); 640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
641 641
642 /* 642 /*
643 * Scan the attribute list for the rest of the entries, storing 643 * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2386 args.dp = context->dp; 2386 args.dp = context->dp;
2387 args.whichfork = XFS_ATTR_FORK; 2387 args.whichfork = XFS_ATTR_FORK;
2388 args.valuelen = valuelen; 2388 args.valuelen = valuelen;
2389 args.value = kmem_alloc(valuelen, KM_SLEEP); 2389 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk); 2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); 2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
2392 retval = xfs_attr_rmtval_get(&args); 2392 retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7e..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
634 return error; 634 return error;
635 } 635 }
636 ASSERT(!bp || !XFS_BUF_GETERROR(bp)); 636 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
637 if (bp != NULL) { 637 if (bp)
638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); 638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
639 }
640 *bpp = bp; 639 *bpp = bp;
641 return 0; 640 return 0;
642} 641}
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
944 switch (cur->bc_btnum) { 943 switch (cur->bc_btnum) {
945 case XFS_BTNUM_BNO: 944 case XFS_BTNUM_BNO:
946 case XFS_BTNUM_CNT: 945 case XFS_BTNUM_CNT:
947 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); 946 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
948 break; 947 break;
949 case XFS_BTNUM_INO: 948 case XFS_BTNUM_INO:
950 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); 949 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
951 break; 950 break;
952 case XFS_BTNUM_BMAP: 951 case XFS_BTNUM_BMAP:
953 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); 952 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
954 break; 953 break;
955 default: 954 default:
956 ASSERT(0); 955 ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5b..98c6f73b6752 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,8 +141,7 @@ xfs_buf_item_log_check(
141#define xfs_buf_item_log_check(x) 141#define xfs_buf_item_log_check(x)
142#endif 142#endif
143 143
144STATIC void xfs_buf_error_relse(xfs_buf_t *bp); 144STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
145STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
146 145
147/* 146/*
148 * This returns the number of log iovecs needed to log the 147 * This returns the number of log iovecs needed to log the
@@ -450,7 +449,7 @@ xfs_buf_item_unpin(
450 * xfs_trans_ail_delete() drops the AIL lock. 449 * xfs_trans_ail_delete() drops the AIL lock.
451 */ 450 */
452 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 451 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
453 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); 452 xfs_buf_do_callbacks(bp);
454 XFS_BUF_SET_FSPRIVATE(bp, NULL); 453 XFS_BUF_SET_FSPRIVATE(bp, NULL);
455 XFS_BUF_CLR_IODONE_FUNC(bp); 454 XFS_BUF_CLR_IODONE_FUNC(bp);
456 } else { 455 } else {
@@ -918,15 +917,26 @@ xfs_buf_attach_iodone(
918 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 917 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
919} 918}
920 919
920/*
921 * We can have many callbacks on a buffer. Running the callbacks individually
922 * can cause a lot of contention on the AIL lock, so we allow for a single
923 * callback to be able to scan the remaining lip->li_bio_list for other items
924 * of the same type and callback to be processed in the first call.
925 *
926 * As a result, the loop walking the callback list below will also modify the
927 * list. it removes the first item from the list and then runs the callback.
928 * The loop then restarts from the new head of the list. This allows the
929 * callback to scan and modify the list attached to the buffer and we don't
930 * have to care about maintaining a next item pointer.
931 */
921STATIC void 932STATIC void
922xfs_buf_do_callbacks( 933xfs_buf_do_callbacks(
923 xfs_buf_t *bp, 934 struct xfs_buf *bp)
924 xfs_log_item_t *lip)
925{ 935{
926 xfs_log_item_t *nlip; 936 struct xfs_log_item *lip;
927 937
928 while (lip != NULL) { 938 while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
929 nlip = lip->li_bio_list; 939 XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
930 ASSERT(lip->li_cb != NULL); 940 ASSERT(lip->li_cb != NULL);
931 /* 941 /*
932 * Clear the next pointer so we don't have any 942 * Clear the next pointer so we don't have any
@@ -936,7 +946,6 @@ xfs_buf_do_callbacks(
936 */ 946 */
937 lip->li_bio_list = NULL; 947 lip->li_bio_list = NULL;
938 lip->li_cb(bp, lip); 948 lip->li_cb(bp, lip);
939 lip = nlip;
940 } 949 }
941} 950}
942 951
@@ -949,128 +958,76 @@ xfs_buf_do_callbacks(
949 */ 958 */
950void 959void
951xfs_buf_iodone_callbacks( 960xfs_buf_iodone_callbacks(
952 xfs_buf_t *bp) 961 struct xfs_buf *bp)
953{ 962{
954 xfs_log_item_t *lip; 963 struct xfs_log_item *lip = bp->b_fspriv;
955 static ulong lasttime; 964 struct xfs_mount *mp = lip->li_mountp;
956 static xfs_buftarg_t *lasttarg; 965 static ulong lasttime;
957 xfs_mount_t *mp; 966 static xfs_buftarg_t *lasttarg;
958 967
959 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 968 if (likely(!XFS_BUF_GETERROR(bp)))
960 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 969 goto do_callbacks;
961 970
962 if (XFS_BUF_GETERROR(bp) != 0) { 971 /*
963 /* 972 * If we've already decided to shutdown the filesystem because of
964 * If we've already decided to shutdown the filesystem 973 * I/O errors, there's no point in giving this a retry.
965 * because of IO errors, there's no point in giving this 974 */
966 * a retry. 975 if (XFS_FORCED_SHUTDOWN(mp)) {
967 */ 976 XFS_BUF_SUPER_STALE(bp);
968 mp = lip->li_mountp; 977 trace_xfs_buf_item_iodone(bp, _RET_IP_);
969 if (XFS_FORCED_SHUTDOWN(mp)) { 978 goto do_callbacks;
970 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 979 }
971 XFS_BUF_SUPER_STALE(bp);
972 trace_xfs_buf_item_iodone(bp, _RET_IP_);
973 xfs_buf_do_callbacks(bp, lip);
974 XFS_BUF_SET_FSPRIVATE(bp, NULL);
975 XFS_BUF_CLR_IODONE_FUNC(bp);
976 xfs_buf_ioend(bp, 0);
977 return;
978 }
979 980
980 if ((XFS_BUF_TARGET(bp) != lasttarg) || 981 if (XFS_BUF_TARGET(bp) != lasttarg ||
981 (time_after(jiffies, (lasttime + 5*HZ)))) { 982 time_after(jiffies, (lasttime + 5*HZ))) {
982 lasttime = jiffies; 983 lasttime = jiffies;
983 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 984 cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
984 " block 0x%llx in %s", 985 " block 0x%llx in %s",
985 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 986 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
986 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 987 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
987 } 988 }
988 lasttarg = XFS_BUF_TARGET(bp); 989 lasttarg = XFS_BUF_TARGET(bp);
989 990
990 if (XFS_BUF_ISASYNC(bp)) { 991 /*
991 /* 992 * If the write was asynchronous then noone will be looking for the
992 * If the write was asynchronous then noone will be 993 * error. Clear the error state and write the buffer out again.
993 * looking for the error. Clear the error state 994 *
994 * and write the buffer out again delayed write. 995 * During sync or umount we'll write all pending buffers again
995 * 996 * synchronous, which will catch these errors if they keep hanging
996 * XXXsup This is OK, so long as we catch these 997 * around.
997 * before we start the umount; we don't want these 998 */
998 * DELWRI metadata bufs to be hanging around. 999 if (XFS_BUF_ISASYNC(bp)) {
999 */ 1000 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
1000 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ 1001
1001 1002 if (!XFS_BUF_ISSTALE(bp)) {
1002 if (!(XFS_BUF_ISSTALE(bp))) { 1003 XFS_BUF_DELAYWRITE(bp);
1003 XFS_BUF_DELAYWRITE(bp);
1004 XFS_BUF_DONE(bp);
1005 XFS_BUF_SET_START(bp);
1006 }
1007 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1008 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1009 xfs_buf_relse(bp);
1010 } else {
1011 /*
1012 * If the write of the buffer was not asynchronous,
1013 * then we want to make sure to return the error
1014 * to the caller of bwrite(). Because of this we
1015 * cannot clear the B_ERROR state at this point.
1016 * Instead we install a callback function that
1017 * will be called when the buffer is released, and
1018 * that routine will clear the error state and
1019 * set the buffer to be written out again after
1020 * some delay.
1021 */
1022 /* We actually overwrite the existing b-relse
1023 function at times, but we're gonna be shutting down
1024 anyway. */
1025 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1026 XFS_BUF_DONE(bp); 1004 XFS_BUF_DONE(bp);
1027 XFS_BUF_FINISH_IOWAIT(bp); 1005 XFS_BUF_SET_START(bp);
1028 } 1006 }
1007 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1008 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1009 xfs_buf_relse(bp);
1029 return; 1010 return;
1030 } 1011 }
1031 1012
1032 xfs_buf_do_callbacks(bp, lip); 1013 /*
1033 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1014 * If the write of the buffer was synchronous, we want to make
1034 XFS_BUF_CLR_IODONE_FUNC(bp); 1015 * sure to return the error to the caller of xfs_bwrite().
1035 xfs_buf_ioend(bp, 0); 1016 */
1036}
1037
1038/*
1039 * This is a callback routine attached to a buffer which gets an error
1040 * when being written out synchronously.
1041 */
1042STATIC void
1043xfs_buf_error_relse(
1044 xfs_buf_t *bp)
1045{
1046 xfs_log_item_t *lip;
1047 xfs_mount_t *mp;
1048
1049 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1050 mp = (xfs_mount_t *)lip->li_mountp;
1051 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1052
1053 XFS_BUF_STALE(bp); 1017 XFS_BUF_STALE(bp);
1054 XFS_BUF_DONE(bp); 1018 XFS_BUF_DONE(bp);
1055 XFS_BUF_UNDELAYWRITE(bp); 1019 XFS_BUF_UNDELAYWRITE(bp);
1056 XFS_BUF_ERROR(bp,0);
1057 1020
1058 trace_xfs_buf_error_relse(bp, _RET_IP_); 1021 trace_xfs_buf_error_relse(bp, _RET_IP_);
1022 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1059 1023
1060 if (! XFS_FORCED_SHUTDOWN(mp)) 1024do_callbacks:
1061 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1025 xfs_buf_do_callbacks(bp);
1062 /*
1063 * We have to unpin the pinned buffers so do the
1064 * callbacks.
1065 */
1066 xfs_buf_do_callbacks(bp, lip);
1067 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1026 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1068 XFS_BUF_CLR_IODONE_FUNC(bp); 1027 XFS_BUF_CLR_IODONE_FUNC(bp);
1069 XFS_BUF_SET_BRELSE_FUNC(bp,NULL); 1028 xfs_buf_ioend(bp, 0);
1070 xfs_buf_relse(bp);
1071} 1029}
1072 1030
1073
1074/* 1031/*
1075 * This is the iodone() function for buffers which have been 1032 * This is the iodone() function for buffers which have been
1076 * logged. It is called when they are eventually flushed out. 1033 * logged. It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
105 xfs_buf_log_format_t bli_format; /* in-log header */ 105 xfs_buf_log_format_t bli_format; /* in-log header */
106} xfs_buf_log_item_t; 106} xfs_buf_log_item_t;
107 107
108/*
109 * This structure is used during recovery to record the buf log
110 * items which have been canceled and should not be replayed.
111 */
112typedef struct xfs_buf_cancel {
113 xfs_daddr_t bc_blkno;
114 uint bc_len;
115 int bc_refcount;
116 struct xfs_buf_cancel *bc_next;
117} xfs_buf_cancel_t;
118
119void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); 108void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
120void xfs_buf_item_relse(struct xfs_buf *); 109void xfs_buf_item_relse(struct xfs_buf *);
121void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); 110void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c78cc6a3d87c..4c7db74a05f7 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
152} 152}
153#endif /* DEBUG */ 153#endif /* DEBUG */
154 154
155
156void
157xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
158{
159 va_list ap;
160
161 va_start(ap, fmt);
162 xfs_fs_vcmn_err(level, mp, fmt, ap);
163 va_end(ap);
164}
165
166void
167xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
168{
169 va_list ap;
170
171#ifdef DEBUG
172 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
173#endif
174
175 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
176 && (level & CE_ALERT)) {
177 level &= ~CE_ALERT;
178 level |= CE_PANIC;
179 cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
180 }
181 va_start(ap, fmt);
182 xfs_fs_vcmn_err(level, mp, fmt, ap);
183 va_end(ap);
184}
185
186void 155void
187xfs_error_report( 156xfs_error_report(
188 const char *tag, 157 const char *tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index f338847f80b8..10dce5475f02 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
137 (rf)))) 137 (rf))))
138 138
139extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); 139extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
140extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); 140extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
141#else 141#else
142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
143#define xfs_errortag_add(tag, mp) (ENOSYS) 143#define xfs_errortag_add(tag, mp) (ENOSYS)
@@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
162 162
163struct xfs_mount; 163struct xfs_mount;
164 164
165extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
166 char *fmt, va_list ap)
167 __attribute__ ((format (printf, 3, 0)));
168extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
169 char *fmt, ...)
170 __attribute__ ((format (printf, 4, 5)));
171extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
172 __attribute__ ((format (printf, 3, 4)));
173
174extern void xfs_hex_dump(void *p, int length); 165extern void xfs_hex_dump(void *p, int length);
175 166
176#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ 167#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
177 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) 168 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
178 169
179#define xfs_fs_mount_cmn_err(f, fmt, args...) \ 170#define xfs_fs_mount_cmn_err(f, fmt, args...) \
180 ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) 171 do { \
172 if (!(f & XFS_MFSI_QUIET)) \
173 cmn_err(CE_WARN, "XFS: " fmt, ## args); \
174 } while (0)
181 175
182#endif /* __XFS_ERROR_H__ */ 176#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..75f2ef60e579 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
48} 48}
49 49
50/* 50/*
51 * Freeing the efi requires that we remove it from the AIL if it has already
52 * been placed there. However, the EFI may not yet have been placed in the AIL
53 * when called by xfs_efi_release() from EFD processing due to the ordering of
54 * committed vs unpin operations in bulk insert operations. Hence the
55 * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
56 * the EFI.
57 */
58STATIC void
59__xfs_efi_release(
60 struct xfs_efi_log_item *efip)
61{
62 struct xfs_ail *ailp = efip->efi_item.li_ailp;
63
64 if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
65 spin_lock(&ailp->xa_lock);
66 /* xfs_trans_ail_delete() drops the AIL lock. */
67 xfs_trans_ail_delete(ailp, &efip->efi_item);
68 xfs_efi_item_free(efip);
69 }
70}
71
72/*
51 * This returns the number of iovecs needed to log the given efi item. 73 * This returns the number of iovecs needed to log the given efi item.
52 * We only need 1 iovec for an efi item. It just logs the efi_log_format 74 * We only need 1 iovec for an efi item. It just logs the efi_log_format
53 * structure. 75 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
74 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 96 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
75 uint size; 97 uint size;
76 98
77 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); 99 ASSERT(atomic_read(&efip->efi_next_extent) ==
100 efip->efi_format.efi_nextents);
78 101
79 efip->efi_format.efi_type = XFS_LI_EFI; 102 efip->efi_format.efi_type = XFS_LI_EFI;
80 103
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
99} 122}
100 123
101/* 124/*
102 * While EFIs cannot really be pinned, the unpin operation is the 125 * While EFIs cannot really be pinned, the unpin operation is the last place at
103 * last place at which the EFI is manipulated during a transaction. 126 * which the EFI is manipulated during a transaction. If we are being asked to
104 * Here we coordinate with xfs_efi_cancel() to determine who gets to 127 * remove the EFI it's because the transaction has been cancelled and by
105 * free the EFI. 128 * definition that means the EFI cannot be in the AIL so remove it from the
129 * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
130 * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
106 */ 131 */
107STATIC void 132STATIC void
108xfs_efi_item_unpin( 133xfs_efi_item_unpin(
@@ -110,20 +135,14 @@ xfs_efi_item_unpin(
110 int remove) 135 int remove)
111{ 136{
112 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 137 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
113 struct xfs_ail *ailp = lip->li_ailp;
114
115 spin_lock(&ailp->xa_lock);
116 if (efip->efi_flags & XFS_EFI_CANCELED) {
117 if (remove)
118 xfs_trans_del_item(lip);
119 138
120 /* xfs_trans_ail_delete() drops the AIL lock. */ 139 if (remove) {
121 xfs_trans_ail_delete(ailp, lip); 140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
141 xfs_trans_del_item(lip);
122 xfs_efi_item_free(efip); 142 xfs_efi_item_free(efip);
123 } else { 143 return;
124 efip->efi_flags |= XFS_EFI_COMMITTED;
125 spin_unlock(&ailp->xa_lock);
126 } 144 }
145 __xfs_efi_release(efip);
127} 146}
128 147
129/* 148/*
@@ -152,16 +171,20 @@ xfs_efi_item_unlock(
152} 171}
153 172
154/* 173/*
155 * The EFI is logged only once and cannot be moved in the log, so 174 * The EFI is logged only once and cannot be moved in the log, so simply return
156 * simply return the lsn at which it's been logged. The canceled 175 * the lsn at which it's been logged. For bulk transaction committed
157 * flag is not paid any attention here. Checking for that is delayed 176 * processing, the EFI may be processed but not yet unpinned prior to the EFD
158 * until the EFI is unpinned. 177 * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
178 * when processing the EFD.
159 */ 179 */
160STATIC xfs_lsn_t 180STATIC xfs_lsn_t
161xfs_efi_item_committed( 181xfs_efi_item_committed(
162 struct xfs_log_item *lip, 182 struct xfs_log_item *lip,
163 xfs_lsn_t lsn) 183 xfs_lsn_t lsn)
164{ 184{
185 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
186
187 set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
165 return lsn; 188 return lsn;
166} 189}
167 190
@@ -230,6 +253,7 @@ xfs_efi_init(
230 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); 253 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
231 efip->efi_format.efi_nextents = nextents; 254 efip->efi_format.efi_nextents = nextents;
232 efip->efi_format.efi_id = (__psint_t)(void*)efip; 255 efip->efi_format.efi_id = (__psint_t)(void*)efip;
256 atomic_set(&efip->efi_next_extent, 0);
233 257
234 return efip; 258 return efip;
235} 259}
@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
289} 313}
290 314
291/* 315/*
292 * This is called by the efd item code below to release references to 316 * This is called by the efd item code below to release references to the given
293 * the given efi item. Each efd calls this with the number of 317 * efi item. Each efd calls this with the number of extents that it has
294 * extents that it has logged, and when the sum of these reaches 318 * logged, and when the sum of these reaches the total number of extents logged
295 * the total number of extents logged by this efi item we can free 319 * by this efi item we can free the efi item.
296 * the efi item.
297 *
298 * Freeing the efi item requires that we remove it from the AIL.
299 * We'll use the AIL lock to protect our counters as well as
300 * the removal from the AIL.
301 */ 320 */
302void 321void
303xfs_efi_release(xfs_efi_log_item_t *efip, 322xfs_efi_release(xfs_efi_log_item_t *efip,
304 uint nextents) 323 uint nextents)
305{ 324{
306 struct xfs_ail *ailp = efip->efi_item.li_ailp; 325 ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
307 int extents_left; 326 if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
308 327 __xfs_efi_release(efip);
309 ASSERT(efip->efi_next_extent > 0);
310 ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
311
312 spin_lock(&ailp->xa_lock);
313 ASSERT(efip->efi_next_extent >= nextents);
314 efip->efi_next_extent -= nextents;
315 extents_left = efip->efi_next_extent;
316 if (extents_left == 0) {
317 /* xfs_trans_ail_delete() drops the AIL lock. */
318 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
319 xfs_efi_item_free(efip);
320 } else {
321 spin_unlock(&ailp->xa_lock);
322 }
323} 328}
324 329
325static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) 330static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
111#define XFS_EFI_MAX_FAST_EXTENTS 16 111#define XFS_EFI_MAX_FAST_EXTENTS 16
112 112
113/* 113/*
114 * Define EFI flags. 114 * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
115 */ 115 */
116#define XFS_EFI_RECOVERED 0x1 116#define XFS_EFI_RECOVERED 1
117#define XFS_EFI_COMMITTED 0x2 117#define XFS_EFI_COMMITTED 2
118#define XFS_EFI_CANCELED 0x4
119 118
120/* 119/*
121 * This is the "extent free intention" log item. It is used 120 * This is the "extent free intention" log item. It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
125 */ 124 */
126typedef struct xfs_efi_log_item { 125typedef struct xfs_efi_log_item {
127 xfs_log_item_t efi_item; 126 xfs_log_item_t efi_item;
128 uint efi_flags; /* misc flags */ 127 atomic_t efi_next_extent;
129 uint efi_next_extent; 128 unsigned long efi_flags; /* misc flags */
130 xfs_efi_log_format_t efi_format; 129 xfs_efi_log_format_t efi_format;
131} xfs_efi_log_item_t; 130} xfs_efi_log_item_t;
132 131
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814af..cec89dd5d7d2 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
374 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 374 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
375 } else 375 } else
376 mp->m_maxicount = 0; 376 mp->m_maxicount = 0;
377 xfs_set_low_space_thresholds(mp);
377 378
378 /* update secondary superblocks. */ 379 /* update secondary superblocks. */
379 for (agno = 1; agno < nagcount; agno++) { 380 for (agno = 1; agno < nagcount; agno++) {
@@ -611,12 +612,13 @@ out:
611 * 612 *
612 * We cannot use an inode here for this - that will push dirty state back up 613 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from 614 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead. 615 * making progress. Hence we log a field in the superblock instead and use a
616 * synchronous transaction to ensure the superblock is immediately unpinned
617 * and can be written back.
615 */ 618 */
616int 619int
617xfs_fs_log_dummy( 620xfs_fs_log_dummy(
618 xfs_mount_t *mp, 621 xfs_mount_t *mp)
619 int flags)
620{ 622{
621 xfs_trans_t *tp; 623 xfs_trans_t *tp;
622 int error; 624 int error;
@@ -631,8 +633,7 @@ xfs_fs_log_dummy(
631 633
632 /* log the UUID because it is an unchanging field */ 634 /* log the UUID because it is an unchanging field */
633 xfs_mod_sb(tp, XFS_SB_UUID); 635 xfs_mod_sb(tp, XFS_SB_UUID);
634 if (flags & SYNC_WAIT) 636 xfs_trans_set_sync(tp);
635 xfs_trans_set_sync(tp);
636 return xfs_trans_commit(tp, 0); 637 return xfs_trans_commit(tp, 0);
637} 638}
638 639
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); 28extern int xfs_fs_log_dummy(struct xfs_mount *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index d7de5a3f7867..cb9b6d1469f7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
43 43
44 44
45/* 45/*
46 * Define xfs inode iolock lockdep classes. We need to ensure that all active
47 * inodes are considered the same for lockdep purposes, including inodes that
48 * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
49 * guarantee the locks are considered the same when there are multiple lock
50 * initialisation siteѕ. Also, define a reclaimable inode class so it is
51 * obvious in lockdep reports which class the report is against.
52 */
53static struct lock_class_key xfs_iolock_active;
54struct lock_class_key xfs_iolock_reclaimable;
55
56/*
46 * Allocate and initialise an xfs_inode. 57 * Allocate and initialise an xfs_inode.
47 */ 58 */
48STATIC struct xfs_inode * 59STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
69 ASSERT(atomic_read(&ip->i_pincount) == 0); 80 ASSERT(atomic_read(&ip->i_pincount) == 0);
70 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 81 ASSERT(!spin_is_locked(&ip->i_flags_lock));
71 ASSERT(completion_done(&ip->i_flush)); 82 ASSERT(completion_done(&ip->i_flush));
83 ASSERT(ip->i_ino == 0);
72 84
73 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 85 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
86 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
87 &xfs_iolock_active, "xfs_iolock_active");
74 88
75 /* initialise the xfs inode */ 89 /* initialise the xfs inode */
76 ip->i_ino = ino; 90 ip->i_ino = ino;
@@ -85,9 +99,6 @@ xfs_inode_alloc(
85 ip->i_size = 0; 99 ip->i_size = 0;
86 ip->i_new_size = 0; 100 ip->i_new_size = 0;
87 101
88 /* prevent anyone from using this yet */
89 VFS_I(ip)->i_state = I_NEW;
90
91 return ip; 102 return ip;
92} 103}
93 104
@@ -145,7 +156,18 @@ xfs_inode_free(
145 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 156 ASSERT(!spin_is_locked(&ip->i_flags_lock));
146 ASSERT(completion_done(&ip->i_flush)); 157 ASSERT(completion_done(&ip->i_flush));
147 158
148 call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback); 159 /*
160 * Because we use RCU freeing we need to ensure the inode always
161 * appears to be reclaimed with an invalid inode number when in the
162 * free state. The ip->i_flags_lock provides the barrier against lookup
163 * races.
164 */
165 spin_lock(&ip->i_flags_lock);
166 ip->i_flags = XFS_IRECLAIM;
167 ip->i_ino = 0;
168 spin_unlock(&ip->i_flags_lock);
169
170 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
149} 171}
150 172
151/* 173/*
@@ -155,14 +177,29 @@ static int
155xfs_iget_cache_hit( 177xfs_iget_cache_hit(
156 struct xfs_perag *pag, 178 struct xfs_perag *pag,
157 struct xfs_inode *ip, 179 struct xfs_inode *ip,
180 xfs_ino_t ino,
158 int flags, 181 int flags,
159 int lock_flags) __releases(pag->pag_ici_lock) 182 int lock_flags) __releases(RCU)
160{ 183{
161 struct inode *inode = VFS_I(ip); 184 struct inode *inode = VFS_I(ip);
162 struct xfs_mount *mp = ip->i_mount; 185 struct xfs_mount *mp = ip->i_mount;
163 int error; 186 int error;
164 187
188 /*
189 * check for re-use of an inode within an RCU grace period due to the
190 * radix tree nodes not being updated yet. We monitor for this by
191 * setting the inode number to zero before freeing the inode structure.
192 * If the inode has been reallocated and set up, then the inode number
193 * will not match, so check for that, too.
194 */
165 spin_lock(&ip->i_flags_lock); 195 spin_lock(&ip->i_flags_lock);
196 if (ip->i_ino != ino) {
197 trace_xfs_iget_skip(ip);
198 XFS_STATS_INC(xs_ig_frecycle);
199 error = EAGAIN;
200 goto out_error;
201 }
202
166 203
167 /* 204 /*
168 * If we are racing with another cache hit that is currently 205 * If we are racing with another cache hit that is currently
@@ -205,7 +242,7 @@ xfs_iget_cache_hit(
205 ip->i_flags |= XFS_IRECLAIM; 242 ip->i_flags |= XFS_IRECLAIM;
206 243
207 spin_unlock(&ip->i_flags_lock); 244 spin_unlock(&ip->i_flags_lock);
208 read_unlock(&pag->pag_ici_lock); 245 rcu_read_unlock();
209 246
210 error = -inode_init_always(mp->m_super, inode); 247 error = -inode_init_always(mp->m_super, inode);
211 if (error) { 248 if (error) {
@@ -213,7 +250,7 @@ xfs_iget_cache_hit(
213 * Re-initializing the inode failed, and we are in deep 250 * Re-initializing the inode failed, and we are in deep
214 * trouble. Try to re-add it to the reclaim list. 251 * trouble. Try to re-add it to the reclaim list.
215 */ 252 */
216 read_lock(&pag->pag_ici_lock); 253 rcu_read_lock();
217 spin_lock(&ip->i_flags_lock); 254 spin_lock(&ip->i_flags_lock);
218 255
219 ip->i_flags &= ~XFS_INEW; 256 ip->i_flags &= ~XFS_INEW;
@@ -223,14 +260,20 @@ xfs_iget_cache_hit(
223 goto out_error; 260 goto out_error;
224 } 261 }
225 262
226 write_lock(&pag->pag_ici_lock); 263 spin_lock(&pag->pag_ici_lock);
227 spin_lock(&ip->i_flags_lock); 264 spin_lock(&ip->i_flags_lock);
228 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); 265 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
229 ip->i_flags |= XFS_INEW; 266 ip->i_flags |= XFS_INEW;
230 __xfs_inode_clear_reclaim_tag(mp, pag, ip); 267 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
231 inode->i_state = I_NEW; 268 inode->i_state = I_NEW;
269
270 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
271 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
272 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
273 &xfs_iolock_active, "xfs_iolock_active");
274
232 spin_unlock(&ip->i_flags_lock); 275 spin_unlock(&ip->i_flags_lock);
233 write_unlock(&pag->pag_ici_lock); 276 spin_unlock(&pag->pag_ici_lock);
234 } else { 277 } else {
235 /* If the VFS inode is being torn down, pause and try again. */ 278 /* If the VFS inode is being torn down, pause and try again. */
236 if (!igrab(inode)) { 279 if (!igrab(inode)) {
@@ -241,7 +284,7 @@ xfs_iget_cache_hit(
241 284
242 /* We've got a live one. */ 285 /* We've got a live one. */
243 spin_unlock(&ip->i_flags_lock); 286 spin_unlock(&ip->i_flags_lock);
244 read_unlock(&pag->pag_ici_lock); 287 rcu_read_unlock();
245 trace_xfs_iget_hit(ip); 288 trace_xfs_iget_hit(ip);
246 } 289 }
247 290
@@ -255,7 +298,7 @@ xfs_iget_cache_hit(
255 298
256out_error: 299out_error:
257 spin_unlock(&ip->i_flags_lock); 300 spin_unlock(&ip->i_flags_lock);
258 read_unlock(&pag->pag_ici_lock); 301 rcu_read_unlock();
259 return error; 302 return error;
260} 303}
261 304
@@ -308,7 +351,7 @@ xfs_iget_cache_miss(
308 BUG(); 351 BUG();
309 } 352 }
310 353
311 write_lock(&pag->pag_ici_lock); 354 spin_lock(&pag->pag_ici_lock);
312 355
313 /* insert the new inode */ 356 /* insert the new inode */
314 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 357 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -323,14 +366,14 @@ xfs_iget_cache_miss(
323 ip->i_udquot = ip->i_gdquot = NULL; 366 ip->i_udquot = ip->i_gdquot = NULL;
324 xfs_iflags_set(ip, XFS_INEW); 367 xfs_iflags_set(ip, XFS_INEW);
325 368
326 write_unlock(&pag->pag_ici_lock); 369 spin_unlock(&pag->pag_ici_lock);
327 radix_tree_preload_end(); 370 radix_tree_preload_end();
328 371
329 *ipp = ip; 372 *ipp = ip;
330 return 0; 373 return 0;
331 374
332out_preload_end: 375out_preload_end:
333 write_unlock(&pag->pag_ici_lock); 376 spin_unlock(&pag->pag_ici_lock);
334 radix_tree_preload_end(); 377 radix_tree_preload_end();
335 if (lock_flags) 378 if (lock_flags)
336 xfs_iunlock(ip, lock_flags); 379 xfs_iunlock(ip, lock_flags);
@@ -377,7 +420,7 @@ xfs_iget(
377 xfs_agino_t agino; 420 xfs_agino_t agino;
378 421
379 /* reject inode numbers outside existing AGs */ 422 /* reject inode numbers outside existing AGs */
380 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 423 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
381 return EINVAL; 424 return EINVAL;
382 425
383 /* get the perag structure and ensure that it's inode capable */ 426 /* get the perag structure and ensure that it's inode capable */
@@ -386,15 +429,15 @@ xfs_iget(
386 429
387again: 430again:
388 error = 0; 431 error = 0;
389 read_lock(&pag->pag_ici_lock); 432 rcu_read_lock();
390 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 433 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
391 434
392 if (ip) { 435 if (ip) {
393 error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); 436 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
394 if (error) 437 if (error)
395 goto out_error_or_again; 438 goto out_error_or_again;
396 } else { 439 } else {
397 read_unlock(&pag->pag_ici_lock); 440 rcu_read_unlock();
398 XFS_STATS_INC(xs_ig_missed); 441 XFS_STATS_INC(xs_ig_missed);
399 442
400 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 443 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f94..be7cf625421f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
887 * around for a while. This helps to keep recently accessed 887 * around for a while. This helps to keep recently accessed
888 * meta-data in-core longer. 888 * meta-data in-core longer.
889 */ 889 */
890 XFS_BUF_SET_REF(bp, XFS_INO_REF); 890 xfs_buf_set_ref(bp, XFS_INO_REF);
891 891
892 /* 892 /*
893 * Use xfs_trans_brelse() to release the buffer containing the 893 * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
2000 */ 2000 */
2001 for (i = 0; i < ninodes; i++) { 2001 for (i = 0; i < ninodes; i++) {
2002retry: 2002retry:
2003 read_lock(&pag->pag_ici_lock); 2003 rcu_read_lock();
2004 ip = radix_tree_lookup(&pag->pag_ici_root, 2004 ip = radix_tree_lookup(&pag->pag_ici_root,
2005 XFS_INO_TO_AGINO(mp, (inum + i))); 2005 XFS_INO_TO_AGINO(mp, (inum + i)));
2006 2006
2007 /* Inode not in memory or stale, nothing to do */ 2007 /* Inode not in memory, nothing to do */
2008 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2008 if (!ip) {
2009 read_unlock(&pag->pag_ici_lock); 2009 rcu_read_unlock();
2010 continue; 2010 continue;
2011 } 2011 }
2012 2012
2013 /* 2013 /*
2014 * because this is an RCU protected lookup, we could
2015 * find a recently freed or even reallocated inode
2016 * during the lookup. We need to check under the
2017 * i_flags_lock for a valid inode here. Skip it if it
2018 * is not valid, the wrong inode or stale.
2019 */
2020 spin_lock(&ip->i_flags_lock);
2021 if (ip->i_ino != inum + i ||
2022 __xfs_iflags_test(ip, XFS_ISTALE)) {
2023 spin_unlock(&ip->i_flags_lock);
2024 rcu_read_unlock();
2025 continue;
2026 }
2027 spin_unlock(&ip->i_flags_lock);
2028
2029 /*
2014 * Don't try to lock/unlock the current inode, but we 2030 * Don't try to lock/unlock the current inode, but we
2015 * _cannot_ skip the other inodes that we did not find 2031 * _cannot_ skip the other inodes that we did not find
2016 * in the list attached to the buffer and are not 2032 * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
2019 */ 2035 */
2020 if (ip != free_ip && 2036 if (ip != free_ip &&
2021 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2037 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2022 read_unlock(&pag->pag_ici_lock); 2038 rcu_read_unlock();
2023 delay(1); 2039 delay(1);
2024 goto retry; 2040 goto retry;
2025 } 2041 }
2026 read_unlock(&pag->pag_ici_lock); 2042 rcu_read_unlock();
2027 2043
2028 xfs_iflock(ip); 2044 xfs_iflock(ip);
2029 xfs_iflags_set(ip, XFS_ISTALE); 2045 xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
2629 2645
2630 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2646 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2631 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2647 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2632 read_lock(&pag->pag_ici_lock); 2648 rcu_read_lock();
2633 /* really need a gang lookup range call here */ 2649 /* really need a gang lookup range call here */
2634 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2650 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2635 first_index, inodes_per_cluster); 2651 first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
2640 iq = ilist[i]; 2656 iq = ilist[i];
2641 if (iq == ip) 2657 if (iq == ip)
2642 continue; 2658 continue;
2643 /* if the inode lies outside this cluster, we're done. */ 2659
2644 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) 2660 /*
2645 break; 2661 * because this is an RCU protected lookup, we could find a
2662 * recently freed or even reallocated inode during the lookup.
2663 * We need to check under the i_flags_lock for a valid inode
2664 * here. Skip it if it is not valid or the wrong inode.
2665 */
2666 spin_lock(&ip->i_flags_lock);
2667 if (!ip->i_ino ||
2668 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2669 spin_unlock(&ip->i_flags_lock);
2670 continue;
2671 }
2672 spin_unlock(&ip->i_flags_lock);
2673
2646 /* 2674 /*
2647 * Do an un-protected check to see if the inode is dirty and 2675 * Do an un-protected check to see if the inode is dirty and
2648 * is a candidate for flushing. These checks will be repeated 2676 * is a candidate for flushing. These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
2692 } 2720 }
2693 2721
2694out_free: 2722out_free:
2695 read_unlock(&pag->pag_ici_lock); 2723 rcu_read_unlock();
2696 kmem_free(ilist); 2724 kmem_free(ilist);
2697out_put: 2725out_put:
2698 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
2704 * Corruption detected in the clustering loop. Invalidate the 2732 * Corruption detected in the clustering loop. Invalidate the
2705 * inode buffer and shut down the filesystem. 2733 * inode buffer and shut down the filesystem.
2706 */ 2734 */
2707 read_unlock(&pag->pag_ici_lock); 2735 rcu_read_unlock();
2708 /* 2736 /*
2709 * Clean up the buffer. If it was B_DELWRI, just release it -- 2737 * Clean up the buffer. If it was B_DELWRI, just release it --
2710 * brelse can handle it with no problems. If not, shut down the 2738 * brelse can handle it with no problems. If not, shut down the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc9..5c95fa8ec11d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
376/* 376/*
377 * In-core inode flags. 377 * In-core inode flags.
378 */ 378 */
379#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ 379#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
380#define XFS_ISTALE 0x0002 /* inode has been staled */ 380#define XFS_ISTALE 0x0002 /* inode has been staled */
381#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ 381#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
382#define XFS_INEW 0x0008 /* inode has just been allocated */ 382#define XFS_INEW 0x0008 /* inode has just been allocated */
383#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ 383#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
384#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ 384#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
385#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
385 386
386/* 387/*
387 * Flags for inode locking. 388 * Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
438#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) 439#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
439#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 440#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
440 441
442extern struct lock_class_key xfs_iolock_reclaimable;
443
441/* 444/*
442 * Flags for xfs_itruncate_start(). 445 * Flags for xfs_itruncate_start().
443 */ 446 */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c8d30c453c3..fd4f398bd6f1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
842 * flushed to disk. It is responsible for removing the inode item 842 * flushed to disk. It is responsible for removing the inode item
843 * from the AIL if it has not been re-logged, and unlocking the inode's 843 * from the AIL if it has not been re-logged, and unlocking the inode's
844 * flush lock. 844 * flush lock.
845 *
846 * To reduce AIL lock traffic as much as possible, we scan the buffer log item
847 * list for other inodes that will run this function. We remove them from the
848 * buffer list so we can process all the inode IO completions in one AIL lock
849 * traversal.
845 */ 850 */
846void 851void
847xfs_iflush_done( 852xfs_iflush_done(
848 struct xfs_buf *bp, 853 struct xfs_buf *bp,
849 struct xfs_log_item *lip) 854 struct xfs_log_item *lip)
850{ 855{
851 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 856 struct xfs_inode_log_item *iip;
852 xfs_inode_t *ip = iip->ili_inode; 857 struct xfs_log_item *blip;
858 struct xfs_log_item *next;
859 struct xfs_log_item *prev;
853 struct xfs_ail *ailp = lip->li_ailp; 860 struct xfs_ail *ailp = lip->li_ailp;
861 int need_ail = 0;
862
863 /*
864 * Scan the buffer IO completions for other inodes being completed and
865 * attach them to the current inode log item.
866 */
867 blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
868 prev = NULL;
869 while (blip != NULL) {
870 if (lip->li_cb != xfs_iflush_done) {
871 prev = blip;
872 blip = blip->li_bio_list;
873 continue;
874 }
875
876 /* remove from list */
877 next = blip->li_bio_list;
878 if (!prev) {
879 XFS_BUF_SET_FSPRIVATE(bp, next);
880 } else {
881 prev->li_bio_list = next;
882 }
883
884 /* add to current list */
885 blip->li_bio_list = lip->li_bio_list;
886 lip->li_bio_list = blip;
887
888 /*
889 * while we have the item, do the unlocked check for needing
890 * the AIL lock.
891 */
892 iip = INODE_ITEM(blip);
893 if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
894 need_ail++;
895
896 blip = next;
897 }
898
899 /* make sure we capture the state of the initial inode. */
900 iip = INODE_ITEM(lip);
901 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
902 need_ail++;
854 903
855 /* 904 /*
856 * We only want to pull the item from the AIL if it is 905 * We only want to pull the item from the AIL if it is
@@ -861,28 +910,37 @@ xfs_iflush_done(
861 * the lock since it's cheaper, and then we recheck while 910 * the lock since it's cheaper, and then we recheck while
862 * holding the lock before removing the inode from the AIL. 911 * holding the lock before removing the inode from the AIL.
863 */ 912 */
864 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { 913 if (need_ail) {
914 struct xfs_log_item *log_items[need_ail];
915 int i = 0;
865 spin_lock(&ailp->xa_lock); 916 spin_lock(&ailp->xa_lock);
866 if (lip->li_lsn == iip->ili_flush_lsn) { 917 for (blip = lip; blip; blip = blip->li_bio_list) {
867 /* xfs_trans_ail_delete() drops the AIL lock. */ 918 iip = INODE_ITEM(blip);
868 xfs_trans_ail_delete(ailp, lip); 919 if (iip->ili_logged &&
869 } else { 920 blip->li_lsn == iip->ili_flush_lsn) {
870 spin_unlock(&ailp->xa_lock); 921 log_items[i++] = blip;
922 }
923 ASSERT(i <= need_ail);
871 } 924 }
925 /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
926 xfs_trans_ail_delete_bulk(ailp, log_items, i);
872 } 927 }
873 928
874 iip->ili_logged = 0;
875 929
876 /* 930 /*
877 * Clear the ili_last_fields bits now that we know that the 931 * clean up and unlock the flush lock now we are done. We can clear the
878 * data corresponding to them is safely on disk. 932 * ili_last_fields bits now that we know that the data corresponding to
933 * them is safely on disk.
879 */ 934 */
880 iip->ili_last_fields = 0; 935 for (blip = lip; blip; blip = next) {
936 next = blip->li_bio_list;
937 blip->li_bio_list = NULL;
881 938
882 /* 939 iip = INODE_ITEM(blip);
883 * Release the inode's flush lock since we're done with it. 940 iip->ili_logged = 0;
884 */ 941 iip->ili_last_fields = 0;
885 xfs_ifunlock(ip); 942 xfs_ifunlock(iip->ili_inode);
943 }
886} 944}
887 945
888/* 946/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..55582bd66659 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
47 47
48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
49 << mp->m_writeio_log) 49 << mp->m_writeio_log)
50#define XFS_STRAT_WRITE_IMAPS 2
51#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 50#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
52 51
53STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
54 int, struct xfs_bmbt_irec *, int *);
55STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
56 struct xfs_bmbt_irec *, int *);
57STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
58 struct xfs_bmbt_irec *, int *);
59
60int
61xfs_iomap(
62 struct xfs_inode *ip,
63 xfs_off_t offset,
64 ssize_t count,
65 int flags,
66 struct xfs_bmbt_irec *imap,
67 int *nimaps,
68 int *new)
69{
70 struct xfs_mount *mp = ip->i_mount;
71 xfs_fileoff_t offset_fsb, end_fsb;
72 int error = 0;
73 int lockmode = 0;
74 int bmapi_flags = 0;
75
76 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
77
78 *new = 0;
79
80 if (XFS_FORCED_SHUTDOWN(mp))
81 return XFS_ERROR(EIO);
82
83 trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
84
85 switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
86 case BMAPI_READ:
87 lockmode = xfs_ilock_map_shared(ip);
88 bmapi_flags = XFS_BMAPI_ENTIRE;
89 break;
90 case BMAPI_WRITE:
91 lockmode = XFS_ILOCK_EXCL;
92 if (flags & BMAPI_IGNSTATE)
93 bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
94 xfs_ilock(ip, lockmode);
95 break;
96 case BMAPI_ALLOCATE:
97 lockmode = XFS_ILOCK_SHARED;
98 bmapi_flags = XFS_BMAPI_ENTIRE;
99
100 /* Attempt non-blocking lock */
101 if (flags & BMAPI_TRYLOCK) {
102 if (!xfs_ilock_nowait(ip, lockmode))
103 return XFS_ERROR(EAGAIN);
104 } else {
105 xfs_ilock(ip, lockmode);
106 }
107 break;
108 default:
109 BUG();
110 }
111
112 ASSERT(offset <= mp->m_maxioffset);
113 if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
114 count = mp->m_maxioffset - offset;
115 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
116 offset_fsb = XFS_B_TO_FSBT(mp, offset);
117
118 error = xfs_bmapi(NULL, ip, offset_fsb,
119 (xfs_filblks_t)(end_fsb - offset_fsb),
120 bmapi_flags, NULL, 0, imap,
121 nimaps, NULL);
122
123 if (error)
124 goto out;
125
126 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
127 case BMAPI_WRITE:
128 /* If we found an extent, return it */
129 if (*nimaps &&
130 (imap->br_startblock != HOLESTARTBLOCK) &&
131 (imap->br_startblock != DELAYSTARTBLOCK)) {
132 trace_xfs_iomap_found(ip, offset, count, flags, imap);
133 break;
134 }
135
136 if (flags & BMAPI_DIRECT) {
137 error = xfs_iomap_write_direct(ip, offset, count, flags,
138 imap, nimaps);
139 } else {
140 error = xfs_iomap_write_delay(ip, offset, count, flags,
141 imap, nimaps);
142 }
143 if (!error) {
144 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
145 }
146 *new = 1;
147 break;
148 case BMAPI_ALLOCATE:
149 /* If we found an extent, return it */
150 xfs_iunlock(ip, lockmode);
151 lockmode = 0;
152
153 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
154 trace_xfs_iomap_found(ip, offset, count, flags, imap);
155 break;
156 }
157
158 error = xfs_iomap_write_allocate(ip, offset, count,
159 imap, nimaps);
160 break;
161 }
162
163 ASSERT(*nimaps <= 1);
164
165out:
166 if (lockmode)
167 xfs_iunlock(ip, lockmode);
168 return XFS_ERROR(error);
169}
170
171STATIC int 52STATIC int
172xfs_iomap_eof_align_last_fsb( 53xfs_iomap_eof_align_last_fsb(
173 xfs_mount_t *mp, 54 xfs_mount_t *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
236 return EFSCORRUPTED; 117 return EFSCORRUPTED;
237} 118}
238 119
239STATIC int 120int
240xfs_iomap_write_direct( 121xfs_iomap_write_direct(
241 xfs_inode_t *ip, 122 xfs_inode_t *ip,
242 xfs_off_t offset, 123 xfs_off_t offset,
243 size_t count, 124 size_t count,
244 int flags,
245 xfs_bmbt_irec_t *imap, 125 xfs_bmbt_irec_t *imap,
246 int *nmaps) 126 int nmaps)
247{ 127{
248 xfs_mount_t *mp = ip->i_mount; 128 xfs_mount_t *mp = ip->i_mount;
249 xfs_fileoff_t offset_fsb; 129 xfs_fileoff_t offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
279 if (error) 159 if (error)
280 goto error_out; 160 goto error_out;
281 } else { 161 } else {
282 if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) 162 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
283 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 163 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
284 imap->br_blockcount + 164 imap->br_blockcount +
285 imap->br_startoff); 165 imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
331 xfs_trans_ijoin(tp, ip); 211 xfs_trans_ijoin(tp, ip);
332 212
333 bmapi_flag = XFS_BMAPI_WRITE; 213 bmapi_flag = XFS_BMAPI_WRITE;
334 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) 214 if (offset < ip->i_size || extsz)
335 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
336 216
337 /* 217 /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
370 goto error_out; 250 goto error_out;
371 } 251 }
372 252
373 *nmaps = 1;
374 return 0; 253 return 0;
375 254
376error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 255error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
379 258
380error1: /* Just cancel transaction */ 259error1: /* Just cancel transaction */
381 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 260 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
382 *nmaps = 0; /* nothing set-up here */
383 261
384error_out: 262error_out:
385 return XFS_ERROR(error); 263 return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
389 * If the caller is doing a write at the end of the file, then extend the 267 * If the caller is doing a write at the end of the file, then extend the
390 * allocation out to the file system's write iosize. We clean up any extra 268 * allocation out to the file system's write iosize. We clean up any extra
391 * space left over when the file is closed in xfs_inactive(). 269 * space left over when the file is closed in xfs_inactive().
270 *
271 * If we find we already have delalloc preallocation beyond EOF, don't do more
272 * preallocation as it it not needed.
392 */ 273 */
393STATIC int 274STATIC int
394xfs_iomap_eof_want_preallocate( 275xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
396 xfs_inode_t *ip, 277 xfs_inode_t *ip,
397 xfs_off_t offset, 278 xfs_off_t offset,
398 size_t count, 279 size_t count,
399 int ioflag,
400 xfs_bmbt_irec_t *imap, 280 xfs_bmbt_irec_t *imap,
401 int nimaps, 281 int nimaps,
402 int *prealloc) 282 int *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
405 xfs_filblks_t count_fsb; 285 xfs_filblks_t count_fsb;
406 xfs_fsblock_t firstblock; 286 xfs_fsblock_t firstblock;
407 int n, error, imaps; 287 int n, error, imaps;
288 int found_delalloc = 0;
408 289
409 *prealloc = 0; 290 *prealloc = 0;
410 if ((offset + count) <= ip->i_size) 291 if ((offset + count) <= ip->i_size)
@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate(
429 return 0; 310 return 0;
430 start_fsb += imap[n].br_blockcount; 311 start_fsb += imap[n].br_blockcount;
431 count_fsb -= imap[n].br_blockcount; 312 count_fsb -= imap[n].br_blockcount;
313
314 if (imap[n].br_startblock == DELAYSTARTBLOCK)
315 found_delalloc = 1;
432 } 316 }
433 } 317 }
434 *prealloc = 1; 318 if (!found_delalloc)
319 *prealloc = 1;
435 return 0; 320 return 0;
436} 321}
437 322
438STATIC int 323/*
324 * If we don't have a user specified preallocation size, dynamically increase
325 * the preallocation size as the size of the file grows. Cap the maximum size
326 * at a single extent or less if the filesystem is near full. The closer the
327 * filesystem is to full, the smaller the maximum prealocation.
328 */
329STATIC xfs_fsblock_t
330xfs_iomap_prealloc_size(
331 struct xfs_mount *mp,
332 struct xfs_inode *ip)
333{
334 xfs_fsblock_t alloc_blocks = 0;
335
336 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
337 int shift = 0;
338 int64_t freesp;
339
340 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
341 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
342 rounddown_pow_of_two(alloc_blocks));
343
344 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
345 freesp = mp->m_sb.sb_fdblocks;
346 if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
347 shift = 2;
348 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
349 shift++;
350 if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
351 shift++;
352 if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
353 shift++;
354 if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
355 shift++;
356 }
357 if (shift)
358 alloc_blocks >>= shift;
359 }
360
361 if (alloc_blocks < mp->m_writeio_blocks)
362 alloc_blocks = mp->m_writeio_blocks;
363
364 return alloc_blocks;
365}
366
367int
439xfs_iomap_write_delay( 368xfs_iomap_write_delay(
440 xfs_inode_t *ip, 369 xfs_inode_t *ip,
441 xfs_off_t offset, 370 xfs_off_t offset,
442 size_t count, 371 size_t count,
443 int ioflag, 372 xfs_bmbt_irec_t *ret_imap)
444 xfs_bmbt_irec_t *ret_imap,
445 int *nmaps)
446{ 373{
447 xfs_mount_t *mp = ip->i_mount; 374 xfs_mount_t *mp = ip->i_mount;
448 xfs_fileoff_t offset_fsb; 375 xfs_fileoff_t offset_fsb;
@@ -469,16 +396,19 @@ xfs_iomap_write_delay(
469 extsz = xfs_get_extsz_hint(ip); 396 extsz = xfs_get_extsz_hint(ip);
470 offset_fsb = XFS_B_TO_FSBT(mp, offset); 397 offset_fsb = XFS_B_TO_FSBT(mp, offset);
471 398
399
472 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, 400 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
473 ioflag, imap, XFS_WRITE_IMAPS, &prealloc); 401 imap, XFS_WRITE_IMAPS, &prealloc);
474 if (error) 402 if (error)
475 return error; 403 return error;
476 404
477retry: 405retry:
478 if (prealloc) { 406 if (prealloc) {
407 xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
408
479 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 409 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
480 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 410 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
481 last_fsb = ioalign + mp->m_writeio_blocks; 411 last_fsb = ioalign + alloc_blocks;
482 } else { 412 } else {
483 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 413 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
484 } 414 }
@@ -496,22 +426,31 @@ retry:
496 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 426 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
497 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 427 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
498 &nimaps, NULL); 428 &nimaps, NULL);
499 if (error && (error != ENOSPC)) 429 switch (error) {
430 case 0:
431 case ENOSPC:
432 case EDQUOT:
433 break;
434 default:
500 return XFS_ERROR(error); 435 return XFS_ERROR(error);
436 }
501 437
502 /* 438 /*
503 * If bmapi returned us nothing, and if we didn't get back EDQUOT, 439 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
504 * then we must have run out of space - flush all other inodes with 440 * ENOSPC, * flush all other inodes with delalloc blocks to free up
505 * delalloc blocks and retry without EOF preallocation. 441 * some of the excess reserved metadata space. For both cases, retry
442 * without EOF preallocation.
506 */ 443 */
507 if (nimaps == 0) { 444 if (nimaps == 0) {
508 trace_xfs_delalloc_enospc(ip, offset, count); 445 trace_xfs_delalloc_enospc(ip, offset, count);
509 if (flushed) 446 if (flushed)
510 return XFS_ERROR(ENOSPC); 447 return XFS_ERROR(error ? error : ENOSPC);
511 448
512 xfs_iunlock(ip, XFS_ILOCK_EXCL); 449 if (error == ENOSPC) {
513 xfs_flush_inodes(ip); 450 xfs_iunlock(ip, XFS_ILOCK_EXCL);
514 xfs_ilock(ip, XFS_ILOCK_EXCL); 451 xfs_flush_inodes(ip);
452 xfs_ilock(ip, XFS_ILOCK_EXCL);
453 }
515 454
516 flushed = 1; 455 flushed = 1;
517 error = 0; 456 error = 0;
@@ -523,8 +462,6 @@ retry:
523 return xfs_cmn_err_fsblock_zero(ip, &imap[0]); 462 return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
524 463
525 *ret_imap = imap[0]; 464 *ret_imap = imap[0];
526 *nmaps = 1;
527
528 return 0; 465 return 0;
529} 466}
530 467
@@ -538,13 +475,12 @@ retry:
538 * We no longer bother to look at the incoming map - all we have to 475 * We no longer bother to look at the incoming map - all we have to
539 * guarantee is that whatever we allocate fills the required range. 476 * guarantee is that whatever we allocate fills the required range.
540 */ 477 */
541STATIC int 478int
542xfs_iomap_write_allocate( 479xfs_iomap_write_allocate(
543 xfs_inode_t *ip, 480 xfs_inode_t *ip,
544 xfs_off_t offset, 481 xfs_off_t offset,
545 size_t count, 482 size_t count,
546 xfs_bmbt_irec_t *imap, 483 xfs_bmbt_irec_t *imap)
547 int *retmap)
548{ 484{
549 xfs_mount_t *mp = ip->i_mount; 485 xfs_mount_t *mp = ip->i_mount;
550 xfs_fileoff_t offset_fsb, last_block; 486 xfs_fileoff_t offset_fsb, last_block;
@@ -557,8 +493,6 @@ xfs_iomap_write_allocate(
557 int error = 0; 493 int error = 0;
558 int nres; 494 int nres;
559 495
560 *retmap = 0;
561
562 /* 496 /*
563 * Make sure that the dquots are there. 497 * Make sure that the dquots are there.
564 */ 498 */
@@ -680,7 +614,6 @@ xfs_iomap_write_allocate(
680 if ((offset_fsb >= imap->br_startoff) && 614 if ((offset_fsb >= imap->br_startoff) &&
681 (offset_fsb < (imap->br_startoff + 615 (offset_fsb < (imap->br_startoff +
682 imap->br_blockcount))) { 616 imap->br_blockcount))) {
683 *retmap = 1;
684 XFS_STATS_INC(xs_xstrat_quick); 617 XFS_STATS_INC(xs_xstrat_quick);
685 return 0; 618 return 0;
686 } 619 }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21/* base extent manipulation calls */
22#define BMAPI_READ (1 << 0) /* read extents */
23#define BMAPI_WRITE (1 << 1) /* create extents */
24#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
25
26/* modifiers */
27#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
28#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
29#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
30#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
31
32#define BMAPI_FLAGS \
33 { BMAPI_READ, "READ" }, \
34 { BMAPI_WRITE, "WRITE" }, \
35 { BMAPI_ALLOCATE, "ALLOCATE" }, \
36 { BMAPI_IGNSTATE, "IGNSTATE" }, \
37 { BMAPI_DIRECT, "DIRECT" }, \
38 { BMAPI_TRYLOCK, "TRYLOCK" }
39
40struct xfs_inode; 21struct xfs_inode;
41struct xfs_bmbt_irec; 22struct xfs_bmbt_irec;
42 23
43extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 24extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
44 struct xfs_bmbt_irec *, int *, int *); 25 struct xfs_bmbt_irec *, int);
26extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
27 struct xfs_bmbt_irec *);
28extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
29 struct xfs_bmbt_irec *);
45extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 30extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
46 31
47#endif /* __XFS_IOMAP_H__*/ 32#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9e..ae6fef1ff563 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
47 xfs_buftarg_t *log_target, 47 xfs_buftarg_t *log_target,
48 xfs_daddr_t blk_offset, 48 xfs_daddr_t blk_offset,
49 int num_bblks); 49 int num_bblks);
50STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 50STATIC int xlog_space_left(struct log *log, atomic64_t *head);
51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
52STATIC void xlog_dealloc_log(xlog_t *log); 52STATIC void xlog_dealloc_log(xlog_t *log);
53 53
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
70/* local functions to manipulate grant head */ 70/* local functions to manipulate grant head */
71STATIC int xlog_grant_log_space(xlog_t *log, 71STATIC int xlog_grant_log_space(xlog_t *log,
72 xlog_ticket_t *xtic); 72 xlog_ticket_t *xtic);
73STATIC void xlog_grant_push_ail(xfs_mount_t *mp, 73STATIC void xlog_grant_push_ail(struct log *log,
74 int need_bytes); 74 int need_bytes);
75STATIC void xlog_regrant_reserve_log_space(xlog_t *log, 75STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
76 xlog_ticket_t *ticket); 76 xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
81 81
82#if defined(DEBUG) 82#if defined(DEBUG)
83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
84STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 84STATIC void xlog_verify_grant_tail(struct log *log);
85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
86 int count, boolean_t syncing); 86 int count, boolean_t syncing);
87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, 87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
88 xfs_lsn_t tail_lsn); 88 xfs_lsn_t tail_lsn);
89#else 89#else
90#define xlog_verify_dest_ptr(a,b) 90#define xlog_verify_dest_ptr(a,b)
91#define xlog_verify_grant_head(a,b) 91#define xlog_verify_grant_tail(a)
92#define xlog_verify_iclog(a,b,c,d) 92#define xlog_verify_iclog(a,b,c,d)
93#define xlog_verify_tail_lsn(a,b,c) 93#define xlog_verify_tail_lsn(a,b,c)
94#endif 94#endif
95 95
96STATIC int xlog_iclogs_empty(xlog_t *log); 96STATIC int xlog_iclogs_empty(xlog_t *log);
97 97
98
99static void 98static void
100xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 99xlog_grant_sub_space(
100 struct log *log,
101 atomic64_t *head,
102 int bytes)
101{ 103{
102 if (*qp) { 104 int64_t head_val = atomic64_read(head);
103 tic->t_next = (*qp); 105 int64_t new, old;
104 tic->t_prev = (*qp)->t_prev;
105 (*qp)->t_prev->t_next = tic;
106 (*qp)->t_prev = tic;
107 } else {
108 tic->t_prev = tic->t_next = tic;
109 *qp = tic;
110 }
111 106
112 tic->t_flags |= XLOG_TIC_IN_Q; 107 do {
113} 108 int cycle, space;
114 109
115static void 110 xlog_crack_grant_head_val(head_val, &cycle, &space);
116xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
117{
118 if (tic == tic->t_next) {
119 *qp = NULL;
120 } else {
121 *qp = tic->t_next;
122 tic->t_next->t_prev = tic->t_prev;
123 tic->t_prev->t_next = tic->t_next;
124 }
125 111
126 tic->t_next = tic->t_prev = NULL; 112 space -= bytes;
127 tic->t_flags &= ~XLOG_TIC_IN_Q; 113 if (space < 0) {
114 space += log->l_logsize;
115 cycle--;
116 }
117
118 old = head_val;
119 new = xlog_assign_grant_head_val(cycle, space);
120 head_val = atomic64_cmpxchg(head, old, new);
121 } while (head_val != old);
128} 122}
129 123
130static void 124static void
131xlog_grant_sub_space(struct log *log, int bytes) 125xlog_grant_add_space(
126 struct log *log,
127 atomic64_t *head,
128 int bytes)
132{ 129{
133 log->l_grant_write_bytes -= bytes; 130 int64_t head_val = atomic64_read(head);
134 if (log->l_grant_write_bytes < 0) { 131 int64_t new, old;
135 log->l_grant_write_bytes += log->l_logsize;
136 log->l_grant_write_cycle--;
137 }
138
139 log->l_grant_reserve_bytes -= bytes;
140 if ((log)->l_grant_reserve_bytes < 0) {
141 log->l_grant_reserve_bytes += log->l_logsize;
142 log->l_grant_reserve_cycle--;
143 }
144 132
145} 133 do {
134 int tmp;
135 int cycle, space;
146 136
147static void 137 xlog_crack_grant_head_val(head_val, &cycle, &space);
148xlog_grant_add_space_write(struct log *log, int bytes)
149{
150 int tmp = log->l_logsize - log->l_grant_write_bytes;
151 if (tmp > bytes)
152 log->l_grant_write_bytes += bytes;
153 else {
154 log->l_grant_write_cycle++;
155 log->l_grant_write_bytes = bytes - tmp;
156 }
157}
158 138
159static void 139 tmp = log->l_logsize - space;
160xlog_grant_add_space_reserve(struct log *log, int bytes) 140 if (tmp > bytes)
161{ 141 space += bytes;
162 int tmp = log->l_logsize - log->l_grant_reserve_bytes; 142 else {
163 if (tmp > bytes) 143 space = bytes - tmp;
164 log->l_grant_reserve_bytes += bytes; 144 cycle++;
165 else { 145 }
166 log->l_grant_reserve_cycle++;
167 log->l_grant_reserve_bytes = bytes - tmp;
168 }
169}
170 146
171static inline void 147 old = head_val;
172xlog_grant_add_space(struct log *log, int bytes) 148 new = xlog_assign_grant_head_val(cycle, space);
173{ 149 head_val = atomic64_cmpxchg(head, old, new);
174 xlog_grant_add_space_write(log, bytes); 150 } while (head_val != old);
175 xlog_grant_add_space_reserve(log, bytes);
176} 151}
177 152
178static void 153static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
355 330
356 trace_xfs_log_reserve(log, internal_ticket); 331 trace_xfs_log_reserve(log, internal_ticket);
357 332
358 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 333 xlog_grant_push_ail(log, internal_ticket->t_unit_res);
359 retval = xlog_regrant_write_log_space(log, internal_ticket); 334 retval = xlog_regrant_write_log_space(log, internal_ticket);
360 } else { 335 } else {
361 /* may sleep if need to allocate more tickets */ 336 /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
369 344
370 trace_xfs_log_reserve(log, internal_ticket); 345 trace_xfs_log_reserve(log, internal_ticket);
371 346
372 xlog_grant_push_ail(mp, 347 xlog_grant_push_ail(log,
373 (internal_ticket->t_unit_res * 348 (internal_ticket->t_unit_res *
374 internal_ticket->t_cnt)); 349 internal_ticket->t_cnt));
375 retval = xlog_grant_log_space(log, internal_ticket); 350 retval = xlog_grant_log_space(log, internal_ticket);
@@ -402,7 +377,7 @@ xfs_log_mount(
402 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 377 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
403 else { 378 else {
404 cmn_err(CE_NOTE, 379 cmn_err(CE_NOTE,
405 "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 380 "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
406 mp->m_fsname); 381 mp->m_fsname);
407 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 382 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
408 } 383 }
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
584 if (!(iclog->ic_state == XLOG_STATE_ACTIVE || 559 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
585 iclog->ic_state == XLOG_STATE_DIRTY)) { 560 iclog->ic_state == XLOG_STATE_DIRTY)) {
586 if (!XLOG_FORCED_SHUTDOWN(log)) { 561 if (!XLOG_FORCED_SHUTDOWN(log)) {
587 sv_wait(&iclog->ic_force_wait, PMEM, 562 xlog_wait(&iclog->ic_force_wait,
588 &log->l_icloglock, s); 563 &log->l_icloglock);
589 } else { 564 } else {
590 spin_unlock(&log->l_icloglock); 565 spin_unlock(&log->l_icloglock);
591 } 566 }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
625 || iclog->ic_state == XLOG_STATE_DIRTY 600 || iclog->ic_state == XLOG_STATE_DIRTY
626 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 601 || iclog->ic_state == XLOG_STATE_IOERROR) ) {
627 602
628 sv_wait(&iclog->ic_force_wait, PMEM, 603 xlog_wait(&iclog->ic_force_wait,
629 &log->l_icloglock, s); 604 &log->l_icloglock);
630 } else { 605 } else {
631 spin_unlock(&log->l_icloglock); 606 spin_unlock(&log->l_icloglock);
632 } 607 }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t *mp,
703{ 678{
704 xlog_ticket_t *tic; 679 xlog_ticket_t *tic;
705 xlog_t *log = mp->m_log; 680 xlog_t *log = mp->m_log;
706 int need_bytes, free_bytes, cycle, bytes; 681 int need_bytes, free_bytes;
707 682
708 if (XLOG_FORCED_SHUTDOWN(log)) 683 if (XLOG_FORCED_SHUTDOWN(log))
709 return; 684 return;
710 685
711 if (tail_lsn == 0) { 686 if (tail_lsn == 0)
712 /* needed since sync_lsn is 64 bits */ 687 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
713 spin_lock(&log->l_icloglock);
714 tail_lsn = log->l_last_sync_lsn;
715 spin_unlock(&log->l_icloglock);
716 }
717
718 spin_lock(&log->l_grant_lock);
719 688
720 /* Also an invalid lsn. 1 implies that we aren't passing in a valid 689 /* tail_lsn == 1 implies that we weren't passed a valid value. */
721 * tail_lsn. 690 if (tail_lsn != 1)
722 */ 691 atomic64_set(&log->l_tail_lsn, tail_lsn);
723 if (tail_lsn != 1) {
724 log->l_tail_lsn = tail_lsn;
725 }
726 692
727 if ((tic = log->l_write_headq)) { 693 if (!list_empty_careful(&log->l_writeq)) {
728#ifdef DEBUG 694#ifdef DEBUG
729 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 695 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
730 panic("Recovery problem"); 696 panic("Recovery problem");
731#endif 697#endif
732 cycle = log->l_grant_write_cycle; 698 spin_lock(&log->l_grant_write_lock);
733 bytes = log->l_grant_write_bytes; 699 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
734 free_bytes = xlog_space_left(log, cycle, bytes); 700 list_for_each_entry(tic, &log->l_writeq, t_queue) {
735 do {
736 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 701 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
737 702
738 if (free_bytes < tic->t_unit_res && tail_lsn != 1) 703 if (free_bytes < tic->t_unit_res && tail_lsn != 1)
739 break; 704 break;
740 tail_lsn = 0; 705 tail_lsn = 0;
741 free_bytes -= tic->t_unit_res; 706 free_bytes -= tic->t_unit_res;
742 sv_signal(&tic->t_wait); 707 trace_xfs_log_regrant_write_wake_up(log, tic);
743 tic = tic->t_next; 708 wake_up(&tic->t_wait);
744 } while (tic != log->l_write_headq); 709 }
710 spin_unlock(&log->l_grant_write_lock);
745 } 711 }
746 if ((tic = log->l_reserve_headq)) { 712
713 if (!list_empty_careful(&log->l_reserveq)) {
747#ifdef DEBUG 714#ifdef DEBUG
748 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 715 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
749 panic("Recovery problem"); 716 panic("Recovery problem");
750#endif 717#endif
751 cycle = log->l_grant_reserve_cycle; 718 spin_lock(&log->l_grant_reserve_lock);
752 bytes = log->l_grant_reserve_bytes; 719 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
753 free_bytes = xlog_space_left(log, cycle, bytes); 720 list_for_each_entry(tic, &log->l_reserveq, t_queue) {
754 do {
755 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 721 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
756 need_bytes = tic->t_unit_res*tic->t_cnt; 722 need_bytes = tic->t_unit_res*tic->t_cnt;
757 else 723 else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t *mp,
760 break; 726 break;
761 tail_lsn = 0; 727 tail_lsn = 0;
762 free_bytes -= need_bytes; 728 free_bytes -= need_bytes;
763 sv_signal(&tic->t_wait); 729 trace_xfs_log_grant_wake_up(log, tic);
764 tic = tic->t_next; 730 wake_up(&tic->t_wait);
765 } while (tic != log->l_reserve_headq); 731 }
732 spin_unlock(&log->l_grant_reserve_lock);
766 } 733 }
767 spin_unlock(&log->l_grant_lock); 734}
768} /* xfs_log_move_tail */
769 735
770/* 736/*
771 * Determine if we have a transaction that has gone to disk 737 * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
831 * We may be holding the log iclog lock upon entering this routine. 797 * We may be holding the log iclog lock upon entering this routine.
832 */ 798 */
833xfs_lsn_t 799xfs_lsn_t
834xlog_assign_tail_lsn(xfs_mount_t *mp) 800xlog_assign_tail_lsn(
801 struct xfs_mount *mp)
835{ 802{
836 xfs_lsn_t tail_lsn; 803 xfs_lsn_t tail_lsn;
837 xlog_t *log = mp->m_log; 804 struct log *log = mp->m_log;
838 805
839 tail_lsn = xfs_trans_ail_tail(mp->m_ail); 806 tail_lsn = xfs_trans_ail_tail(mp->m_ail);
840 spin_lock(&log->l_grant_lock); 807 if (!tail_lsn)
841 if (tail_lsn != 0) { 808 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
842 log->l_tail_lsn = tail_lsn;
843 } else {
844 tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
845 }
846 spin_unlock(&log->l_grant_lock);
847 809
810 atomic64_set(&log->l_tail_lsn, tail_lsn);
848 return tail_lsn; 811 return tail_lsn;
849} /* xlog_assign_tail_lsn */ 812}
850
851 813
852/* 814/*
853 * Return the space in the log between the tail and the head. The head 815 * Return the space in the log between the tail and the head. The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
864 * result is that we return the size of the log as the amount of space left. 826 * result is that we return the size of the log as the amount of space left.
865 */ 827 */
866STATIC int 828STATIC int
867xlog_space_left(xlog_t *log, int cycle, int bytes) 829xlog_space_left(
868{ 830 struct log *log,
869 int free_bytes; 831 atomic64_t *head)
870 int tail_bytes; 832{
871 int tail_cycle; 833 int free_bytes;
872 834 int tail_bytes;
873 tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); 835 int tail_cycle;
874 tail_cycle = CYCLE_LSN(log->l_tail_lsn); 836 int head_cycle;
875 if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { 837 int head_bytes;
876 free_bytes = log->l_logsize - (bytes - tail_bytes); 838
877 } else if ((tail_cycle + 1) < cycle) { 839 xlog_crack_grant_head(head, &head_cycle, &head_bytes);
840 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
841 tail_bytes = BBTOB(tail_bytes);
842 if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
843 free_bytes = log->l_logsize - (head_bytes - tail_bytes);
844 else if (tail_cycle + 1 < head_cycle)
878 return 0; 845 return 0;
879 } else if (tail_cycle < cycle) { 846 else if (tail_cycle < head_cycle) {
880 ASSERT(tail_cycle == (cycle - 1)); 847 ASSERT(tail_cycle == (head_cycle - 1));
881 free_bytes = tail_bytes - bytes; 848 free_bytes = tail_bytes - head_bytes;
882 } else { 849 } else {
883 /* 850 /*
884 * The reservation head is behind the tail. 851 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
889 "xlog_space_left: head behind tail\n" 856 "xlog_space_left: head behind tail\n"
890 " tail_cycle = %d, tail_bytes = %d\n" 857 " tail_cycle = %d, tail_bytes = %d\n"
891 " GH cycle = %d, GH bytes = %d", 858 " GH cycle = %d, GH bytes = %d",
892 tail_cycle, tail_bytes, cycle, bytes); 859 tail_cycle, tail_bytes, head_cycle, head_bytes);
893 ASSERT(0); 860 ASSERT(0);
894 free_bytes = log->l_logsize; 861 free_bytes = log->l_logsize;
895 } 862 }
896 return free_bytes; 863 return free_bytes;
897} /* xlog_space_left */ 864}
898 865
899 866
900/* 867/*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t *mp,
1047 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1014 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1048 1015
1049 log->l_prev_block = -1; 1016 log->l_prev_block = -1;
1050 log->l_tail_lsn = xlog_assign_lsn(1, 0);
1051 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1017 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
1052 log->l_last_sync_lsn = log->l_tail_lsn; 1018 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
1019 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
1053 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1020 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
1054 log->l_grant_reserve_cycle = 1; 1021 xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
1055 log->l_grant_write_cycle = 1; 1022 xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
1023 INIT_LIST_HEAD(&log->l_reserveq);
1024 INIT_LIST_HEAD(&log->l_writeq);
1025 spin_lock_init(&log->l_grant_reserve_lock);
1026 spin_lock_init(&log->l_grant_write_lock);
1056 1027
1057 error = EFSCORRUPTED; 1028 error = EFSCORRUPTED;
1058 if (xfs_sb_version_hassector(&mp->m_sb)) { 1029 if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1094 log->l_xbuf = bp; 1065 log->l_xbuf = bp;
1095 1066
1096 spin_lock_init(&log->l_icloglock); 1067 spin_lock_init(&log->l_icloglock);
1097 spin_lock_init(&log->l_grant_lock); 1068 init_waitqueue_head(&log->l_flush_wait);
1098 sv_init(&log->l_flush_wait, 0, "flush_wait");
1099 1069
1100 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1070 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1101 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1071 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1151 1121
1152 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); 1122 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1153 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1123 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
1154 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1124 init_waitqueue_head(&iclog->ic_force_wait);
1155 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1125 init_waitqueue_head(&iclog->ic_write_wait);
1156 1126
1157 iclogp = &iclog->ic_next; 1127 iclogp = &iclog->ic_next;
1158 } 1128 }
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t *mp,
1167out_free_iclog: 1137out_free_iclog:
1168 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { 1138 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1169 prev_iclog = iclog->ic_next; 1139 prev_iclog = iclog->ic_next;
1170 if (iclog->ic_bp) { 1140 if (iclog->ic_bp)
1171 sv_destroy(&iclog->ic_force_wait);
1172 sv_destroy(&iclog->ic_write_wait);
1173 xfs_buf_free(iclog->ic_bp); 1141 xfs_buf_free(iclog->ic_bp);
1174 }
1175 kmem_free(iclog); 1142 kmem_free(iclog);
1176 } 1143 }
1177 spinlock_destroy(&log->l_icloglock); 1144 spinlock_destroy(&log->l_icloglock);
1178 spinlock_destroy(&log->l_grant_lock);
1179 xfs_buf_free(log->l_xbuf); 1145 xfs_buf_free(log->l_xbuf);
1180out_free_log: 1146out_free_log:
1181 kmem_free(log); 1147 kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
1223 * water mark. In this manner, we would be creating a low water mark. 1189 * water mark. In this manner, we would be creating a low water mark.
1224 */ 1190 */
1225STATIC void 1191STATIC void
1226xlog_grant_push_ail(xfs_mount_t *mp, 1192xlog_grant_push_ail(
1227 int need_bytes) 1193 struct log *log,
1194 int need_bytes)
1228{ 1195{
1229 xlog_t *log = mp->m_log; /* pointer to the log */ 1196 xfs_lsn_t threshold_lsn = 0;
1230 xfs_lsn_t tail_lsn; /* lsn of the log tail */ 1197 xfs_lsn_t last_sync_lsn;
1231 xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ 1198 int free_blocks;
1232 int free_blocks; /* free blocks left to write to */ 1199 int free_bytes;
1233 int free_bytes; /* free bytes left to write to */ 1200 int threshold_block;
1234 int threshold_block; /* block in lsn we'd like to be at */ 1201 int threshold_cycle;
1235 int threshold_cycle; /* lsn cycle we'd like to be at */ 1202 int free_threshold;
1236 int free_threshold; 1203
1237 1204 ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1238 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1205
1239 1206 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
1240 spin_lock(&log->l_grant_lock); 1207 free_blocks = BTOBBT(free_bytes);
1241 free_bytes = xlog_space_left(log, 1208
1242 log->l_grant_reserve_cycle, 1209 /*
1243 log->l_grant_reserve_bytes); 1210 * Set the threshold for the minimum number of free blocks in the
1244 tail_lsn = log->l_tail_lsn; 1211 * log to the maximum of what the caller needs, one quarter of the
1245 free_blocks = BTOBBT(free_bytes); 1212 * log, and 256 blocks.
1246 1213 */
1247 /* 1214 free_threshold = BTOBB(need_bytes);
1248 * Set the threshold for the minimum number of free blocks in the 1215 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
1249 * log to the maximum of what the caller needs, one quarter of the 1216 free_threshold = MAX(free_threshold, 256);
1250 * log, and 256 blocks. 1217 if (free_blocks >= free_threshold)
1251 */ 1218 return;
1252 free_threshold = BTOBB(need_bytes); 1219
1253 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); 1220 xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
1254 free_threshold = MAX(free_threshold, 256); 1221 &threshold_block);
1255 if (free_blocks < free_threshold) { 1222 threshold_block += free_threshold;
1256 threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
1257 threshold_cycle = CYCLE_LSN(tail_lsn);
1258 if (threshold_block >= log->l_logBBsize) { 1223 if (threshold_block >= log->l_logBBsize) {
1259 threshold_block -= log->l_logBBsize; 1224 threshold_block -= log->l_logBBsize;
1260 threshold_cycle += 1; 1225 threshold_cycle += 1;
1261 } 1226 }
1262 threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block); 1227 threshold_lsn = xlog_assign_lsn(threshold_cycle,
1228 threshold_block);
1229 /*
1230 * Don't pass in an lsn greater than the lsn of the last
1231 * log record known to be on disk. Use a snapshot of the last sync lsn
1232 * so that it doesn't change between the compare and the set.
1233 */
1234 last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
1235 if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
1236 threshold_lsn = last_sync_lsn;
1263 1237
1264 /* Don't pass in an lsn greater than the lsn of the last 1238 /*
1265 * log record known to be on disk. 1239 * Get the transaction layer to kick the dirty buffers out to
1240 * disk asynchronously. No point in trying to do this if
1241 * the filesystem is shutting down.
1266 */ 1242 */
1267 if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) 1243 if (!XLOG_FORCED_SHUTDOWN(log))
1268 threshold_lsn = log->l_last_sync_lsn; 1244 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1269 } 1245}
1270 spin_unlock(&log->l_grant_lock);
1271
1272 /*
1273 * Get the transaction layer to kick the dirty buffers out to
1274 * disk asynchronously. No point in trying to do this if
1275 * the filesystem is shutting down.
1276 */
1277 if (threshold_lsn &&
1278 !XLOG_FORCED_SHUTDOWN(log))
1279 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1280} /* xlog_grant_push_ail */
1281 1246
1282/* 1247/*
1283 * The bdstrat callback function for log bufs. This gives us a central 1248 * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t *log,
1372 roundoff < BBTOB(1))); 1337 roundoff < BBTOB(1)));
1373 1338
1374 /* move grant heads by roundoff in sync */ 1339 /* move grant heads by roundoff in sync */
1375 spin_lock(&log->l_grant_lock); 1340 xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
1376 xlog_grant_add_space(log, roundoff); 1341 xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
1377 spin_unlock(&log->l_grant_lock);
1378 1342
1379 /* put cycle number in every block */ 1343 /* put cycle number in every block */
1380 xlog_pack_data(log, iclog, roundoff); 1344 xlog_pack_data(log, iclog, roundoff);
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
1489 1453
1490 iclog = log->l_iclog; 1454 iclog = log->l_iclog;
1491 for (i=0; i<log->l_iclog_bufs; i++) { 1455 for (i=0; i<log->l_iclog_bufs; i++) {
1492 sv_destroy(&iclog->ic_force_wait);
1493 sv_destroy(&iclog->ic_write_wait);
1494 xfs_buf_free(iclog->ic_bp); 1456 xfs_buf_free(iclog->ic_bp);
1495 next_iclog = iclog->ic_next; 1457 next_iclog = iclog->ic_next;
1496 kmem_free(iclog); 1458 kmem_free(iclog);
1497 iclog = next_iclog; 1459 iclog = next_iclog;
1498 } 1460 }
1499 spinlock_destroy(&log->l_icloglock); 1461 spinlock_destroy(&log->l_icloglock);
1500 spinlock_destroy(&log->l_grant_lock);
1501 1462
1502 xfs_buf_free(log->l_xbuf); 1463 xfs_buf_free(log->l_xbuf);
1503 log->l_mp->m_log = NULL; 1464 log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
2232 lowest_lsn = xlog_get_lowest_lsn(log); 2193 lowest_lsn = xlog_get_lowest_lsn(log);
2233 if (lowest_lsn && 2194 if (lowest_lsn &&
2234 XFS_LSN_CMP(lowest_lsn, 2195 XFS_LSN_CMP(lowest_lsn,
2235 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { 2196 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2236 iclog = iclog->ic_next; 2197 iclog = iclog->ic_next;
2237 continue; /* Leave this iclog for 2198 continue; /* Leave this iclog for
2238 * another thread */ 2199 * another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
2240 2201
2241 iclog->ic_state = XLOG_STATE_CALLBACK; 2202 iclog->ic_state = XLOG_STATE_CALLBACK;
2242 2203
2243 spin_unlock(&log->l_icloglock);
2244 2204
2245 /* l_last_sync_lsn field protected by 2205 /*
2246 * l_grant_lock. Don't worry about iclog's lsn. 2206 * update the last_sync_lsn before we drop the
2247 * No one else can be here except us. 2207 * icloglock to ensure we are the only one that
2208 * can update it.
2248 */ 2209 */
2249 spin_lock(&log->l_grant_lock); 2210 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2250 ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, 2211 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2251 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2212 atomic64_set(&log->l_last_sync_lsn,
2252 log->l_last_sync_lsn = 2213 be64_to_cpu(iclog->ic_header.h_lsn));
2253 be64_to_cpu(iclog->ic_header.h_lsn);
2254 spin_unlock(&log->l_grant_lock);
2255 2214
2256 } else { 2215 } else
2257 spin_unlock(&log->l_icloglock);
2258 ioerrors++; 2216 ioerrors++;
2259 } 2217
2218 spin_unlock(&log->l_icloglock);
2260 2219
2261 /* 2220 /*
2262 * Keep processing entries in the callback list until 2221 * Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
2297 xlog_state_clean_log(log); 2256 xlog_state_clean_log(log);
2298 2257
2299 /* wake up threads waiting in xfs_log_force() */ 2258 /* wake up threads waiting in xfs_log_force() */
2300 sv_broadcast(&iclog->ic_force_wait); 2259 wake_up_all(&iclog->ic_force_wait);
2301 2260
2302 iclog = iclog->ic_next; 2261 iclog = iclog->ic_next;
2303 } while (first_iclog != iclog); 2262 } while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
2344 spin_unlock(&log->l_icloglock); 2303 spin_unlock(&log->l_icloglock);
2345 2304
2346 if (wake) 2305 if (wake)
2347 sv_broadcast(&log->l_flush_wait); 2306 wake_up_all(&log->l_flush_wait);
2348} 2307}
2349 2308
2350 2309
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
2395 * iclog buffer, we wake them all, one will get to do the 2354 * iclog buffer, we wake them all, one will get to do the
2396 * I/O, the others get to wait for the result. 2355 * I/O, the others get to wait for the result.
2397 */ 2356 */
2398 sv_broadcast(&iclog->ic_write_wait); 2357 wake_up_all(&iclog->ic_write_wait);
2399 spin_unlock(&log->l_icloglock); 2358 spin_unlock(&log->l_icloglock);
2400 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2359 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
2401} /* xlog_state_done_syncing */ 2360} /* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
2444 XFS_STATS_INC(xs_log_noiclogs); 2403 XFS_STATS_INC(xs_log_noiclogs);
2445 2404
2446 /* Wait for log writes to have flushed */ 2405 /* Wait for log writes to have flushed */
2447 sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); 2406 xlog_wait(&log->l_flush_wait, &log->l_icloglock);
2448 goto restart; 2407 goto restart;
2449 } 2408 }
2450 2409
@@ -2527,6 +2486,18 @@ restart:
2527 * 2486 *
2528 * Once a ticket gets put onto the reserveq, it will only return after 2487 * Once a ticket gets put onto the reserveq, it will only return after
2529 * the needed reservation is satisfied. 2488 * the needed reservation is satisfied.
2489 *
2490 * This function is structured so that it has a lock free fast path. This is
2491 * necessary because every new transaction reservation will come through this
2492 * path. Hence any lock will be globally hot if we take it unconditionally on
2493 * every pass.
2494 *
2495 * As tickets are only ever moved on and off the reserveq under the
2496 * l_grant_reserve_lock, we only need to take that lock if we are going
2497 * to add the ticket to the queue and sleep. We can avoid taking the lock if the
2498 * ticket was never added to the reserveq because the t_queue list head will be
2499 * empty and we hold the only reference to it so it can safely be checked
2500 * unlocked.
2530 */ 2501 */
2531STATIC int 2502STATIC int
2532xlog_grant_log_space(xlog_t *log, 2503xlog_grant_log_space(xlog_t *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t *log,
2534{ 2505{
2535 int free_bytes; 2506 int free_bytes;
2536 int need_bytes; 2507 int need_bytes;
2537#ifdef DEBUG
2538 xfs_lsn_t tail_lsn;
2539#endif
2540
2541 2508
2542#ifdef DEBUG 2509#ifdef DEBUG
2543 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 2510 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2544 panic("grant Recovery problem"); 2511 panic("grant Recovery problem");
2545#endif 2512#endif
2546 2513
2547 /* Is there space or do we need to sleep? */
2548 spin_lock(&log->l_grant_lock);
2549
2550 trace_xfs_log_grant_enter(log, tic); 2514 trace_xfs_log_grant_enter(log, tic);
2551 2515
2516 need_bytes = tic->t_unit_res;
2517 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2518 need_bytes *= tic->t_ocnt;
2519
2552 /* something is already sleeping; insert new transaction at end */ 2520 /* something is already sleeping; insert new transaction at end */
2553 if (log->l_reserve_headq) { 2521 if (!list_empty_careful(&log->l_reserveq)) {
2554 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2522 spin_lock(&log->l_grant_reserve_lock);
2523 /* recheck the queue now we are locked */
2524 if (list_empty(&log->l_reserveq)) {
2525 spin_unlock(&log->l_grant_reserve_lock);
2526 goto redo;
2527 }
2528 list_add_tail(&tic->t_queue, &log->l_reserveq);
2555 2529
2556 trace_xfs_log_grant_sleep1(log, tic); 2530 trace_xfs_log_grant_sleep1(log, tic);
2557 2531
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t *log,
2563 goto error_return; 2537 goto error_return;
2564 2538
2565 XFS_STATS_INC(xs_sleep_logspace); 2539 XFS_STATS_INC(xs_sleep_logspace);
2566 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); 2540 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2541
2567 /* 2542 /*
2568 * If we got an error, and the filesystem is shutting down, 2543 * If we got an error, and the filesystem is shutting down,
2569 * we'll catch it down below. So just continue... 2544 * we'll catch it down below. So just continue...
2570 */ 2545 */
2571 trace_xfs_log_grant_wake1(log, tic); 2546 trace_xfs_log_grant_wake1(log, tic);
2572 spin_lock(&log->l_grant_lock);
2573 } 2547 }
2574 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2575 need_bytes = tic->t_unit_res*tic->t_ocnt;
2576 else
2577 need_bytes = tic->t_unit_res;
2578 2548
2579redo: 2549redo:
2580 if (XLOG_FORCED_SHUTDOWN(log)) 2550 if (XLOG_FORCED_SHUTDOWN(log))
2581 goto error_return; 2551 goto error_return_unlocked;
2582 2552
2583 free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, 2553 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
2584 log->l_grant_reserve_bytes);
2585 if (free_bytes < need_bytes) { 2554 if (free_bytes < need_bytes) {
2586 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2555 spin_lock(&log->l_grant_reserve_lock);
2587 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2556 if (list_empty(&tic->t_queue))
2557 list_add_tail(&tic->t_queue, &log->l_reserveq);
2588 2558
2589 trace_xfs_log_grant_sleep2(log, tic); 2559 trace_xfs_log_grant_sleep2(log, tic);
2590 2560
2591 spin_unlock(&log->l_grant_lock);
2592 xlog_grant_push_ail(log->l_mp, need_bytes);
2593 spin_lock(&log->l_grant_lock);
2594
2595 XFS_STATS_INC(xs_sleep_logspace);
2596 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2597
2598 spin_lock(&log->l_grant_lock);
2599 if (XLOG_FORCED_SHUTDOWN(log)) 2561 if (XLOG_FORCED_SHUTDOWN(log))
2600 goto error_return; 2562 goto error_return;
2601 2563
2602 trace_xfs_log_grant_wake2(log, tic); 2564 xlog_grant_push_ail(log, need_bytes);
2565
2566 XFS_STATS_INC(xs_sleep_logspace);
2567 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2603 2568
2569 trace_xfs_log_grant_wake2(log, tic);
2604 goto redo; 2570 goto redo;
2605 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2571 }
2606 xlog_del_ticketq(&log->l_reserve_headq, tic);
2607 2572
2608 /* we've got enough space */ 2573 if (!list_empty(&tic->t_queue)) {
2609 xlog_grant_add_space(log, need_bytes); 2574 spin_lock(&log->l_grant_reserve_lock);
2610#ifdef DEBUG 2575 list_del_init(&tic->t_queue);
2611 tail_lsn = log->l_tail_lsn; 2576 spin_unlock(&log->l_grant_reserve_lock);
2612 /*
2613 * Check to make sure the grant write head didn't just over lap the
2614 * tail. If the cycles are the same, we can't be overlapping.
2615 * Otherwise, make sure that the cycles differ by exactly one and
2616 * check the byte count.
2617 */
2618 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2619 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2620 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2621 } 2577 }
2622#endif 2578
2579 /* we've got enough space */
2580 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
2581 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2623 trace_xfs_log_grant_exit(log, tic); 2582 trace_xfs_log_grant_exit(log, tic);
2624 xlog_verify_grant_head(log, 1); 2583 xlog_verify_grant_tail(log);
2625 spin_unlock(&log->l_grant_lock);
2626 return 0; 2584 return 0;
2627 2585
2628 error_return: 2586error_return_unlocked:
2629 if (tic->t_flags & XLOG_TIC_IN_Q) 2587 spin_lock(&log->l_grant_reserve_lock);
2630 xlog_del_ticketq(&log->l_reserve_headq, tic); 2588error_return:
2631 2589 list_del_init(&tic->t_queue);
2590 spin_unlock(&log->l_grant_reserve_lock);
2632 trace_xfs_log_grant_error(log, tic); 2591 trace_xfs_log_grant_error(log, tic);
2633 2592
2634 /* 2593 /*
@@ -2638,7 +2597,6 @@ redo:
2638 */ 2597 */
2639 tic->t_curr_res = 0; 2598 tic->t_curr_res = 0;
2640 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2599 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2641 spin_unlock(&log->l_grant_lock);
2642 return XFS_ERROR(EIO); 2600 return XFS_ERROR(EIO);
2643} /* xlog_grant_log_space */ 2601} /* xlog_grant_log_space */
2644 2602
@@ -2646,17 +2604,14 @@ redo:
2646/* 2604/*
2647 * Replenish the byte reservation required by moving the grant write head. 2605 * Replenish the byte reservation required by moving the grant write head.
2648 * 2606 *
2649 * 2607 * Similar to xlog_grant_log_space, the function is structured to have a lock
2608 * free fast path.
2650 */ 2609 */
2651STATIC int 2610STATIC int
2652xlog_regrant_write_log_space(xlog_t *log, 2611xlog_regrant_write_log_space(xlog_t *log,
2653 xlog_ticket_t *tic) 2612 xlog_ticket_t *tic)
2654{ 2613{
2655 int free_bytes, need_bytes; 2614 int free_bytes, need_bytes;
2656 xlog_ticket_t *ntic;
2657#ifdef DEBUG
2658 xfs_lsn_t tail_lsn;
2659#endif
2660 2615
2661 tic->t_curr_res = tic->t_unit_res; 2616 tic->t_curr_res = tic->t_unit_res;
2662 xlog_tic_reset_res(tic); 2617 xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t *log,
2669 panic("regrant Recovery problem"); 2624 panic("regrant Recovery problem");
2670#endif 2625#endif
2671 2626
2672 spin_lock(&log->l_grant_lock);
2673
2674 trace_xfs_log_regrant_write_enter(log, tic); 2627 trace_xfs_log_regrant_write_enter(log, tic);
2675
2676 if (XLOG_FORCED_SHUTDOWN(log)) 2628 if (XLOG_FORCED_SHUTDOWN(log))
2677 goto error_return; 2629 goto error_return_unlocked;
2678 2630
2679 /* If there are other waiters on the queue then give them a 2631 /* If there are other waiters on the queue then give them a
2680 * chance at logspace before us. Wake up the first waiters, 2632 * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t *log,
2683 * this transaction. 2635 * this transaction.
2684 */ 2636 */
2685 need_bytes = tic->t_unit_res; 2637 need_bytes = tic->t_unit_res;
2686 if ((ntic = log->l_write_headq)) { 2638 if (!list_empty_careful(&log->l_writeq)) {
2687 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2639 struct xlog_ticket *ntic;
2688 log->l_grant_write_bytes); 2640
2689 do { 2641 spin_lock(&log->l_grant_write_lock);
2642 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2643 list_for_each_entry(ntic, &log->l_writeq, t_queue) {
2690 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); 2644 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
2691 2645
2692 if (free_bytes < ntic->t_unit_res) 2646 if (free_bytes < ntic->t_unit_res)
2693 break; 2647 break;
2694 free_bytes -= ntic->t_unit_res; 2648 free_bytes -= ntic->t_unit_res;
2695 sv_signal(&ntic->t_wait); 2649 wake_up(&ntic->t_wait);
2696 ntic = ntic->t_next; 2650 }
2697 } while (ntic != log->l_write_headq);
2698
2699 if (ntic != log->l_write_headq) {
2700 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2701 xlog_ins_ticketq(&log->l_write_headq, tic);
2702 2651
2652 if (ntic != list_first_entry(&log->l_writeq,
2653 struct xlog_ticket, t_queue)) {
2654 if (list_empty(&tic->t_queue))
2655 list_add_tail(&tic->t_queue, &log->l_writeq);
2703 trace_xfs_log_regrant_write_sleep1(log, tic); 2656 trace_xfs_log_regrant_write_sleep1(log, tic);
2704 2657
2705 spin_unlock(&log->l_grant_lock); 2658 xlog_grant_push_ail(log, need_bytes);
2706 xlog_grant_push_ail(log->l_mp, need_bytes);
2707 spin_lock(&log->l_grant_lock);
2708 2659
2709 XFS_STATS_INC(xs_sleep_logspace); 2660 XFS_STATS_INC(xs_sleep_logspace);
2710 sv_wait(&tic->t_wait, PINOD|PLTWAIT, 2661 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2711 &log->l_grant_lock, s);
2712
2713 /* If we're shutting down, this tic is already
2714 * off the queue */
2715 spin_lock(&log->l_grant_lock);
2716 if (XLOG_FORCED_SHUTDOWN(log))
2717 goto error_return;
2718
2719 trace_xfs_log_regrant_write_wake1(log, tic); 2662 trace_xfs_log_regrant_write_wake1(log, tic);
2720 } 2663 } else
2664 spin_unlock(&log->l_grant_write_lock);
2721 } 2665 }
2722 2666
2723redo: 2667redo:
2724 if (XLOG_FORCED_SHUTDOWN(log)) 2668 if (XLOG_FORCED_SHUTDOWN(log))
2725 goto error_return; 2669 goto error_return_unlocked;
2726 2670
2727 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2671 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2728 log->l_grant_write_bytes);
2729 if (free_bytes < need_bytes) { 2672 if (free_bytes < need_bytes) {
2730 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2673 spin_lock(&log->l_grant_write_lock);
2731 xlog_ins_ticketq(&log->l_write_headq, tic); 2674 if (list_empty(&tic->t_queue))
2732 spin_unlock(&log->l_grant_lock); 2675 list_add_tail(&tic->t_queue, &log->l_writeq);
2733 xlog_grant_push_ail(log->l_mp, need_bytes);
2734 spin_lock(&log->l_grant_lock);
2735
2736 XFS_STATS_INC(xs_sleep_logspace);
2737 trace_xfs_log_regrant_write_sleep2(log, tic);
2738
2739 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2740 2676
2741 /* If we're shutting down, this tic is already off the queue */
2742 spin_lock(&log->l_grant_lock);
2743 if (XLOG_FORCED_SHUTDOWN(log)) 2677 if (XLOG_FORCED_SHUTDOWN(log))
2744 goto error_return; 2678 goto error_return;
2745 2679
2680 xlog_grant_push_ail(log, need_bytes);
2681
2682 XFS_STATS_INC(xs_sleep_logspace);
2683 trace_xfs_log_regrant_write_sleep2(log, tic);
2684 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2685
2746 trace_xfs_log_regrant_write_wake2(log, tic); 2686 trace_xfs_log_regrant_write_wake2(log, tic);
2747 goto redo; 2687 goto redo;
2748 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2688 }
2749 xlog_del_ticketq(&log->l_write_headq, tic);
2750 2689
2751 /* we've got enough space */ 2690 if (!list_empty(&tic->t_queue)) {
2752 xlog_grant_add_space_write(log, need_bytes); 2691 spin_lock(&log->l_grant_write_lock);
2753#ifdef DEBUG 2692 list_del_init(&tic->t_queue);
2754 tail_lsn = log->l_tail_lsn; 2693 spin_unlock(&log->l_grant_write_lock);
2755 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2756 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2757 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2758 } 2694 }
2759#endif
2760 2695
2696 /* we've got enough space */
2697 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2761 trace_xfs_log_regrant_write_exit(log, tic); 2698 trace_xfs_log_regrant_write_exit(log, tic);
2762 2699 xlog_verify_grant_tail(log);
2763 xlog_verify_grant_head(log, 1);
2764 spin_unlock(&log->l_grant_lock);
2765 return 0; 2700 return 0;
2766 2701
2767 2702
2703 error_return_unlocked:
2704 spin_lock(&log->l_grant_write_lock);
2768 error_return: 2705 error_return:
2769 if (tic->t_flags & XLOG_TIC_IN_Q) 2706 list_del_init(&tic->t_queue);
2770 xlog_del_ticketq(&log->l_reserve_headq, tic); 2707 spin_unlock(&log->l_grant_write_lock);
2771
2772 trace_xfs_log_regrant_write_error(log, tic); 2708 trace_xfs_log_regrant_write_error(log, tic);
2773 2709
2774 /* 2710 /*
@@ -2778,7 +2714,6 @@ redo:
2778 */ 2714 */
2779 tic->t_curr_res = 0; 2715 tic->t_curr_res = 0;
2780 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2716 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2781 spin_unlock(&log->l_grant_lock);
2782 return XFS_ERROR(EIO); 2717 return XFS_ERROR(EIO);
2783} /* xlog_regrant_write_log_space */ 2718} /* xlog_regrant_write_log_space */
2784 2719
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2799 if (ticket->t_cnt > 0) 2734 if (ticket->t_cnt > 0)
2800 ticket->t_cnt--; 2735 ticket->t_cnt--;
2801 2736
2802 spin_lock(&log->l_grant_lock); 2737 xlog_grant_sub_space(log, &log->l_grant_reserve_head,
2803 xlog_grant_sub_space(log, ticket->t_curr_res); 2738 ticket->t_curr_res);
2739 xlog_grant_sub_space(log, &log->l_grant_write_head,
2740 ticket->t_curr_res);
2804 ticket->t_curr_res = ticket->t_unit_res; 2741 ticket->t_curr_res = ticket->t_unit_res;
2805 xlog_tic_reset_res(ticket); 2742 xlog_tic_reset_res(ticket);
2806 2743
2807 trace_xfs_log_regrant_reserve_sub(log, ticket); 2744 trace_xfs_log_regrant_reserve_sub(log, ticket);
2808 2745
2809 xlog_verify_grant_head(log, 1);
2810
2811 /* just return if we still have some of the pre-reserved space */ 2746 /* just return if we still have some of the pre-reserved space */
2812 if (ticket->t_cnt > 0) { 2747 if (ticket->t_cnt > 0)
2813 spin_unlock(&log->l_grant_lock);
2814 return; 2748 return;
2815 }
2816 2749
2817 xlog_grant_add_space_reserve(log, ticket->t_unit_res); 2750 xlog_grant_add_space(log, &log->l_grant_reserve_head,
2751 ticket->t_unit_res);
2818 2752
2819 trace_xfs_log_regrant_reserve_exit(log, ticket); 2753 trace_xfs_log_regrant_reserve_exit(log, ticket);
2820 2754
2821 xlog_verify_grant_head(log, 0);
2822 spin_unlock(&log->l_grant_lock);
2823 ticket->t_curr_res = ticket->t_unit_res; 2755 ticket->t_curr_res = ticket->t_unit_res;
2824 xlog_tic_reset_res(ticket); 2756 xlog_tic_reset_res(ticket);
2825} /* xlog_regrant_reserve_log_space */ 2757} /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
2843xlog_ungrant_log_space(xlog_t *log, 2775xlog_ungrant_log_space(xlog_t *log,
2844 xlog_ticket_t *ticket) 2776 xlog_ticket_t *ticket)
2845{ 2777{
2778 int bytes;
2779
2846 if (ticket->t_cnt > 0) 2780 if (ticket->t_cnt > 0)
2847 ticket->t_cnt--; 2781 ticket->t_cnt--;
2848 2782
2849 spin_lock(&log->l_grant_lock);
2850 trace_xfs_log_ungrant_enter(log, ticket); 2783 trace_xfs_log_ungrant_enter(log, ticket);
2851
2852 xlog_grant_sub_space(log, ticket->t_curr_res);
2853
2854 trace_xfs_log_ungrant_sub(log, ticket); 2784 trace_xfs_log_ungrant_sub(log, ticket);
2855 2785
2856 /* If this is a permanent reservation ticket, we may be able to free 2786 /*
2787 * If this is a permanent reservation ticket, we may be able to free
2857 * up more space based on the remaining count. 2788 * up more space based on the remaining count.
2858 */ 2789 */
2790 bytes = ticket->t_curr_res;
2859 if (ticket->t_cnt > 0) { 2791 if (ticket->t_cnt > 0) {
2860 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 2792 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2861 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); 2793 bytes += ticket->t_unit_res*ticket->t_cnt;
2862 } 2794 }
2863 2795
2796 xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
2797 xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
2798
2864 trace_xfs_log_ungrant_exit(log, ticket); 2799 trace_xfs_log_ungrant_exit(log, ticket);
2865 2800
2866 xlog_verify_grant_head(log, 1);
2867 spin_unlock(&log->l_grant_lock);
2868 xfs_log_move_tail(log->l_mp, 1); 2801 xfs_log_move_tail(log->l_mp, 1);
2869} /* xlog_ungrant_log_space */ 2802} /* xlog_ungrant_log_space */
2870 2803
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
2901 2834
2902 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 2835 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
2903 /* update tail before writing to iclog */ 2836 /* update tail before writing to iclog */
2904 xlog_assign_tail_lsn(log->l_mp); 2837 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
2905 sync++; 2838 sync++;
2906 iclog->ic_state = XLOG_STATE_SYNCING; 2839 iclog->ic_state = XLOG_STATE_SYNCING;
2907 iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); 2840 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
2908 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); 2841 xlog_verify_tail_lsn(log, iclog, tail_lsn);
2909 /* cycle incremented when incrementing curr_block */ 2842 /* cycle incremented when incrementing curr_block */
2910 } 2843 }
2911 spin_unlock(&log->l_icloglock); 2844 spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
3088 return XFS_ERROR(EIO); 3021 return XFS_ERROR(EIO);
3089 } 3022 }
3090 XFS_STATS_INC(xs_log_force_sleep); 3023 XFS_STATS_INC(xs_log_force_sleep);
3091 sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); 3024 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3092 /* 3025 /*
3093 * No need to grab the log lock here since we're 3026 * No need to grab the log lock here since we're
3094 * only deciding whether or not to return EIO 3027 * only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
3206 3139
3207 XFS_STATS_INC(xs_log_force_sleep); 3140 XFS_STATS_INC(xs_log_force_sleep);
3208 3141
3209 sv_wait(&iclog->ic_prev->ic_write_wait, 3142 xlog_wait(&iclog->ic_prev->ic_write_wait,
3210 PSWP, &log->l_icloglock, s); 3143 &log->l_icloglock);
3211 if (log_flushed) 3144 if (log_flushed)
3212 *log_flushed = 1; 3145 *log_flushed = 1;
3213 already_slept = 1; 3146 already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
3235 return XFS_ERROR(EIO); 3168 return XFS_ERROR(EIO);
3236 } 3169 }
3237 XFS_STATS_INC(xs_log_force_sleep); 3170 XFS_STATS_INC(xs_log_force_sleep);
3238 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); 3171 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3239 /* 3172 /*
3240 * No need to grab the log lock here since we're 3173 * No need to grab the log lock here since we're
3241 * only deciding whether or not to return EIO 3174 * only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
3310 xlog_ticket_t *ticket) 3243 xlog_ticket_t *ticket)
3311{ 3244{
3312 ASSERT(atomic_read(&ticket->t_ref) > 0); 3245 ASSERT(atomic_read(&ticket->t_ref) > 0);
3313 if (atomic_dec_and_test(&ticket->t_ref)) { 3246 if (atomic_dec_and_test(&ticket->t_ref))
3314 sv_destroy(&ticket->t_wait);
3315 kmem_zone_free(xfs_log_ticket_zone, ticket); 3247 kmem_zone_free(xfs_log_ticket_zone, ticket);
3316 }
3317} 3248}
3318 3249
3319xlog_ticket_t * 3250xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
3435 } 3366 }
3436 3367
3437 atomic_set(&tic->t_ref, 1); 3368 atomic_set(&tic->t_ref, 1);
3369 INIT_LIST_HEAD(&tic->t_queue);
3438 tic->t_unit_res = unit_bytes; 3370 tic->t_unit_res = unit_bytes;
3439 tic->t_curr_res = unit_bytes; 3371 tic->t_curr_res = unit_bytes;
3440 tic->t_cnt = cnt; 3372 tic->t_cnt = cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
3445 tic->t_trans_type = 0; 3377 tic->t_trans_type = 0;
3446 if (xflags & XFS_LOG_PERM_RESERV) 3378 if (xflags & XFS_LOG_PERM_RESERV)
3447 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3379 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3448 sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); 3380 init_waitqueue_head(&tic->t_wait);
3449 3381
3450 xlog_tic_reset_res(tic); 3382 xlog_tic_reset_res(tic);
3451 3383
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
3484} 3416}
3485 3417
3486STATIC void 3418STATIC void
3487xlog_verify_grant_head(xlog_t *log, int equals) 3419xlog_verify_grant_tail(
3420 struct log *log)
3488{ 3421{
3489 if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { 3422 int tail_cycle, tail_blocks;
3490 if (equals) 3423 int cycle, space;
3491 ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); 3424
3492 else 3425 /*
3493 ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); 3426 * Check to make sure the grant write head didn't just over lap the
3494 } else { 3427 * tail. If the cycles are the same, we can't be overlapping.
3495 ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); 3428 * Otherwise, make sure that the cycles differ by exactly one and
3496 ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); 3429 * check the byte count.
3497 } 3430 */
3498} /* xlog_verify_grant_head */ 3431 xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
3432 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3433 if (tail_cycle != cycle) {
3434 ASSERT(cycle - 1 == tail_cycle);
3435 ASSERT(space <= BBTOB(tail_blocks));
3436 }
3437}
3499 3438
3500/* check if it will fit */ 3439/* check if it will fit */
3501STATIC void 3440STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
3716 xlog_cil_force(log); 3655 xlog_cil_force(log);
3717 3656
3718 /* 3657 /*
3719 * We must hold both the GRANT lock and the LOG lock, 3658 * mark the filesystem and the as in a shutdown state and wake
3720 * before we mark the filesystem SHUTDOWN and wake 3659 * everybody up to tell them the bad news.
3721 * everybody up to tell the bad news.
3722 */ 3660 */
3723 spin_lock(&log->l_icloglock); 3661 spin_lock(&log->l_icloglock);
3724 spin_lock(&log->l_grant_lock);
3725 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3662 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3726 if (mp->m_sb_bp) 3663 if (mp->m_sb_bp)
3727 XFS_BUF_DONE(mp->m_sb_bp); 3664 XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
3742 spin_unlock(&log->l_icloglock); 3679 spin_unlock(&log->l_icloglock);
3743 3680
3744 /* 3681 /*
3745 * We don't want anybody waiting for log reservations 3682 * We don't want anybody waiting for log reservations after this. That
3746 * after this. That means we have to wake up everybody 3683 * means we have to wake up everybody queued up on reserveq as well as
3747 * queued up on reserve_headq as well as write_headq. 3684 * writeq. In addition, we make sure in xlog_{re}grant_log_space that
3748 * In addition, we make sure in xlog_{re}grant_log_space 3685 * we don't enqueue anything once the SHUTDOWN flag is set, and this
3749 * that we don't enqueue anything once the SHUTDOWN flag 3686 * action is protected by the grant locks.
3750 * is set, and this action is protected by the GRANTLOCK.
3751 */ 3687 */
3752 if ((tic = log->l_reserve_headq)) { 3688 spin_lock(&log->l_grant_reserve_lock);
3753 do { 3689 list_for_each_entry(tic, &log->l_reserveq, t_queue)
3754 sv_signal(&tic->t_wait); 3690 wake_up(&tic->t_wait);
3755 tic = tic->t_next; 3691 spin_unlock(&log->l_grant_reserve_lock);
3756 } while (tic != log->l_reserve_headq); 3692
3757 } 3693 spin_lock(&log->l_grant_write_lock);
3758 3694 list_for_each_entry(tic, &log->l_writeq, t_queue)
3759 if ((tic = log->l_write_headq)) { 3695 wake_up(&tic->t_wait);
3760 do { 3696 spin_unlock(&log->l_grant_write_lock);
3761 sv_signal(&tic->t_wait);
3762 tic = tic->t_next;
3763 } while (tic != log->l_write_headq);
3764 }
3765 spin_unlock(&log->l_grant_lock);
3766 3697
3767 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3698 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3768 ASSERT(!logerror); 3699 ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97b..9dc8125d04e5 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
61 INIT_LIST_HEAD(&cil->xc_committing); 61 INIT_LIST_HEAD(&cil->xc_committing);
62 spin_lock_init(&cil->xc_cil_lock); 62 spin_lock_init(&cil->xc_cil_lock);
63 init_rwsem(&cil->xc_ctx_lock); 63 init_rwsem(&cil->xc_ctx_lock);
64 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); 64 init_waitqueue_head(&cil->xc_commit_wait);
65 65
66 INIT_LIST_HEAD(&ctx->committing); 66 INIT_LIST_HEAD(&ctx->committing);
67 INIT_LIST_HEAD(&ctx->busy_extents); 67 INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
361 int abort) 361 int abort)
362{ 362{
363 struct xfs_cil_ctx *ctx = args; 363 struct xfs_cil_ctx *ctx = args;
364 struct xfs_log_vec *lv;
365 int abortflag = abort ? XFS_LI_ABORTED : 0;
366 struct xfs_busy_extent *busyp, *n; 364 struct xfs_busy_extent *busyp, *n;
367 365
368 /* unpin all the log items */ 366 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
369 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { 367 ctx->start_lsn, abort);
370 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
371 abortflag);
372 }
373 368
374 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) 369 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
375 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); 370 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -568,7 +563,7 @@ restart:
568 * It is still being pushed! Wait for the push to 563 * It is still being pushed! Wait for the push to
569 * complete, then start again from the beginning. 564 * complete, then start again from the beginning.
570 */ 565 */
571 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 566 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
572 goto restart; 567 goto restart;
573 } 568 }
574 } 569 }
@@ -592,7 +587,7 @@ restart:
592 */ 587 */
593 spin_lock(&cil->xc_cil_lock); 588 spin_lock(&cil->xc_cil_lock);
594 ctx->commit_lsn = commit_lsn; 589 ctx->commit_lsn = commit_lsn;
595 sv_broadcast(&cil->xc_commit_wait); 590 wake_up_all(&cil->xc_commit_wait);
596 spin_unlock(&cil->xc_cil_lock); 591 spin_unlock(&cil->xc_cil_lock);
597 592
598 /* release the hounds! */ 593 /* release the hounds! */
@@ -757,7 +752,7 @@ restart:
757 * It is still being pushed! Wait for the push to 752 * It is still being pushed! Wait for the push to
758 * complete, then start again from the beginning. 753 * complete, then start again from the beginning.
759 */ 754 */
760 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 755 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
761 goto restart; 756 goto restart;
762 } 757 }
763 if (ctx->sequence != sequence) 758 if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..d5f8be8f4bf6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
21struct xfs_buf; 21struct xfs_buf;
22struct log; 22struct log;
23struct xlog_ticket; 23struct xlog_ticket;
24struct xfs_buf_cancel;
25struct xfs_mount; 24struct xfs_mount;
26 25
27/* 26/*
@@ -54,7 +53,6 @@ struct xfs_mount;
54 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ 53 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
55 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) 54 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
56 55
57
58static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) 56static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
59{ 57{
60 return ((xfs_lsn_t)cycle << 32) | block; 58 return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
133 */ 131 */
134#define XLOG_TIC_INITED 0x1 /* has been initialized */ 132#define XLOG_TIC_INITED 0x1 /* has been initialized */
135#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ 133#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
136#define XLOG_TIC_IN_Q 0x4
137 134
138#define XLOG_TIC_FLAGS \ 135#define XLOG_TIC_FLAGS \
139 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ 136 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
140 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ 137 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
141 { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" }
142 138
143#endif /* __KERNEL__ */ 139#endif /* __KERNEL__ */
144 140
@@ -244,9 +240,8 @@ typedef struct xlog_res {
244} xlog_res_t; 240} xlog_res_t;
245 241
246typedef struct xlog_ticket { 242typedef struct xlog_ticket {
247 sv_t t_wait; /* ticket wait queue : 20 */ 243 wait_queue_head_t t_wait; /* ticket wait queue */
248 struct xlog_ticket *t_next; /* :4|8 */ 244 struct list_head t_queue; /* reserve/write queue */
249 struct xlog_ticket *t_prev; /* :4|8 */
250 xlog_tid_t t_tid; /* transaction identifier : 4 */ 245 xlog_tid_t t_tid; /* transaction identifier : 4 */
251 atomic_t t_ref; /* ticket reference count : 4 */ 246 atomic_t t_ref; /* ticket reference count : 4 */
252 int t_curr_res; /* current reservation in bytes : 4 */ 247 int t_curr_res; /* current reservation in bytes : 4 */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
353 * and move everything else out to subsequent cachelines. 348 * and move everything else out to subsequent cachelines.
354 */ 349 */
355typedef struct xlog_in_core { 350typedef struct xlog_in_core {
356 sv_t ic_force_wait; 351 wait_queue_head_t ic_force_wait;
357 sv_t ic_write_wait; 352 wait_queue_head_t ic_write_wait;
358 struct xlog_in_core *ic_next; 353 struct xlog_in_core *ic_next;
359 struct xlog_in_core *ic_prev; 354 struct xlog_in_core *ic_prev;
360 struct xfs_buf *ic_bp; 355 struct xfs_buf *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
421 struct xfs_cil_ctx *xc_ctx; 416 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock; 417 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 418 struct list_head xc_committing;
424 sv_t xc_commit_wait; 419 wait_queue_head_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence; 420 xfs_lsn_t xc_current_sequence;
426}; 421};
427 422
@@ -491,7 +486,7 @@ typedef struct log {
491 struct xfs_buftarg *l_targ; /* buftarg of log */ 486 struct xfs_buftarg *l_targ; /* buftarg of log */
492 uint l_flags; 487 uint l_flags;
493 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 488 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
494 struct xfs_buf_cancel **l_buf_cancel_table; 489 struct list_head *l_buf_cancel_table;
495 int l_iclog_hsize; /* size of iclog header */ 490 int l_iclog_hsize; /* size of iclog header */
496 int l_iclog_heads; /* # of iclog header sectors */ 491 int l_iclog_heads; /* # of iclog header sectors */
497 uint l_sectBBsize; /* sector size in BBs (2^n) */ 492 uint l_sectBBsize; /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
503 int l_logBBsize; /* size of log in BB chunks */ 498 int l_logBBsize; /* size of log in BB chunks */
504 499
505 /* The following block of fields are changed while holding icloglock */ 500 /* The following block of fields are changed while holding icloglock */
506 sv_t l_flush_wait ____cacheline_aligned_in_smp; 501 wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
507 /* waiting for iclog flush */ 502 /* waiting for iclog flush */
508 int l_covered_state;/* state of "covering disk 503 int l_covered_state;/* state of "covering disk
509 * log entries" */ 504 * log entries" */
510 xlog_in_core_t *l_iclog; /* head log queue */ 505 xlog_in_core_t *l_iclog; /* head log queue */
511 spinlock_t l_icloglock; /* grab to change iclog state */ 506 spinlock_t l_icloglock; /* grab to change iclog state */
512 xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
513 * buffers */
514 xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
515 int l_curr_cycle; /* Cycle number of log writes */ 507 int l_curr_cycle; /* Cycle number of log writes */
516 int l_prev_cycle; /* Cycle number before last 508 int l_prev_cycle; /* Cycle number before last
517 * block increment */ 509 * block increment */
518 int l_curr_block; /* current logical log block */ 510 int l_curr_block; /* current logical log block */
519 int l_prev_block; /* previous logical log block */ 511 int l_prev_block; /* previous logical log block */
520 512
521 /* The following block of fields are changed while holding grant_lock */ 513 /*
522 spinlock_t l_grant_lock ____cacheline_aligned_in_smp; 514 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
523 xlog_ticket_t *l_reserve_headq; 515 * read without needing to hold specific locks. To avoid operations
524 xlog_ticket_t *l_write_headq; 516 * contending with other hot objects, place each of them on a separate
525 int l_grant_reserve_cycle; 517 * cacheline.
526 int l_grant_reserve_bytes; 518 */
527 int l_grant_write_cycle; 519 /* lsn of last LR on disk */
528 int l_grant_write_bytes; 520 atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
521 /* lsn of 1st LR with unflushed * buffers */
522 atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
523
524 /*
525 * ticket grant locks, queues and accounting have their own cachlines
526 * as these are quite hot and can be operated on concurrently.
527 */
528 spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
529 struct list_head l_reserveq;
530 atomic64_t l_grant_reserve_head;
531
532 spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
533 struct list_head l_writeq;
534 atomic64_t l_grant_write_head;
529 535
530 /* The following field are used for debugging; need to hold icloglock */ 536 /* The following field are used for debugging; need to hold icloglock */
531#ifdef DEBUG 537#ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
534 540
535} xlog_t; 541} xlog_t;
536 542
543#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
544 ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
545
537#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 546#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
538 547
539/* common routines */ 548/* common routines */
@@ -562,6 +571,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
562 xlog_in_core_t **commit_iclog, uint flags); 571 xlog_in_core_t **commit_iclog, uint flags);
563 572
564/* 573/*
574 * When we crack an atomic LSN, we sample it first so that the value will not
575 * change while we are cracking it into the component values. This means we
576 * will always get consistent component values to work from. This should always
577 * be used to smaple and crack LSNs taht are stored and updated in atomic
578 * variables.
579 */
580static inline void
581xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
582{
583 xfs_lsn_t val = atomic64_read(lsn);
584
585 *cycle = CYCLE_LSN(val);
586 *block = BLOCK_LSN(val);
587}
588
589/*
590 * Calculate and assign a value to an atomic LSN variable from component pieces.
591 */
592static inline void
593xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
594{
595 atomic64_set(lsn, xlog_assign_lsn(cycle, block));
596}
597
598/*
599 * When we crack the grant head, we sample it first so that the value will not
600 * change while we are cracking it into the component values. This means we
601 * will always get consistent component values to work from.
602 */
603static inline void
604xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
605{
606 *cycle = val >> 32;
607 *space = val & 0xffffffff;
608}
609
610static inline void
611xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
612{
613 xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
614}
615
616static inline int64_t
617xlog_assign_grant_head_val(int cycle, int space)
618{
619 return ((int64_t)cycle << 32) | space;
620}
621
622static inline void
623xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
624{
625 atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
626}
627
628/*
565 * Committed Item List interfaces 629 * Committed Item List interfaces
566 */ 630 */
567int xlog_cil_init(struct log *log); 631int xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
585 */ 649 */
586#define XLOG_UNMOUNT_REC_TYPE (-1U) 650#define XLOG_UNMOUNT_REC_TYPE (-1U)
587 651
652/*
653 * Wrapper function for waiting on a wait queue serialised against wakeups
654 * by a spinlock. This matches the semantics of all the wait queues used in the
655 * log code.
656 */
657static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
658{
659 DECLARE_WAITQUEUE(wait, current);
660
661 add_wait_queue_exclusive(wq, &wait);
662 __set_current_state(TASK_UNINTERRUPTIBLE);
663 spin_unlock(lock);
664 schedule();
665 remove_wait_queue(wq, &wait);
666}
588#endif /* __KERNEL__ */ 667#endif /* __KERNEL__ */
589 668
590#endif /* __XFS_LOG_PRIV_H__ */ 669#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458c..aa0ebb776903 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void xlog_recover_check_summary(xlog_t *);
53#endif 53#endif
54 54
55/* 55/*
56 * This structure is used during recovery to record the buf log items which
57 * have been canceled and should not be replayed.
58 */
59struct xfs_buf_cancel {
60 xfs_daddr_t bc_blkno;
61 uint bc_len;
62 int bc_refcount;
63 struct list_head bc_list;
64};
65
66/*
56 * Sector aligned buffer routines for buffer create/read/write/access 67 * Sector aligned buffer routines for buffer create/read/write/access
57 */ 68 */
58 69
@@ -925,12 +936,12 @@ xlog_find_tail(
925 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 936 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
926 if (found == 2) 937 if (found == 2)
927 log->l_curr_cycle++; 938 log->l_curr_cycle++;
928 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); 939 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
929 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); 940 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
930 log->l_grant_reserve_cycle = log->l_curr_cycle; 941 xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
931 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); 942 BBTOB(log->l_curr_block));
932 log->l_grant_write_cycle = log->l_curr_cycle; 943 xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
933 log->l_grant_write_bytes = BBTOB(log->l_curr_block); 944 BBTOB(log->l_curr_block));
934 945
935 /* 946 /*
936 * Look for unmount record. If we find it, then we know there 947 * Look for unmount record. If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
960 } 971 }
961 after_umount_blk = (i + hblks + (int) 972 after_umount_blk = (i + hblks + (int)
962 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; 973 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
963 tail_lsn = log->l_tail_lsn; 974 tail_lsn = atomic64_read(&log->l_tail_lsn);
964 if (*head_blk == after_umount_blk && 975 if (*head_blk == after_umount_blk &&
965 be32_to_cpu(rhead->h_num_logops) == 1) { 976 be32_to_cpu(rhead->h_num_logops) == 1) {
966 umount_data_blk = (i + hblks) % log->l_logBBsize; 977 umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
975 * log records will point recovery to after the 986 * log records will point recovery to after the
976 * current unmount record. 987 * current unmount record.
977 */ 988 */
978 log->l_tail_lsn = 989 xlog_assign_atomic_lsn(&log->l_tail_lsn,
979 xlog_assign_lsn(log->l_curr_cycle, 990 log->l_curr_cycle, after_umount_blk);
980 after_umount_blk); 991 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
981 log->l_last_sync_lsn = 992 log->l_curr_cycle, after_umount_blk);
982 xlog_assign_lsn(log->l_curr_cycle,
983 after_umount_blk);
984 *tail_blk = after_umount_blk; 993 *tail_blk = after_umount_blk;
985 994
986 /* 995 /*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
1605 * record in the table to tell us how many times we expect to see this 1614 * record in the table to tell us how many times we expect to see this
1606 * record during the second pass. 1615 * record during the second pass.
1607 */ 1616 */
1608STATIC void 1617STATIC int
1609xlog_recover_do_buffer_pass1( 1618xlog_recover_buffer_pass1(
1610 xlog_t *log, 1619 struct log *log,
1611 xfs_buf_log_format_t *buf_f) 1620 xlog_recover_item_t *item)
1612{ 1621{
1613 xfs_buf_cancel_t *bcp; 1622 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1614 xfs_buf_cancel_t *nextp; 1623 struct list_head *bucket;
1615 xfs_buf_cancel_t *prevp; 1624 struct xfs_buf_cancel *bcp;
1616 xfs_buf_cancel_t **bucket;
1617 xfs_daddr_t blkno = 0;
1618 uint len = 0;
1619 ushort flags = 0;
1620
1621 switch (buf_f->blf_type) {
1622 case XFS_LI_BUF:
1623 blkno = buf_f->blf_blkno;
1624 len = buf_f->blf_len;
1625 flags = buf_f->blf_flags;
1626 break;
1627 }
1628 1625
1629 /* 1626 /*
1630 * If this isn't a cancel buffer item, then just return. 1627 * If this isn't a cancel buffer item, then just return.
1631 */ 1628 */
1632 if (!(flags & XFS_BLF_CANCEL)) { 1629 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1633 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1630 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1634 return; 1631 return 0;
1635 }
1636
1637 /*
1638 * Insert an xfs_buf_cancel record into the hash table of
1639 * them. If there is already an identical record, bump
1640 * its reference count.
1641 */
1642 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1643 XLOG_BC_TABLE_SIZE];
1644 /*
1645 * If the hash bucket is empty then just insert a new record into
1646 * the bucket.
1647 */
1648 if (*bucket == NULL) {
1649 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1650 KM_SLEEP);
1651 bcp->bc_blkno = blkno;
1652 bcp->bc_len = len;
1653 bcp->bc_refcount = 1;
1654 bcp->bc_next = NULL;
1655 *bucket = bcp;
1656 return;
1657 } 1632 }
1658 1633
1659 /* 1634 /*
1660 * The hash bucket is not empty, so search for duplicates of our 1635 * Insert an xfs_buf_cancel record into the hash table of them.
1661 * record. If we find one them just bump its refcount. If not 1636 * If there is already an identical record, bump its reference count.
1662 * then add us at the end of the list.
1663 */ 1637 */
1664 prevp = NULL; 1638 bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1665 nextp = *bucket; 1639 list_for_each_entry(bcp, bucket, bc_list) {
1666 while (nextp != NULL) { 1640 if (bcp->bc_blkno == buf_f->blf_blkno &&
1667 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1641 bcp->bc_len == buf_f->blf_len) {
1668 nextp->bc_refcount++; 1642 bcp->bc_refcount++;
1669 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); 1643 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1670 return; 1644 return 0;
1671 } 1645 }
1672 prevp = nextp; 1646 }
1673 nextp = nextp->bc_next; 1647
1674 } 1648 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1675 ASSERT(prevp != NULL); 1649 bcp->bc_blkno = buf_f->blf_blkno;
1676 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), 1650 bcp->bc_len = buf_f->blf_len;
1677 KM_SLEEP);
1678 bcp->bc_blkno = blkno;
1679 bcp->bc_len = len;
1680 bcp->bc_refcount = 1; 1651 bcp->bc_refcount = 1;
1681 bcp->bc_next = NULL; 1652 list_add_tail(&bcp->bc_list, bucket);
1682 prevp->bc_next = bcp; 1653
1683 trace_xfs_log_recover_buf_cancel_add(log, buf_f); 1654 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1655 return 0;
1684} 1656}
1685 1657
1686/* 1658/*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
1698 */ 1670 */
1699STATIC int 1671STATIC int
1700xlog_check_buffer_cancelled( 1672xlog_check_buffer_cancelled(
1701 xlog_t *log, 1673 struct log *log,
1702 xfs_daddr_t blkno, 1674 xfs_daddr_t blkno,
1703 uint len, 1675 uint len,
1704 ushort flags) 1676 ushort flags)
1705{ 1677{
1706 xfs_buf_cancel_t *bcp; 1678 struct list_head *bucket;
1707 xfs_buf_cancel_t *prevp; 1679 struct xfs_buf_cancel *bcp;
1708 xfs_buf_cancel_t **bucket;
1709 1680
1710 if (log->l_buf_cancel_table == NULL) { 1681 if (log->l_buf_cancel_table == NULL) {
1711 /* 1682 /*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
1716 return 0; 1687 return 0;
1717 } 1688 }
1718 1689
1719 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1720 XLOG_BC_TABLE_SIZE];
1721 bcp = *bucket;
1722 if (bcp == NULL) {
1723 /*
1724 * There is no corresponding entry in the table built
1725 * in pass one, so this buffer has not been cancelled.
1726 */
1727 ASSERT(!(flags & XFS_BLF_CANCEL));
1728 return 0;
1729 }
1730
1731 /* 1690 /*
1732 * Search for an entry in the buffer cancel table that 1691 * Search for an entry in the cancel table that matches our buffer.
1733 * matches our buffer.
1734 */ 1692 */
1735 prevp = NULL; 1693 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1736 while (bcp != NULL) { 1694 list_for_each_entry(bcp, bucket, bc_list) {
1737 if (bcp->bc_blkno == blkno && bcp->bc_len == len) { 1695 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1738 /* 1696 goto found;
1739 * We've go a match, so return 1 so that the
1740 * recovery of this buffer is cancelled.
1741 * If this buffer is actually a buffer cancel
1742 * log item, then decrement the refcount on the
1743 * one in the table and remove it if this is the
1744 * last reference.
1745 */
1746 if (flags & XFS_BLF_CANCEL) {
1747 bcp->bc_refcount--;
1748 if (bcp->bc_refcount == 0) {
1749 if (prevp == NULL) {
1750 *bucket = bcp->bc_next;
1751 } else {
1752 prevp->bc_next = bcp->bc_next;
1753 }
1754 kmem_free(bcp);
1755 }
1756 }
1757 return 1;
1758 }
1759 prevp = bcp;
1760 bcp = bcp->bc_next;
1761 } 1697 }
1698
1762 /* 1699 /*
1763 * We didn't find a corresponding entry in the table, so 1700 * We didn't find a corresponding entry in the table, so return 0 so
1764 * return 0 so that the buffer is NOT cancelled. 1701 * that the buffer is NOT cancelled.
1765 */ 1702 */
1766 ASSERT(!(flags & XFS_BLF_CANCEL)); 1703 ASSERT(!(flags & XFS_BLF_CANCEL));
1767 return 0; 1704 return 0;
1768}
1769 1705
1770STATIC int 1706found:
1771xlog_recover_do_buffer_pass2( 1707 /*
1772 xlog_t *log, 1708 * We've go a match, so return 1 so that the recovery of this buffer
1773 xfs_buf_log_format_t *buf_f) 1709 * is cancelled. If this buffer is actually a buffer cancel log
1774{ 1710 * item, then decrement the refcount on the one in the table and
1775 xfs_daddr_t blkno = 0; 1711 * remove it if this is the last reference.
1776 ushort flags = 0; 1712 */
1777 uint len = 0; 1713 if (flags & XFS_BLF_CANCEL) {
1778 1714 if (--bcp->bc_refcount == 0) {
1779 switch (buf_f->blf_type) { 1715 list_del(&bcp->bc_list);
1780 case XFS_LI_BUF: 1716 kmem_free(bcp);
1781 blkno = buf_f->blf_blkno; 1717 }
1782 flags = buf_f->blf_flags;
1783 len = buf_f->blf_len;
1784 break;
1785 } 1718 }
1786 1719 return 1;
1787 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1788} 1720}
1789 1721
1790/* 1722/*
1791 * Perform recovery for a buffer full of inodes. In these buffers, 1723 * Perform recovery for a buffer full of inodes. In these buffers, the only
1792 * the only data which should be recovered is that which corresponds 1724 * data which should be recovered is that which corresponds to the
1793 * to the di_next_unlinked pointers in the on disk inode structures. 1725 * di_next_unlinked pointers in the on disk inode structures. The rest of the
1794 * The rest of the data for the inodes is always logged through the 1726 * data for the inodes is always logged through the inodes themselves rather
1795 * inodes themselves rather than the inode buffer and is recovered 1727 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1796 * in xlog_recover_do_inode_trans().
1797 * 1728 *
1798 * The only time when buffers full of inodes are fully recovered is 1729 * The only time when buffers full of inodes are fully recovered is when the
1799 * when the buffer is full of newly allocated inodes. In this case 1730 * buffer is full of newly allocated inodes. In this case the buffer will
1800 * the buffer will not be marked as an inode buffer and so will be 1731 * not be marked as an inode buffer and so will be sent to
1801 * sent to xlog_recover_do_reg_buffer() below during recovery. 1732 * xlog_recover_do_reg_buffer() below during recovery.
1802 */ 1733 */
1803STATIC int 1734STATIC int
1804xlog_recover_do_inode_buffer( 1735xlog_recover_do_inode_buffer(
1805 xfs_mount_t *mp, 1736 struct xfs_mount *mp,
1806 xlog_recover_item_t *item, 1737 xlog_recover_item_t *item,
1807 xfs_buf_t *bp, 1738 struct xfs_buf *bp,
1808 xfs_buf_log_format_t *buf_f) 1739 xfs_buf_log_format_t *buf_f)
1809{ 1740{
1810 int i; 1741 int i;
1811 int item_index; 1742 int item_index = 0;
1812 int bit; 1743 int bit = 0;
1813 int nbits; 1744 int nbits = 0;
1814 int reg_buf_offset; 1745 int reg_buf_offset = 0;
1815 int reg_buf_bytes; 1746 int reg_buf_bytes = 0;
1816 int next_unlinked_offset; 1747 int next_unlinked_offset;
1817 int inodes_per_buf; 1748 int inodes_per_buf;
1818 xfs_agino_t *logged_nextp; 1749 xfs_agino_t *logged_nextp;
1819 xfs_agino_t *buffer_nextp; 1750 xfs_agino_t *buffer_nextp;
1820 unsigned int *data_map = NULL;
1821 unsigned int map_size = 0;
1822 1751
1823 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1752 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1824 1753
1825 switch (buf_f->blf_type) {
1826 case XFS_LI_BUF:
1827 data_map = buf_f->blf_data_map;
1828 map_size = buf_f->blf_map_size;
1829 break;
1830 }
1831 /*
1832 * Set the variables corresponding to the current region to
1833 * 0 so that we'll initialize them on the first pass through
1834 * the loop.
1835 */
1836 reg_buf_offset = 0;
1837 reg_buf_bytes = 0;
1838 bit = 0;
1839 nbits = 0;
1840 item_index = 0;
1841 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; 1754 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1842 for (i = 0; i < inodes_per_buf; i++) { 1755 for (i = 0; i < inodes_per_buf; i++) {
1843 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 1756 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
1852 * the current di_next_unlinked field. 1765 * the current di_next_unlinked field.
1853 */ 1766 */
1854 bit += nbits; 1767 bit += nbits;
1855 bit = xfs_next_bit(data_map, map_size, bit); 1768 bit = xfs_next_bit(buf_f->blf_data_map,
1769 buf_f->blf_map_size, bit);
1856 1770
1857 /* 1771 /*
1858 * If there are no more logged regions in the 1772 * If there are no more logged regions in the
1859 * buffer, then we're done. 1773 * buffer, then we're done.
1860 */ 1774 */
1861 if (bit == -1) { 1775 if (bit == -1)
1862 return 0; 1776 return 0;
1863 }
1864 1777
1865 nbits = xfs_contig_bits(data_map, map_size, 1778 nbits = xfs_contig_bits(buf_f->blf_data_map,
1866 bit); 1779 buf_f->blf_map_size, bit);
1867 ASSERT(nbits > 0); 1780 ASSERT(nbits > 0);
1868 reg_buf_offset = bit << XFS_BLF_SHIFT; 1781 reg_buf_offset = bit << XFS_BLF_SHIFT;
1869 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 1782 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
1875 * di_next_unlinked field, then move on to the next 1788 * di_next_unlinked field, then move on to the next
1876 * di_next_unlinked field. 1789 * di_next_unlinked field.
1877 */ 1790 */
1878 if (next_unlinked_offset < reg_buf_offset) { 1791 if (next_unlinked_offset < reg_buf_offset)
1879 continue; 1792 continue;
1880 }
1881 1793
1882 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1794 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1883 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 1795 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
1913 * given buffer. The bitmap in the buf log format structure indicates 1825 * given buffer. The bitmap in the buf log format structure indicates
1914 * where to place the logged data. 1826 * where to place the logged data.
1915 */ 1827 */
1916/*ARGSUSED*/
1917STATIC void 1828STATIC void
1918xlog_recover_do_reg_buffer( 1829xlog_recover_do_reg_buffer(
1919 struct xfs_mount *mp, 1830 struct xfs_mount *mp,
1920 xlog_recover_item_t *item, 1831 xlog_recover_item_t *item,
1921 xfs_buf_t *bp, 1832 struct xfs_buf *bp,
1922 xfs_buf_log_format_t *buf_f) 1833 xfs_buf_log_format_t *buf_f)
1923{ 1834{
1924 int i; 1835 int i;
1925 int bit; 1836 int bit;
1926 int nbits; 1837 int nbits;
1927 unsigned int *data_map = NULL;
1928 unsigned int map_size = 0;
1929 int error; 1838 int error;
1930 1839
1931 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 1840 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1932 1841
1933 switch (buf_f->blf_type) {
1934 case XFS_LI_BUF:
1935 data_map = buf_f->blf_data_map;
1936 map_size = buf_f->blf_map_size;
1937 break;
1938 }
1939 bit = 0; 1842 bit = 0;
1940 i = 1; /* 0 is the buf format structure */ 1843 i = 1; /* 0 is the buf format structure */
1941 while (1) { 1844 while (1) {
1942 bit = xfs_next_bit(data_map, map_size, bit); 1845 bit = xfs_next_bit(buf_f->blf_data_map,
1846 buf_f->blf_map_size, bit);
1943 if (bit == -1) 1847 if (bit == -1)
1944 break; 1848 break;
1945 nbits = xfs_contig_bits(data_map, map_size, bit); 1849 nbits = xfs_contig_bits(buf_f->blf_data_map,
1850 buf_f->blf_map_size, bit);
1946 ASSERT(nbits > 0); 1851 ASSERT(nbits > 0);
1947 ASSERT(item->ri_buf[i].i_addr != NULL); 1852 ASSERT(item->ri_buf[i].i_addr != NULL);
1948 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 1853 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
2176 * for more details on the implementation of the table of cancel records. 2081 * for more details on the implementation of the table of cancel records.
2177 */ 2082 */
2178STATIC int 2083STATIC int
2179xlog_recover_do_buffer_trans( 2084xlog_recover_buffer_pass2(
2180 xlog_t *log, 2085 xlog_t *log,
2181 xlog_recover_item_t *item, 2086 xlog_recover_item_t *item)
2182 int pass)
2183{ 2087{
2184 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2088 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2185 xfs_mount_t *mp; 2089 xfs_mount_t *mp = log->l_mp;
2186 xfs_buf_t *bp; 2090 xfs_buf_t *bp;
2187 int error; 2091 int error;
2188 int cancel;
2189 xfs_daddr_t blkno;
2190 int len;
2191 ushort flags;
2192 uint buf_flags; 2092 uint buf_flags;
2193 2093
2194 if (pass == XLOG_RECOVER_PASS1) { 2094 /*
2195 /* 2095 * In this pass we only want to recover all the buffers which have
2196 * In this pass we're only looking for buf items 2096 * not been cancelled and are not cancellation buffers themselves.
2197 * with the XFS_BLF_CANCEL bit set. 2097 */
2198 */ 2098 if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2199 xlog_recover_do_buffer_pass1(log, buf_f); 2099 buf_f->blf_len, buf_f->blf_flags)) {
2100 trace_xfs_log_recover_buf_cancel(log, buf_f);
2200 return 0; 2101 return 0;
2201 } else {
2202 /*
2203 * In this pass we want to recover all the buffers
2204 * which have not been cancelled and are not
2205 * cancellation buffers themselves. The routine
2206 * we call here will tell us whether or not to
2207 * continue with the replay of this buffer.
2208 */
2209 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2210 if (cancel) {
2211 trace_xfs_log_recover_buf_cancel(log, buf_f);
2212 return 0;
2213 }
2214 } 2102 }
2103
2215 trace_xfs_log_recover_buf_recover(log, buf_f); 2104 trace_xfs_log_recover_buf_recover(log, buf_f);
2216 switch (buf_f->blf_type) {
2217 case XFS_LI_BUF:
2218 blkno = buf_f->blf_blkno;
2219 len = buf_f->blf_len;
2220 flags = buf_f->blf_flags;
2221 break;
2222 default:
2223 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2224 "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2225 buf_f->blf_type, log->l_mp->m_logname ?
2226 log->l_mp->m_logname : "internal");
2227 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2228 XFS_ERRLEVEL_LOW, log->l_mp);
2229 return XFS_ERROR(EFSCORRUPTED);
2230 }
2231 2105
2232 mp = log->l_mp;
2233 buf_flags = XBF_LOCK; 2106 buf_flags = XBF_LOCK;
2234 if (!(flags & XFS_BLF_INODE_BUF)) 2107 if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
2235 buf_flags |= XBF_MAPPED; 2108 buf_flags |= XBF_MAPPED;
2236 2109
2237 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2110 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2111 buf_flags);
2238 if (XFS_BUF_ISERROR(bp)) { 2112 if (XFS_BUF_ISERROR(bp)) {
2239 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, 2113 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
2240 bp, blkno); 2114 bp, buf_f->blf_blkno);
2241 error = XFS_BUF_GETERROR(bp); 2115 error = XFS_BUF_GETERROR(bp);
2242 xfs_buf_relse(bp); 2116 xfs_buf_relse(bp);
2243 return error; 2117 return error;
2244 } 2118 }
2245 2119
2246 error = 0; 2120 error = 0;
2247 if (flags & XFS_BLF_INODE_BUF) { 2121 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2248 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2122 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2249 } else if (flags & 2123 } else if (buf_f->blf_flags &
2250 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2124 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2251 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2125 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2252 } else { 2126 } else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
2286} 2160}
2287 2161
2288STATIC int 2162STATIC int
2289xlog_recover_do_inode_trans( 2163xlog_recover_inode_pass2(
2290 xlog_t *log, 2164 xlog_t *log,
2291 xlog_recover_item_t *item, 2165 xlog_recover_item_t *item)
2292 int pass)
2293{ 2166{
2294 xfs_inode_log_format_t *in_f; 2167 xfs_inode_log_format_t *in_f;
2295 xfs_mount_t *mp; 2168 xfs_mount_t *mp = log->l_mp;
2296 xfs_buf_t *bp; 2169 xfs_buf_t *bp;
2297 xfs_dinode_t *dip; 2170 xfs_dinode_t *dip;
2298 xfs_ino_t ino;
2299 int len; 2171 int len;
2300 xfs_caddr_t src; 2172 xfs_caddr_t src;
2301 xfs_caddr_t dest; 2173 xfs_caddr_t dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
2305 xfs_icdinode_t *dicp; 2177 xfs_icdinode_t *dicp;
2306 int need_free = 0; 2178 int need_free = 0;
2307 2179
2308 if (pass == XLOG_RECOVER_PASS1) {
2309 return 0;
2310 }
2311
2312 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2180 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2313 in_f = item->ri_buf[0].i_addr; 2181 in_f = item->ri_buf[0].i_addr;
2314 } else { 2182 } else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
2318 if (error) 2186 if (error)
2319 goto error; 2187 goto error;
2320 } 2188 }
2321 ino = in_f->ilf_ino;
2322 mp = log->l_mp;
2323 2189
2324 /* 2190 /*
2325 * Inode buffers can be freed, look out for it, 2191 * Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
2354 xfs_buf_relse(bp); 2220 xfs_buf_relse(bp);
2355 xfs_fs_cmn_err(CE_ALERT, mp, 2221 xfs_fs_cmn_err(CE_ALERT, mp,
2356 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2222 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2357 dip, bp, ino); 2223 dip, bp, in_f->ilf_ino);
2358 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", 2224 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2359 XFS_ERRLEVEL_LOW, mp); 2225 XFS_ERRLEVEL_LOW, mp);
2360 error = EFSCORRUPTED; 2226 error = EFSCORRUPTED;
2361 goto error; 2227 goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
2365 xfs_buf_relse(bp); 2231 xfs_buf_relse(bp);
2366 xfs_fs_cmn_err(CE_ALERT, mp, 2232 xfs_fs_cmn_err(CE_ALERT, mp,
2367 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", 2233 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2368 item, ino); 2234 item, in_f->ilf_ino);
2369 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", 2235 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2370 XFS_ERRLEVEL_LOW, mp); 2236 XFS_ERRLEVEL_LOW, mp);
2371 error = EFSCORRUPTED; 2237 error = EFSCORRUPTED;
2372 goto error; 2238 goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
2394 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { 2260 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2395 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2261 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2396 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2262 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2397 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", 2263 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2398 XFS_ERRLEVEL_LOW, mp, dicp); 2264 XFS_ERRLEVEL_LOW, mp, dicp);
2399 xfs_buf_relse(bp); 2265 xfs_buf_relse(bp);
2400 xfs_fs_cmn_err(CE_ALERT, mp, 2266 xfs_fs_cmn_err(CE_ALERT, mp,
2401 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2267 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2402 item, dip, bp, ino); 2268 item, dip, bp, in_f->ilf_ino);
2403 error = EFSCORRUPTED; 2269 error = EFSCORRUPTED;
2404 goto error; 2270 goto error;
2405 } 2271 }
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
2407 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2273 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2408 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 2274 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2409 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2275 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2410 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", 2276 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2411 XFS_ERRLEVEL_LOW, mp, dicp); 2277 XFS_ERRLEVEL_LOW, mp, dicp);
2412 xfs_buf_relse(bp); 2278 xfs_buf_relse(bp);
2413 xfs_fs_cmn_err(CE_ALERT, mp, 2279 xfs_fs_cmn_err(CE_ALERT, mp,
2414 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2280 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2415 item, dip, bp, ino); 2281 item, dip, bp, in_f->ilf_ino);
2416 error = EFSCORRUPTED; 2282 error = EFSCORRUPTED;
2417 goto error; 2283 goto error;
2418 } 2284 }
2419 } 2285 }
2420 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2286 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2421 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", 2287 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2422 XFS_ERRLEVEL_LOW, mp, dicp); 2288 XFS_ERRLEVEL_LOW, mp, dicp);
2423 xfs_buf_relse(bp); 2289 xfs_buf_relse(bp);
2424 xfs_fs_cmn_err(CE_ALERT, mp, 2290 xfs_fs_cmn_err(CE_ALERT, mp,
2425 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2291 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2426 item, dip, bp, ino, 2292 item, dip, bp, in_f->ilf_ino,
2427 dicp->di_nextents + dicp->di_anextents, 2293 dicp->di_nextents + dicp->di_anextents,
2428 dicp->di_nblocks); 2294 dicp->di_nblocks);
2429 error = EFSCORRUPTED; 2295 error = EFSCORRUPTED;
2430 goto error; 2296 goto error;
2431 } 2297 }
2432 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2298 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2433 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", 2299 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2434 XFS_ERRLEVEL_LOW, mp, dicp); 2300 XFS_ERRLEVEL_LOW, mp, dicp);
2435 xfs_buf_relse(bp); 2301 xfs_buf_relse(bp);
2436 xfs_fs_cmn_err(CE_ALERT, mp, 2302 xfs_fs_cmn_err(CE_ALERT, mp,
2437 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", 2303 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2438 item, dip, bp, ino, dicp->di_forkoff); 2304 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2439 error = EFSCORRUPTED; 2305 error = EFSCORRUPTED;
2440 goto error; 2306 goto error;
2441 } 2307 }
2442 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { 2308 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2443 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", 2309 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2444 XFS_ERRLEVEL_LOW, mp, dicp); 2310 XFS_ERRLEVEL_LOW, mp, dicp);
2445 xfs_buf_relse(bp); 2311 xfs_buf_relse(bp);
2446 xfs_fs_cmn_err(CE_ALERT, mp, 2312 xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
2532 break; 2398 break;
2533 2399
2534 default: 2400 default:
2535 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); 2401 xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
2536 ASSERT(0); 2402 ASSERT(0);
2537 xfs_buf_relse(bp); 2403 xfs_buf_relse(bp);
2538 error = EIO; 2404 error = EIO;
@@ -2556,18 +2422,11 @@ error:
2556 * of that type. 2422 * of that type.
2557 */ 2423 */
2558STATIC int 2424STATIC int
2559xlog_recover_do_quotaoff_trans( 2425xlog_recover_quotaoff_pass1(
2560 xlog_t *log, 2426 xlog_t *log,
2561 xlog_recover_item_t *item, 2427 xlog_recover_item_t *item)
2562 int pass)
2563{ 2428{
2564 xfs_qoff_logformat_t *qoff_f; 2429 xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
2565
2566 if (pass == XLOG_RECOVER_PASS2) {
2567 return (0);
2568 }
2569
2570 qoff_f = item->ri_buf[0].i_addr;
2571 ASSERT(qoff_f); 2430 ASSERT(qoff_f);
2572 2431
2573 /* 2432 /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
2588 * Recover a dquot record 2447 * Recover a dquot record
2589 */ 2448 */
2590STATIC int 2449STATIC int
2591xlog_recover_do_dquot_trans( 2450xlog_recover_dquot_pass2(
2592 xlog_t *log, 2451 xlog_t *log,
2593 xlog_recover_item_t *item, 2452 xlog_recover_item_t *item)
2594 int pass)
2595{ 2453{
2596 xfs_mount_t *mp; 2454 xfs_mount_t *mp = log->l_mp;
2597 xfs_buf_t *bp; 2455 xfs_buf_t *bp;
2598 struct xfs_disk_dquot *ddq, *recddq; 2456 struct xfs_disk_dquot *ddq, *recddq;
2599 int error; 2457 int error;
2600 xfs_dq_logformat_t *dq_f; 2458 xfs_dq_logformat_t *dq_f;
2601 uint type; 2459 uint type;
2602 2460
2603 if (pass == XLOG_RECOVER_PASS1) {
2604 return 0;
2605 }
2606 mp = log->l_mp;
2607 2461
2608 /* 2462 /*
2609 * Filesystems are required to send in quota flags at mount time. 2463 * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
2647 if ((error = xfs_qm_dqcheck(recddq, 2501 if ((error = xfs_qm_dqcheck(recddq,
2648 dq_f->qlf_id, 2502 dq_f->qlf_id,
2649 0, XFS_QMOPT_DOWARN, 2503 0, XFS_QMOPT_DOWARN,
2650 "xlog_recover_do_dquot_trans (log copy)"))) { 2504 "xlog_recover_dquot_pass2 (log copy)"))) {
2651 return XFS_ERROR(EIO); 2505 return XFS_ERROR(EIO);
2652 } 2506 }
2653 ASSERT(dq_f->qlf_len == 1); 2507 ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
2670 * minimal initialization then. 2524 * minimal initialization then.
2671 */ 2525 */
2672 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2526 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2673 "xlog_recover_do_dquot_trans")) { 2527 "xlog_recover_dquot_pass2")) {
2674 xfs_buf_relse(bp); 2528 xfs_buf_relse(bp);
2675 return XFS_ERROR(EIO); 2529 return XFS_ERROR(EIO);
2676 } 2530 }
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
2693 * LSN. 2547 * LSN.
2694 */ 2548 */
2695STATIC int 2549STATIC int
2696xlog_recover_do_efi_trans( 2550xlog_recover_efi_pass2(
2697 xlog_t *log, 2551 xlog_t *log,
2698 xlog_recover_item_t *item, 2552 xlog_recover_item_t *item,
2699 xfs_lsn_t lsn, 2553 xfs_lsn_t lsn)
2700 int pass)
2701{ 2554{
2702 int error; 2555 int error;
2703 xfs_mount_t *mp; 2556 xfs_mount_t *mp = log->l_mp;
2704 xfs_efi_log_item_t *efip; 2557 xfs_efi_log_item_t *efip;
2705 xfs_efi_log_format_t *efi_formatp; 2558 xfs_efi_log_format_t *efi_formatp;
2706 2559
2707 if (pass == XLOG_RECOVER_PASS1) {
2708 return 0;
2709 }
2710
2711 efi_formatp = item->ri_buf[0].i_addr; 2560 efi_formatp = item->ri_buf[0].i_addr;
2712 2561
2713 mp = log->l_mp;
2714 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2562 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2715 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), 2563 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2716 &(efip->efi_format)))) { 2564 &(efip->efi_format)))) {
2717 xfs_efi_item_free(efip); 2565 xfs_efi_item_free(efip);
2718 return error; 2566 return error;
2719 } 2567 }
2720 efip->efi_next_extent = efi_formatp->efi_nextents; 2568 atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2721 efip->efi_flags |= XFS_EFI_COMMITTED;
2722 2569
2723 spin_lock(&log->l_ailp->xa_lock); 2570 spin_lock(&log->l_ailp->xa_lock);
2724 /* 2571 /*
2725 * xfs_trans_ail_update() drops the AIL lock. 2572 * xfs_trans_ail_update() drops the AIL lock.
2726 */ 2573 */
2727 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); 2574 xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2728 return 0; 2575 return 0;
2729} 2576}
2730 2577
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
2737 * efd format structure. If we find it, we remove the efi from the 2584 * efd format structure. If we find it, we remove the efi from the
2738 * AIL and free it. 2585 * AIL and free it.
2739 */ 2586 */
2740STATIC void 2587STATIC int
2741xlog_recover_do_efd_trans( 2588xlog_recover_efd_pass2(
2742 xlog_t *log, 2589 xlog_t *log,
2743 xlog_recover_item_t *item, 2590 xlog_recover_item_t *item)
2744 int pass)
2745{ 2591{
2746 xfs_efd_log_format_t *efd_formatp; 2592 xfs_efd_log_format_t *efd_formatp;
2747 xfs_efi_log_item_t *efip = NULL; 2593 xfs_efi_log_item_t *efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
2750 struct xfs_ail_cursor cur; 2596 struct xfs_ail_cursor cur;
2751 struct xfs_ail *ailp = log->l_ailp; 2597 struct xfs_ail *ailp = log->l_ailp;
2752 2598
2753 if (pass == XLOG_RECOVER_PASS1) {
2754 return;
2755 }
2756
2757 efd_formatp = item->ri_buf[0].i_addr; 2599 efd_formatp = item->ri_buf[0].i_addr;
2758 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2600 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2759 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2601 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
2785 } 2627 }
2786 xfs_trans_ail_cursor_done(ailp, &cur); 2628 xfs_trans_ail_cursor_done(ailp, &cur);
2787 spin_unlock(&ailp->xa_lock); 2629 spin_unlock(&ailp->xa_lock);
2788}
2789
2790/*
2791 * Perform the transaction
2792 *
2793 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2794 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2795 */
2796STATIC int
2797xlog_recover_do_trans(
2798 xlog_t *log,
2799 xlog_recover_t *trans,
2800 int pass)
2801{
2802 int error = 0;
2803 xlog_recover_item_t *item;
2804
2805 error = xlog_recover_reorder_trans(log, trans, pass);
2806 if (error)
2807 return error;
2808
2809 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2810 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2811 switch (ITEM_TYPE(item)) {
2812 case XFS_LI_BUF:
2813 error = xlog_recover_do_buffer_trans(log, item, pass);
2814 break;
2815 case XFS_LI_INODE:
2816 error = xlog_recover_do_inode_trans(log, item, pass);
2817 break;
2818 case XFS_LI_EFI:
2819 error = xlog_recover_do_efi_trans(log, item,
2820 trans->r_lsn, pass);
2821 break;
2822 case XFS_LI_EFD:
2823 xlog_recover_do_efd_trans(log, item, pass);
2824 error = 0;
2825 break;
2826 case XFS_LI_DQUOT:
2827 error = xlog_recover_do_dquot_trans(log, item, pass);
2828 break;
2829 case XFS_LI_QUOTAOFF:
2830 error = xlog_recover_do_quotaoff_trans(log, item,
2831 pass);
2832 break;
2833 default:
2834 xlog_warn(
2835 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2836 ASSERT(0);
2837 error = XFS_ERROR(EIO);
2838 break;
2839 }
2840
2841 if (error)
2842 return error;
2843 }
2844 2630
2845 return 0; 2631 return 0;
2846} 2632}
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
2852 */ 2638 */
2853STATIC void 2639STATIC void
2854xlog_recover_free_trans( 2640xlog_recover_free_trans(
2855 xlog_recover_t *trans) 2641 struct xlog_recover *trans)
2856{ 2642{
2857 xlog_recover_item_t *item, *n; 2643 xlog_recover_item_t *item, *n;
2858 int i; 2644 int i;
@@ -2871,17 +2657,95 @@ xlog_recover_free_trans(
2871} 2657}
2872 2658
2873STATIC int 2659STATIC int
2660xlog_recover_commit_pass1(
2661 struct log *log,
2662 struct xlog_recover *trans,
2663 xlog_recover_item_t *item)
2664{
2665 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
2666
2667 switch (ITEM_TYPE(item)) {
2668 case XFS_LI_BUF:
2669 return xlog_recover_buffer_pass1(log, item);
2670 case XFS_LI_QUOTAOFF:
2671 return xlog_recover_quotaoff_pass1(log, item);
2672 case XFS_LI_INODE:
2673 case XFS_LI_EFI:
2674 case XFS_LI_EFD:
2675 case XFS_LI_DQUOT:
2676 /* nothing to do in pass 1 */
2677 return 0;
2678 default:
2679 xlog_warn(
2680 "XFS: invalid item type (%d) xlog_recover_commit_pass1",
2681 ITEM_TYPE(item));
2682 ASSERT(0);
2683 return XFS_ERROR(EIO);
2684 }
2685}
2686
2687STATIC int
2688xlog_recover_commit_pass2(
2689 struct log *log,
2690 struct xlog_recover *trans,
2691 xlog_recover_item_t *item)
2692{
2693 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2694
2695 switch (ITEM_TYPE(item)) {
2696 case XFS_LI_BUF:
2697 return xlog_recover_buffer_pass2(log, item);
2698 case XFS_LI_INODE:
2699 return xlog_recover_inode_pass2(log, item);
2700 case XFS_LI_EFI:
2701 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2702 case XFS_LI_EFD:
2703 return xlog_recover_efd_pass2(log, item);
2704 case XFS_LI_DQUOT:
2705 return xlog_recover_dquot_pass2(log, item);
2706 case XFS_LI_QUOTAOFF:
2707 /* nothing to do in pass2 */
2708 return 0;
2709 default:
2710 xlog_warn(
2711 "XFS: invalid item type (%d) xlog_recover_commit_pass2",
2712 ITEM_TYPE(item));
2713 ASSERT(0);
2714 return XFS_ERROR(EIO);
2715 }
2716}
2717
2718/*
2719 * Perform the transaction.
2720 *
2721 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2722 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2723 */
2724STATIC int
2874xlog_recover_commit_trans( 2725xlog_recover_commit_trans(
2875 xlog_t *log, 2726 struct log *log,
2876 xlog_recover_t *trans, 2727 struct xlog_recover *trans,
2877 int pass) 2728 int pass)
2878{ 2729{
2879 int error; 2730 int error = 0;
2731 xlog_recover_item_t *item;
2880 2732
2881 hlist_del(&trans->r_list); 2733 hlist_del(&trans->r_list);
2882 if ((error = xlog_recover_do_trans(log, trans, pass))) 2734
2735 error = xlog_recover_reorder_trans(log, trans, pass);
2736 if (error)
2883 return error; 2737 return error;
2884 xlog_recover_free_trans(trans); /* no error */ 2738
2739 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2740 if (pass == XLOG_RECOVER_PASS1)
2741 error = xlog_recover_commit_pass1(log, trans, item);
2742 else
2743 error = xlog_recover_commit_pass2(log, trans, item);
2744 if (error)
2745 return error;
2746 }
2747
2748 xlog_recover_free_trans(trans);
2885 return 0; 2749 return 0;
2886} 2750}
2887 2751
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
3011 xfs_extent_t *extp; 2875 xfs_extent_t *extp;
3012 xfs_fsblock_t startblock_fsb; 2876 xfs_fsblock_t startblock_fsb;
3013 2877
3014 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); 2878 ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
3015 2879
3016 /* 2880 /*
3017 * First check the validity of the extents described by the 2881 * First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
3050 extp->ext_len); 2914 extp->ext_len);
3051 } 2915 }
3052 2916
3053 efip->efi_flags |= XFS_EFI_RECOVERED; 2917 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3054 error = xfs_trans_commit(tp, 0); 2918 error = xfs_trans_commit(tp, 0);
3055 return error; 2919 return error;
3056 2920
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
3107 * Skip EFIs that we've already processed. 2971 * Skip EFIs that we've already processed.
3108 */ 2972 */
3109 efip = (xfs_efi_log_item_t *)lip; 2973 efip = (xfs_efi_log_item_t *)lip;
3110 if (efip->efi_flags & XFS_EFI_RECOVERED) { 2974 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3111 lip = xfs_trans_ail_cursor_next(ailp, &cur); 2975 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3112 continue; 2976 continue;
3113 } 2977 }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
3724 xfs_daddr_t head_blk, 3588 xfs_daddr_t head_blk,
3725 xfs_daddr_t tail_blk) 3589 xfs_daddr_t tail_blk)
3726{ 3590{
3727 int error; 3591 int error, i;
3728 3592
3729 ASSERT(head_blk != tail_blk); 3593 ASSERT(head_blk != tail_blk);
3730 3594
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
3732 * First do a pass to find all of the cancelled buf log items. 3596 * First do a pass to find all of the cancelled buf log items.
3733 * Store them in the buf_cancel_table for use in the second pass. 3597 * Store them in the buf_cancel_table for use in the second pass.
3734 */ 3598 */
3735 log->l_buf_cancel_table = 3599 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3736 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * 3600 sizeof(struct list_head),
3737 sizeof(xfs_buf_cancel_t*),
3738 KM_SLEEP); 3601 KM_SLEEP);
3602 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3603 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3604
3739 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 3605 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3740 XLOG_RECOVER_PASS1); 3606 XLOG_RECOVER_PASS1);
3741 if (error != 0) { 3607 if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
3754 int i; 3620 int i;
3755 3621
3756 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 3622 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3757 ASSERT(log->l_buf_cancel_table[i] == NULL); 3623 ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3758 } 3624 }
3759#endif /* DEBUG */ 3625#endif /* DEBUG */
3760 3626
@@ -3934,7 +3800,7 @@ xlog_recover_finish(
3934 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3800 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3935 } else { 3801 } else {
3936 cmn_err(CE_DEBUG, 3802 cmn_err(CE_DEBUG,
3937 "!Ending clean XFS mount for filesystem: %s\n", 3803 "Ending clean XFS mount for filesystem: %s\n",
3938 log->l_mp->m_fsname); 3804 log->l_mp->m_fsname);
3939 } 3805 }
3940 return 0; 3806 return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa1c254..d447aef84bc3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -472,7 +472,7 @@ xfs_initialize_perag(
472 goto out_unwind; 472 goto out_unwind;
473 pag->pag_agno = index; 473 pag->pag_agno = index;
474 pag->pag_mount = mp; 474 pag->pag_mount = mp;
475 rwlock_init(&pag->pag_ici_lock); 475 spin_lock_init(&pag->pag_ici_lock);
476 mutex_init(&pag->pag_ici_reclaim_lock); 476 mutex_init(&pag->pag_ici_reclaim_lock);
477 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 477 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
478 spin_lock_init(&pag->pag_buf_lock); 478 spin_lock_init(&pag->pag_buf_lock);
@@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
975} 975}
976 976
977/* 977/*
978 * precalculate the low space thresholds for dynamic speculative preallocation.
979 */
980void
981xfs_set_low_space_thresholds(
982 struct xfs_mount *mp)
983{
984 int i;
985
986 for (i = 0; i < XFS_LOWSP_MAX; i++) {
987 __uint64_t space = mp->m_sb.sb_dblocks;
988
989 do_div(space, 100);
990 mp->m_low_space[i] = space * (i + 1);
991 }
992}
993
994
995/*
978 * Set whether we're using inode alignment. 996 * Set whether we're using inode alignment.
979 */ 997 */
980STATIC void 998STATIC void
@@ -1196,6 +1214,9 @@ xfs_mountfs(
1196 */ 1214 */
1197 xfs_set_rw_sizes(mp); 1215 xfs_set_rw_sizes(mp);
1198 1216
1217 /* set the low space thresholds for dynamic preallocation */
1218 xfs_set_low_space_thresholds(mp);
1219
1199 /* 1220 /*
1200 * Set the inode cluster size. 1221 * Set the inode cluster size.
1201 * This may still be overridden by the file system 1222 * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b4980740..a62e8971539d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
103 xfs_mod_incore_sb(mp, field, delta, rsvd) 103 xfs_mod_incore_sb(mp, field, delta, rsvd)
104#endif 104#endif
105 105
106/* dynamic preallocation free space thresholds, 5% down to 1% */
107enum {
108 XFS_LOWSP_1_PCNT = 0,
109 XFS_LOWSP_2_PCNT,
110 XFS_LOWSP_3_PCNT,
111 XFS_LOWSP_4_PCNT,
112 XFS_LOWSP_5_PCNT,
113 XFS_LOWSP_MAX,
114};
115
106typedef struct xfs_mount { 116typedef struct xfs_mount {
107 struct super_block *m_super; 117 struct super_block *m_super;
108 xfs_tid_t m_tid; /* next unused tid for fs */ 118 xfs_tid_t m_tid; /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
202 __int64_t m_update_flags; /* sb flags we need to update 212 __int64_t m_update_flags; /* sb flags we need to update
203 on the next remount,rw */ 213 on the next remount,rw */
204 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 214 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
215 int64_t m_low_space[XFS_LOWSP_MAX];
216 /* low free space thresholds */
205} xfs_mount_t; 217} xfs_mount_t;
206 218
207/* 219/*
@@ -379,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
379 391
380extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 392extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
381 393
394extern void xfs_set_low_space_thresholds(struct xfs_mount *);
395
382#endif /* __KERNEL__ */ 396#endif /* __KERNEL__ */
383 397
384extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 398extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711e..33dbc4e0ad62 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
1137 if (blkdelta) 1137 if (blkdelta)
1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); 1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1139out: 1139out:
1140 ASSERT(error = 0); 1140 ASSERT(error == 0);
1141 return; 1141 return;
1142} 1142}
1143 1143
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
1350 * they could be immediately flushed and we'd have to race with the flusher 1350 * they could be immediately flushed and we'd have to race with the flusher
1351 * trying to pull the item from the AIL as we add it. 1351 * trying to pull the item from the AIL as we add it.
1352 */ 1352 */
1353void 1353static void
1354xfs_trans_item_committed( 1354xfs_trans_item_committed(
1355 struct xfs_log_item *lip, 1355 struct xfs_log_item *lip,
1356 xfs_lsn_t commit_lsn, 1356 xfs_lsn_t commit_lsn,
@@ -1425,6 +1425,83 @@ xfs_trans_committed(
1425 xfs_trans_free(tp); 1425 xfs_trans_free(tp);
1426} 1426}
1427 1427
1428static inline void
1429xfs_log_item_batch_insert(
1430 struct xfs_ail *ailp,
1431 struct xfs_log_item **log_items,
1432 int nr_items,
1433 xfs_lsn_t commit_lsn)
1434{
1435 int i;
1436
1437 spin_lock(&ailp->xa_lock);
1438 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
1439 xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
1440
1441 for (i = 0; i < nr_items; i++)
1442 IOP_UNPIN(log_items[i], 0);
1443}
1444
1445/*
1446 * Bulk operation version of xfs_trans_committed that takes a log vector of
1447 * items to insert into the AIL. This uses bulk AIL insertion techniques to
1448 * minimise lock traffic.
1449 */
1450void
1451xfs_trans_committed_bulk(
1452 struct xfs_ail *ailp,
1453 struct xfs_log_vec *log_vector,
1454 xfs_lsn_t commit_lsn,
1455 int aborted)
1456{
1457#define LOG_ITEM_BATCH_SIZE 32
1458 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
1459 struct xfs_log_vec *lv;
1460 int i = 0;
1461
1462 /* unpin all the log items */
1463 for (lv = log_vector; lv; lv = lv->lv_next ) {
1464 struct xfs_log_item *lip = lv->lv_item;
1465 xfs_lsn_t item_lsn;
1466
1467 if (aborted)
1468 lip->li_flags |= XFS_LI_ABORTED;
1469 item_lsn = IOP_COMMITTED(lip, commit_lsn);
1470
1471 /* item_lsn of -1 means the item was freed */
1472 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1473 continue;
1474
1475 if (item_lsn != commit_lsn) {
1476
1477 /*
1478 * Not a bulk update option due to unusual item_lsn.
1479 * Push into AIL immediately, rechecking the lsn once
1480 * we have the ail lock. Then unpin the item.
1481 */
1482 spin_lock(&ailp->xa_lock);
1483 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
1484 xfs_trans_ail_update(ailp, lip, item_lsn);
1485 else
1486 spin_unlock(&ailp->xa_lock);
1487 IOP_UNPIN(lip, 0);
1488 continue;
1489 }
1490
1491 /* Item is a candidate for bulk AIL insert. */
1492 log_items[i++] = lv->lv_item;
1493 if (i >= LOG_ITEM_BATCH_SIZE) {
1494 xfs_log_item_batch_insert(ailp, log_items,
1495 LOG_ITEM_BATCH_SIZE, commit_lsn);
1496 i = 0;
1497 }
1498 }
1499
1500 /* make sure we insert the remainder! */
1501 if (i)
1502 xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
1503}
1504
1428/* 1505/*
1429 * Called from the trans_commit code when we notice that 1506 * Called from the trans_commit code when we notice that
1430 * the filesystem is in the middle of a forced shutdown. 1507 * the filesystem is in the middle of a forced shutdown.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a86..c2042b736b81 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
294#define XFS_ALLOC_BTREE_REF 2 294#define XFS_ALLOC_BTREE_REF 2
295#define XFS_BMAP_BTREE_REF 2 295#define XFS_BMAP_BTREE_REF 2
296#define XFS_DIR_BTREE_REF 2 296#define XFS_DIR_BTREE_REF 2
297#define XFS_INO_REF 2
297#define XFS_ATTR_BTREE_REF 1 298#define XFS_ATTR_BTREE_REF 1
298#define XFS_INO_REF 1
299#define XFS_DQUOT_REF 1 299#define XFS_DQUOT_REF 1
300 300
301#ifdef __KERNEL__ 301#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..c5bbbc45db91 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30 30
31STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *); 31STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
32STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); 32STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *); 33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *); 34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
35 35
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
449 xfs_log_move_tail(ailp->xa_mount, 1); 449 xfs_log_move_tail(ailp->xa_mount, 1);
450} /* xfs_trans_unlocked_item */ 450} /* xfs_trans_unlocked_item */
451 451
452
453/* 452/*
454 * Update the position of the item in the AIL with the new 453 * xfs_trans_ail_update - bulk AIL insertion operation.
455 * lsn. If it is not yet in the AIL, add it. Otherwise, move 454 *
456 * it to its new position by removing it and re-adding it. 455 * @xfs_trans_ail_update takes an array of log items that all need to be
456 * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
457 * be added. Otherwise, it will be repositioned by removing it and re-adding
458 * it to the AIL. If we move the first item in the AIL, update the log tail to
459 * match the new minimum LSN in the AIL.
457 * 460 *
458 * Wakeup anyone with an lsn less than the item's lsn. If the item 461 * This function takes the AIL lock once to execute the update operations on
459 * we move in the AIL is the minimum one, update the tail lsn in the 462 * all the items in the array, and as such should not be called with the AIL
460 * log manager. 463 * lock held. As a result, once we have the AIL lock, we need to check each log
464 * item LSN to confirm it needs to be moved forward in the AIL.
461 * 465 *
462 * This function must be called with the AIL lock held. The lock 466 * To optimise the insert operation, we delete all the items from the AIL in
463 * is dropped before returning. 467 * the first pass, moving them into a temporary list, then splice the temporary
468 * list into the correct position in the AIL. This avoids needing to do an
469 * insert operation on every item.
470 *
471 * This function must be called with the AIL lock held. The lock is dropped
472 * before returning.
464 */ 473 */
465void 474void
466xfs_trans_ail_update( 475xfs_trans_ail_update_bulk(
467 struct xfs_ail *ailp, 476 struct xfs_ail *ailp,
468 xfs_log_item_t *lip, 477 struct xfs_log_item **log_items,
469 xfs_lsn_t lsn) __releases(ailp->xa_lock) 478 int nr_items,
479 xfs_lsn_t lsn) __releases(ailp->xa_lock)
470{ 480{
471 xfs_log_item_t *dlip = NULL; 481 xfs_log_item_t *mlip;
472 xfs_log_item_t *mlip; /* ptr to minimum lip */
473 xfs_lsn_t tail_lsn; 482 xfs_lsn_t tail_lsn;
483 int mlip_changed = 0;
484 int i;
485 LIST_HEAD(tmp);
474 486
475 mlip = xfs_ail_min(ailp); 487 mlip = xfs_ail_min(ailp);
476 488
477 if (lip->li_flags & XFS_LI_IN_AIL) { 489 for (i = 0; i < nr_items; i++) {
478 dlip = xfs_ail_delete(ailp, lip); 490 struct xfs_log_item *lip = log_items[i];
479 ASSERT(dlip == lip); 491 if (lip->li_flags & XFS_LI_IN_AIL) {
480 xfs_trans_ail_cursor_clear(ailp, dlip); 492 /* check if we really need to move the item */
481 } else { 493 if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
482 lip->li_flags |= XFS_LI_IN_AIL; 494 continue;
495
496 xfs_ail_delete(ailp, lip);
497 if (mlip == lip)
498 mlip_changed = 1;
499 } else {
500 lip->li_flags |= XFS_LI_IN_AIL;
501 }
502 lip->li_lsn = lsn;
503 list_add(&lip->li_ail, &tmp);
483 } 504 }
484 505
485 lip->li_lsn = lsn; 506 xfs_ail_splice(ailp, &tmp, lsn);
486 xfs_ail_insert(ailp, lip);
487 507
488 if (mlip == dlip) { 508 if (!mlip_changed) {
489 mlip = xfs_ail_min(ailp);
490 /*
491 * It is not safe to access mlip after the AIL lock is
492 * dropped, so we must get a copy of li_lsn before we do
493 * so. This is especially important on 32-bit platforms
494 * where accessing and updating 64-bit values like li_lsn
495 * is not atomic.
496 */
497 tail_lsn = mlip->li_lsn;
498 spin_unlock(&ailp->xa_lock);
499 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
500 } else {
501 spin_unlock(&ailp->xa_lock); 509 spin_unlock(&ailp->xa_lock);
510 return;
502 } 511 }
503 512
504 513 /*
505} /* xfs_trans_update_ail */ 514 * It is not safe to access mlip after the AIL lock is dropped, so we
515 * must get a copy of li_lsn before we do so. This is especially
516 * important on 32-bit platforms where accessing and updating 64-bit
517 * values like li_lsn is not atomic.
518 */
519 mlip = xfs_ail_min(ailp);
520 tail_lsn = mlip->li_lsn;
521 spin_unlock(&ailp->xa_lock);
522 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
523}
506 524
507/* 525/*
508 * Delete the given item from the AIL. It must already be in 526 * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
509 * the AIL.
510 * 527 *
511 * Wakeup anyone with an lsn less than item's lsn. If the item 528 * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
512 * we delete in the AIL is the minimum one, update the tail lsn in the 529 * removed from the AIL. The caller is already holding the AIL lock, and done
513 * log manager. 530 * all the checks necessary to ensure the items passed in via @log_items are
531 * ready for deletion. This includes checking that the items are in the AIL.
514 * 532 *
515 * Clear the IN_AIL flag from the item, reset its lsn to 0, and 533 * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
516 * bump the AIL's generation count to indicate that the tree 534 * flag from the item and reset the item's lsn to 0. If we remove the first
517 * has changed. 535 * item in the AIL, update the log tail to match the new minimum LSN in the
536 * AIL.
518 * 537 *
519 * This function must be called with the AIL lock held. The lock 538 * This function will not drop the AIL lock until all items are removed from
520 * is dropped before returning. 539 * the AIL to minimise the amount of lock traffic on the AIL. This does not
540 * greatly increase the AIL hold time, but does significantly reduce the amount
541 * of traffic on the lock, especially during IO completion.
542 *
543 * This function must be called with the AIL lock held. The lock is dropped
544 * before returning.
521 */ 545 */
522void 546void
523xfs_trans_ail_delete( 547xfs_trans_ail_delete_bulk(
524 struct xfs_ail *ailp, 548 struct xfs_ail *ailp,
525 xfs_log_item_t *lip) __releases(ailp->xa_lock) 549 struct xfs_log_item **log_items,
550 int nr_items) __releases(ailp->xa_lock)
526{ 551{
527 xfs_log_item_t *dlip;
528 xfs_log_item_t *mlip; 552 xfs_log_item_t *mlip;
529 xfs_lsn_t tail_lsn; 553 xfs_lsn_t tail_lsn;
554 int mlip_changed = 0;
555 int i;
530 556
531 if (lip->li_flags & XFS_LI_IN_AIL) { 557 mlip = xfs_ail_min(ailp);
532 mlip = xfs_ail_min(ailp);
533 dlip = xfs_ail_delete(ailp, lip);
534 ASSERT(dlip == lip);
535 xfs_trans_ail_cursor_clear(ailp, dlip);
536
537 558
538 lip->li_flags &= ~XFS_LI_IN_AIL; 559 for (i = 0; i < nr_items; i++) {
539 lip->li_lsn = 0; 560 struct xfs_log_item *lip = log_items[i];
561 if (!(lip->li_flags & XFS_LI_IN_AIL)) {
562 struct xfs_mount *mp = ailp->xa_mount;
540 563
541 if (mlip == dlip) {
542 mlip = xfs_ail_min(ailp);
543 /*
544 * It is not safe to access mlip after the AIL lock
545 * is dropped, so we must get a copy of li_lsn
546 * before we do so. This is especially important
547 * on 32-bit platforms where accessing and updating
548 * 64-bit values like li_lsn is not atomic.
549 */
550 tail_lsn = mlip ? mlip->li_lsn : 0;
551 spin_unlock(&ailp->xa_lock);
552 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
553 } else {
554 spin_unlock(&ailp->xa_lock); 564 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) {
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 return;
555 } 572 }
573
574 xfs_ail_delete(ailp, lip);
575 lip->li_flags &= ~XFS_LI_IN_AIL;
576 lip->li_lsn = 0;
577 if (mlip == lip)
578 mlip_changed = 1;
556 } 579 }
557 else {
558 /*
559 * If the file system is not being shutdown, we are in
560 * serious trouble if we get to this stage.
561 */
562 struct xfs_mount *mp = ailp->xa_mount;
563 580
581 if (!mlip_changed) {
564 spin_unlock(&ailp->xa_lock); 582 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) { 583 return;
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 } 584 }
572}
573
574 585
586 /*
587 * It is not safe to access mlip after the AIL lock is dropped, so we
588 * must get a copy of li_lsn before we do so. This is especially
589 * important on 32-bit platforms where accessing and updating 64-bit
590 * values like li_lsn is not atomic. It is possible we've emptied the
591 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
592 */
593 mlip = xfs_ail_min(ailp);
594 tail_lsn = mlip ? mlip->li_lsn : 0;
595 spin_unlock(&ailp->xa_lock);
596 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
597}
575 598
576/* 599/*
577 * The active item list (AIL) is a doubly linked list of log 600 * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
623} 646}
624 647
625/* 648/*
626 * Insert the given log item into the AIL. 649 * splice the log item list into the AIL at the given LSN.
627 * We almost always insert at the end of the list, so on inserts
628 * we search from the end of the list to find where the
629 * new item belongs.
630 */ 650 */
631STATIC void 651STATIC void
632xfs_ail_insert( 652xfs_ail_splice(
633 struct xfs_ail *ailp, 653 struct xfs_ail *ailp,
634 xfs_log_item_t *lip) 654 struct list_head *list,
635/* ARGSUSED */ 655 xfs_lsn_t lsn)
636{ 656{
637 xfs_log_item_t *next_lip; 657 xfs_log_item_t *next_lip;
638 658
@@ -640,39 +660,33 @@ xfs_ail_insert(
640 * If the list is empty, just insert the item. 660 * If the list is empty, just insert the item.
641 */ 661 */
642 if (list_empty(&ailp->xa_ail)) { 662 if (list_empty(&ailp->xa_ail)) {
643 list_add(&lip->li_ail, &ailp->xa_ail); 663 list_splice(list, &ailp->xa_ail);
644 return; 664 return;
645 } 665 }
646 666
647 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { 667 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
648 if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0) 668 if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
649 break; 669 break;
650 } 670 }
651 671
652 ASSERT((&next_lip->li_ail == &ailp->xa_ail) || 672 ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
653 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); 673 (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
654
655 list_add(&lip->li_ail, &next_lip->li_ail);
656 674
657 xfs_ail_check(ailp, lip); 675 list_splice_init(list, &next_lip->li_ail);
658 return; 676 return;
659} 677}
660 678
661/* 679/*
662 * Delete the given item from the AIL. Return a pointer to the item. 680 * Delete the given item from the AIL. Return a pointer to the item.
663 */ 681 */
664/*ARGSUSED*/ 682STATIC void
665STATIC xfs_log_item_t *
666xfs_ail_delete( 683xfs_ail_delete(
667 struct xfs_ail *ailp, 684 struct xfs_ail *ailp,
668 xfs_log_item_t *lip) 685 xfs_log_item_t *lip)
669/* ARGSUSED */
670{ 686{
671 xfs_ail_check(ailp, lip); 687 xfs_ail_check(ailp, lip);
672
673 list_del(&lip->li_ail); 688 list_del(&lip->li_ail);
674 689 xfs_trans_ail_cursor_clear(ailp, lip);
675 return lip;
676} 690}
677 691
678/* 692/*
@@ -682,7 +696,6 @@ xfs_ail_delete(
682STATIC xfs_log_item_t * 696STATIC xfs_log_item_t *
683xfs_ail_min( 697xfs_ail_min(
684 struct xfs_ail *ailp) 698 struct xfs_ail *ailp)
685/* ARGSUSED */
686{ 699{
687 if (list_empty(&ailp->xa_ail)) 700 if (list_empty(&ailp->xa_ail))
688 return NULL; 701 return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
699xfs_ail_next( 712xfs_ail_next(
700 struct xfs_ail *ailp, 713 struct xfs_ail *ailp,
701 xfs_log_item_t *lip) 714 xfs_log_item_t *lip)
702/* ARGSUSED */
703{ 715{
704 if (lip->li_ail.next == &ailp->xa_ail) 716 if (lip->li_ail.next == &ailp->xa_ail)
705 return NULL; 717 return NULL;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
69 tp->t_flags |= XFS_TRANS_DIRTY; 69 tp->t_flags |= XFS_TRANS_DIRTY;
70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; 70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
71 71
72 next_extent = efip->efi_next_extent; 72 /*
73 * atomic_inc_return gives us the value after the increment;
74 * we want to use it as an array index so we need to subtract 1 from
75 * it.
76 */
77 next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
73 ASSERT(next_extent < efip->efi_format.efi_nextents); 78 ASSERT(next_extent < efip->efi_format.efi_nextents);
74 extp = &(efip->efi_format.efi_extents[next_extent]); 79 extp = &(efip->efi_format.efi_extents[next_extent]);
75 extp->ext_start = start_block; 80 extp->ext_start = start_block;
76 extp->ext_len = ext_len; 81 extp->ext_len = ext_len;
77 efip->efi_next_extent++;
78} 82}
79 83
80 84
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..35162c238fa3 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
22struct xfs_log_item_desc; 22struct xfs_log_item_desc;
23struct xfs_mount; 23struct xfs_mount;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_ail;
26struct xfs_log_vec;
25 27
26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 28void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27void xfs_trans_del_item(struct xfs_log_item *); 29void xfs_trans_del_item(struct xfs_log_item *);
28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 30void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29 int flags); 31 int flags);
30void xfs_trans_item_committed(struct xfs_log_item *lip,
31 xfs_lsn_t commit_lsn, int aborted);
32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
33 33
34void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
35 xfs_lsn_t commit_lsn, int aborted);
34/* 36/*
35 * AIL traversal cursor. 37 * AIL traversal cursor.
36 * 38 *
@@ -73,12 +75,29 @@ struct xfs_ail {
73/* 75/*
74 * From xfs_trans_ail.c 76 * From xfs_trans_ail.c
75 */ 77 */
76void xfs_trans_ail_update(struct xfs_ail *ailp, 78void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
77 struct xfs_log_item *lip, xfs_lsn_t lsn) 79 struct xfs_log_item **log_items, int nr_items,
78 __releases(ailp->xa_lock); 80 xfs_lsn_t lsn) __releases(ailp->xa_lock);
79void xfs_trans_ail_delete(struct xfs_ail *ailp, 81static inline void
80 struct xfs_log_item *lip) 82xfs_trans_ail_update(
81 __releases(ailp->xa_lock); 83 struct xfs_ail *ailp,
84 struct xfs_log_item *lip,
85 xfs_lsn_t lsn) __releases(ailp->xa_lock)
86{
87 xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
88}
89
90void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
91 struct xfs_log_item **log_items, int nr_items)
92 __releases(ailp->xa_lock);
93static inline void
94xfs_trans_ail_delete(
95 struct xfs_ail *ailp,
96 xfs_log_item_t *lip) __releases(ailp->xa_lock)
97{
98 xfs_trans_ail_delete_bulk(ailp, &lip, 1);
99}
100
82void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); 101void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
83void xfs_trans_unlocked_item(struct xfs_ail *, 102void xfs_trans_unlocked_item(struct xfs_ail *,
84 xfs_log_item_t *); 103 xfs_log_item_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151a..d8e6f8cd6f0c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
964 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 964 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
965 } 965 }
966 966
967 if (ip->i_d.di_nlink != 0) { 967 if (ip->i_d.di_nlink == 0)
968 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 968 return 0;
969 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
970 ip->i_delayed_blks > 0)) &&
971 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
972 (!(ip->i_d.di_flags &
973 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
974 969
975 /* 970 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
976 * If we can't get the iolock just skip truncating 971 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
977 * the blocks past EOF because we could deadlock 972 ip->i_delayed_blks > 0)) &&
978 * with the mmap_sem otherwise. We'll get another 973 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
979 * chance to drop them once the last reference to 974 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
980 * the inode is dropped, so we'll never leak blocks
981 * permanently.
982 */
983 error = xfs_free_eofblocks(mp, ip,
984 XFS_FREE_EOF_TRYLOCK);
985 if (error)
986 return error;
987 }
988 }
989 975
976 /*
977 * If we can't get the iolock just skip truncating the blocks
978 * past EOF because we could deadlock with the mmap_sem
979 * otherwise. We'll get another chance to drop them once the
980 * last reference to the inode is dropped, so we'll never leak
981 * blocks permanently.
982 *
983 * Further, check if the inode is being opened, written and
984 * closed frequently and we have delayed allocation blocks
985 * oustanding (e.g. streaming writes from the NFS server),
986 * truncating the blocks past EOF will cause fragmentation to
987 * occur.
988 *
989 * In this case don't do the truncation, either, but we have to
990 * be careful how we detect this case. Blocks beyond EOF show
991 * up as i_delayed_blks even when the inode is clean, so we
992 * need to truncate them away first before checking for a dirty
993 * release. Hence on the first dirty close we will still remove
994 * the speculative allocation, but after that we will leave it
995 * in place.
996 */
997 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
998 return 0;
999
1000 error = xfs_free_eofblocks(mp, ip,
1001 XFS_FREE_EOF_TRYLOCK);
1002 if (error)
1003 return error;
1004
1005 /* delalloc blocks after truncation means it really is dirty */
1006 if (ip->i_delayed_blks)
1007 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1008 }
990 return 0; 1009 return 0;
991} 1010}
992 1011