aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2011-03-19 02:38:50 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2011-03-19 02:38:50 -0400
commit97eb3f24352ec6632c2127b35d8087d2a809a9b9 (patch)
tree722948059bbd325bbca232269490124231df80d4 /fs
parent439581ec07fa9cf3f519dd461a2cf41cfd3adcb4 (diff)
parentdef179c271ac9b5020deca798470521f14d11edd (diff)
Merge branch 'next' into for-linus
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig5
-rw-r--r--fs/9p/Makefile1
-rw-r--r--fs/9p/acl.c9
-rw-r--r--fs/9p/acl.h2
-rw-r--r--fs/9p/v9fs.h42
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_dentry.c6
-rw-r--r--fs/9p/vfs_inode.c881
-rw-r--r--fs/9p/vfs_inode_dotl.c824
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/9p/xattr.c2
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/adfs/dir.c12
-rw-r--r--fs/adfs/super.c13
-rw-r--r--fs/affs/affs.h1
-rw-r--r--fs/affs/amigaffs.c4
-rw-r--r--fs/affs/namei.c69
-rw-r--r--fs/affs/super.c15
-rw-r--r--fs/afs/cmservice.c12
-rw-r--r--fs/afs/dir.c13
-rw-r--r--fs/afs/inode.c3
-rw-r--r--fs/afs/internal.h5
-rw-r--r--fs/afs/main.c13
-rw-r--r--fs/afs/mntpt.c63
-rw-r--r--fs/afs/rxrpc.c2
-rw-r--r--fs/afs/security.c7
-rw-r--r--fs/afs/server.c13
-rw-r--r--fs/afs/super.c11
-rw-r--r--fs/afs/vlocation.c14
-rw-r--r--fs/aio.c31
-rw-r--r--fs/anon_inodes.c29
-rw-r--r--fs/autofs4/autofs_i.h134
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/expire.c170
-rw-r--r--fs/autofs4/inode.c114
-rw-r--r--fs/autofs4/root.c788
-rw-r--r--fs/autofs4/symlink.c3
-rw-r--r--fs/autofs4/waitq.c40
-rw-r--r--fs/bad_inode.c5
-rw-r--r--fs/befs/endian.h16
-rw-r--r--fs/befs/linuxvfs.c12
-rw-r--r--fs/bfs/inode.c9
-rw-r--r--fs/binfmt_elf.c23
-rw-r--r--fs/bio-integrity.c7
-rw-r--r--fs/bio.c23
-rw-r--r--fs/block_dev.c778
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c25
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/compression.c344
-rw-r--r--fs/btrfs/compression.h72
-rw-r--r--fs/btrfs/ctree.c8
-rw-r--r--fs/btrfs/ctree.h57
-rw-r--r--fs/btrfs/disk-io.c453
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/export.c88
-rw-r--r--fs/btrfs/extent-tree.c167
-rw-r--r--fs/btrfs/extent_io.c84
-rw-r--r--fs/btrfs/extent_io.h20
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c225
-rw-r--r--fs/btrfs/free-space-cache.c12
-rw-r--r--fs/btrfs/inode.c505
-rw-r--r--fs/btrfs/ioctl.c245
-rw-r--r--fs/btrfs/ioctl.h26
-rw-r--r--fs/btrfs/lzo.c420
-rw-r--r--fs/btrfs/ordered-data.c85
-rw-r--r--fs/btrfs/ordered-data.h11
-rw-r--r--fs/btrfs/orphan.c6
-rw-r--r--fs/btrfs/super.c325
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c21
-rw-r--r--fs/btrfs/volumes.c674
-rw-r--r--fs/btrfs/volumes.h31
-rw-r--r--fs/btrfs/xattr.c18
-rw-r--r--fs/btrfs/zlib.c369
-rw-r--r--fs/buffer.c37
-rw-r--r--fs/ceph/Makefile23
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/caps.c60
-rw-r--r--fs/ceph/debugfs.c9
-rw-r--r--fs/ceph/dir.c71
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c65
-rw-r--r--fs/ceph/inode.c102
-rw-r--r--fs/ceph/ioctl.h2
-rw-r--r--fs/ceph/locks.c94
-rw-r--r--fs/ceph/mds_client.c107
-rw-r--r--fs/ceph/mds_client.h35
-rw-r--r--fs/ceph/super.c15
-rw-r--r--fs/ceph/super.h8
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/char_dev.c15
-rw-r--r--fs/cifs/Kconfig8
-rw-r--r--fs/cifs/Makefile6
-rw-r--r--fs/cifs/README14
-rw-r--r--fs/cifs/TODO2
-rw-r--r--fs/cifs/cache.c16
-rw-r--r--fs/cifs/cifs_debug.c32
-rw-r--r--fs/cifs/cifs_dfs_ref.c119
-rw-r--r--fs/cifs/cifs_fs_sb.h8
-rw-r--r--fs/cifs/cifs_spnego.c10
-rw-r--r--fs/cifs/cifs_unicode.c127
-rw-r--r--fs/cifs/cifsacl.c64
-rw-r--r--fs/cifs/cifsacl.h4
-rw-r--r--fs/cifs/cifsencrypt.c39
-rw-r--r--fs/cifs/cifsencrypt.h33
-rw-r--r--fs/cifs/cifsfs.c104
-rw-r--r--fs/cifs/cifsfs.h25
-rw-r--r--fs/cifs/cifsglob.h121
-rw-r--r--fs/cifs/cifspdu.h62
-rw-r--r--fs/cifs/cifsproto.h32
-rw-r--r--fs/cifs/cifssmb.c301
-rw-r--r--fs/cifs/connect.c911
-rw-r--r--fs/cifs/dir.c94
-rw-r--r--fs/cifs/dns_resolve.c2
-rw-r--r--fs/cifs/file.c800
-rw-r--r--fs/cifs/fscache.c12
-rw-r--r--fs/cifs/inode.c94
-rw-r--r--fs/cifs/ioctl.c16
-rw-r--r--fs/cifs/link.c62
-rw-r--r--fs/cifs/md4.c205
-rw-r--r--fs/cifs/md5.c366
-rw-r--r--fs/cifs/md5.h38
-rw-r--r--fs/cifs/misc.c96
-rw-r--r--fs/cifs/netmisc.c8
-rw-r--r--fs/cifs/readdir.c49
-rw-r--r--fs/cifs/sess.c150
-rw-r--r--fs/cifs/smbdes.c1
-rw-r--r--fs/cifs/smbencrypt.c91
-rw-r--r--fs/cifs/transport.c436
-rw-r--r--fs/cifs/xattr.c55
-rw-r--r--fs/coda/cache.c9
-rw-r--r--fs/coda/cnode.c3
-rw-r--r--fs/coda/coda_cache.h22
-rw-r--r--fs/coda/coda_fs_i.h58
-rw-r--r--fs/coda/coda_linux.c3
-rw-r--r--fs/coda/coda_linux.h101
-rw-r--r--fs/coda/dir.c27
-rw-r--r--fs/coda/file.c3
-rw-r--r--fs/coda/inode.c17
-rw-r--r--fs/coda/pioctl.c10
-rw-r--r--fs/coda/psdev.c4
-rw-r--r--fs/coda/symlink.c4
-rw-r--r--fs/coda/upcall.c5
-rw-r--r--fs/compat.c38
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/configfs/Kconfig4
-rw-r--r--fs/configfs/configfs_internal.h5
-rw-r--r--fs/configfs/dir.c22
-rw-r--r--fs/configfs/inode.c8
-rw-r--r--fs/configfs/mount.c1
-rw-r--r--fs/cramfs/inode.c110
-rw-r--r--fs/dcache.c1389
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/dlm/Kconfig3
-rw-r--r--fs/dlm/lowcomms.c63
-rw-r--r--fs/ecryptfs/crypto.c30
-rw-r--r--fs/ecryptfs/dentry.c9
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c28
-rw-r--r--fs/ecryptfs/inode.c43
-rw-r--r--fs/ecryptfs/keystore.c26
-rw-r--r--fs/ecryptfs/main.c165
-rw-r--r--fs/ecryptfs/mmap.c35
-rw-r--r--fs/ecryptfs/super.c13
-rw-r--r--fs/efs/super.c9
-rw-r--r--fs/eventpoll.c20
-rw-r--r--fs/exec.c41
-rw-r--r--fs/exofs/super.c9
-rw-r--r--fs/exportfs/expfs.c14
-rw-r--r--fs/ext2/acl.c11
-rw-r--r--fs/ext2/acl.h2
-rw-r--r--fs/ext2/dir.c19
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c34
-rw-r--r--fs/ext2/xattr.c10
-rw-r--r--fs/ext3/acl.c11
-rw-r--r--fs/ext3/acl.h2
-rw-r--r--fs/ext3/balloc.c266
-rw-r--r--fs/ext3/dir.c15
-rw-r--r--fs/ext3/inode.c6
-rw-r--r--fs/ext3/ioctl.c22
-rw-r--r--fs/ext3/namei.c138
-rw-r--r--fs/ext3/resize.c65
-rw-r--r--fs/ext3/super.c111
-rw-r--r--fs/ext3/xattr.c2
-rw-r--r--fs/ext4/acl.c11
-rw-r--r--fs/ext4/acl.h2
-rw-r--r--fs/ext4/balloc.c3
-rw-r--r--fs/ext4/dir.c56
-rw-r--r--fs/ext4/ext4.h100
-rw-r--r--fs/ext4/ext4_extents.h8
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c105
-rw-r--r--fs/ext4/file.c24
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c91
-rw-r--r--fs/ext4/ioctl.c24
-rw-r--r--fs/ext4/mballoc.c57
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/namei.c71
-rw-r--r--fs/ext4/page-io.c106
-rw-r--r--fs/ext4/resize.c69
-rw-r--r--fs/ext4/super.c455
-rw-r--r--fs/ext4/xattr.c28
-rw-r--r--fs/fat/fat.h3
-rw-r--r--fs/fat/inode.c22
-rw-r--r--fs/fat/namei_msdos.c44
-rw-r--r--fs/fat/namei_vfat.c74
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/filesystems.c3
-rw-r--r--fs/freevxfs/vxfs_inode.c9
-rw-r--r--fs/fs-writeback.c105
-rw-r--r--fs/fs_struct.c49
-rw-r--r--fs/fscache/operation.c2
-rw-r--r--fs/fuse/dev.c156
-rw-r--r--fs/fuse/dir.c70
-rw-r--r--fs/fuse/file.c140
-rw-r--r--fs/fuse/fuse_i.h27
-rw-r--r--fs/fuse/inode.c49
-rw-r--r--fs/generic_acl.c20
-rw-r--r--fs/gfs2/acl.c5
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/bmap.c11
-rw-r--r--fs/gfs2/dentry.c22
-rw-r--r--fs/gfs2/export.c59
-rw-r--r--fs/gfs2/file.c260
-rw-r--r--fs/gfs2/glock.c92
-rw-r--r--fs/gfs2/glock.h28
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/incore.h13
-rw-r--r--fs/gfs2/inode.c225
-rw-r--r--fs/gfs2/inode.h7
-rw-r--r--fs/gfs2/lock_dlm.c15
-rw-r--r--fs/gfs2/ops_fstype.c10
-rw-r--r--fs/gfs2/ops_inode.c292
-rw-r--r--fs/gfs2/quota.c28
-rw-r--r--fs/gfs2/rgrp.c146
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/super.c10
-rw-r--r--fs/gfs2/xattr.c23
-rw-r--r--fs/hfs/dir.c2
-rw-r--r--fs/hfs/hfs_fs.h8
-rw-r--r--fs/hfs/string.c17
-rw-r--r--fs/hfs/super.c12
-rw-r--r--fs/hfs/sysdep.c7
-rw-r--r--fs/hfsplus/bfind.c6
-rw-r--r--fs/hfsplus/bitmap.c3
-rw-r--r--fs/hfsplus/bnode.c70
-rw-r--r--fs/hfsplus/brec.c28
-rw-r--r--fs/hfsplus/btree.c33
-rw-r--r--fs/hfsplus/catalog.c85
-rw-r--r--fs/hfsplus/dir.c38
-rw-r--r--fs/hfsplus/extents.c96
-rw-r--r--fs/hfsplus/hfsplus_fs.h130
-rw-r--r--fs/hfsplus/hfsplus_raw.h3
-rw-r--r--fs/hfsplus/inode.c89
-rw-r--r--fs/hfsplus/ioctl.c6
-rw-r--r--fs/hfsplus/options.c44
-rw-r--r--fs/hfsplus/part_tbl.c129
-rw-r--r--fs/hfsplus/super.c142
-rw-r--r--fs/hfsplus/unicode.c56
-rw-r--r--fs/hfsplus/wrapper.c178
-rw-r--r--fs/hostfs/hostfs_kern.c44
-rw-r--r--fs/hpfs/dentry.c32
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/namei.c2
-rw-r--r--fs/hpfs/super.c11
-rw-r--r--fs/hppfs/hppfs.c9
-rw-r--r--fs/hugetlbfs/inode.c12
-rw-r--r--fs/inode.c50
-rw-r--r--fs/internal.h5
-rw-r--r--fs/ioctl.c50
-rw-r--r--fs/ioprio.c13
-rw-r--r--fs/isofs/inode.c142
-rw-r--r--fs/isofs/namei.c5
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/journal.c50
-rw-r--r--fs/jbd2/recovery.c2
-rw-r--r--fs/jbd2/transaction.c8
-rw-r--r--fs/jffs2/acl.c5
-rw-r--r--fs/jffs2/acl.h2
-rw-r--r--fs/jffs2/build.c5
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/super.c9
-rw-r--r--fs/jffs2/xattr.c12
-rw-r--r--fs/jfs/acl.c8
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/jfs_logmgr.c17
-rw-r--r--fs/jfs/namei.c69
-rw-r--r--fs/jfs/super.c16
-rw-r--r--fs/libfs.c67
-rw-r--r--fs/lockd/Makefile6
-rw-r--r--fs/lockd/clnt4xdr.c605
-rw-r--r--fs/lockd/clntlock.c5
-rw-r--r--fs/lockd/clntproc.c19
-rw-r--r--fs/lockd/clntxdr.c627
-rw-r--r--fs/lockd/host.c417
-rw-r--r--fs/lockd/mon.c110
-rw-r--r--fs/lockd/svc4proc.c21
-rw-r--r--fs/lockd/svclock.c35
-rw-r--r--fs/lockd/svcproc.c29
-rw-r--r--fs/lockd/xdr.c287
-rw-r--r--fs/lockd/xdr4.c255
-rw-r--r--fs/locks.c30
-rw-r--r--fs/logfs/dev_bdev.c7
-rw-r--r--fs/logfs/dir.c6
-rw-r--r--fs/logfs/inode.c9
-rw-r--r--fs/logfs/journal.c2
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/logfs/readwrite.c3
-rw-r--r--fs/mbcache.c12
-rw-r--r--fs/minix/inode.c9
-rw-r--r--fs/minix/namei.c2
-rw-r--r--fs/mpage.c49
-rw-r--r--fs/namei.c1142
-rw-r--r--fs/namespace.c363
-rw-r--r--fs/ncpfs/dir.c106
-rw-r--r--fs/ncpfs/file.c4
-rw-r--r--fs/ncpfs/inode.c24
-rw-r--r--fs/ncpfs/ioctl.c5
-rw-r--r--fs/ncpfs/mmap.c4
-rw-r--r--fs/ncpfs/ncp_fs.h98
-rw-r--r--fs/ncpfs/ncp_fs_i.h29
-rw-r--r--fs/ncpfs/ncp_fs_sb.h176
-rw-r--r--fs/ncpfs/ncplib_kernel.c2
-rw-r--r--fs/ncpfs/ncplib_kernel.h18
-rw-r--r--fs/ncpfs/ncpsign_kernel.c1
-rw-r--r--fs/ncpfs/ncpsign_kernel.h2
-rw-r--r--fs/ncpfs/sock.c2
-rw-r--r--fs/ncpfs/symlink.c4
-rw-r--r--fs/nfs/callback.c93
-rw-r--r--fs/nfs/callback.h61
-rw-r--r--fs/nfs/callback_proc.c324
-rw-r--r--fs/nfs/callback_xdr.c142
-rw-r--r--fs/nfs/client.c297
-rw-r--r--fs/nfs/delegation.c369
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c336
-rw-r--r--fs/nfs/direct.c36
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/getroot.c12
-rw-r--r--fs/nfs/idmap.c2
-rw-r--r--fs/nfs/inode.c43
-rw-r--r--fs/nfs/internal.h28
-rw-r--r--fs/nfs/mount_clnt.c87
-rw-r--r--fs/nfs/namespace.c94
-rw-r--r--fs/nfs/nfs2xdr.c1296
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs3xdr.c2814
-rw-r--r--fs/nfs/nfs4_fs.h13
-rw-r--r--fs/nfs/nfs4filelayout.c6
-rw-r--r--fs/nfs/nfs4filelayoutdev.c9
-rw-r--r--fs/nfs/nfs4proc.c229
-rw-r--r--fs/nfs/nfs4renewd.c11
-rw-r--r--fs/nfs/nfs4state.c287
-rw-r--r--fs/nfs/nfs4xdr.c1439
-rw-r--r--fs/nfs/pagelist.c11
-rw-r--r--fs/nfs/pnfs.c526
-rw-r--r--fs/nfs/pnfs.h76
-rw-r--r--fs/nfs/proc.c5
-rw-r--r--fs/nfs/read.c1
-rw-r--r--fs/nfs/super.c32
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c5
-rw-r--r--fs/nfs_common/nfsacl.c54
-rw-r--r--fs/nfsd/acl.h59
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/idmap.h62
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs3xdr.c6
-rw-r--r--fs/nfsd/nfs4acl.c2
-rw-r--r--fs/nfsd/nfs4callback.c841
-rw-r--r--fs/nfsd/nfs4idmap.c15
-rw-r--r--fs/nfsd/nfs4proc.c59
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c265
-rw-r--r--fs/nfsd/nfs4xdr.c115
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/nfsd.h1
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h16
-rw-r--r--fs/nfsd/vfs.c96
-rw-r--r--fs/nfsd/xdr4.h30
-rw-r--r--fs/nilfs2/bmap.c47
-rw-r--r--fs/nilfs2/btnode.c3
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dir.c3
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/gcinode.c9
-rw-r--r--fs/nilfs2/ifile.c11
-rw-r--r--fs/nilfs2/inode.c190
-rw-r--r--fs/nilfs2/ioctl.c28
-rw-r--r--fs/nilfs2/mdt.c32
-rw-r--r--fs/nilfs2/namei.c1
-rw-r--r--fs/nilfs2/nilfs.h15
-rw-r--r--fs/nilfs2/page.c86
-rw-r--r--fs/nilfs2/page.h3
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/sb.h8
-rw-r--r--fs/nilfs2/segment.c43
-rw-r--r--fs/nilfs2/super.c53
-rw-r--r--fs/nilfs2/the_nilfs.c6
-rw-r--r--fs/nilfs2/the_nilfs.h3
-rw-r--r--fs/notify/fanotify/Kconfig2
-rw-r--r--fs/notify/fanotify/fanotify.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c81
-rw-r--r--fs/notify/fsnotify.c8
-rw-r--r--fs/notify/inotify/inotify_user.c1
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/file.c35
-rw-r--r--fs/ntfs/inode.c9
-rw-r--r--fs/ntfs/mft.c11
-rw-r--r--fs/ntfs/super.c6
-rw-r--r--fs/ocfs2/Kconfig5
-rw-r--r--fs/ocfs2/acl.c8
-rw-r--r--fs/ocfs2/acl.h2
-rw-r--r--fs/ocfs2/alloc.c77
-rw-r--r--fs/ocfs2/alloc.h4
-rw-r--r--fs/ocfs2/aops.c66
-rw-r--r--fs/ocfs2/aops.h23
-rw-r--r--fs/ocfs2/cluster/heartbeat.c265
-rw-r--r--fs/ocfs2/cluster/masklog.c3
-rw-r--r--fs/ocfs2/cluster/masklog.h15
-rw-r--r--fs/ocfs2/cluster/netdebug.c286
-rw-r--r--fs/ocfs2/cluster/quorum.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c145
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h33
-rw-r--r--fs/ocfs2/dcache.c21
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlm/dlmast.c76
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h86
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c200
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c12
-rw-r--r--fs/ocfs2/dlm/dlmlock.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/dlm/dlmthread.c132
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c9
-rw-r--r--fs/ocfs2/export.c6
-rw-r--r--fs/ocfs2/file.c40
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/namei.c10
-rw-r--r--fs/ocfs2/ocfs2.h11
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/super.c16
-rw-r--r--fs/open.c11
-rw-r--r--fs/openpromfs/inode.c11
-rw-r--r--fs/partitions/check.c106
-rw-r--r--fs/pipe.c34
-rw-r--r--fs/pnode.c4
-rw-r--r--fs/posix_acl.c17
-rw-r--r--fs/proc/Kconfig6
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c28
-rw-r--r--fs/proc/base.c183
-rw-r--r--fs/proc/consoles.c114
-rw-r--r--fs/proc/devices.c4
-rw-r--r--fs/proc/generic.c21
-rw-r--r--fs/proc/inode.c17
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/meminfo.c14
-rw-r--r--fs/proc/page.c16
-rw-r--r--fs/proc/proc_sysctl.c31
-rw-r--r--fs/proc/proc_tty.c26
-rw-r--r--fs/proc/softirqs.c6
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c15
-rw-r--r--fs/proc/task_nommu.c7
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/qnx4/inode.c9
-rw-r--r--fs/quota/dquot.c36
-rw-r--r--fs/quota/quota.c41
-rw-r--r--fs/quota/quota_tree.c9
-rw-r--r--fs/read_write.c28
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/ioctl.c8
-rw-r--r--fs/reiserfs/journal.c22
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/reiserfs/super.c27
-rw-r--r--fs/reiserfs/xattr.c18
-rw-r--r--fs/reiserfs/xattr_acl.c6
-rw-r--r--fs/romfs/super.c9
-rw-r--r--fs/select.c2
-rw-r--r--fs/splice.c67
-rw-r--r--fs/squashfs/Kconfig18
-rw-r--r--fs/squashfs/Makefile1
-rw-r--r--fs/squashfs/block.c9
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c16
-rw-r--r--fs/squashfs/decompressor.h9
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/lzo_wrapper.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_i.h6
-rw-r--r--fs/squashfs/super.c9
-rw-r--r--fs/squashfs/xattr_id.c1
-rw-r--r--fs/squashfs/xz_wrapper.c147
-rw-r--r--fs/squashfs/zlib_wrapper.c21
-rw-r--r--fs/stat.c4
-rw-r--r--fs/super.c22
-rw-r--r--fs/sysfs/Kconfig2
-rw-r--r--fs/sysfs/dir.c10
-rw-r--r--fs/sysfs/group.c10
-rw-r--r--fs/sysfs/inode.c12
-rw-r--r--fs/sysfs/sysfs.h3
-rw-r--r--fs/sysv/inode.c9
-rw-r--r--fs/sysv/namei.c4
-rw-r--r--fs/sysv/super.c8
-rw-r--r--fs/ubifs/super.c10
-rw-r--r--fs/udf/Kconfig1
-rw-r--r--fs/udf/balloc.c3
-rw-r--r--fs/udf/dir.c5
-rw-r--r--fs/udf/file.c11
-rw-r--r--fs/udf/ialloc.c21
-rw-r--r--fs/udf/inode.c51
-rw-r--r--fs/udf/namei.c107
-rw-r--r--fs/udf/partition.c27
-rw-r--r--fs/udf/super.c76
-rw-r--r--fs/udf/symlink.c12
-rw-r--r--fs/udf/udf_i.h13
-rw-r--r--fs/udf/udf_sb.h22
-rw-r--r--fs/udf/udfdecl.h4
-rw-r--r--fs/ufs/super.c9
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/sv.h59
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c526
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h16
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c275
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h29
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c191
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c587
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c25
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c57
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c34
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c104
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h92
-rw-r--r--fs/xfs/quota/xfs_dquot.c1
-rw-r--r--fs/xfs/quota/xfs_qm.c46
-rw-r--r--fs/xfs/support/debug.c112
-rw-r--r--fs/xfs/support/debug.h25
-rw-r--r--fs/xfs/xfs_acl.h2
-rw-r--r--fs/xfs/xfs_ag.h2
-rw-r--r--fs/xfs/xfs_alloc.c361
-rw-r--r--fs/xfs/xfs_alloc.h41
-rw-r--r--fs/xfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/xfs_bmap.c146
-rw-r--r--fs/xfs/xfs_bmap.h5
-rw-r--r--fs/xfs/xfs_btree.c9
-rw-r--r--fs/xfs/xfs_buf_item.c191
-rw-r--r--fs/xfs/xfs_buf_item.h11
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_error.c34
-rw-r--r--fs/xfs/xfs_error.h23
-rw-r--r--fs/xfs/xfs_extfree_item.c96
-rw-r--r--fs/xfs/xfs_extfree_item.h11
-rw-r--r--fs/xfs/xfs_filestream.c8
-rw-r--r--fs/xfs/xfs_fsops.c11
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_iget.c90
-rw-r--r--fs/xfs/xfs_inode.c54
-rw-r--r--fs/xfs/xfs_inode.h15
-rw-r--r--fs/xfs/xfs_inode_item.c121
-rw-r--r--fs/xfs/xfs_iomap.c238
-rw-r--r--fs/xfs/xfs_iomap.h27
-rw-r--r--fs/xfs/xfs_log.c741
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c32
-rw-r--r--fs/xfs/xfs_log_priv.h127
-rw-r--r--fs/xfs/xfs_log_recover.c622
-rw-r--r--fs/xfs/xfs_mount.c24
-rw-r--r--fs/xfs/xfs_mount.h14
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_quota.h20
-rw-r--r--fs/xfs/xfs_rename.c1
-rw-r--r--fs/xfs/xfs_trans.c122
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_ail.c232
-rw-r--r--fs/xfs/xfs_trans_extfree.c8
-rw-r--r--fs/xfs/xfs_trans_priv.h35
-rw-r--r--fs/xfs/xfs_vnodeops.c61
600 files changed, 28864 insertions, 17806 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 7e051147679..814ac4e213a 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -9,6 +9,8 @@ config 9P_FS
9 9
10 If unsure, say N. 10 If unsure, say N.
11 11
12if 9P_FS
13
12config 9P_FSCACHE 14config 9P_FSCACHE
13 bool "Enable 9P client caching support (EXPERIMENTAL)" 15 bool "Enable 9P client caching support (EXPERIMENTAL)"
14 depends on EXPERIMENTAL 16 depends on EXPERIMENTAL
@@ -20,7 +22,6 @@ config 9P_FSCACHE
20 22
21config 9P_FS_POSIX_ACL 23config 9P_FS_POSIX_ACL
22 bool "9P POSIX Access Control Lists" 24 bool "9P POSIX Access Control Lists"
23 depends on 9P_FS
24 select FS_POSIX_ACL 25 select FS_POSIX_ACL
25 help 26 help
26 POSIX Access Control Lists (ACLs) support permissions for users and 27 POSIX Access Control Lists (ACLs) support permissions for users and
@@ -30,3 +31,5 @@ config 9P_FS_POSIX_ACL
30 Linux website <http://acl.bestbits.at/>. 31 Linux website <http://acl.bestbits.at/>.
31 32
32 If you don't know what Access Control Lists are, say N 33 If you don't know what Access Control Lists are, say N
34
35endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index f8ba37effd1..ab8c1278063 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
39p-objs := \ 39p-objs := \
4 vfs_super.o \ 4 vfs_super.o \
5 vfs_inode.o \ 5 vfs_inode.o \
6 vfs_inode_dotl.o \
6 vfs_addr.o \ 7 vfs_addr.o \
7 vfs_file.o \ 8 vfs_file.o \
8 vfs_dir.o \ 9 vfs_dir.o \
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 12d602351db..02a2cf61631 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -28,7 +28,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
28{ 28{
29 ssize_t size; 29 ssize_t size;
30 void *value = NULL; 30 void *value = NULL;
31 struct posix_acl *acl = NULL;; 31 struct posix_acl *acl = NULL;
32 32
33 size = v9fs_fid_xattr_get(fid, name, NULL, 0); 33 size = v9fs_fid_xattr_get(fid, name, NULL, 0);
34 if (size > 0) { 34 if (size > 0) {
@@ -91,11 +91,14 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
91 return acl; 91 return acl;
92} 92}
93 93
94int v9fs_check_acl(struct inode *inode, int mask) 94int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
95{ 95{
96 struct posix_acl *acl; 96 struct posix_acl *acl;
97 struct v9fs_session_info *v9ses; 97 struct v9fs_session_info *v9ses;
98 98
99 if (flags & IPERM_FLAG_RCU)
100 return -ECHILD;
101
99 v9ses = v9fs_inode2v9ses(inode); 102 v9ses = v9fs_inode2v9ses(inode);
100 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { 103 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
101 /* 104 /*
@@ -362,7 +365,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
362 case ACL_TYPE_DEFAULT: 365 case ACL_TYPE_DEFAULT:
363 name = POSIX_ACL_XATTR_DEFAULT; 366 name = POSIX_ACL_XATTR_DEFAULT;
364 if (!S_ISDIR(inode->i_mode)) { 367 if (!S_ISDIR(inode->i_mode)) {
365 retval = -EINVAL; 368 retval = acl ? -EINVAL : 0;
366 goto err_out; 369 goto err_out;
367 } 370 }
368 break; 371 break;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 59e18c2e8c7..7ef3ac9f6d9 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -16,7 +16,7 @@
16 16
17#ifdef CONFIG_9P_FS_POSIX_ACL 17#ifdef CONFIG_9P_FS_POSIX_ACL
18extern int v9fs_get_acl(struct inode *, struct p9_fid *); 18extern int v9fs_get_acl(struct inode *, struct p9_fid *);
19extern int v9fs_check_acl(struct inode *inode, int mask); 19extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
20extern int v9fs_acl_chmod(struct dentry *); 20extern int v9fs_acl_chmod(struct dentry *);
21extern int v9fs_set_create_acl(struct dentry *, 21extern int v9fs_set_create_acl(struct dentry *,
22 struct posix_acl *, struct posix_acl *); 22 struct posix_acl *, struct posix_acl *);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index cb6396855e2..c4b5d8864f0 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -113,9 +113,27 @@ struct v9fs_session_info {
113 113
114struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 114struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
115 char *); 115 char *);
116void v9fs_session_close(struct v9fs_session_info *v9ses); 116extern void v9fs_session_close(struct v9fs_session_info *v9ses);
117void v9fs_session_cancel(struct v9fs_session_info *v9ses); 117extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
118void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); 118extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
119extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
120 struct nameidata *nameidata);
121extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
122extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
123extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
124 struct inode *new_dir, struct dentry *new_dentry);
125extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
126 void *p);
127extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
128 struct p9_fid *fid,
129 struct super_block *sb);
130
131extern const struct inode_operations v9fs_dir_inode_operations_dotl;
132extern const struct inode_operations v9fs_file_inode_operations_dotl;
133extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
134extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
135 struct p9_fid *fid,
136 struct super_block *sb);
119 137
120/* other default globals */ 138/* other default globals */
121#define V9FS_PORT 564 139#define V9FS_PORT 564
@@ -138,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
138{ 156{
139 return v9ses->flags & V9FS_PROTO_2000L; 157 return v9ses->flags & V9FS_PROTO_2000L;
140} 158}
159
160/**
161 * v9fs_inode_from_fid - Helper routine to populate an inode by
162 * issuing a attribute request
163 * @v9ses: session information
164 * @fid: fid to issue attribute request for
165 * @sb: superblock on which to create inode
166 *
167 */
168static inline struct inode *
169v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
170 struct super_block *sb)
171{
172 if (v9fs_proto_dotl(v9ses))
173 return v9fs_inode_dotl(v9ses, fid, sb);
174 else
175 return v9fs_inode(v9ses, fid, sb);
176}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index bab0eac873f..b789f8e597e 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,7 +59,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
59int v9fs_dir_release(struct inode *inode, struct file *filp); 59int v9fs_dir_release(struct inode *inode, struct file *filp);
60int v9fs_file_open(struct inode *inode, struct file *file); 60int v9fs_file_open(struct inode *inode, struct file *file);
61void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); 61void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
62void v9fs_dentry_release(struct dentry *);
63int v9fs_uflags2omode(int uflags, int extended); 62int v9fs_uflags2omode(int uflags, int extended);
64 63
65ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); 64ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index cbf4e50f393..233b7d4ffe5 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -51,7 +51,7 @@
51 * 51 *
52 */ 52 */
53 53
54static int v9fs_dentry_delete(struct dentry *dentry) 54static int v9fs_dentry_delete(const struct dentry *dentry)
55{ 55{
56 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, 56 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
57 dentry); 57 dentry);
@@ -68,7 +68,7 @@ static int v9fs_dentry_delete(struct dentry *dentry)
68 * 68 *
69 */ 69 */
70 70
71static int v9fs_cached_dentry_delete(struct dentry *dentry) 71static int v9fs_cached_dentry_delete(const struct dentry *dentry)
72{ 72{
73 struct inode *inode = dentry->d_inode; 73 struct inode *inode = dentry->d_inode;
74 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, 74 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
@@ -86,7 +86,7 @@ static int v9fs_cached_dentry_delete(struct dentry *dentry)
86 * 86 *
87 */ 87 */
88 88
89void v9fs_dentry_release(struct dentry *dentry) 89static void v9fs_dentry_release(struct dentry *dentry)
90{ 90{
91 struct v9fs_dentry *dent; 91 struct v9fs_dentry *dent;
92 struct p9_fid *temp, *current_fid; 92 struct p9_fid *temp, *current_fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 34bf71b5654..b76a40bdf4c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -49,15 +49,8 @@
49 49
50static const struct inode_operations v9fs_dir_inode_operations; 50static const struct inode_operations v9fs_dir_inode_operations;
51static const struct inode_operations v9fs_dir_inode_operations_dotu; 51static const struct inode_operations v9fs_dir_inode_operations_dotu;
52static const struct inode_operations v9fs_dir_inode_operations_dotl;
53static const struct inode_operations v9fs_file_inode_operations; 52static const struct inode_operations v9fs_file_inode_operations;
54static const struct inode_operations v9fs_file_inode_operations_dotl;
55static const struct inode_operations v9fs_symlink_inode_operations; 53static const struct inode_operations v9fs_symlink_inode_operations;
56static const struct inode_operations v9fs_symlink_inode_operations_dotl;
57
58static int
59v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
60 dev_t rdev);
61 54
62/** 55/**
63 * unixmode2p9mode - convert unix mode bits to plan 9 56 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -237,46 +230,18 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
237 * 230 *
238 */ 231 */
239 232
240void v9fs_destroy_inode(struct inode *inode) 233static void v9fs_i_callback(struct rcu_head *head)
241{ 234{
235 struct inode *inode = container_of(head, struct inode, i_rcu);
236 INIT_LIST_HEAD(&inode->i_dentry);
242 kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode)); 237 kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
243} 238}
244#endif
245 239
246/** 240void v9fs_destroy_inode(struct inode *inode)
247 * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
248 * new file system object. This checks the S_ISGID to determine the owning
249 * group of the new file system object.
250 */
251
252static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
253{
254 BUG_ON(dir_inode == NULL);
255
256 if (dir_inode->i_mode & S_ISGID) {
257 /* set_gid bit is set.*/
258 return dir_inode->i_gid;
259 }
260 return current_fsgid();
261}
262
263/**
264 * v9fs_dentry_from_dir_inode - helper function to get the dentry from
265 * dir inode.
266 *
267 */
268
269static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
270{ 241{
271 struct dentry *dentry; 242 call_rcu(&inode->i_rcu, v9fs_i_callback);
272
273 spin_lock(&dcache_lock);
274 /* Directory should have only one entry. */
275 BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
276 dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
277 spin_unlock(&dcache_lock);
278 return dentry;
279} 243}
244#endif
280 245
281/** 246/**
282 * v9fs_get_inode - helper function to setup an inode 247 * v9fs_get_inode - helper function to setup an inode
@@ -447,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode)
447#endif 412#endif
448} 413}
449 414
450static struct inode * 415struct inode *
451v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, 416v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
452 struct super_block *sb) 417 struct super_block *sb)
453{ 418{
@@ -482,60 +447,6 @@ error:
482 return ERR_PTR(err); 447 return ERR_PTR(err);
483} 448}
484 449
485static struct inode *
486v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
487 struct super_block *sb)
488{
489 struct inode *ret = NULL;
490 int err;
491 struct p9_stat_dotl *st;
492
493 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
494 if (IS_ERR(st))
495 return ERR_CAST(st);
496
497 ret = v9fs_get_inode(sb, st->st_mode);
498 if (IS_ERR(ret)) {
499 err = PTR_ERR(ret);
500 goto error;
501 }
502
503 v9fs_stat2inode_dotl(st, ret);
504 ret->i_ino = v9fs_qid2ino(&st->qid);
505#ifdef CONFIG_9P_FSCACHE
506 v9fs_vcookie_set_qid(ret, &st->qid);
507 v9fs_cache_inode_get_cookie(ret);
508#endif
509 err = v9fs_get_acl(ret, fid);
510 if (err) {
511 iput(ret);
512 goto error;
513 }
514 kfree(st);
515 return ret;
516error:
517 kfree(st);
518 return ERR_PTR(err);
519}
520
521/**
522 * v9fs_inode_from_fid - Helper routine to populate an inode by
523 * issuing a attribute request
524 * @v9ses: session information
525 * @fid: fid to issue attribute request for
526 * @sb: superblock on which to create inode
527 *
528 */
529static inline struct inode *
530v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
531 struct super_block *sb)
532{
533 if (v9fs_proto_dotl(v9ses))
534 return v9fs_inode_dotl(v9ses, fid, sb);
535 else
536 return v9fs_inode(v9ses, fid, sb);
537}
538
539/** 450/**
540 * v9fs_remove - helper function to remove files and directories 451 * v9fs_remove - helper function to remove files and directories
541 * @dir: directory inode that is being deleted 452 * @dir: directory inode that is being deleted
@@ -626,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
626 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 537 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
627 goto error; 538 goto error;
628 } 539 }
629
630 if (v9ses->cache)
631 dentry->d_op = &v9fs_cached_dentry_operations;
632 else
633 dentry->d_op = &v9fs_dentry_operations;
634
635 d_instantiate(dentry, inode); 540 d_instantiate(dentry, inode);
636 err = v9fs_fid_add(dentry, fid); 541 err = v9fs_fid_add(dentry, fid);
637 if (err < 0) 542 if (err < 0)
@@ -650,144 +555,6 @@ error:
650} 555}
651 556
652/** 557/**
653 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
654 * @dir: directory inode that is being created
655 * @dentry: dentry that is being deleted
656 * @mode: create permissions
657 * @nd: path information
658 *
659 */
660
661static int
662v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
663 struct nameidata *nd)
664{
665 int err = 0;
666 char *name = NULL;
667 gid_t gid;
668 int flags;
669 mode_t mode;
670 struct v9fs_session_info *v9ses;
671 struct p9_fid *fid = NULL;
672 struct p9_fid *dfid, *ofid;
673 struct file *filp;
674 struct p9_qid qid;
675 struct inode *inode;
676 struct posix_acl *pacl = NULL, *dacl = NULL;
677
678 v9ses = v9fs_inode2v9ses(dir);
679 if (nd && nd->flags & LOOKUP_OPEN)
680 flags = nd->intent.open.flags - 1;
681 else {
682 /*
683 * create call without LOOKUP_OPEN is due
684 * to mknod of regular files. So use mknod
685 * operation.
686 */
687 return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
688 }
689
690 name = (char *) dentry->d_name.name;
691 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
692 "mode:0x%x\n", name, flags, omode);
693
694 dfid = v9fs_fid_lookup(dentry->d_parent);
695 if (IS_ERR(dfid)) {
696 err = PTR_ERR(dfid);
697 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
698 return err;
699 }
700
701 /* clone a fid to use for creation */
702 ofid = p9_client_walk(dfid, 0, NULL, 1);
703 if (IS_ERR(ofid)) {
704 err = PTR_ERR(ofid);
705 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
706 return err;
707 }
708
709 gid = v9fs_get_fsgid_for_create(dir);
710
711 mode = omode;
712 /* Update mode based on ACL value */
713 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
714 if (err) {
715 P9_DPRINTK(P9_DEBUG_VFS,
716 "Failed to get acl values in creat %d\n", err);
717 goto error;
718 }
719 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
720 if (err < 0) {
721 P9_DPRINTK(P9_DEBUG_VFS,
722 "p9_client_open_dotl failed in creat %d\n",
723 err);
724 goto error;
725 }
726 /* instantiate inode and assign the unopened fid to the dentry */
727 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
728 (nd && nd->flags & LOOKUP_OPEN)) {
729 fid = p9_client_walk(dfid, 1, &name, 1);
730 if (IS_ERR(fid)) {
731 err = PTR_ERR(fid);
732 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
733 err);
734 fid = NULL;
735 goto error;
736 }
737
738 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
739 if (IS_ERR(inode)) {
740 err = PTR_ERR(inode);
741 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
742 err);
743 goto error;
744 }
745 dentry->d_op = &v9fs_cached_dentry_operations;
746 d_instantiate(dentry, inode);
747 err = v9fs_fid_add(dentry, fid);
748 if (err < 0)
749 goto error;
750 /* The fid would get clunked via a dput */
751 fid = NULL;
752 } else {
753 /*
754 * Not in cached mode. No need to populate
755 * inode with stat. We need to get an inode
756 * so that we can set the acl with dentry
757 */
758 inode = v9fs_get_inode(dir->i_sb, mode);
759 if (IS_ERR(inode)) {
760 err = PTR_ERR(inode);
761 goto error;
762 }
763 dentry->d_op = &v9fs_dentry_operations;
764 d_instantiate(dentry, inode);
765 }
766 /* Now set the ACL based on the default value */
767 v9fs_set_create_acl(dentry, dacl, pacl);
768
769 /* if we are opening a file, assign the open fid to the file */
770 if (nd && nd->flags & LOOKUP_OPEN) {
771 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
772 if (IS_ERR(filp)) {
773 p9_client_clunk(ofid);
774 return PTR_ERR(filp);
775 }
776 filp->private_data = ofid;
777 } else
778 p9_client_clunk(ofid);
779
780 return 0;
781
782error:
783 if (ofid)
784 p9_client_clunk(ofid);
785 if (fid)
786 p9_client_clunk(fid);
787 return err;
788}
789
790/**
791 * v9fs_vfs_create - VFS hook to create files 558 * v9fs_vfs_create - VFS hook to create files
792 * @dir: directory inode that is being created 559 * @dir: directory inode that is being created
793 * @dentry: dentry that is being deleted 560 * @dentry: dentry that is being deleted
@@ -877,107 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
877 return err; 644 return err;
878} 645}
879 646
880
881/**
882 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
883 * @dir: inode that is being unlinked
884 * @dentry: dentry that is being unlinked
885 * @mode: mode for new directory
886 *
887 */
888
889static int v9fs_vfs_mkdir_dotl(struct inode *dir,
890 struct dentry *dentry, int omode)
891{
892 int err;
893 struct v9fs_session_info *v9ses;
894 struct p9_fid *fid = NULL, *dfid = NULL;
895 gid_t gid;
896 char *name;
897 mode_t mode;
898 struct inode *inode;
899 struct p9_qid qid;
900 struct dentry *dir_dentry;
901 struct posix_acl *dacl = NULL, *pacl = NULL;
902
903 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
904 err = 0;
905 v9ses = v9fs_inode2v9ses(dir);
906
907 omode |= S_IFDIR;
908 if (dir->i_mode & S_ISGID)
909 omode |= S_ISGID;
910
911 dir_dentry = v9fs_dentry_from_dir_inode(dir);
912 dfid = v9fs_fid_lookup(dir_dentry);
913 if (IS_ERR(dfid)) {
914 err = PTR_ERR(dfid);
915 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
916 dfid = NULL;
917 goto error;
918 }
919
920 gid = v9fs_get_fsgid_for_create(dir);
921 mode = omode;
922 /* Update mode based on ACL value */
923 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
924 if (err) {
925 P9_DPRINTK(P9_DEBUG_VFS,
926 "Failed to get acl values in mkdir %d\n", err);
927 goto error;
928 }
929 name = (char *) dentry->d_name.name;
930 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
931 if (err < 0)
932 goto error;
933
934 /* instantiate inode and assign the unopened fid to the dentry */
935 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
936 fid = p9_client_walk(dfid, 1, &name, 1);
937 if (IS_ERR(fid)) {
938 err = PTR_ERR(fid);
939 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
940 err);
941 fid = NULL;
942 goto error;
943 }
944
945 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
946 if (IS_ERR(inode)) {
947 err = PTR_ERR(inode);
948 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
949 err);
950 goto error;
951 }
952 dentry->d_op = &v9fs_cached_dentry_operations;
953 d_instantiate(dentry, inode);
954 err = v9fs_fid_add(dentry, fid);
955 if (err < 0)
956 goto error;
957 fid = NULL;
958 } else {
959 /*
960 * Not in cached mode. No need to populate
961 * inode with stat. We need to get an inode
962 * so that we can set the acl with dentry
963 */
964 inode = v9fs_get_inode(dir->i_sb, mode);
965 if (IS_ERR(inode)) {
966 err = PTR_ERR(inode);
967 goto error;
968 }
969 dentry->d_op = &v9fs_dentry_operations;
970 d_instantiate(dentry, inode);
971 }
972 /* Now set the ACL based on the default value */
973 v9fs_set_create_acl(dentry, dacl, pacl);
974
975error:
976 if (fid)
977 p9_client_clunk(fid);
978 return err;
979}
980
981/** 647/**
982 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode 648 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
983 * @dir: inode that is being walked from 649 * @dir: inode that is being walked from
@@ -986,7 +652,7 @@ error:
986 * 652 *
987 */ 653 */
988 654
989static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, 655struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
990 struct nameidata *nameidata) 656 struct nameidata *nameidata)
991{ 657{
992 struct super_block *sb; 658 struct super_block *sb;
@@ -1033,11 +699,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
1033 goto error_iput; 699 goto error_iput;
1034 700
1035inst_out: 701inst_out:
1036 if (v9ses->cache)
1037 dentry->d_op = &v9fs_cached_dentry_operations;
1038 else
1039 dentry->d_op = &v9fs_dentry_operations;
1040
1041 d_add(dentry, inode); 702 d_add(dentry, inode);
1042 return NULL; 703 return NULL;
1043 704
@@ -1056,7 +717,7 @@ error:
1056 * 717 *
1057 */ 718 */
1058 719
1059static int v9fs_vfs_unlink(struct inode *i, struct dentry *d) 720int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
1060{ 721{
1061 return v9fs_remove(i, d, 0); 722 return v9fs_remove(i, d, 0);
1062} 723}
@@ -1068,7 +729,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
1068 * 729 *
1069 */ 730 */
1070 731
1071static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) 732int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
1072{ 733{
1073 return v9fs_remove(i, d, 1); 734 return v9fs_remove(i, d, 1);
1074} 735}
@@ -1082,7 +743,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
1082 * 743 *
1083 */ 744 */
1084 745
1085static int 746int
1086v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 747v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1087 struct inode *new_dir, struct dentry *new_dentry) 748 struct inode *new_dir, struct dentry *new_dentry)
1088{ 749{
@@ -1189,42 +850,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1189 return 0; 850 return 0;
1190} 851}
1191 852
1192static int
1193v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
1194 struct kstat *stat)
1195{
1196 int err;
1197 struct v9fs_session_info *v9ses;
1198 struct p9_fid *fid;
1199 struct p9_stat_dotl *st;
1200
1201 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
1202 err = -EPERM;
1203 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1204 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
1205 return simple_getattr(mnt, dentry, stat);
1206
1207 fid = v9fs_fid_lookup(dentry);
1208 if (IS_ERR(fid))
1209 return PTR_ERR(fid);
1210
1211 /* Ask for all the fields in stat structure. Server will return
1212 * whatever it supports
1213 */
1214
1215 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
1216 if (IS_ERR(st))
1217 return PTR_ERR(st);
1218
1219 v9fs_stat2inode_dotl(st, dentry->d_inode);
1220 generic_fillattr(dentry->d_inode, stat);
1221 /* Change block size to what the server returned */
1222 stat->blksize = st->st_blksize;
1223
1224 kfree(st);
1225 return 0;
1226}
1227
1228/** 853/**
1229 * v9fs_vfs_setattr - set file metadata 854 * v9fs_vfs_setattr - set file metadata
1230 * @dentry: file whose metadata to set 855 * @dentry: file whose metadata to set
@@ -1284,64 +909,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
1284} 909}
1285 910
1286/** 911/**
1287 * v9fs_vfs_setattr_dotl - set file metadata
1288 * @dentry: file whose metadata to set
1289 * @iattr: metadata assignment structure
1290 *
1291 */
1292
1293int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
1294{
1295 int retval;
1296 struct v9fs_session_info *v9ses;
1297 struct p9_fid *fid;
1298 struct p9_iattr_dotl p9attr;
1299
1300 P9_DPRINTK(P9_DEBUG_VFS, "\n");
1301
1302 retval = inode_change_ok(dentry->d_inode, iattr);
1303 if (retval)
1304 return retval;
1305
1306 p9attr.valid = iattr->ia_valid;
1307 p9attr.mode = iattr->ia_mode;
1308 p9attr.uid = iattr->ia_uid;
1309 p9attr.gid = iattr->ia_gid;
1310 p9attr.size = iattr->ia_size;
1311 p9attr.atime_sec = iattr->ia_atime.tv_sec;
1312 p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
1313 p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
1314 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
1315
1316 retval = -EPERM;
1317 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1318 fid = v9fs_fid_lookup(dentry);
1319 if (IS_ERR(fid))
1320 return PTR_ERR(fid);
1321
1322 retval = p9_client_setattr(fid, &p9attr);
1323 if (retval < 0)
1324 return retval;
1325
1326 if ((iattr->ia_valid & ATTR_SIZE) &&
1327 iattr->ia_size != i_size_read(dentry->d_inode)) {
1328 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
1329 if (retval)
1330 return retval;
1331 }
1332
1333 setattr_copy(dentry->d_inode, iattr);
1334 mark_inode_dirty(dentry->d_inode);
1335 if (iattr->ia_valid & ATTR_MODE) {
1336 /* We also want to update ACL when we update mode bits */
1337 retval = v9fs_acl_chmod(dentry);
1338 if (retval < 0)
1339 return retval;
1340 }
1341 return 0;
1342}
1343
1344/**
1345 * v9fs_stat2inode - populate an inode structure with mistat info 912 * v9fs_stat2inode - populate an inode structure with mistat info
1346 * @stat: Plan 9 metadata (mistat) structure 913 * @stat: Plan 9 metadata (mistat) structure
1347 * @inode: inode to populate 914 * @inode: inode to populate
@@ -1419,77 +986,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1419} 986}
1420 987
1421/** 988/**
1422 * v9fs_stat2inode_dotl - populate an inode structure with stat info
1423 * @stat: stat structure
1424 * @inode: inode to populate
1425 * @sb: superblock of filesystem
1426 *
1427 */
1428
1429void
1430v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
1431{
1432
1433 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
1434 inode->i_atime.tv_sec = stat->st_atime_sec;
1435 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1436 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1437 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1438 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1439 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1440 inode->i_uid = stat->st_uid;
1441 inode->i_gid = stat->st_gid;
1442 inode->i_nlink = stat->st_nlink;
1443 inode->i_mode = stat->st_mode;
1444 inode->i_rdev = new_decode_dev(stat->st_rdev);
1445
1446 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
1447 init_special_inode(inode, inode->i_mode, inode->i_rdev);
1448
1449 i_size_write(inode, stat->st_size);
1450 inode->i_blocks = stat->st_blocks;
1451 } else {
1452 if (stat->st_result_mask & P9_STATS_ATIME) {
1453 inode->i_atime.tv_sec = stat->st_atime_sec;
1454 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1455 }
1456 if (stat->st_result_mask & P9_STATS_MTIME) {
1457 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1458 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1459 }
1460 if (stat->st_result_mask & P9_STATS_CTIME) {
1461 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1462 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1463 }
1464 if (stat->st_result_mask & P9_STATS_UID)
1465 inode->i_uid = stat->st_uid;
1466 if (stat->st_result_mask & P9_STATS_GID)
1467 inode->i_gid = stat->st_gid;
1468 if (stat->st_result_mask & P9_STATS_NLINK)
1469 inode->i_nlink = stat->st_nlink;
1470 if (stat->st_result_mask & P9_STATS_MODE) {
1471 inode->i_mode = stat->st_mode;
1472 if ((S_ISBLK(inode->i_mode)) ||
1473 (S_ISCHR(inode->i_mode)))
1474 init_special_inode(inode, inode->i_mode,
1475 inode->i_rdev);
1476 }
1477 if (stat->st_result_mask & P9_STATS_RDEV)
1478 inode->i_rdev = new_decode_dev(stat->st_rdev);
1479 if (stat->st_result_mask & P9_STATS_SIZE)
1480 i_size_write(inode, stat->st_size);
1481 if (stat->st_result_mask & P9_STATS_BLOCKS)
1482 inode->i_blocks = stat->st_blocks;
1483 }
1484 if (stat->st_result_mask & P9_STATS_GEN)
1485 inode->i_generation = stat->st_gen;
1486
1487 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
1488 * because the inode structure does not have fields for them.
1489 */
1490}
1491
1492/**
1493 * v9fs_qid2ino - convert qid into inode number 989 * v9fs_qid2ino - convert qid into inode number
1494 * @qid: qid to hash 990 * @qid: qid to hash
1495 * 991 *
@@ -1595,7 +1091,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
1595 * 1091 *
1596 */ 1092 */
1597 1093
1598static void 1094void
1599v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) 1095v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1600{ 1096{
1601 char *s = nd_get_link(nd); 1097 char *s = nd_get_link(nd);
@@ -1639,94 +1135,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1639} 1135}
1640 1136
1641/** 1137/**
1642 * v9fs_vfs_symlink_dotl - helper function to create symlinks
1643 * @dir: directory inode containing symlink
1644 * @dentry: dentry for symlink
1645 * @symname: symlink data
1646 *
1647 * See Also: 9P2000.L RFC for more information
1648 *
1649 */
1650
1651static int
1652v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
1653 const char *symname)
1654{
1655 struct v9fs_session_info *v9ses;
1656 struct p9_fid *dfid;
1657 struct p9_fid *fid = NULL;
1658 struct inode *inode;
1659 struct p9_qid qid;
1660 char *name;
1661 int err;
1662 gid_t gid;
1663
1664 name = (char *) dentry->d_name.name;
1665 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
1666 dir->i_ino, name, symname);
1667 v9ses = v9fs_inode2v9ses(dir);
1668
1669 dfid = v9fs_fid_lookup(dentry->d_parent);
1670 if (IS_ERR(dfid)) {
1671 err = PTR_ERR(dfid);
1672 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1673 return err;
1674 }
1675
1676 gid = v9fs_get_fsgid_for_create(dir);
1677
1678 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
1679 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
1680
1681 if (err < 0) {
1682 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
1683 goto error;
1684 }
1685
1686 if (v9ses->cache) {
1687 /* Now walk from the parent so we can get an unopened fid. */
1688 fid = p9_client_walk(dfid, 1, &name, 1);
1689 if (IS_ERR(fid)) {
1690 err = PTR_ERR(fid);
1691 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1692 err);
1693 fid = NULL;
1694 goto error;
1695 }
1696
1697 /* instantiate inode and assign the unopened fid to dentry */
1698 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1699 if (IS_ERR(inode)) {
1700 err = PTR_ERR(inode);
1701 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1702 err);
1703 goto error;
1704 }
1705 dentry->d_op = &v9fs_cached_dentry_operations;
1706 d_instantiate(dentry, inode);
1707 err = v9fs_fid_add(dentry, fid);
1708 if (err < 0)
1709 goto error;
1710 fid = NULL;
1711 } else {
1712 /* Not in cached mode. No need to populate inode with stat */
1713 inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
1714 if (IS_ERR(inode)) {
1715 err = PTR_ERR(inode);
1716 goto error;
1717 }
1718 dentry->d_op = &v9fs_dentry_operations;
1719 d_instantiate(dentry, inode);
1720 }
1721
1722error:
1723 if (fid)
1724 p9_client_clunk(fid);
1725
1726 return err;
1727}
1728
1729/**
1730 * v9fs_vfs_symlink - helper function to create symlinks 1138 * v9fs_vfs_symlink - helper function to create symlinks
1731 * @dir: directory inode containing symlink 1139 * @dir: directory inode containing symlink
1732 * @dentry: dentry for symlink 1140 * @dentry: dentry for symlink
@@ -1785,77 +1193,6 @@ clunk_fid:
1785} 1193}
1786 1194
1787/** 1195/**
1788 * v9fs_vfs_link_dotl - create a hardlink for dotl
1789 * @old_dentry: dentry for file to link to
1790 * @dir: inode destination for new link
1791 * @dentry: dentry for link
1792 *
1793 */
1794
1795static int
1796v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
1797 struct dentry *dentry)
1798{
1799 int err;
1800 struct p9_fid *dfid, *oldfid;
1801 char *name;
1802 struct v9fs_session_info *v9ses;
1803 struct dentry *dir_dentry;
1804
1805 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
1806 dir->i_ino, old_dentry->d_name.name,
1807 dentry->d_name.name);
1808
1809 v9ses = v9fs_inode2v9ses(dir);
1810 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1811 dfid = v9fs_fid_lookup(dir_dentry);
1812 if (IS_ERR(dfid))
1813 return PTR_ERR(dfid);
1814
1815 oldfid = v9fs_fid_lookup(old_dentry);
1816 if (IS_ERR(oldfid))
1817 return PTR_ERR(oldfid);
1818
1819 name = (char *) dentry->d_name.name;
1820
1821 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
1822
1823 if (err < 0) {
1824 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
1825 return err;
1826 }
1827
1828 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1829 /* Get the latest stat info from server. */
1830 struct p9_fid *fid;
1831 struct p9_stat_dotl *st;
1832
1833 fid = v9fs_fid_lookup(old_dentry);
1834 if (IS_ERR(fid))
1835 return PTR_ERR(fid);
1836
1837 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
1838 if (IS_ERR(st))
1839 return PTR_ERR(st);
1840
1841 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
1842
1843 kfree(st);
1844 } else {
1845 /* Caching disabled. No need to get upto date stat info.
1846 * This dentry will be released immediately. So, just hold the
1847 * inode
1848 */
1849 ihold(old_dentry->d_inode);
1850 }
1851
1852 dentry->d_op = old_dentry->d_op;
1853 d_instantiate(dentry, old_dentry->d_inode);
1854
1855 return err;
1856}
1857
1858/**
1859 * v9fs_vfs_mknod - create a special file 1196 * v9fs_vfs_mknod - create a special file
1860 * @dir: inode destination for new link 1197 * @dir: inode destination for new link
1861 * @dentry: dentry for file 1198 * @dentry: dentry for file
@@ -1900,160 +1237,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1900 return retval; 1237 return retval;
1901} 1238}
1902 1239
1903/**
1904 * v9fs_vfs_mknod_dotl - create a special file
1905 * @dir: inode destination for new link
1906 * @dentry: dentry for file
1907 * @mode: mode for creation
1908 * @rdev: device associated with special file
1909 *
1910 */
1911static int
1912v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
1913 dev_t rdev)
1914{
1915 int err;
1916 char *name;
1917 mode_t mode;
1918 struct v9fs_session_info *v9ses;
1919 struct p9_fid *fid = NULL, *dfid = NULL;
1920 struct inode *inode;
1921 gid_t gid;
1922 struct p9_qid qid;
1923 struct dentry *dir_dentry;
1924 struct posix_acl *dacl = NULL, *pacl = NULL;
1925
1926 P9_DPRINTK(P9_DEBUG_VFS,
1927 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
1928 dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
1929
1930 if (!new_valid_dev(rdev))
1931 return -EINVAL;
1932
1933 v9ses = v9fs_inode2v9ses(dir);
1934 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1935 dfid = v9fs_fid_lookup(dir_dentry);
1936 if (IS_ERR(dfid)) {
1937 err = PTR_ERR(dfid);
1938 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1939 dfid = NULL;
1940 goto error;
1941 }
1942
1943 gid = v9fs_get_fsgid_for_create(dir);
1944 mode = omode;
1945 /* Update mode based on ACL value */
1946 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
1947 if (err) {
1948 P9_DPRINTK(P9_DEBUG_VFS,
1949 "Failed to get acl values in mknod %d\n", err);
1950 goto error;
1951 }
1952 name = (char *) dentry->d_name.name;
1953
1954 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
1955 if (err < 0)
1956 goto error;
1957
1958 /* instantiate inode and assign the unopened fid to the dentry */
1959 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1960 fid = p9_client_walk(dfid, 1, &name, 1);
1961 if (IS_ERR(fid)) {
1962 err = PTR_ERR(fid);
1963 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1964 err);
1965 fid = NULL;
1966 goto error;
1967 }
1968
1969 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1970 if (IS_ERR(inode)) {
1971 err = PTR_ERR(inode);
1972 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1973 err);
1974 goto error;
1975 }
1976 dentry->d_op = &v9fs_cached_dentry_operations;
1977 d_instantiate(dentry, inode);
1978 err = v9fs_fid_add(dentry, fid);
1979 if (err < 0)
1980 goto error;
1981 fid = NULL;
1982 } else {
1983 /*
1984 * Not in cached mode. No need to populate inode with stat.
1985 * socket syscall returns a fd, so we need instantiate
1986 */
1987 inode = v9fs_get_inode(dir->i_sb, mode);
1988 if (IS_ERR(inode)) {
1989 err = PTR_ERR(inode);
1990 goto error;
1991 }
1992 dentry->d_op = &v9fs_dentry_operations;
1993 d_instantiate(dentry, inode);
1994 }
1995 /* Now set the ACL based on the default value */
1996 v9fs_set_create_acl(dentry, dacl, pacl);
1997error:
1998 if (fid)
1999 p9_client_clunk(fid);
2000 return err;
2001}
2002
2003static int
2004v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
2005{
2006 int retval;
2007 struct p9_fid *fid;
2008 char *target = NULL;
2009
2010 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
2011 retval = -EPERM;
2012 fid = v9fs_fid_lookup(dentry);
2013 if (IS_ERR(fid))
2014 return PTR_ERR(fid);
2015
2016 retval = p9_client_readlink(fid, &target);
2017 if (retval < 0)
2018 return retval;
2019
2020 strncpy(buffer, target, buflen);
2021 P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
2022
2023 retval = strnlen(buffer, buflen);
2024 return retval;
2025}
2026
2027/**
2028 * v9fs_vfs_follow_link_dotl - follow a symlink path
2029 * @dentry: dentry for symlink
2030 * @nd: nameidata
2031 *
2032 */
2033
2034static void *
2035v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
2036{
2037 int len = 0;
2038 char *link = __getname();
2039
2040 P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
2041
2042 if (!link)
2043 link = ERR_PTR(-ENOMEM);
2044 else {
2045 len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
2046 if (len < 0) {
2047 __putname(link);
2048 link = ERR_PTR(len);
2049 } else
2050 link[min(len, PATH_MAX-1)] = 0;
2051 }
2052 nd_set_link(nd, link);
2053
2054 return NULL;
2055}
2056
2057static const struct inode_operations v9fs_dir_inode_operations_dotu = { 1240static const struct inode_operations v9fs_dir_inode_operations_dotu = {
2058 .create = v9fs_vfs_create, 1241 .create = v9fs_vfs_create,
2059 .lookup = v9fs_vfs_lookup, 1242 .lookup = v9fs_vfs_lookup,
@@ -2068,25 +1251,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
2068 .setattr = v9fs_vfs_setattr, 1251 .setattr = v9fs_vfs_setattr,
2069}; 1252};
2070 1253
2071static const struct inode_operations v9fs_dir_inode_operations_dotl = {
2072 .create = v9fs_vfs_create_dotl,
2073 .lookup = v9fs_vfs_lookup,
2074 .link = v9fs_vfs_link_dotl,
2075 .symlink = v9fs_vfs_symlink_dotl,
2076 .unlink = v9fs_vfs_unlink,
2077 .mkdir = v9fs_vfs_mkdir_dotl,
2078 .rmdir = v9fs_vfs_rmdir,
2079 .mknod = v9fs_vfs_mknod_dotl,
2080 .rename = v9fs_vfs_rename,
2081 .getattr = v9fs_vfs_getattr_dotl,
2082 .setattr = v9fs_vfs_setattr_dotl,
2083 .setxattr = generic_setxattr,
2084 .getxattr = generic_getxattr,
2085 .removexattr = generic_removexattr,
2086 .listxattr = v9fs_listxattr,
2087 .check_acl = v9fs_check_acl,
2088};
2089
2090static const struct inode_operations v9fs_dir_inode_operations = { 1254static const struct inode_operations v9fs_dir_inode_operations = {
2091 .create = v9fs_vfs_create, 1255 .create = v9fs_vfs_create,
2092 .lookup = v9fs_vfs_lookup, 1256 .lookup = v9fs_vfs_lookup,
@@ -2104,16 +1268,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
2104 .setattr = v9fs_vfs_setattr, 1268 .setattr = v9fs_vfs_setattr,
2105}; 1269};
2106 1270
2107static const struct inode_operations v9fs_file_inode_operations_dotl = {
2108 .getattr = v9fs_vfs_getattr_dotl,
2109 .setattr = v9fs_vfs_setattr_dotl,
2110 .setxattr = generic_setxattr,
2111 .getxattr = generic_getxattr,
2112 .removexattr = generic_removexattr,
2113 .listxattr = v9fs_listxattr,
2114 .check_acl = v9fs_check_acl,
2115};
2116
2117static const struct inode_operations v9fs_symlink_inode_operations = { 1271static const struct inode_operations v9fs_symlink_inode_operations = {
2118 .readlink = generic_readlink, 1272 .readlink = generic_readlink,
2119 .follow_link = v9fs_vfs_follow_link, 1273 .follow_link = v9fs_vfs_follow_link,
@@ -2122,14 +1276,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
2122 .setattr = v9fs_vfs_setattr, 1276 .setattr = v9fs_vfs_setattr,
2123}; 1277};
2124 1278
2125static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
2126 .readlink = v9fs_vfs_readlink_dotl,
2127 .follow_link = v9fs_vfs_follow_link_dotl,
2128 .put_link = v9fs_vfs_put_link,
2129 .getattr = v9fs_vfs_getattr_dotl,
2130 .setattr = v9fs_vfs_setattr_dotl,
2131 .setxattr = generic_setxattr,
2132 .getxattr = generic_getxattr,
2133 .removexattr = generic_removexattr,
2134 .listxattr = v9fs_listxattr,
2135};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 00000000000..fe3ffa9aace
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,824 @@
1/*
2 * linux/fs/9p/vfs_inode_dotl.c
3 *
4 * This file contains vfs inode ops for the 9P2000.L protocol.
5 *
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to:
20 * Free Software Foundation
21 * 51 Franklin Street, Fifth Floor
22 * Boston, MA 02111-1301 USA
23 *
24 */
25
26#include <linux/module.h>
27#include <linux/errno.h>
28#include <linux/fs.h>
29#include <linux/file.h>
30#include <linux/pagemap.h>
31#include <linux/stat.h>
32#include <linux/string.h>
33#include <linux/inet.h>
34#include <linux/namei.h>
35#include <linux/idr.h>
36#include <linux/sched.h>
37#include <linux/slab.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include <net/9p/9p.h>
41#include <net/9p/client.h>
42
43#include "v9fs.h"
44#include "v9fs_vfs.h"
45#include "fid.h"
46#include "cache.h"
47#include "xattr.h"
48#include "acl.h"
49
50static int
51v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
52 dev_t rdev);
53
54/**
55 * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
56 * new file system object. This checks the S_ISGID to determine the owning
57 * group of the new file system object.
58 */
59
60static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
61{
62 BUG_ON(dir_inode == NULL);
63
64 if (dir_inode->i_mode & S_ISGID) {
65 /* set_gid bit is set.*/
66 return dir_inode->i_gid;
67 }
68 return current_fsgid();
69}
70
71/**
72 * v9fs_dentry_from_dir_inode - helper function to get the dentry from
73 * dir inode.
74 *
75 */
76
77static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
78{
79 struct dentry *dentry;
80
81 spin_lock(&inode->i_lock);
82 /* Directory should have only one entry. */
83 BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
84 dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
85 spin_unlock(&inode->i_lock);
86 return dentry;
87}
88
89struct inode *
90v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
91 struct super_block *sb)
92{
93 struct inode *ret = NULL;
94 int err;
95 struct p9_stat_dotl *st;
96
97 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
98 if (IS_ERR(st))
99 return ERR_CAST(st);
100
101 ret = v9fs_get_inode(sb, st->st_mode);
102 if (IS_ERR(ret)) {
103 err = PTR_ERR(ret);
104 goto error;
105 }
106
107 v9fs_stat2inode_dotl(st, ret);
108 ret->i_ino = v9fs_qid2ino(&st->qid);
109#ifdef CONFIG_9P_FSCACHE
110 v9fs_vcookie_set_qid(ret, &st->qid);
111 v9fs_cache_inode_get_cookie(ret);
112#endif
113 err = v9fs_get_acl(ret, fid);
114 if (err) {
115 iput(ret);
116 goto error;
117 }
118 kfree(st);
119 return ret;
120error:
121 kfree(st);
122 return ERR_PTR(err);
123}
124
125/**
126 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
127 * @dir: directory inode that is being created
128 * @dentry: dentry that is being deleted
129 * @mode: create permissions
130 * @nd: path information
131 *
132 */
133
134static int
135v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
136 struct nameidata *nd)
137{
138 int err = 0;
139 char *name = NULL;
140 gid_t gid;
141 int flags;
142 mode_t mode;
143 struct v9fs_session_info *v9ses;
144 struct p9_fid *fid = NULL;
145 struct p9_fid *dfid, *ofid;
146 struct file *filp;
147 struct p9_qid qid;
148 struct inode *inode;
149 struct posix_acl *pacl = NULL, *dacl = NULL;
150
151 v9ses = v9fs_inode2v9ses(dir);
152 if (nd && nd->flags & LOOKUP_OPEN)
153 flags = nd->intent.open.flags - 1;
154 else {
155 /*
156 * create call without LOOKUP_OPEN is due
157 * to mknod of regular files. So use mknod
158 * operation.
159 */
160 return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
161 }
162
163 name = (char *) dentry->d_name.name;
164 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
165 "mode:0x%x\n", name, flags, omode);
166
167 dfid = v9fs_fid_lookup(dentry->d_parent);
168 if (IS_ERR(dfid)) {
169 err = PTR_ERR(dfid);
170 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
171 return err;
172 }
173
174 /* clone a fid to use for creation */
175 ofid = p9_client_walk(dfid, 0, NULL, 1);
176 if (IS_ERR(ofid)) {
177 err = PTR_ERR(ofid);
178 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
179 return err;
180 }
181
182 gid = v9fs_get_fsgid_for_create(dir);
183
184 mode = omode;
185 /* Update mode based on ACL value */
186 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
187 if (err) {
188 P9_DPRINTK(P9_DEBUG_VFS,
189 "Failed to get acl values in creat %d\n", err);
190 goto error;
191 }
192 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
193 if (err < 0) {
194 P9_DPRINTK(P9_DEBUG_VFS,
195 "p9_client_open_dotl failed in creat %d\n",
196 err);
197 goto error;
198 }
199
200 /* instantiate inode and assign the unopened fid to the dentry */
201 fid = p9_client_walk(dfid, 1, &name, 1);
202 if (IS_ERR(fid)) {
203 err = PTR_ERR(fid);
204 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
205 fid = NULL;
206 goto error;
207 }
208 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
209 if (IS_ERR(inode)) {
210 err = PTR_ERR(inode);
211 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
212 goto error;
213 }
214 d_instantiate(dentry, inode);
215 err = v9fs_fid_add(dentry, fid);
216 if (err < 0)
217 goto error;
218
219 /* Now set the ACL based on the default value */
220 v9fs_set_create_acl(dentry, dacl, pacl);
221
222 /* Since we are opening a file, assign the open fid to the file */
223 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
224 if (IS_ERR(filp)) {
225 p9_client_clunk(ofid);
226 return PTR_ERR(filp);
227 }
228 filp->private_data = ofid;
229 return 0;
230
231error:
232 if (ofid)
233 p9_client_clunk(ofid);
234 if (fid)
235 p9_client_clunk(fid);
236 return err;
237}
238
239/**
240 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
241 * @dir: inode that is being unlinked
242 * @dentry: dentry that is being unlinked
243 * @mode: mode for new directory
244 *
245 */
246
247static int v9fs_vfs_mkdir_dotl(struct inode *dir,
248 struct dentry *dentry, int omode)
249{
250 int err;
251 struct v9fs_session_info *v9ses;
252 struct p9_fid *fid = NULL, *dfid = NULL;
253 gid_t gid;
254 char *name;
255 mode_t mode;
256 struct inode *inode;
257 struct p9_qid qid;
258 struct dentry *dir_dentry;
259 struct posix_acl *dacl = NULL, *pacl = NULL;
260
261 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
262 err = 0;
263 v9ses = v9fs_inode2v9ses(dir);
264
265 omode |= S_IFDIR;
266 if (dir->i_mode & S_ISGID)
267 omode |= S_ISGID;
268
269 dir_dentry = v9fs_dentry_from_dir_inode(dir);
270 dfid = v9fs_fid_lookup(dir_dentry);
271 if (IS_ERR(dfid)) {
272 err = PTR_ERR(dfid);
273 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
274 dfid = NULL;
275 goto error;
276 }
277
278 gid = v9fs_get_fsgid_for_create(dir);
279 mode = omode;
280 /* Update mode based on ACL value */
281 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
282 if (err) {
283 P9_DPRINTK(P9_DEBUG_VFS,
284 "Failed to get acl values in mkdir %d\n", err);
285 goto error;
286 }
287 name = (char *) dentry->d_name.name;
288 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
289 if (err < 0)
290 goto error;
291
292 /* instantiate inode and assign the unopened fid to the dentry */
293 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
294 fid = p9_client_walk(dfid, 1, &name, 1);
295 if (IS_ERR(fid)) {
296 err = PTR_ERR(fid);
297 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
298 err);
299 fid = NULL;
300 goto error;
301 }
302
303 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
304 if (IS_ERR(inode)) {
305 err = PTR_ERR(inode);
306 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
307 err);
308 goto error;
309 }
310 d_instantiate(dentry, inode);
311 err = v9fs_fid_add(dentry, fid);
312 if (err < 0)
313 goto error;
314 fid = NULL;
315 } else {
316 /*
317 * Not in cached mode. No need to populate
318 * inode with stat. We need to get an inode
319 * so that we can set the acl with dentry
320 */
321 inode = v9fs_get_inode(dir->i_sb, mode);
322 if (IS_ERR(inode)) {
323 err = PTR_ERR(inode);
324 goto error;
325 }
326 d_instantiate(dentry, inode);
327 }
328 /* Now set the ACL based on the default value */
329 v9fs_set_create_acl(dentry, dacl, pacl);
330
331error:
332 if (fid)
333 p9_client_clunk(fid);
334 return err;
335}
336
337static int
338v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
339 struct kstat *stat)
340{
341 int err;
342 struct v9fs_session_info *v9ses;
343 struct p9_fid *fid;
344 struct p9_stat_dotl *st;
345
346 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
347 err = -EPERM;
348 v9ses = v9fs_inode2v9ses(dentry->d_inode);
349 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
350 return simple_getattr(mnt, dentry, stat);
351
352 fid = v9fs_fid_lookup(dentry);
353 if (IS_ERR(fid))
354 return PTR_ERR(fid);
355
356 /* Ask for all the fields in stat structure. Server will return
357 * whatever it supports
358 */
359
360 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
361 if (IS_ERR(st))
362 return PTR_ERR(st);
363
364 v9fs_stat2inode_dotl(st, dentry->d_inode);
365 generic_fillattr(dentry->d_inode, stat);
366 /* Change block size to what the server returned */
367 stat->blksize = st->st_blksize;
368
369 kfree(st);
370 return 0;
371}
372
373/**
374 * v9fs_vfs_setattr_dotl - set file metadata
375 * @dentry: file whose metadata to set
376 * @iattr: metadata assignment structure
377 *
378 */
379
380int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
381{
382 int retval;
383 struct v9fs_session_info *v9ses;
384 struct p9_fid *fid;
385 struct p9_iattr_dotl p9attr;
386
387 P9_DPRINTK(P9_DEBUG_VFS, "\n");
388
389 retval = inode_change_ok(dentry->d_inode, iattr);
390 if (retval)
391 return retval;
392
393 p9attr.valid = iattr->ia_valid;
394 p9attr.mode = iattr->ia_mode;
395 p9attr.uid = iattr->ia_uid;
396 p9attr.gid = iattr->ia_gid;
397 p9attr.size = iattr->ia_size;
398 p9attr.atime_sec = iattr->ia_atime.tv_sec;
399 p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
400 p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
401 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
402
403 retval = -EPERM;
404 v9ses = v9fs_inode2v9ses(dentry->d_inode);
405 fid = v9fs_fid_lookup(dentry);
406 if (IS_ERR(fid))
407 return PTR_ERR(fid);
408
409 retval = p9_client_setattr(fid, &p9attr);
410 if (retval < 0)
411 return retval;
412
413 if ((iattr->ia_valid & ATTR_SIZE) &&
414 iattr->ia_size != i_size_read(dentry->d_inode)) {
415 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
416 if (retval)
417 return retval;
418 }
419
420 setattr_copy(dentry->d_inode, iattr);
421 mark_inode_dirty(dentry->d_inode);
422 if (iattr->ia_valid & ATTR_MODE) {
423 /* We also want to update ACL when we update mode bits */
424 retval = v9fs_acl_chmod(dentry);
425 if (retval < 0)
426 return retval;
427 }
428 return 0;
429}
430
431/**
432 * v9fs_stat2inode_dotl - populate an inode structure with stat info
433 * @stat: stat structure
434 * @inode: inode to populate
435 * @sb: superblock of filesystem
436 *
437 */
438
439void
440v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
441{
442
443 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
444 inode->i_atime.tv_sec = stat->st_atime_sec;
445 inode->i_atime.tv_nsec = stat->st_atime_nsec;
446 inode->i_mtime.tv_sec = stat->st_mtime_sec;
447 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
448 inode->i_ctime.tv_sec = stat->st_ctime_sec;
449 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
450 inode->i_uid = stat->st_uid;
451 inode->i_gid = stat->st_gid;
452 inode->i_nlink = stat->st_nlink;
453 inode->i_mode = stat->st_mode;
454 inode->i_rdev = new_decode_dev(stat->st_rdev);
455
456 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
457 init_special_inode(inode, inode->i_mode, inode->i_rdev);
458
459 i_size_write(inode, stat->st_size);
460 inode->i_blocks = stat->st_blocks;
461 } else {
462 if (stat->st_result_mask & P9_STATS_ATIME) {
463 inode->i_atime.tv_sec = stat->st_atime_sec;
464 inode->i_atime.tv_nsec = stat->st_atime_nsec;
465 }
466 if (stat->st_result_mask & P9_STATS_MTIME) {
467 inode->i_mtime.tv_sec = stat->st_mtime_sec;
468 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
469 }
470 if (stat->st_result_mask & P9_STATS_CTIME) {
471 inode->i_ctime.tv_sec = stat->st_ctime_sec;
472 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
473 }
474 if (stat->st_result_mask & P9_STATS_UID)
475 inode->i_uid = stat->st_uid;
476 if (stat->st_result_mask & P9_STATS_GID)
477 inode->i_gid = stat->st_gid;
478 if (stat->st_result_mask & P9_STATS_NLINK)
479 inode->i_nlink = stat->st_nlink;
480 if (stat->st_result_mask & P9_STATS_MODE) {
481 inode->i_mode = stat->st_mode;
482 if ((S_ISBLK(inode->i_mode)) ||
483 (S_ISCHR(inode->i_mode)))
484 init_special_inode(inode, inode->i_mode,
485 inode->i_rdev);
486 }
487 if (stat->st_result_mask & P9_STATS_RDEV)
488 inode->i_rdev = new_decode_dev(stat->st_rdev);
489 if (stat->st_result_mask & P9_STATS_SIZE)
490 i_size_write(inode, stat->st_size);
491 if (stat->st_result_mask & P9_STATS_BLOCKS)
492 inode->i_blocks = stat->st_blocks;
493 }
494 if (stat->st_result_mask & P9_STATS_GEN)
495 inode->i_generation = stat->st_gen;
496
497 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
498 * because the inode structure does not have fields for them.
499 */
500}
501
502static int
503v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
504 const char *symname)
505{
506 struct v9fs_session_info *v9ses;
507 struct p9_fid *dfid;
508 struct p9_fid *fid = NULL;
509 struct inode *inode;
510 struct p9_qid qid;
511 char *name;
512 int err;
513 gid_t gid;
514
515 name = (char *) dentry->d_name.name;
516 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
517 dir->i_ino, name, symname);
518 v9ses = v9fs_inode2v9ses(dir);
519
520 dfid = v9fs_fid_lookup(dentry->d_parent);
521 if (IS_ERR(dfid)) {
522 err = PTR_ERR(dfid);
523 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
524 return err;
525 }
526
527 gid = v9fs_get_fsgid_for_create(dir);
528
529 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
530 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
531
532 if (err < 0) {
533 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
534 goto error;
535 }
536
537 if (v9ses->cache) {
538 /* Now walk from the parent so we can get an unopened fid. */
539 fid = p9_client_walk(dfid, 1, &name, 1);
540 if (IS_ERR(fid)) {
541 err = PTR_ERR(fid);
542 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
543 err);
544 fid = NULL;
545 goto error;
546 }
547
548 /* instantiate inode and assign the unopened fid to dentry */
549 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
550 if (IS_ERR(inode)) {
551 err = PTR_ERR(inode);
552 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
553 err);
554 goto error;
555 }
556 d_instantiate(dentry, inode);
557 err = v9fs_fid_add(dentry, fid);
558 if (err < 0)
559 goto error;
560 fid = NULL;
561 } else {
562 /* Not in cached mode. No need to populate inode with stat */
563 inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
564 if (IS_ERR(inode)) {
565 err = PTR_ERR(inode);
566 goto error;
567 }
568 d_instantiate(dentry, inode);
569 }
570
571error:
572 if (fid)
573 p9_client_clunk(fid);
574
575 return err;
576}
577
578/**
579 * v9fs_vfs_link_dotl - create a hardlink for dotl
580 * @old_dentry: dentry for file to link to
581 * @dir: inode destination for new link
582 * @dentry: dentry for link
583 *
584 */
585
586static int
587v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
588 struct dentry *dentry)
589{
590 int err;
591 struct p9_fid *dfid, *oldfid;
592 char *name;
593 struct v9fs_session_info *v9ses;
594 struct dentry *dir_dentry;
595
596 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
597 dir->i_ino, old_dentry->d_name.name,
598 dentry->d_name.name);
599
600 v9ses = v9fs_inode2v9ses(dir);
601 dir_dentry = v9fs_dentry_from_dir_inode(dir);
602 dfid = v9fs_fid_lookup(dir_dentry);
603 if (IS_ERR(dfid))
604 return PTR_ERR(dfid);
605
606 oldfid = v9fs_fid_lookup(old_dentry);
607 if (IS_ERR(oldfid))
608 return PTR_ERR(oldfid);
609
610 name = (char *) dentry->d_name.name;
611
612 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
613
614 if (err < 0) {
615 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
616 return err;
617 }
618
619 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
620 /* Get the latest stat info from server. */
621 struct p9_fid *fid;
622 struct p9_stat_dotl *st;
623
624 fid = v9fs_fid_lookup(old_dentry);
625 if (IS_ERR(fid))
626 return PTR_ERR(fid);
627
628 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
629 if (IS_ERR(st))
630 return PTR_ERR(st);
631
632 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
633
634 kfree(st);
635 } else {
636 /* Caching disabled. No need to get upto date stat info.
637 * This dentry will be released immediately. So, just hold the
638 * inode
639 */
640 ihold(old_dentry->d_inode);
641 }
642 d_instantiate(dentry, old_dentry->d_inode);
643
644 return err;
645}
646
647/**
648 * v9fs_vfs_mknod_dotl - create a special file
649 * @dir: inode destination for new link
650 * @dentry: dentry for file
651 * @mode: mode for creation
652 * @rdev: device associated with special file
653 *
654 */
655static int
656v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
657 dev_t rdev)
658{
659 int err;
660 char *name;
661 mode_t mode;
662 struct v9fs_session_info *v9ses;
663 struct p9_fid *fid = NULL, *dfid = NULL;
664 struct inode *inode;
665 gid_t gid;
666 struct p9_qid qid;
667 struct dentry *dir_dentry;
668 struct posix_acl *dacl = NULL, *pacl = NULL;
669
670 P9_DPRINTK(P9_DEBUG_VFS,
671 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
672 dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
673
674 if (!new_valid_dev(rdev))
675 return -EINVAL;
676
677 v9ses = v9fs_inode2v9ses(dir);
678 dir_dentry = v9fs_dentry_from_dir_inode(dir);
679 dfid = v9fs_fid_lookup(dir_dentry);
680 if (IS_ERR(dfid)) {
681 err = PTR_ERR(dfid);
682 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
683 dfid = NULL;
684 goto error;
685 }
686
687 gid = v9fs_get_fsgid_for_create(dir);
688 mode = omode;
689 /* Update mode based on ACL value */
690 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
691 if (err) {
692 P9_DPRINTK(P9_DEBUG_VFS,
693 "Failed to get acl values in mknod %d\n", err);
694 goto error;
695 }
696 name = (char *) dentry->d_name.name;
697
698 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
699 if (err < 0)
700 goto error;
701
702 /* instantiate inode and assign the unopened fid to the dentry */
703 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
704 fid = p9_client_walk(dfid, 1, &name, 1);
705 if (IS_ERR(fid)) {
706 err = PTR_ERR(fid);
707 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
708 err);
709 fid = NULL;
710 goto error;
711 }
712
713 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
714 if (IS_ERR(inode)) {
715 err = PTR_ERR(inode);
716 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
717 err);
718 goto error;
719 }
720 d_instantiate(dentry, inode);
721 err = v9fs_fid_add(dentry, fid);
722 if (err < 0)
723 goto error;
724 fid = NULL;
725 } else {
726 /*
727 * Not in cached mode. No need to populate inode with stat.
728 * socket syscall returns a fd, so we need instantiate
729 */
730 inode = v9fs_get_inode(dir->i_sb, mode);
731 if (IS_ERR(inode)) {
732 err = PTR_ERR(inode);
733 goto error;
734 }
735 d_instantiate(dentry, inode);
736 }
737 /* Now set the ACL based on the default value */
738 v9fs_set_create_acl(dentry, dacl, pacl);
739error:
740 if (fid)
741 p9_client_clunk(fid);
742 return err;
743}
744
745/**
746 * v9fs_vfs_follow_link_dotl - follow a symlink path
747 * @dentry: dentry for symlink
748 * @nd: nameidata
749 *
750 */
751
752static void *
753v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
754{
755 int retval;
756 struct p9_fid *fid;
757 char *link = __getname();
758 char *target;
759
760 P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
761
762 if (!link) {
763 link = ERR_PTR(-ENOMEM);
764 goto ndset;
765 }
766 fid = v9fs_fid_lookup(dentry);
767 if (IS_ERR(fid)) {
768 __putname(link);
769 link = ERR_PTR(PTR_ERR(fid));
770 goto ndset;
771 }
772 retval = p9_client_readlink(fid, &target);
773 if (!retval) {
774 strcpy(link, target);
775 kfree(target);
776 goto ndset;
777 }
778 __putname(link);
779 link = ERR_PTR(retval);
780ndset:
781 nd_set_link(nd, link);
782 return NULL;
783}
784
785const struct inode_operations v9fs_dir_inode_operations_dotl = {
786 .create = v9fs_vfs_create_dotl,
787 .lookup = v9fs_vfs_lookup,
788 .link = v9fs_vfs_link_dotl,
789 .symlink = v9fs_vfs_symlink_dotl,
790 .unlink = v9fs_vfs_unlink,
791 .mkdir = v9fs_vfs_mkdir_dotl,
792 .rmdir = v9fs_vfs_rmdir,
793 .mknod = v9fs_vfs_mknod_dotl,
794 .rename = v9fs_vfs_rename,
795 .getattr = v9fs_vfs_getattr_dotl,
796 .setattr = v9fs_vfs_setattr_dotl,
797 .setxattr = generic_setxattr,
798 .getxattr = generic_getxattr,
799 .removexattr = generic_removexattr,
800 .listxattr = v9fs_listxattr,
801 .check_acl = v9fs_check_acl,
802};
803
804const struct inode_operations v9fs_file_inode_operations_dotl = {
805 .getattr = v9fs_vfs_getattr_dotl,
806 .setattr = v9fs_vfs_setattr_dotl,
807 .setxattr = generic_setxattr,
808 .getxattr = generic_getxattr,
809 .removexattr = generic_removexattr,
810 .listxattr = v9fs_listxattr,
811 .check_acl = v9fs_check_acl,
812};
813
814const struct inode_operations v9fs_symlink_inode_operations_dotl = {
815 .readlink = generic_readlink,
816 .follow_link = v9fs_vfs_follow_link_dotl,
817 .put_link = v9fs_vfs_put_link,
818 .getattr = v9fs_vfs_getattr_dotl,
819 .setattr = v9fs_vfs_setattr_dotl,
820 .setxattr = generic_setxattr,
821 .getxattr = generic_getxattr,
822 .removexattr = generic_removexattr,
823 .listxattr = v9fs_listxattr,
824};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index c55c614500a..dbaabe3b813 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -141,6 +141,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
141 } 141 }
142 v9fs_fill_super(sb, v9ses, flags, data); 142 v9fs_fill_super(sb, v9ses, flags, data);
143 143
144 if (v9ses->cache)
145 sb->s_d_op = &v9fs_cached_dentry_operations;
146 else
147 sb->s_d_op = &v9fs_dentry_operations;
148
144 inode = v9fs_get_inode(sb, S_IFDIR | mode); 149 inode = v9fs_get_inode(sb, S_IFDIR | mode);
145 if (IS_ERR(inode)) { 150 if (IS_ERR(inode)) {
146 retval = PTR_ERR(inode); 151 retval = PTR_ERR(inode);
@@ -217,9 +222,6 @@ static void v9fs_kill_super(struct super_block *s)
217 222
218 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); 223 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
219 224
220 if (s->s_root)
221 v9fs_dentry_release(s->s_root); /* clunk root */
222
223 kill_anon_super(s); 225 kill_anon_super(s);
224 226
225 v9fs_session_cancel(v9ses); 227 v9fs_session_cancel(v9ses);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 43ec7df8433..d288773871b 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -133,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
133 "p9_client_xattrcreate failed %d\n", retval); 133 "p9_client_xattrcreate failed %d\n", retval);
134 goto error; 134 goto error;
135 } 135 }
136 msize = fid->clnt->msize;; 136 msize = fid->clnt->msize;
137 while (value_len) { 137 while (value_len) {
138 if (value_len > (msize - P9_IOHDRSZ)) 138 if (value_len > (msize - P9_IOHDRSZ))
139 write_count = msize - P9_IOHDRSZ; 139 write_count = msize - P9_IOHDRSZ;
diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457402d..3db9caa57ed 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
30source "fs/reiserfs/Kconfig" 30source "fs/reiserfs/Kconfig"
31source "fs/jfs/Kconfig" 31source "fs/jfs/Kconfig"
32 32
33config FS_POSIX_ACL
34# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
35#
36# NOTE: you can implement Posix ACLs without these helpers (XFS does).
37# Never use this symbol for ifdefs.
38#
39 bool
40 default n
41
42source "fs/xfs/Kconfig" 33source "fs/xfs/Kconfig"
43source "fs/gfs2/Kconfig" 34source "fs/gfs2/Kconfig"
44source "fs/ocfs2/Kconfig" 35source "fs/ocfs2/Kconfig"
@@ -47,11 +38,19 @@ source "fs/nilfs2/Kconfig"
47 38
48endif # BLOCK 39endif # BLOCK
49 40
41# Posix ACL utility routines
42#
43# Note: Posix ACLs can be implemented without these helpers. Never use
44# this symbol for ifdefs in core code.
45#
46config FS_POSIX_ACL
47 def_bool n
48
50config EXPORTFS 49config EXPORTFS
51 tristate 50 tristate
52 51
53config FILE_LOCKING 52config FILE_LOCKING
54 bool "Enable POSIX file locking API" if EMBEDDED 53 bool "Enable POSIX file locking API" if EXPERT
55 default y 54 default y
56 help 55 help
57 This option enables standard file locking support, required 56 This option enables standard file locking support, required
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index f4287e4de74..3b4a764ed78 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = {
201}; 201};
202 202
203static int 203static int
204adfs_hash(struct dentry *parent, struct qstr *qstr) 204adfs_hash(const struct dentry *parent, const struct inode *inode,
205 struct qstr *qstr)
205{ 206{
206 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; 207 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
207 const unsigned char *name; 208 const unsigned char *name;
@@ -237,17 +238,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr)
237 * requirements of the underlying filesystem. 238 * requirements of the underlying filesystem.
238 */ 239 */
239static int 240static int
240adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name) 241adfs_compare(const struct dentry *parent, const struct inode *pinode,
242 const struct dentry *dentry, const struct inode *inode,
243 unsigned int len, const char *str, const struct qstr *name)
241{ 244{
242 int i; 245 int i;
243 246
244 if (entry->len != name->len) 247 if (len != name->len)
245 return 1; 248 return 1;
246 249
247 for (i = 0; i < name->len; i++) { 250 for (i = 0; i < name->len; i++) {
248 char a, b; 251 char a, b;
249 252
250 a = entry->name[i]; 253 a = str[i];
251 b = name->name[i]; 254 b = name->name[i];
252 255
253 if (a >= 'A' && a <= 'Z') 256 if (a >= 'A' && a <= 'Z')
@@ -273,7 +276,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
273 struct object_info obj; 276 struct object_info obj;
274 int error; 277 int error;
275 278
276 dentry->d_op = &adfs_dentry_operations;
277 lock_kernel(); 279 lock_kernel();
278 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); 280 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
279 if (error == 0) { 281 if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 959dbff2d42..2d7954049fb 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
240 return &ei->vfs_inode; 240 return &ei->vfs_inode;
241} 241}
242 242
243static void adfs_destroy_inode(struct inode *inode) 243static void adfs_i_callback(struct rcu_head *head)
244{ 244{
245 struct inode *inode = container_of(head, struct inode, i_rcu);
246 INIT_LIST_HEAD(&inode->i_dentry);
245 kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); 247 kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
246} 248}
247 249
250static void adfs_destroy_inode(struct inode *inode)
251{
252 call_rcu(&inode->i_rcu, adfs_i_callback);
253}
254
248static void init_once(void *foo) 255static void init_once(void *foo)
249{ 256{
250 struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; 257 struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
@@ -466,6 +473,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
466 asb->s_namelen = ADFS_F_NAME_LEN; 473 asb->s_namelen = ADFS_F_NAME_LEN;
467 } 474 }
468 475
476 sb->s_d_op = &adfs_dentry_operations;
469 root = adfs_iget(sb, &root_obj); 477 root = adfs_iget(sb, &root_obj);
470 sb->s_root = d_alloc_root(root); 478 sb->s_root = d_alloc_root(root);
471 if (!sb->s_root) { 479 if (!sb->s_root) {
@@ -476,8 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
476 kfree(asb->s_map); 484 kfree(asb->s_map);
477 adfs_error(sb, "get root inode failed\n"); 485 adfs_error(sb, "get root inode failed\n");
478 goto error; 486 goto error;
479 } else 487 }
480 sb->s_root->d_op = &adfs_dentry_operations;
481 unlock_kernel(); 488 unlock_kernel();
482 return 0; 489 return 0;
483 490
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a8cbdeb3402..0e95f73a702 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -201,6 +201,7 @@ extern const struct address_space_operations affs_aops;
201extern const struct address_space_operations affs_aops_ofs; 201extern const struct address_space_operations affs_aops_ofs;
202 202
203extern const struct dentry_operations affs_dentry_operations; 203extern const struct dentry_operations affs_dentry_operations;
204extern const struct dentry_operations affs_intl_dentry_operations;
204 205
205static inline void 206static inline void
206affs_set_blocksize(struct super_block *sb, int size) 207affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7d0f0a30f7a..3a4557e8325 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -128,7 +128,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
128 void *data = dentry->d_fsdata; 128 void *data = dentry->d_fsdata;
129 struct list_head *head, *next; 129 struct list_head *head, *next;
130 130
131 spin_lock(&dcache_lock); 131 spin_lock(&inode->i_lock);
132 head = &inode->i_dentry; 132 head = &inode->i_dentry;
133 next = head->next; 133 next = head->next;
134 while (next != head) { 134 while (next != head) {
@@ -139,7 +139,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
139 } 139 }
140 next = next->next; 140 next = next->next;
141 } 141 }
142 spin_unlock(&dcache_lock); 142 spin_unlock(&inode->i_lock);
143} 143}
144 144
145 145
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 914d1c0bc07..e3e9efc1fdd 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,26 @@
13typedef int (*toupper_t)(int); 13typedef int (*toupper_t)(int);
14 14
15static int affs_toupper(int ch); 15static int affs_toupper(int ch);
16static int affs_hash_dentry(struct dentry *, struct qstr *); 16static int affs_hash_dentry(const struct dentry *,
17static int affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *); 17 const struct inode *, struct qstr *);
18static int affs_compare_dentry(const struct dentry *parent,
19 const struct inode *pinode,
20 const struct dentry *dentry, const struct inode *inode,
21 unsigned int len, const char *str, const struct qstr *name);
18static int affs_intl_toupper(int ch); 22static int affs_intl_toupper(int ch);
19static int affs_intl_hash_dentry(struct dentry *, struct qstr *); 23static int affs_intl_hash_dentry(const struct dentry *,
20static int affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *); 24 const struct inode *, struct qstr *);
25static int affs_intl_compare_dentry(const struct dentry *parent,
26 const struct inode *pinode,
27 const struct dentry *dentry, const struct inode *inode,
28 unsigned int len, const char *str, const struct qstr *name);
21 29
22const struct dentry_operations affs_dentry_operations = { 30const struct dentry_operations affs_dentry_operations = {
23 .d_hash = affs_hash_dentry, 31 .d_hash = affs_hash_dentry,
24 .d_compare = affs_compare_dentry, 32 .d_compare = affs_compare_dentry,
25}; 33};
26 34
27static const struct dentry_operations affs_intl_dentry_operations = { 35const struct dentry_operations affs_intl_dentry_operations = {
28 .d_hash = affs_intl_hash_dentry, 36 .d_hash = affs_intl_hash_dentry,
29 .d_compare = affs_intl_compare_dentry, 37 .d_compare = affs_intl_compare_dentry,
30}; 38};
@@ -58,13 +66,13 @@ affs_get_toupper(struct super_block *sb)
58 * Note: the dentry argument is the parent dentry. 66 * Note: the dentry argument is the parent dentry.
59 */ 67 */
60static inline int 68static inline int
61__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper) 69__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
62{ 70{
63 const u8 *name = qstr->name; 71 const u8 *name = qstr->name;
64 unsigned long hash; 72 unsigned long hash;
65 int i; 73 int i;
66 74
67 i = affs_check_name(qstr->name,qstr->len); 75 i = affs_check_name(qstr->name, qstr->len);
68 if (i) 76 if (i)
69 return i; 77 return i;
70 78
@@ -78,39 +86,41 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
78} 86}
79 87
80static int 88static int
81affs_hash_dentry(struct dentry *dentry, struct qstr *qstr) 89affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
90 struct qstr *qstr)
82{ 91{
83 return __affs_hash_dentry(dentry, qstr, affs_toupper); 92 return __affs_hash_dentry(qstr, affs_toupper);
84} 93}
85static int 94static int
86affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr) 95affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
96 struct qstr *qstr)
87{ 97{
88 return __affs_hash_dentry(dentry, qstr, affs_intl_toupper); 98 return __affs_hash_dentry(qstr, affs_intl_toupper);
89} 99}
90 100
91static inline int 101static inline int __affs_compare_dentry(unsigned int len,
92__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper) 102 const char *str, const struct qstr *name, toupper_t toupper)
93{ 103{
94 const u8 *aname = a->name; 104 const u8 *aname = str;
95 const u8 *bname = b->name; 105 const u8 *bname = name->name;
96 int len;
97 106
98 /* 'a' is the qstr of an already existing dentry, so the name 107 /*
99 * must be valid. 'b' must be validated first. 108 * 'str' is the name of an already existing dentry, so the name
109 * must be valid. 'name' must be validated first.
100 */ 110 */
101 111
102 if (affs_check_name(b->name,b->len)) 112 if (affs_check_name(name->name, name->len))
103 return 1; 113 return 1;
104 114
105 /* If the names are longer than the allowed 30 chars, 115 /*
116 * If the names are longer than the allowed 30 chars,
106 * the excess is ignored, so their length may differ. 117 * the excess is ignored, so their length may differ.
107 */ 118 */
108 len = a->len;
109 if (len >= 30) { 119 if (len >= 30) {
110 if (b->len < 30) 120 if (name->len < 30)
111 return 1; 121 return 1;
112 len = 30; 122 len = 30;
113 } else if (len != b->len) 123 } else if (len != name->len)
114 return 1; 124 return 1;
115 125
116 for (; len > 0; len--) 126 for (; len > 0; len--)
@@ -121,14 +131,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou
121} 131}
122 132
123static int 133static int
124affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) 134affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
135 const struct dentry *dentry, const struct inode *inode,
136 unsigned int len, const char *str, const struct qstr *name)
125{ 137{
126 return __affs_compare_dentry(dentry, a, b, affs_toupper); 138 return __affs_compare_dentry(len, str, name, affs_toupper);
127} 139}
128static int 140static int
129affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) 141affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
142 const struct dentry *dentry, const struct inode *inode,
143 unsigned int len, const char *str, const struct qstr *name)
130{ 144{
131 return __affs_compare_dentry(dentry, a, b, affs_intl_toupper); 145 return __affs_compare_dentry(len, str, name, affs_intl_toupper);
132} 146}
133 147
134/* 148/*
@@ -226,7 +240,6 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
226 if (IS_ERR(inode)) 240 if (IS_ERR(inode))
227 return ERR_CAST(inode); 241 return ERR_CAST(inode);
228 } 242 }
229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
230 d_add(dentry, inode); 243 d_add(dentry, inode);
231 return NULL; 244 return NULL;
232} 245}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 0cf7f4384cb..b31507d0f9b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -95,11 +95,18 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
95 return &i->vfs_inode; 95 return &i->vfs_inode;
96} 96}
97 97
98static void affs_destroy_inode(struct inode *inode) 98static void affs_i_callback(struct rcu_head *head)
99{ 99{
100 struct inode *inode = container_of(head, struct inode, i_rcu);
101 INIT_LIST_HEAD(&inode->i_dentry);
100 kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); 102 kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
101} 103}
102 104
105static void affs_destroy_inode(struct inode *inode)
106{
107 call_rcu(&inode->i_rcu, affs_i_callback);
108}
109
103static void init_once(void *foo) 110static void init_once(void *foo)
104{ 111{
105 struct affs_inode_info *ei = (struct affs_inode_info *) foo; 112 struct affs_inode_info *ei = (struct affs_inode_info *) foo;
@@ -470,12 +477,16 @@ got_root:
470 goto out_error_noinode; 477 goto out_error_noinode;
471 } 478 }
472 479
480 if (AFFS_SB(sb)->s_flags & SF_INTL)
481 sb->s_d_op = &affs_intl_dentry_operations;
482 else
483 sb->s_d_op = &affs_dentry_operations;
484
473 sb->s_root = d_alloc_root(root_inode); 485 sb->s_root = d_alloc_root(root_inode);
474 if (!sb->s_root) { 486 if (!sb->s_root) {
475 printk(KERN_ERR "AFFS: Get root inode failed\n"); 487 printk(KERN_ERR "AFFS: Get root inode failed\n");
476 goto out_error; 488 goto out_error;
477 } 489 }
478 sb->s_root->d_op = &affs_dentry_operations;
479 490
480 pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); 491 pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
481 return 0; 492 return 0;
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54..1c8c6cc6de3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
289 call->server = server; 289 call->server = server;
290 290
291 INIT_WORK(&call->work, SRXAFSCB_CallBack); 291 INIT_WORK(&call->work, SRXAFSCB_CallBack);
292 schedule_work(&call->work); 292 queue_work(afs_wq, &call->work);
293 return 0; 293 return 0;
294} 294}
295 295
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
336 call->server = server; 336 call->server = server;
337 337
338 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); 338 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
339 schedule_work(&call->work); 339 queue_work(afs_wq, &call->work);
340 return 0; 340 return 0;
341} 341}
342 342
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
367 call->server = server; 367 call->server = server;
368 368
369 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); 369 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
370 schedule_work(&call->work); 370 queue_work(afs_wq, &call->work);
371 return 0; 371 return 0;
372} 372}
373 373
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
400 call->state = AFS_CALL_REPLYING; 400 call->state = AFS_CALL_REPLYING;
401 401
402 INIT_WORK(&call->work, SRXAFSCB_Probe); 402 INIT_WORK(&call->work, SRXAFSCB_Probe);
403 schedule_work(&call->work); 403 queue_work(afs_wq, &call->work);
404 return 0; 404 return 0;
405} 405}
406 406
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
496 call->state = AFS_CALL_REPLYING; 496 call->state = AFS_CALL_REPLYING;
497 497
498 INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); 498 INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
499 schedule_work(&call->work); 499 queue_work(afs_wq, &call->work);
500 return 0; 500 return 0;
501} 501}
502 502
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
580 call->state = AFS_CALL_REPLYING; 580 call->state = AFS_CALL_REPLYING;
581 581
582 INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); 582 INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
583 schedule_work(&call->work); 583 queue_work(afs_wq, &call->work);
584 return 0; 584 return 0;
585} 585}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5439e1bc9a8..20c106f2492 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/namei.h>
16#include <linux/pagemap.h> 17#include <linux/pagemap.h>
17#include <linux/ctype.h> 18#include <linux/ctype.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
@@ -23,7 +24,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
23static int afs_dir_open(struct inode *inode, struct file *file); 24static int afs_dir_open(struct inode *inode, struct file *file);
24static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); 25static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
25static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd); 26static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
26static int afs_d_delete(struct dentry *dentry); 27static int afs_d_delete(const struct dentry *dentry);
27static void afs_d_release(struct dentry *dentry); 28static void afs_d_release(struct dentry *dentry);
28static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, 29static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
29 loff_t fpos, u64 ino, unsigned dtype); 30 loff_t fpos, u64 ino, unsigned dtype);
@@ -61,10 +62,11 @@ const struct inode_operations afs_dir_inode_operations = {
61 .setattr = afs_setattr, 62 .setattr = afs_setattr,
62}; 63};
63 64
64static const struct dentry_operations afs_fs_dentry_operations = { 65const struct dentry_operations afs_fs_dentry_operations = {
65 .d_revalidate = afs_d_revalidate, 66 .d_revalidate = afs_d_revalidate,
66 .d_delete = afs_d_delete, 67 .d_delete = afs_d_delete,
67 .d_release = afs_d_release, 68 .d_release = afs_d_release,
69 .d_automount = afs_d_automount,
68}; 70};
69 71
70#define AFS_DIR_HASHTBL_SIZE 128 72#define AFS_DIR_HASHTBL_SIZE 128
@@ -581,8 +583,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
581 } 583 }
582 584
583success: 585success:
584 dentry->d_op = &afs_fs_dentry_operations;
585
586 d_add(dentry, inode); 586 d_add(dentry, inode);
587 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }", 587 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
588 fid.vnode, 588 fid.vnode,
@@ -607,6 +607,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
607 void *dir_version; 607 void *dir_version;
608 int ret; 608 int ret;
609 609
610 if (nd->flags & LOOKUP_RCU)
611 return -ECHILD;
612
610 vnode = AFS_FS_I(dentry->d_inode); 613 vnode = AFS_FS_I(dentry->d_inode);
611 614
612 if (dentry->d_inode) 615 if (dentry->d_inode)
@@ -730,7 +733,7 @@ out_bad:
730 * - called from dput() when d_count is going to 0. 733 * - called from dput() when d_count is going to 0.
731 * - return 1 to request dentry be unhashed, 0 otherwise 734 * - return 1 to request dentry be unhashed, 0 otherwise
732 */ 735 */
733static int afs_d_delete(struct dentry *dentry) 736static int afs_d_delete(const struct dentry *dentry)
734{ 737{
735 _enter("%s", dentry->d_name.name); 738 _enter("%s", dentry->d_name.name);
736 739
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c..db66c520147 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
184 inode->i_generation = 0; 184 inode->i_generation = 0;
185 185
186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); 186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
187 inode->i_flags |= S_NOATIME; 187 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
188 inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
188 unlock_new_inode(inode); 189 unlock_new_inode(inode);
189 _leave(" = %p", inode); 190 _leave(" = %p", inode);
190 return inode; 191 return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index cca8eef736f..5a9b6843bac 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -486,6 +486,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
486 * dir.c 486 * dir.c
487 */ 487 */
488extern const struct inode_operations afs_dir_inode_operations; 488extern const struct inode_operations afs_dir_inode_operations;
489extern const struct dentry_operations afs_fs_dentry_operations;
489extern const struct file_operations afs_dir_file_operations; 490extern const struct file_operations afs_dir_file_operations;
490 491
491/* 492/*
@@ -576,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
576/* 577/*
577 * main.c 578 * main.c
578 */ 579 */
580extern struct workqueue_struct *afs_wq;
579extern struct afs_uuid afs_uuid; 581extern struct afs_uuid afs_uuid;
580 582
581/* 583/*
@@ -590,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
590extern const struct inode_operations afs_autocell_inode_operations; 592extern const struct inode_operations afs_autocell_inode_operations;
591extern const struct file_operations afs_mntpt_file_operations; 593extern const struct file_operations afs_mntpt_file_operations;
592 594
595extern struct vfsmount *afs_d_automount(struct path *);
593extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); 596extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
594extern void afs_mntpt_kill_timer(void); 597extern void afs_mntpt_kill_timer(void);
595 598
@@ -624,7 +627,7 @@ extern void afs_clear_permits(struct afs_vnode *);
624extern void afs_cache_permit(struct afs_vnode *, struct key *, long); 627extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
625extern void afs_zap_permits(struct rcu_head *); 628extern void afs_zap_permits(struct rcu_head *);
626extern struct key *afs_request_key(struct afs_cell *); 629extern struct key *afs_request_key(struct afs_cell *);
627extern int afs_permission(struct inode *, int); 630extern int afs_permission(struct inode *, int, unsigned int);
628 631
629/* 632/*
630 * server.c 633 * server.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b2..42dd2e499ed 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); 30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
31 31
32struct afs_uuid afs_uuid; 32struct afs_uuid afs_uuid;
33struct workqueue_struct *afs_wq;
33 34
34/* 35/*
35 * get a client UUID 36 * get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
87 if (ret < 0) 88 if (ret < 0)
88 return ret; 89 return ret;
89 90
91 /* create workqueue */
92 ret = -ENOMEM;
93 afs_wq = alloc_workqueue("afs", 0, 0);
94 if (!afs_wq)
95 return ret;
96
90 /* register the /proc stuff */ 97 /* register the /proc stuff */
91 ret = afs_proc_init(); 98 ret = afs_proc_init();
92 if (ret < 0) 99 if (ret < 0)
93 return ret; 100 goto error_proc;
94 101
95#ifdef CONFIG_AFS_FSCACHE 102#ifdef CONFIG_AFS_FSCACHE
96 /* we want to be able to cache */ 103 /* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
140error_cache: 147error_cache:
141#endif 148#endif
142 afs_proc_cleanup(); 149 afs_proc_cleanup();
150error_proc:
151 destroy_workqueue(afs_wq);
143 rcu_barrier(); 152 rcu_barrier();
144 printk(KERN_ERR "kAFS: failed to register: %d\n", ret); 153 printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
145 return ret; 154 return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
163 afs_purge_servers(); 172 afs_purge_servers();
164 afs_callback_update_kill(); 173 afs_callback_update_kill();
165 afs_vlocation_purge(); 174 afs_vlocation_purge();
166 flush_scheduled_work(); 175 destroy_workqueue(afs_wq);
167 afs_cell_purge(); 176 afs_cell_purge();
168#ifdef CONFIG_AFS_FSCACHE 177#ifdef CONFIG_AFS_FSCACHE
169 fscache_unregister_netfs(&afs_cache_netfs); 178 fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6153417caf5..aa59184151d 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
24 struct dentry *dentry, 24 struct dentry *dentry,
25 struct nameidata *nd); 25 struct nameidata *nd);
26static int afs_mntpt_open(struct inode *inode, struct file *file); 26static int afs_mntpt_open(struct inode *inode, struct file *file);
27static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
28static void afs_mntpt_expiry_timed_out(struct work_struct *work); 27static void afs_mntpt_expiry_timed_out(struct work_struct *work);
29 28
30const struct file_operations afs_mntpt_file_operations = { 29const struct file_operations afs_mntpt_file_operations = {
@@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = {
34 33
35const struct inode_operations afs_mntpt_inode_operations = { 34const struct inode_operations afs_mntpt_inode_operations = {
36 .lookup = afs_mntpt_lookup, 35 .lookup = afs_mntpt_lookup,
37 .follow_link = afs_mntpt_follow_link,
38 .readlink = page_readlink, 36 .readlink = page_readlink,
39 .getattr = afs_getattr, 37 .getattr = afs_getattr,
40}; 38};
41 39
42const struct inode_operations afs_autocell_inode_operations = { 40const struct inode_operations afs_autocell_inode_operations = {
43 .follow_link = afs_mntpt_follow_link,
44 .getattr = afs_getattr, 41 .getattr = afs_getattr,
45}; 42};
46 43
@@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
88 _debug("symlink is a mountpoint"); 85 _debug("symlink is a mountpoint");
89 spin_lock(&vnode->lock); 86 spin_lock(&vnode->lock);
90 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); 87 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
88 vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
91 spin_unlock(&vnode->lock); 89 spin_unlock(&vnode->lock);
92 } 90 }
93 91
@@ -238,52 +236,24 @@ error_no_devname:
238} 236}
239 237
240/* 238/*
241 * follow a link from a mountpoint directory, thus causing it to be mounted 239 * handle an automount point
242 */ 240 */
243static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) 241struct vfsmount *afs_d_automount(struct path *path)
244{ 242{
245 struct vfsmount *newmnt; 243 struct vfsmount *newmnt;
246 int err;
247 244
248 _enter("%p{%s},{%s:%p{%s},}", 245 _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
249 dentry,
250 dentry->d_name.name,
251 nd->path.mnt->mnt_devname,
252 dentry,
253 nd->path.dentry->d_name.name);
254
255 dput(nd->path.dentry);
256 nd->path.dentry = dget(dentry);
257 246
258 newmnt = afs_mntpt_do_automount(nd->path.dentry); 247 newmnt = afs_mntpt_do_automount(path->dentry);
259 if (IS_ERR(newmnt)) { 248 if (IS_ERR(newmnt))
260 path_put(&nd->path); 249 return newmnt;
261 return (void *)newmnt;
262 }
263
264 mntget(newmnt);
265 err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
266 switch (err) {
267 case 0:
268 path_put(&nd->path);
269 nd->path.mnt = newmnt;
270 nd->path.dentry = dget(newmnt->mnt_root);
271 schedule_delayed_work(&afs_mntpt_expiry_timer,
272 afs_mntpt_expiry_timeout * HZ);
273 break;
274 case -EBUSY:
275 /* someone else made a mount here whilst we were busy */
276 while (d_mountpoint(nd->path.dentry) &&
277 follow_down(&nd->path))
278 ;
279 err = 0;
280 default:
281 mntput(newmnt);
282 break;
283 }
284 250
285 _leave(" = %d", err); 251 mntget(newmnt); /* prevent immediate expiration */
286 return ERR_PTR(err); 252 mnt_set_expiry(newmnt, &afs_vfsmounts);
253 queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
254 afs_mntpt_expiry_timeout * HZ);
255 _leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
256 return newmnt;
287} 257}
288 258
289/* 259/*
@@ -295,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
295 265
296 if (!list_empty(&afs_vfsmounts)) { 266 if (!list_empty(&afs_vfsmounts)) {
297 mark_mounts_for_expiry(&afs_vfsmounts); 267 mark_mounts_for_expiry(&afs_vfsmounts);
298 schedule_delayed_work(&afs_mntpt_expiry_timer, 268 queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
299 afs_mntpt_expiry_timeout * HZ); 269 afs_mntpt_expiry_timeout * HZ);
300 } 270 }
301 271
302 _leave(""); 272 _leave("");
@@ -310,6 +280,5 @@ void afs_mntpt_kill_timer(void)
310 _enter(""); 280 _enter("");
311 281
312 ASSERT(list_empty(&afs_vfsmounts)); 282 ASSERT(list_empty(&afs_vfsmounts));
313 cancel_delayed_work(&afs_mntpt_expiry_timer); 283 cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
314 flush_scheduled_work();
315} 284}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01..e45a323aebb 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
410 if (!call) { 410 if (!call) {
411 /* its an incoming call for our callback service */ 411 /* its an incoming call for our callback service */
412 skb_queue_tail(&afs_incoming_calls, skb); 412 skb_queue_tail(&afs_incoming_calls, skb);
413 schedule_work(&afs_collect_incoming_call_work); 413 queue_work(afs_wq, &afs_collect_incoming_call_work);
414 } else { 414 } else {
415 /* route the messages directly to the appropriate call */ 415 /* route the messages directly to the appropriate call */
416 skb_queue_tail(&call->rx_queue, skb); 416 skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index bb4ed144d0e..f44b9d35537 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
285 * - AFS ACLs are attached to directories only, and a file is controlled by its 285 * - AFS ACLs are attached to directories only, and a file is controlled by its
286 * parent directory's ACL 286 * parent directory's ACL
287 */ 287 */
288int afs_permission(struct inode *inode, int mask) 288int afs_permission(struct inode *inode, int mask, unsigned int flags)
289{ 289{
290 struct afs_vnode *vnode = AFS_FS_I(inode); 290 struct afs_vnode *vnode = AFS_FS_I(inode);
291 afs_access_t uninitialized_var(access); 291 afs_access_t uninitialized_var(access);
292 struct key *key; 292 struct key *key;
293 int ret; 293 int ret;
294 294
295 if (flags & IPERM_FLAG_RCU)
296 return -ECHILD;
297
295 _enter("{{%x:%u},%lx},%x,", 298 _enter("{{%x:%u},%lx},%x,",
296 vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); 299 vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
297 300
@@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
347 } 350 }
348 351
349 key_put(key); 352 key_put(key);
350 ret = generic_permission(inode, mask, NULL); 353 ret = generic_permission(inode, mask, flags, NULL);
351 _leave(" = %d", ret); 354 _leave(" = %d", ret);
352 return ret; 355 return ret;
353 356
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7b..d59b7516e94 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
238 if (atomic_read(&server->usage) == 0) { 238 if (atomic_read(&server->usage) == 0) {
239 list_move_tail(&server->grave, &afs_server_graveyard); 239 list_move_tail(&server->grave, &afs_server_graveyard);
240 server->time_of_death = get_seconds(); 240 server->time_of_death = get_seconds();
241 schedule_delayed_work(&afs_server_reaper, 241 queue_delayed_work(afs_wq, &afs_server_reaper,
242 afs_server_timeout * HZ); 242 afs_server_timeout * HZ);
243 } 243 }
244 spin_unlock(&afs_server_graveyard_lock); 244 spin_unlock(&afs_server_graveyard_lock);
245 _leave(" [dead]"); 245 _leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
285 expiry = server->time_of_death + afs_server_timeout; 285 expiry = server->time_of_death + afs_server_timeout;
286 if (expiry > now) { 286 if (expiry > now) {
287 delay = (expiry - now) * HZ; 287 delay = (expiry - now) * HZ;
288 if (!schedule_delayed_work(&afs_server_reaper, delay)) { 288 if (!queue_delayed_work(afs_wq, &afs_server_reaper,
289 delay)) {
289 cancel_delayed_work(&afs_server_reaper); 290 cancel_delayed_work(&afs_server_reaper);
290 schedule_delayed_work(&afs_server_reaper, 291 queue_delayed_work(afs_wq, &afs_server_reaper,
291 delay); 292 delay);
292 } 293 }
293 break; 294 break;
294 } 295 }
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
323{ 324{
324 afs_server_timeout = 0; 325 afs_server_timeout = 0;
325 cancel_delayed_work(&afs_server_reaper); 326 cancel_delayed_work(&afs_server_reaper);
326 schedule_delayed_work(&afs_server_reaper, 0); 327 queue_delayed_work(afs_wq, &afs_server_reaper, 0);
327} 328}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 27201cffece..fb240e8766d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -336,6 +336,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
336 if (!root) 336 if (!root)
337 goto error; 337 goto error;
338 338
339 sb->s_d_op = &afs_fs_dentry_operations;
339 sb->s_root = root; 340 sb->s_root = root;
340 341
341 _leave(" = 0"); 342 _leave(" = 0");
@@ -498,6 +499,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
498 return &vnode->vfs_inode; 499 return &vnode->vfs_inode;
499} 500}
500 501
502static void afs_i_callback(struct rcu_head *head)
503{
504 struct inode *inode = container_of(head, struct inode, i_rcu);
505 struct afs_vnode *vnode = AFS_FS_I(inode);
506 INIT_LIST_HEAD(&inode->i_dentry);
507 kmem_cache_free(afs_inode_cachep, vnode);
508}
509
501/* 510/*
502 * destroy an AFS inode struct 511 * destroy an AFS inode struct
503 */ 512 */
@@ -511,7 +520,7 @@ static void afs_destroy_inode(struct inode *inode)
511 520
512 ASSERTCMP(vnode->server, ==, NULL); 521 ASSERTCMP(vnode->server, ==, NULL);
513 522
514 kmem_cache_free(afs_inode_cachep, vnode); 523 call_rcu(&inode->i_rcu, afs_i_callback);
515 atomic_dec(&afs_count_active_inodes); 524 atomic_dec(&afs_count_active_inodes);
516} 525}
517 526
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361..431984d2e37 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
507 _debug("buried"); 507 _debug("buried");
508 list_move_tail(&vl->grave, &afs_vlocation_graveyard); 508 list_move_tail(&vl->grave, &afs_vlocation_graveyard);
509 vl->time_of_death = get_seconds(); 509 vl->time_of_death = get_seconds();
510 schedule_delayed_work(&afs_vlocation_reap, 510 queue_delayed_work(afs_wq, &afs_vlocation_reap,
511 afs_vlocation_timeout * HZ); 511 afs_vlocation_timeout * HZ);
512 512
513 /* suspend updates on this record */ 513 /* suspend updates on this record */
514 if (!list_empty(&vl->update)) { 514 if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
561 if (expiry > now) { 561 if (expiry > now) {
562 delay = (expiry - now) * HZ; 562 delay = (expiry - now) * HZ;
563 _debug("delay %lu", delay); 563 _debug("delay %lu", delay);
564 if (!schedule_delayed_work(&afs_vlocation_reap, 564 if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
565 delay)) { 565 delay)) {
566 cancel_delayed_work(&afs_vlocation_reap); 566 cancel_delayed_work(&afs_vlocation_reap);
567 schedule_delayed_work(&afs_vlocation_reap, 567 queue_delayed_work(afs_wq, &afs_vlocation_reap,
568 delay); 568 delay);
569 } 569 }
570 break; 570 break;
571 } 571 }
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
620 destroy_workqueue(afs_vlocation_update_worker); 620 destroy_workqueue(afs_vlocation_update_worker);
621 621
622 cancel_delayed_work(&afs_vlocation_reap); 622 cancel_delayed_work(&afs_vlocation_reap);
623 schedule_delayed_work(&afs_vlocation_reap, 0); 623 queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
624} 624}
625 625
626/* 626/*
diff --git a/fs/aio.c b/fs/aio.c
index 8c8f6c5b6d7..fc557a3be0a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
87 87
88 aio_wq = create_workqueue("aio"); 88 aio_wq = create_workqueue("aio");
89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); 89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
90 BUG_ON(!abe_pool); 90 BUG_ON(!aio_wq || !abe_pool);
91 91
92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
93 93
@@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx)
798 queue_delayed_work(aio_wq, &ctx->wq, timeout); 798 queue_delayed_work(aio_wq, &ctx->wq, timeout);
799} 799}
800 800
801
802/*
803 * aio_run_iocbs:
804 * Process all pending retries queued on the ioctx
805 * run list.
806 * Assumes it is operating within the aio issuer's mm
807 * context.
808 */
809static inline void aio_run_iocbs(struct kioctx *ctx)
810{
811 int requeue;
812
813 spin_lock_irq(&ctx->ctx_lock);
814
815 requeue = __aio_run_iocbs(ctx);
816 spin_unlock_irq(&ctx->ctx_lock);
817 if (requeue)
818 aio_queue_work(ctx);
819}
820
821/* 801/*
822 * just like aio_run_iocbs, but keeps running them until 802 * aio_run_all_iocbs:
823 * the list stays empty 803 * Process all pending retries queued on the ioctx
804 * run list, and keep running them until the list
805 * stays empty.
806 * Assumes it is operating within the aio issuer's mm context.
824 */ 807 */
825static inline void aio_run_all_iocbs(struct kioctx *ctx) 808static inline void aio_run_all_iocbs(struct kioctx *ctx)
826{ 809{
@@ -1839,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1839 long ret = -EINVAL; 1822 long ret = -EINVAL;
1840 1823
1841 if (likely(ioctx)) { 1824 if (likely(ioctx)) {
1842 if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0)) 1825 if (likely(min_nr <= nr && min_nr >= 0))
1843 ret = read_events(ioctx, min_nr, nr, events, timeout); 1826 ret = read_events(ioctx, min_nr, nr, events, timeout);
1844 put_ioctx(ioctx); 1827 put_ioctx(ioctx);
1845 } 1828 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 57ce55b2564..c5567cb7843 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
26static struct inode *anon_inode_inode; 26static struct inode *anon_inode_inode;
27static const struct file_operations anon_inode_fops; 27static const struct file_operations anon_inode_fops;
28 28
29static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
30 int flags, const char *dev_name, void *data)
31{
32 return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
33}
34
35/* 29/*
36 * anon_inodefs_dname() is called from d_path(). 30 * anon_inodefs_dname() is called from d_path().
37 */ 31 */
@@ -41,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
41 dentry->d_name.name); 35 dentry->d_name.name);
42} 36}
43 37
38static const struct dentry_operations anon_inodefs_dentry_operations = {
39 .d_dname = anon_inodefs_dname,
40};
41
42static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
43 int flags, const char *dev_name, void *data)
44{
45 return mount_pseudo(fs_type, "anon_inode:", NULL,
46 &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
47}
48
44static struct file_system_type anon_inode_fs_type = { 49static struct file_system_type anon_inode_fs_type = {
45 .name = "anon_inodefs", 50 .name = "anon_inodefs",
46 .mount = anon_inodefs_mount, 51 .mount = anon_inodefs_mount,
47 .kill_sb = kill_anon_super, 52 .kill_sb = kill_anon_super,
48}; 53};
49static const struct dentry_operations anon_inodefs_dentry_operations = {
50 .d_dname = anon_inodefs_dname,
51};
52 54
53/* 55/*
54 * nop .set_page_dirty method so that people can use .page_mkwrite on 56 * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -64,9 +66,9 @@ static const struct address_space_operations anon_aops = {
64}; 66};
65 67
66/** 68/**
67 * anon_inode_getfd - creates a new file instance by hooking it up to an 69 * anon_inode_getfile - creates a new file instance by hooking it up to an
68 * anonymous inode, and a dentry that describe the "class" 70 * anonymous inode, and a dentry that describe the "class"
69 * of the file 71 * of the file
70 * 72 *
71 * @name: [in] name of the "class" of the new file 73 * @name: [in] name of the "class" of the new file
72 * @fops: [in] file operations for the new file 74 * @fops: [in] file operations for the new file
@@ -102,7 +104,7 @@ struct file *anon_inode_getfile(const char *name,
102 this.name = name; 104 this.name = name;
103 this.len = strlen(name); 105 this.len = strlen(name);
104 this.hash = 0; 106 this.hash = 0;
105 path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); 107 path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
106 if (!path.dentry) 108 if (!path.dentry)
107 goto err_module; 109 goto err_module;
108 110
@@ -113,7 +115,6 @@ struct file *anon_inode_getfile(const char *name,
113 */ 115 */
114 ihold(anon_inode_inode); 116 ihold(anon_inode_inode);
115 117
116 path.dentry->d_op = &anon_inodefs_dentry_operations;
117 d_instantiate(path.dentry, anon_inode_inode); 118 d_instantiate(path.dentry, anon_inode_inode);
118 119
119 error = -ENFILE; 120 error = -ENFILE;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3d283abf67d..54f92379272 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -16,6 +16,7 @@
16#include <linux/auto_fs4.h> 16#include <linux/auto_fs4.h>
17#include <linux/auto_dev-ioctl.h> 17#include <linux/auto_dev-ioctl.h>
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/spinlock.h>
19#include <linux/list.h> 20#include <linux/list.h>
20 21
21/* This is the range of ioctl() numbers we claim as ours */ 22/* This is the range of ioctl() numbers we claim as ours */
@@ -60,6 +61,8 @@ do { \
60 current->pid, __func__, ##args); \ 61 current->pid, __func__, ##args); \
61} while (0) 62} while (0)
62 63
64extern spinlock_t autofs4_lock;
65
63/* Unified info structure. This is pointed to by both the dentry and 66/* Unified info structure. This is pointed to by both the dentry and
64 inode structures. Each file in the filesystem has an instance of this 67 inode structures. Each file in the filesystem has an instance of this
65 structure. It holds a reference to the dentry, so dentries are never 68 structure. It holds a reference to the dentry, so dentries are never
@@ -85,18 +88,9 @@ struct autofs_info {
85 88
86 uid_t uid; 89 uid_t uid;
87 gid_t gid; 90 gid_t gid;
88
89 mode_t mode;
90 size_t size;
91
92 void (*free)(struct autofs_info *);
93 union {
94 const char *symlink;
95 } u;
96}; 91};
97 92
98#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 93#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
99#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
100#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ 94#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
101 95
102struct autofs_wait_queue { 96struct autofs_wait_queue {
@@ -173,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
173 return 0; 167 return 0;
174} 168}
175 169
176static inline void autofs4_copy_atime(struct file *src, struct file *dst) 170struct inode *autofs4_get_inode(struct super_block *, mode_t);
177{
178 dst->f_path.dentry->d_inode->i_atime =
179 src->f_path.dentry->d_inode->i_atime;
180 return;
181}
182
183struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
184void autofs4_free_ino(struct autofs_info *); 171void autofs4_free_ino(struct autofs_info *);
185 172
186/* Expiration */ 173/* Expiration */
@@ -209,16 +196,89 @@ void autofs_dev_ioctl_exit(void);
209 196
210extern const struct inode_operations autofs4_symlink_inode_operations; 197extern const struct inode_operations autofs4_symlink_inode_operations;
211extern const struct inode_operations autofs4_dir_inode_operations; 198extern const struct inode_operations autofs4_dir_inode_operations;
212extern const struct inode_operations autofs4_root_inode_operations;
213extern const struct inode_operations autofs4_indirect_root_inode_operations;
214extern const struct inode_operations autofs4_direct_root_inode_operations;
215extern const struct file_operations autofs4_dir_operations; 199extern const struct file_operations autofs4_dir_operations;
216extern const struct file_operations autofs4_root_operations; 200extern const struct file_operations autofs4_root_operations;
201extern const struct dentry_operations autofs4_dentry_operations;
202
203/* VFS automount flags management functions */
204
205static inline void __managed_dentry_set_automount(struct dentry *dentry)
206{
207 dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
208}
209
210static inline void managed_dentry_set_automount(struct dentry *dentry)
211{
212 spin_lock(&dentry->d_lock);
213 __managed_dentry_set_automount(dentry);
214 spin_unlock(&dentry->d_lock);
215}
216
217static inline void __managed_dentry_clear_automount(struct dentry *dentry)
218{
219 dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
220}
221
222static inline void managed_dentry_clear_automount(struct dentry *dentry)
223{
224 spin_lock(&dentry->d_lock);
225 __managed_dentry_clear_automount(dentry);
226 spin_unlock(&dentry->d_lock);
227}
228
229static inline void __managed_dentry_set_transit(struct dentry *dentry)
230{
231 dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
232}
233
234static inline void managed_dentry_set_transit(struct dentry *dentry)
235{
236 spin_lock(&dentry->d_lock);
237 __managed_dentry_set_transit(dentry);
238 spin_unlock(&dentry->d_lock);
239}
240
241static inline void __managed_dentry_clear_transit(struct dentry *dentry)
242{
243 dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
244}
245
246static inline void managed_dentry_clear_transit(struct dentry *dentry)
247{
248 spin_lock(&dentry->d_lock);
249 __managed_dentry_clear_transit(dentry);
250 spin_unlock(&dentry->d_lock);
251}
252
253static inline void __managed_dentry_set_managed(struct dentry *dentry)
254{
255 dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
256}
257
258static inline void managed_dentry_set_managed(struct dentry *dentry)
259{
260 spin_lock(&dentry->d_lock);
261 __managed_dentry_set_managed(dentry);
262 spin_unlock(&dentry->d_lock);
263}
264
265static inline void __managed_dentry_clear_managed(struct dentry *dentry)
266{
267 dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
268}
269
270static inline void managed_dentry_clear_managed(struct dentry *dentry)
271{
272 spin_lock(&dentry->d_lock);
273 __managed_dentry_clear_managed(dentry);
274 spin_unlock(&dentry->d_lock);
275}
217 276
218/* Initializing function */ 277/* Initializing function */
219 278
220int autofs4_fill_super(struct super_block *, void *, int); 279int autofs4_fill_super(struct super_block *, void *, int);
221struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode); 280struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
281void autofs4_clean_ino(struct autofs_info *);
222 282
223/* Queue management functions */ 283/* Queue management functions */
224 284
@@ -226,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
226int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); 286int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
227void autofs4_catatonic_mode(struct autofs_sb_info *); 287void autofs4_catatonic_mode(struct autofs_sb_info *);
228 288
229static inline int autofs4_follow_mount(struct path *path)
230{
231 int res = 0;
232
233 while (d_mountpoint(path->dentry)) {
234 int followed = follow_down(path);
235 if (!followed)
236 break;
237 res = 1;
238 }
239 return res;
240}
241
242static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) 289static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
243{ 290{
244 return new_encode_dev(sbi->sb->s_dev); 291 return new_encode_dev(sbi->sb->s_dev);
@@ -254,17 +301,15 @@ static inline int simple_positive(struct dentry *dentry)
254 return dentry->d_inode && !d_unhashed(dentry); 301 return dentry->d_inode && !d_unhashed(dentry);
255} 302}
256 303
257static inline int __simple_empty(struct dentry *dentry) 304static inline void __autofs4_add_expiring(struct dentry *dentry)
258{ 305{
259 struct dentry *child; 306 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
260 int ret = 0; 307 struct autofs_info *ino = autofs4_dentry_ino(dentry);
261 308 if (ino) {
262 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) 309 if (list_empty(&ino->expiring))
263 if (simple_positive(child)) 310 list_add(&ino->expiring, &sbi->expiring_list);
264 goto out; 311 }
265 ret = 1; 312 return;
266out:
267 return ret;
268} 313}
269 314
270static inline void autofs4_add_expiring(struct dentry *dentry) 315static inline void autofs4_add_expiring(struct dentry *dentry)
@@ -293,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
293 return; 338 return;
294} 339}
295 340
296void autofs4_dentry_release(struct dentry *);
297extern void autofs4_kill_sb(struct super_block *); 341extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469..1442da4860e 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
551 551
552 err = have_submounts(path.dentry); 552 err = have_submounts(path.dentry);
553 553
554 if (follow_down(&path)) 554 if (follow_down_one(&path))
555 magic = path.mnt->mnt_sb->s_magic; 555 magic = path.mnt->mnt_sb->s_magic;
556 } 556 }
557 557
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index a796c9417fb..f43100b9662 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
26 if (ino == NULL) 26 if (ino == NULL)
27 return 0; 27 return 0;
28 28
29 /* No point expiring a pending mount */
30 if (ino->flags & AUTOFS_INF_PENDING)
31 return 0;
32
33 if (!do_now) { 29 if (!do_now) {
34 /* Too young to die */ 30 /* Too young to die */
35 if (!timeout || time_after(ino->last_used + timeout, now)) 31 if (!timeout || time_after(ino->last_used + timeout, now))
@@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
56 52
57 path_get(&path); 53 path_get(&path);
58 54
59 if (!follow_down(&path)) 55 if (!follow_down_one(&path))
60 goto done; 56 goto done;
61 57
62 if (is_autofs4_dentry(path.dentry)) { 58 if (is_autofs4_dentry(path.dentry)) {
@@ -91,24 +87,64 @@ done:
91} 87}
92 88
93/* 89/*
94 * Calculate next entry in top down tree traversal. 90 * Calculate and dget next entry in top down tree traversal.
95 * From next_mnt in namespace.c - elegant.
96 */ 91 */
97static struct dentry *next_dentry(struct dentry *p, struct dentry *root) 92static struct dentry *get_next_positive_dentry(struct dentry *prev,
93 struct dentry *root)
98{ 94{
99 struct list_head *next = p->d_subdirs.next; 95 struct list_head *next;
96 struct dentry *p, *ret;
97
98 if (prev == NULL)
99 return dget(root);
100 100
101 spin_lock(&autofs4_lock);
102relock:
103 p = prev;
104 spin_lock(&p->d_lock);
105again:
106 next = p->d_subdirs.next;
101 if (next == &p->d_subdirs) { 107 if (next == &p->d_subdirs) {
102 while (1) { 108 while (1) {
103 if (p == root) 109 struct dentry *parent;
110
111 if (p == root) {
112 spin_unlock(&p->d_lock);
113 spin_unlock(&autofs4_lock);
114 dput(prev);
104 return NULL; 115 return NULL;
116 }
117
118 parent = p->d_parent;
119 if (!spin_trylock(&parent->d_lock)) {
120 spin_unlock(&p->d_lock);
121 cpu_relax();
122 goto relock;
123 }
124 spin_unlock(&p->d_lock);
105 next = p->d_u.d_child.next; 125 next = p->d_u.d_child.next;
106 if (next != &p->d_parent->d_subdirs) 126 p = parent;
127 if (next != &parent->d_subdirs)
107 break; 128 break;
108 p = p->d_parent;
109 } 129 }
110 } 130 }
111 return list_entry(next, struct dentry, d_u.d_child); 131 ret = list_entry(next, struct dentry, d_u.d_child);
132
133 spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
134 /* Negative dentry - try next */
135 if (!simple_positive(ret)) {
136 spin_unlock(&p->d_lock);
137 p = ret;
138 goto again;
139 }
140 dget_dlock(ret);
141 spin_unlock(&ret->d_lock);
142 spin_unlock(&p->d_lock);
143 spin_unlock(&autofs4_lock);
144
145 dput(prev);
146
147 return ret;
112} 148}
113 149
114/* 150/*
@@ -158,18 +194,11 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
158 if (!simple_positive(top)) 194 if (!simple_positive(top))
159 return 1; 195 return 1;
160 196
161 spin_lock(&dcache_lock); 197 p = NULL;
162 for (p = top; p; p = next_dentry(p, top)) { 198 while ((p = get_next_positive_dentry(p, top))) {
163 /* Negative dentry - give up */
164 if (!simple_positive(p))
165 continue;
166
167 DPRINTK("dentry %p %.*s", 199 DPRINTK("dentry %p %.*s",
168 p, (int) p->d_name.len, p->d_name.name); 200 p, (int) p->d_name.len, p->d_name.name);
169 201
170 p = dget(p);
171 spin_unlock(&dcache_lock);
172
173 /* 202 /*
174 * Is someone visiting anywhere in the subtree ? 203 * Is someone visiting anywhere in the subtree ?
175 * If there's no mount we need to check the usage 204 * If there's no mount we need to check the usage
@@ -198,16 +227,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
198 else 227 else
199 ino_count++; 228 ino_count++;
200 229
201 if (atomic_read(&p->d_count) > ino_count) { 230 if (p->d_count > ino_count) {
202 top_ino->last_used = jiffies; 231 top_ino->last_used = jiffies;
203 dput(p); 232 dput(p);
204 return 1; 233 return 1;
205 } 234 }
206 } 235 }
207 dput(p);
208 spin_lock(&dcache_lock);
209 } 236 }
210 spin_unlock(&dcache_lock);
211 237
212 /* Timeout of a tree mount is ultimately determined by its top dentry */ 238 /* Timeout of a tree mount is ultimately determined by its top dentry */
213 if (!autofs4_can_expire(top, timeout, do_now)) 239 if (!autofs4_can_expire(top, timeout, do_now))
@@ -226,32 +252,21 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
226 DPRINTK("parent %p %.*s", 252 DPRINTK("parent %p %.*s",
227 parent, (int)parent->d_name.len, parent->d_name.name); 253 parent, (int)parent->d_name.len, parent->d_name.name);
228 254
229 spin_lock(&dcache_lock); 255 p = NULL;
230 for (p = parent; p; p = next_dentry(p, parent)) { 256 while ((p = get_next_positive_dentry(p, parent))) {
231 /* Negative dentry - give up */
232 if (!simple_positive(p))
233 continue;
234
235 DPRINTK("dentry %p %.*s", 257 DPRINTK("dentry %p %.*s",
236 p, (int) p->d_name.len, p->d_name.name); 258 p, (int) p->d_name.len, p->d_name.name);
237 259
238 p = dget(p);
239 spin_unlock(&dcache_lock);
240
241 if (d_mountpoint(p)) { 260 if (d_mountpoint(p)) {
242 /* Can we umount this guy */ 261 /* Can we umount this guy */
243 if (autofs4_mount_busy(mnt, p)) 262 if (autofs4_mount_busy(mnt, p))
244 goto cont; 263 continue;
245 264
246 /* Can we expire this guy */ 265 /* Can we expire this guy */
247 if (autofs4_can_expire(p, timeout, do_now)) 266 if (autofs4_can_expire(p, timeout, do_now))
248 return p; 267 return p;
249 } 268 }
250cont:
251 dput(p);
252 spin_lock(&dcache_lock);
253 } 269 }
254 spin_unlock(&dcache_lock);
255 return NULL; 270 return NULL;
256} 271}
257 272
@@ -264,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
264 unsigned long timeout; 279 unsigned long timeout;
265 struct dentry *root = dget(sb->s_root); 280 struct dentry *root = dget(sb->s_root);
266 int do_now = how & AUTOFS_EXP_IMMEDIATE; 281 int do_now = how & AUTOFS_EXP_IMMEDIATE;
282 struct autofs_info *ino;
267 283
268 if (!root) 284 if (!root)
269 return NULL; 285 return NULL;
@@ -272,17 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
272 timeout = sbi->exp_timeout; 288 timeout = sbi->exp_timeout;
273 289
274 spin_lock(&sbi->fs_lock); 290 spin_lock(&sbi->fs_lock);
291 ino = autofs4_dentry_ino(root);
292 /* No point expiring a pending mount */
293 if (ino->flags & AUTOFS_INF_PENDING) {
294 spin_unlock(&sbi->fs_lock);
295 return NULL;
296 }
297 managed_dentry_set_transit(root);
275 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 298 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
276 struct autofs_info *ino = autofs4_dentry_ino(root); 299 struct autofs_info *ino = autofs4_dentry_ino(root);
277 if (d_mountpoint(root)) {
278 ino->flags |= AUTOFS_INF_MOUNTPOINT;
279 root->d_mounted--;
280 }
281 ino->flags |= AUTOFS_INF_EXPIRING; 300 ino->flags |= AUTOFS_INF_EXPIRING;
282 init_completion(&ino->expire_complete); 301 init_completion(&ino->expire_complete);
283 spin_unlock(&sbi->fs_lock); 302 spin_unlock(&sbi->fs_lock);
284 return root; 303 return root;
285 } 304 }
305 managed_dentry_clear_transit(root);
286 spin_unlock(&sbi->fs_lock); 306 spin_unlock(&sbi->fs_lock);
287 dput(root); 307 dput(root);
288 308
@@ -302,8 +322,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
302{ 322{
303 unsigned long timeout; 323 unsigned long timeout;
304 struct dentry *root = sb->s_root; 324 struct dentry *root = sb->s_root;
325 struct dentry *dentry;
305 struct dentry *expired = NULL; 326 struct dentry *expired = NULL;
306 struct list_head *next;
307 int do_now = how & AUTOFS_EXP_IMMEDIATE; 327 int do_now = how & AUTOFS_EXP_IMMEDIATE;
308 int exp_leaves = how & AUTOFS_EXP_LEAVES; 328 int exp_leaves = how & AUTOFS_EXP_LEAVES;
309 struct autofs_info *ino; 329 struct autofs_info *ino;
@@ -315,25 +335,14 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
315 now = jiffies; 335 now = jiffies;
316 timeout = sbi->exp_timeout; 336 timeout = sbi->exp_timeout;
317 337
318 spin_lock(&dcache_lock); 338 dentry = NULL;
319 next = root->d_subdirs.next; 339 while ((dentry = get_next_positive_dentry(dentry, root))) {
320
321 /* On exit from the loop expire is set to a dgot dentry
322 * to expire or it's NULL */
323 while ( next != &root->d_subdirs ) {
324 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
325
326 /* Negative dentry - give up */
327 if (!simple_positive(dentry)) {
328 next = next->next;
329 continue;
330 }
331
332 dentry = dget(dentry);
333 spin_unlock(&dcache_lock);
334
335 spin_lock(&sbi->fs_lock); 340 spin_lock(&sbi->fs_lock);
336 ino = autofs4_dentry_ino(dentry); 341 ino = autofs4_dentry_ino(dentry);
342 /* No point expiring a pending mount */
343 if (ino->flags & AUTOFS_INF_PENDING)
344 goto cont;
345 managed_dentry_set_transit(dentry);
337 346
338 /* 347 /*
339 * Case 1: (i) indirect mount or top level pseudo direct mount 348 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -347,7 +356,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
347 356
348 /* Path walk currently on this dentry? */ 357 /* Path walk currently on this dentry? */
349 ino_count = atomic_read(&ino->count) + 2; 358 ino_count = atomic_read(&ino->count) + 2;
350 if (atomic_read(&dentry->d_count) > ino_count) 359 if (dentry->d_count > ino_count)
351 goto next; 360 goto next;
352 361
353 /* Can we umount this guy */ 362 /* Can we umount this guy */
@@ -369,7 +378,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
369 if (!exp_leaves) { 378 if (!exp_leaves) {
370 /* Path walk currently on this dentry? */ 379 /* Path walk currently on this dentry? */
371 ino_count = atomic_read(&ino->count) + 1; 380 ino_count = atomic_read(&ino->count) + 1;
372 if (atomic_read(&dentry->d_count) > ino_count) 381 if (dentry->d_count > ino_count)
373 goto next; 382 goto next;
374 383
375 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { 384 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -383,7 +392,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
383 } else { 392 } else {
384 /* Path walk currently on this dentry? */ 393 /* Path walk currently on this dentry? */
385 ino_count = atomic_read(&ino->count) + 1; 394 ino_count = atomic_read(&ino->count) + 1;
386 if (atomic_read(&dentry->d_count) > ino_count) 395 if (dentry->d_count > ino_count)
387 goto next; 396 goto next;
388 397
389 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); 398 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
@@ -393,12 +402,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
393 } 402 }
394 } 403 }
395next: 404next:
405 managed_dentry_clear_transit(dentry);
406cont:
396 spin_unlock(&sbi->fs_lock); 407 spin_unlock(&sbi->fs_lock);
397 dput(dentry);
398 spin_lock(&dcache_lock);
399 next = next->next;
400 } 408 }
401 spin_unlock(&dcache_lock);
402 return NULL; 409 return NULL;
403 410
404found: 411found:
@@ -408,9 +415,13 @@ found:
408 ino->flags |= AUTOFS_INF_EXPIRING; 415 ino->flags |= AUTOFS_INF_EXPIRING;
409 init_completion(&ino->expire_complete); 416 init_completion(&ino->expire_complete);
410 spin_unlock(&sbi->fs_lock); 417 spin_unlock(&sbi->fs_lock);
411 spin_lock(&dcache_lock); 418 spin_lock(&autofs4_lock);
419 spin_lock(&expired->d_parent->d_lock);
420 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
412 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); 421 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
413 spin_unlock(&dcache_lock); 422 spin_unlock(&expired->d_lock);
423 spin_unlock(&expired->d_parent->d_lock);
424 spin_unlock(&autofs4_lock);
414 return expired; 425 return expired;
415} 426}
416 427
@@ -473,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb,
473 spin_lock(&sbi->fs_lock); 484 spin_lock(&sbi->fs_lock);
474 ino = autofs4_dentry_ino(dentry); 485 ino = autofs4_dentry_ino(dentry);
475 ino->flags &= ~AUTOFS_INF_EXPIRING; 486 ino->flags &= ~AUTOFS_INF_EXPIRING;
487 if (!d_unhashed(dentry))
488 managed_dentry_clear_transit(dentry);
476 complete_all(&ino->expire_complete); 489 complete_all(&ino->expire_complete);
477 spin_unlock(&sbi->fs_lock); 490 spin_unlock(&sbi->fs_lock);
478 491
@@ -498,11 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
498 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); 511 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
499 512
500 spin_lock(&sbi->fs_lock); 513 spin_lock(&sbi->fs_lock);
501 if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
502 sb->s_root->d_mounted++;
503 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
504 }
505 ino->flags &= ~AUTOFS_INF_EXPIRING; 514 ino->flags &= ~AUTOFS_INF_EXPIRING;
515 spin_lock(&dentry->d_lock);
516 if (ret)
517 __managed_dentry_clear_transit(dentry);
518 else {
519 if ((IS_ROOT(dentry) ||
520 (autofs_type_indirect(sbi->type) &&
521 IS_ROOT(dentry->d_parent))) &&
522 !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
523 __managed_dentry_set_automount(dentry);
524 }
525 spin_unlock(&dentry->d_lock);
506 complete_all(&ino->expire_complete); 526 complete_all(&ino->expire_complete);
507 spin_unlock(&sbi->fs_lock); 527 spin_unlock(&sbi->fs_lock);
508 dput(dentry); 528 dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index ac87e49fa70..180fa2425e4 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,77 +22,27 @@
22#include "autofs_i.h" 22#include "autofs_i.h"
23#include <linux/module.h> 23#include <linux/module.h>
24 24
25static void ino_lnkfree(struct autofs_info *ino) 25struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
26{ 26{
27 if (ino->u.symlink) { 27 struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
28 kfree(ino->u.symlink); 28 if (ino) {
29 ino->u.symlink = NULL;
30 }
31}
32
33struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
34 struct autofs_sb_info *sbi, mode_t mode)
35{
36 int reinit = 1;
37
38 if (ino == NULL) {
39 reinit = 0;
40 ino = kmalloc(sizeof(*ino), GFP_KERNEL);
41 }
42
43 if (ino == NULL)
44 return NULL;
45
46 if (!reinit) {
47 ino->flags = 0;
48 ino->inode = NULL;
49 ino->dentry = NULL;
50 ino->size = 0;
51 INIT_LIST_HEAD(&ino->active); 29 INIT_LIST_HEAD(&ino->active);
52 ino->active_count = 0;
53 INIT_LIST_HEAD(&ino->expiring); 30 INIT_LIST_HEAD(&ino->expiring);
54 atomic_set(&ino->count, 0); 31 ino->last_used = jiffies;
32 ino->sbi = sbi;
55 } 33 }
34 return ino;
35}
56 36
37void autofs4_clean_ino(struct autofs_info *ino)
38{
57 ino->uid = 0; 39 ino->uid = 0;
58 ino->gid = 0; 40 ino->gid = 0;
59 ino->mode = mode;
60 ino->last_used = jiffies; 41 ino->last_used = jiffies;
61
62 ino->sbi = sbi;
63
64 if (reinit && ino->free)
65 (ino->free)(ino);
66
67 memset(&ino->u, 0, sizeof(ino->u));
68
69 ino->free = NULL;
70
71 if (S_ISLNK(mode))
72 ino->free = ino_lnkfree;
73
74 return ino;
75} 42}
76 43
77void autofs4_free_ino(struct autofs_info *ino) 44void autofs4_free_ino(struct autofs_info *ino)
78{ 45{
79 struct autofs_info *p_ino;
80
81 if (ino->dentry) {
82 ino->dentry->d_fsdata = NULL;
83 if (ino->dentry->d_inode) {
84 struct dentry *parent = ino->dentry->d_parent;
85 if (atomic_dec_and_test(&ino->count)) {
86 p_ino = autofs4_dentry_ino(parent);
87 if (p_ino && parent != ino->dentry)
88 atomic_dec(&p_ino->count);
89 }
90 dput(ino->dentry);
91 }
92 ino->dentry = NULL;
93 }
94 if (ino->free)
95 (ino->free)(ino);
96 kfree(ino); 46 kfree(ino);
97} 47}
98 48
@@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
148 return 0; 98 return 0;
149} 99}
150 100
101static void autofs4_evict_inode(struct inode *inode)
102{
103 end_writeback(inode);
104 kfree(inode->i_private);
105}
106
151static const struct super_operations autofs4_sops = { 107static const struct super_operations autofs4_sops = {
152 .statfs = simple_statfs, 108 .statfs = simple_statfs,
153 .show_options = autofs4_show_options, 109 .show_options = autofs4_show_options,
110 .evict_inode = autofs4_evict_inode,
154}; 111};
155 112
156enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, 113enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
240 return (*pipefd < 0); 197 return (*pipefd < 0);
241} 198}
242 199
243static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
244{
245 struct autofs_info *ino;
246
247 ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
248 if (!ino)
249 return NULL;
250
251 return ino;
252}
253
254static const struct dentry_operations autofs4_sb_dentry_operations = {
255 .d_release = autofs4_dentry_release,
256};
257
258int autofs4_fill_super(struct super_block *s, void *data, int silent) 200int autofs4_fill_super(struct super_block *s, void *data, int silent)
259{ 201{
260 struct inode * root_inode; 202 struct inode * root_inode;
@@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
292 s->s_blocksize_bits = 10; 234 s->s_blocksize_bits = 10;
293 s->s_magic = AUTOFS_SUPER_MAGIC; 235 s->s_magic = AUTOFS_SUPER_MAGIC;
294 s->s_op = &autofs4_sops; 236 s->s_op = &autofs4_sops;
237 s->s_d_op = &autofs4_dentry_operations;
295 s->s_time_gran = 1; 238 s->s_time_gran = 1;
296 239
297 /* 240 /*
298 * Get the root inode and dentry, but defer checking for errors. 241 * Get the root inode and dentry, but defer checking for errors.
299 */ 242 */
300 ino = autofs4_mkroot(sbi); 243 ino = autofs4_new_ino(sbi);
301 if (!ino) 244 if (!ino)
302 goto fail_free; 245 goto fail_free;
303 root_inode = autofs4_get_inode(s, ino); 246 root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
304 if (!root_inode) 247 if (!root_inode)
305 goto fail_ino; 248 goto fail_ino;
306 249
@@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
309 goto fail_iput; 252 goto fail_iput;
310 pipe = NULL; 253 pipe = NULL;
311 254
312 root->d_op = &autofs4_sb_dentry_operations;
313 root->d_fsdata = ino; 255 root->d_fsdata = ino;
314 256
315 /* Can this call block? */ 257 /* Can this call block? */
@@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
320 goto fail_dput; 262 goto fail_dput;
321 } 263 }
322 264
265 if (autofs_type_trigger(sbi->type))
266 __managed_dentry_set_managed(root);
267
323 root_inode->i_fop = &autofs4_root_operations; 268 root_inode->i_fop = &autofs4_root_operations;
324 root_inode->i_op = autofs_type_trigger(sbi->type) ? 269 root_inode->i_op = &autofs4_dir_inode_operations;
325 &autofs4_direct_root_inode_operations :
326 &autofs4_indirect_root_inode_operations;
327 270
328 /* Couldn't this be tested earlier? */ 271 /* Couldn't this be tested earlier? */
329 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || 272 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
@@ -383,16 +326,14 @@ fail_unlock:
383 return -EINVAL; 326 return -EINVAL;
384} 327}
385 328
386struct inode *autofs4_get_inode(struct super_block *sb, 329struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
387 struct autofs_info *inf)
388{ 330{
389 struct inode *inode = new_inode(sb); 331 struct inode *inode = new_inode(sb);
390 332
391 if (inode == NULL) 333 if (inode == NULL)
392 return NULL; 334 return NULL;
393 335
394 inf->inode = inode; 336 inode->i_mode = mode;
395 inode->i_mode = inf->mode;
396 if (sb->s_root) { 337 if (sb->s_root) {
397 inode->i_uid = sb->s_root->d_inode->i_uid; 338 inode->i_uid = sb->s_root->d_inode->i_uid;
398 inode->i_gid = sb->s_root->d_inode->i_gid; 339 inode->i_gid = sb->s_root->d_inode->i_gid;
@@ -400,12 +341,11 @@ struct inode *autofs4_get_inode(struct super_block *sb,
400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 341 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
401 inode->i_ino = get_next_ino(); 342 inode->i_ino = get_next_ino();
402 343
403 if (S_ISDIR(inf->mode)) { 344 if (S_ISDIR(mode)) {
404 inode->i_nlink = 2; 345 inode->i_nlink = 2;
405 inode->i_op = &autofs4_dir_inode_operations; 346 inode->i_op = &autofs4_dir_inode_operations;
406 inode->i_fop = &autofs4_dir_operations; 347 inode->i_fop = &autofs4_dir_operations;
407 } else if (S_ISLNK(inf->mode)) { 348 } else if (S_ISLNK(mode)) {
408 inode->i_size = inf->size;
409 inode->i_op = &autofs4_symlink_inode_operations; 349 inode->i_op = &autofs4_symlink_inode_operations;
410 } 350 }
411 351
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d5c1401f003..014e7aba3b0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,6 +23,8 @@
23 23
24#include "autofs_i.h" 24#include "autofs_i.h"
25 25
26DEFINE_SPINLOCK(autofs4_lock);
27
26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 28static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
27static int autofs4_dir_unlink(struct inode *,struct dentry *); 29static int autofs4_dir_unlink(struct inode *,struct dentry *);
28static int autofs4_dir_rmdir(struct inode *,struct dentry *); 30static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -33,10 +35,9 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
33#endif 35#endif
34static int autofs4_dir_open(struct inode *inode, struct file *file); 36static int autofs4_dir_open(struct inode *inode, struct file *file);
35static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 37static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
36static void *autofs4_follow_link(struct dentry *, struct nameidata *); 38static struct vfsmount *autofs4_d_automount(struct path *);
37 39static int autofs4_d_manage(struct dentry *, bool, bool);
38#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) 40static void autofs4_dentry_release(struct dentry *);
39#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
40 41
41const struct file_operations autofs4_root_operations = { 42const struct file_operations autofs4_root_operations = {
42 .open = dcache_dir_open, 43 .open = dcache_dir_open,
@@ -58,7 +59,7 @@ const struct file_operations autofs4_dir_operations = {
58 .llseek = dcache_dir_lseek, 59 .llseek = dcache_dir_lseek,
59}; 60};
60 61
61const struct inode_operations autofs4_indirect_root_inode_operations = { 62const struct inode_operations autofs4_dir_inode_operations = {
62 .lookup = autofs4_lookup, 63 .lookup = autofs4_lookup,
63 .unlink = autofs4_dir_unlink, 64 .unlink = autofs4_dir_unlink,
64 .symlink = autofs4_dir_symlink, 65 .symlink = autofs4_dir_symlink,
@@ -66,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = {
66 .rmdir = autofs4_dir_rmdir, 67 .rmdir = autofs4_dir_rmdir,
67}; 68};
68 69
69const struct inode_operations autofs4_direct_root_inode_operations = { 70const struct dentry_operations autofs4_dentry_operations = {
70 .lookup = autofs4_lookup, 71 .d_automount = autofs4_d_automount,
71 .unlink = autofs4_dir_unlink, 72 .d_manage = autofs4_d_manage,
72 .mkdir = autofs4_dir_mkdir, 73 .d_release = autofs4_dentry_release,
73 .rmdir = autofs4_dir_rmdir,
74 .follow_link = autofs4_follow_link,
75};
76
77const struct inode_operations autofs4_dir_inode_operations = {
78 .lookup = autofs4_lookup,
79 .unlink = autofs4_dir_unlink,
80 .symlink = autofs4_dir_symlink,
81 .mkdir = autofs4_dir_mkdir,
82 .rmdir = autofs4_dir_rmdir,
83}; 74};
84 75
85static void autofs4_add_active(struct dentry *dentry) 76static void autofs4_add_active(struct dentry *dentry)
@@ -114,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
114 return; 105 return;
115} 106}
116 107
117static unsigned int autofs4_need_mount(unsigned int flags)
118{
119 unsigned int res = 0;
120 if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
121 res = 1;
122 return res;
123}
124
125static int autofs4_dir_open(struct inode *inode, struct file *file) 108static int autofs4_dir_open(struct inode *inode, struct file *file)
126{ 109{
127 struct dentry *dentry = file->f_path.dentry; 110 struct dentry *dentry = file->f_path.dentry;
@@ -142,275 +125,41 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
142 * autofs file system so just let the libfs routines handle 125 * autofs file system so just let the libfs routines handle
143 * it. 126 * it.
144 */ 127 */
145 spin_lock(&dcache_lock); 128 spin_lock(&autofs4_lock);
129 spin_lock(&dentry->d_lock);
146 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 130 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
147 spin_unlock(&dcache_lock); 131 spin_unlock(&dentry->d_lock);
132 spin_unlock(&autofs4_lock);
148 return -ENOENT; 133 return -ENOENT;
149 } 134 }
150 spin_unlock(&dcache_lock); 135 spin_unlock(&dentry->d_lock);
136 spin_unlock(&autofs4_lock);
151 137
152out: 138out:
153 return dcache_dir_open(inode, file); 139 return dcache_dir_open(inode, file);
154} 140}
155 141
156static int try_to_fill_dentry(struct dentry *dentry, int flags) 142static void autofs4_dentry_release(struct dentry *de)
157{
158 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
159 struct autofs_info *ino = autofs4_dentry_ino(dentry);
160 int status;
161
162 DPRINTK("dentry=%p %.*s ino=%p",
163 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
164
165 /*
166 * Wait for a pending mount, triggering one if there
167 * isn't one already
168 */
169 if (dentry->d_inode == NULL) {
170 DPRINTK("waiting for mount name=%.*s",
171 dentry->d_name.len, dentry->d_name.name);
172
173 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
174
175 DPRINTK("mount done status=%d", status);
176
177 /* Turn this into a real negative dentry? */
178 if (status == -ENOENT) {
179 spin_lock(&sbi->fs_lock);
180 ino->flags &= ~AUTOFS_INF_PENDING;
181 spin_unlock(&sbi->fs_lock);
182 return status;
183 } else if (status) {
184 /* Return a negative dentry, but leave it "pending" */
185 return status;
186 }
187 /* Trigger mount for path component or follow link */
188 } else if (ino->flags & AUTOFS_INF_PENDING ||
189 autofs4_need_mount(flags)) {
190 DPRINTK("waiting for mount name=%.*s",
191 dentry->d_name.len, dentry->d_name.name);
192
193 spin_lock(&sbi->fs_lock);
194 ino->flags |= AUTOFS_INF_PENDING;
195 spin_unlock(&sbi->fs_lock);
196 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
197
198 DPRINTK("mount done status=%d", status);
199
200 if (status) {
201 spin_lock(&sbi->fs_lock);
202 ino->flags &= ~AUTOFS_INF_PENDING;
203 spin_unlock(&sbi->fs_lock);
204 return status;
205 }
206 }
207
208 /* Initialize expiry counter after successful mount */
209 ino->last_used = jiffies;
210
211 spin_lock(&sbi->fs_lock);
212 ino->flags &= ~AUTOFS_INF_PENDING;
213 spin_unlock(&sbi->fs_lock);
214
215 return 0;
216}
217
218/* For autofs direct mounts the follow link triggers the mount */
219static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
220{
221 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
222 struct autofs_info *ino = autofs4_dentry_ino(dentry);
223 int oz_mode = autofs4_oz_mode(sbi);
224 unsigned int lookup_type;
225 int status;
226
227 DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
228 dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
229 nd->flags);
230 /*
231 * For an expire of a covered direct or offset mount we need
232 * to break out of follow_down() at the autofs mount trigger
233 * (d_mounted--), so we can see the expiring flag, and manage
234 * the blocking and following here until the expire is completed.
235 */
236 if (oz_mode) {
237 spin_lock(&sbi->fs_lock);
238 if (ino->flags & AUTOFS_INF_EXPIRING) {
239 spin_unlock(&sbi->fs_lock);
240 /* Follow down to our covering mount. */
241 if (!follow_down(&nd->path))
242 goto done;
243 goto follow;
244 }
245 spin_unlock(&sbi->fs_lock);
246 goto done;
247 }
248
249 /* If an expire request is pending everyone must wait. */
250 autofs4_expire_wait(dentry);
251
252 /* We trigger a mount for almost all flags */
253 lookup_type = autofs4_need_mount(nd->flags);
254 spin_lock(&sbi->fs_lock);
255 spin_lock(&dcache_lock);
256 if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
257 spin_unlock(&dcache_lock);
258 spin_unlock(&sbi->fs_lock);
259 goto follow;
260 }
261
262 /*
263 * If the dentry contains directories then it is an autofs
264 * multi-mount with no root mount offset. So don't try to
265 * mount it again.
266 */
267 if (ino->flags & AUTOFS_INF_PENDING ||
268 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
269 spin_unlock(&dcache_lock);
270 spin_unlock(&sbi->fs_lock);
271
272 status = try_to_fill_dentry(dentry, nd->flags);
273 if (status)
274 goto out_error;
275
276 goto follow;
277 }
278 spin_unlock(&dcache_lock);
279 spin_unlock(&sbi->fs_lock);
280follow:
281 /*
282 * If there is no root mount it must be an autofs
283 * multi-mount with no root offset so we don't need
284 * to follow it.
285 */
286 if (d_mountpoint(dentry)) {
287 if (!autofs4_follow_mount(&nd->path)) {
288 status = -ENOENT;
289 goto out_error;
290 }
291 }
292
293done:
294 return NULL;
295
296out_error:
297 path_put(&nd->path);
298 return ERR_PTR(status);
299}
300
301/*
302 * Revalidate is called on every cache lookup. Some of those
303 * cache lookups may actually happen while the dentry is not
304 * yet completely filled in, and revalidate has to delay such
305 * lookups..
306 */
307static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
308{ 143{
309 struct inode *dir = dentry->d_parent->d_inode; 144 struct autofs_info *ino = autofs4_dentry_ino(de);
310 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 145 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
311 int oz_mode = autofs4_oz_mode(sbi);
312 int flags = nd ? nd->flags : 0;
313 int status = 1;
314
315 /* Pending dentry */
316 spin_lock(&sbi->fs_lock);
317 if (autofs4_ispending(dentry)) {
318 /* The daemon never causes a mount to trigger */
319 spin_unlock(&sbi->fs_lock);
320
321 if (oz_mode)
322 return 1;
323
324 /*
325 * If the directory has gone away due to an expire
326 * we have been called as ->d_revalidate() and so
327 * we need to return false and proceed to ->lookup().
328 */
329 if (autofs4_expire_wait(dentry) == -EAGAIN)
330 return 0;
331
332 /*
333 * A zero status is success otherwise we have a
334 * negative error code.
335 */
336 status = try_to_fill_dentry(dentry, flags);
337 if (status == 0)
338 return 1;
339
340 return status;
341 }
342 spin_unlock(&sbi->fs_lock);
343
344 /* Negative dentry.. invalidate if "old" */
345 if (dentry->d_inode == NULL)
346 return 0;
347
348 /* Check for a non-mountpoint directory with no contents */
349 spin_lock(&dcache_lock);
350 if (S_ISDIR(dentry->d_inode->i_mode) &&
351 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
352 DPRINTK("dentry=%p %.*s, emptydir",
353 dentry, dentry->d_name.len, dentry->d_name.name);
354 spin_unlock(&dcache_lock);
355
356 /* The daemon never causes a mount to trigger */
357 if (oz_mode)
358 return 1;
359
360 /*
361 * A zero status is success otherwise we have a
362 * negative error code.
363 */
364 status = try_to_fill_dentry(dentry, flags);
365 if (status == 0)
366 return 1;
367
368 return status;
369 }
370 spin_unlock(&dcache_lock);
371
372 return 1;
373}
374
375void autofs4_dentry_release(struct dentry *de)
376{
377 struct autofs_info *inf;
378 146
379 DPRINTK("releasing %p", de); 147 DPRINTK("releasing %p", de);
380 148
381 inf = autofs4_dentry_ino(de); 149 if (!ino)
382 de->d_fsdata = NULL; 150 return;
383
384 if (inf) {
385 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
386
387 if (sbi) {
388 spin_lock(&sbi->lookup_lock);
389 if (!list_empty(&inf->active))
390 list_del(&inf->active);
391 if (!list_empty(&inf->expiring))
392 list_del(&inf->expiring);
393 spin_unlock(&sbi->lookup_lock);
394 }
395
396 inf->dentry = NULL;
397 inf->inode = NULL;
398 151
399 autofs4_free_ino(inf); 152 if (sbi) {
153 spin_lock(&sbi->lookup_lock);
154 if (!list_empty(&ino->active))
155 list_del(&ino->active);
156 if (!list_empty(&ino->expiring))
157 list_del(&ino->expiring);
158 spin_unlock(&sbi->lookup_lock);
400 } 159 }
401}
402 160
403/* For dentries of directories in the root dir */ 161 autofs4_free_ino(ino);
404static const struct dentry_operations autofs4_root_dentry_operations = { 162}
405 .d_revalidate = autofs4_revalidate,
406 .d_release = autofs4_dentry_release,
407};
408
409/* For other dentries */
410static const struct dentry_operations autofs4_dentry_operations = {
411 .d_revalidate = autofs4_revalidate,
412 .d_release = autofs4_dentry_release,
413};
414 163
415static struct dentry *autofs4_lookup_active(struct dentry *dentry) 164static struct dentry *autofs4_lookup_active(struct dentry *dentry)
416{ 165{
@@ -422,7 +171,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
422 const unsigned char *str = name->name; 171 const unsigned char *str = name->name;
423 struct list_head *p, *head; 172 struct list_head *p, *head;
424 173
425 spin_lock(&dcache_lock); 174 spin_lock(&autofs4_lock);
426 spin_lock(&sbi->lookup_lock); 175 spin_lock(&sbi->lookup_lock);
427 head = &sbi->active_list; 176 head = &sbi->active_list;
428 list_for_each(p, head) { 177 list_for_each(p, head) {
@@ -436,7 +185,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
436 spin_lock(&active->d_lock); 185 spin_lock(&active->d_lock);
437 186
438 /* Already gone? */ 187 /* Already gone? */
439 if (atomic_read(&active->d_count) == 0) 188 if (active->d_count == 0)
440 goto next; 189 goto next;
441 190
442 qstr = &active->d_name; 191 qstr = &active->d_name;
@@ -452,17 +201,17 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
452 goto next; 201 goto next;
453 202
454 if (d_unhashed(active)) { 203 if (d_unhashed(active)) {
455 dget(active); 204 dget_dlock(active);
456 spin_unlock(&active->d_lock); 205 spin_unlock(&active->d_lock);
457 spin_unlock(&sbi->lookup_lock); 206 spin_unlock(&sbi->lookup_lock);
458 spin_unlock(&dcache_lock); 207 spin_unlock(&autofs4_lock);
459 return active; 208 return active;
460 } 209 }
461next: 210next:
462 spin_unlock(&active->d_lock); 211 spin_unlock(&active->d_lock);
463 } 212 }
464 spin_unlock(&sbi->lookup_lock); 213 spin_unlock(&sbi->lookup_lock);
465 spin_unlock(&dcache_lock); 214 spin_unlock(&autofs4_lock);
466 215
467 return NULL; 216 return NULL;
468} 217}
@@ -477,7 +226,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
477 const unsigned char *str = name->name; 226 const unsigned char *str = name->name;
478 struct list_head *p, *head; 227 struct list_head *p, *head;
479 228
480 spin_lock(&dcache_lock); 229 spin_lock(&autofs4_lock);
481 spin_lock(&sbi->lookup_lock); 230 spin_lock(&sbi->lookup_lock);
482 head = &sbi->expiring_list; 231 head = &sbi->expiring_list;
483 list_for_each(p, head) { 232 list_for_each(p, head) {
@@ -507,66 +256,261 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
507 goto next; 256 goto next;
508 257
509 if (d_unhashed(expiring)) { 258 if (d_unhashed(expiring)) {
510 dget(expiring); 259 dget_dlock(expiring);
511 spin_unlock(&expiring->d_lock); 260 spin_unlock(&expiring->d_lock);
512 spin_unlock(&sbi->lookup_lock); 261 spin_unlock(&sbi->lookup_lock);
513 spin_unlock(&dcache_lock); 262 spin_unlock(&autofs4_lock);
514 return expiring; 263 return expiring;
515 } 264 }
516next: 265next:
517 spin_unlock(&expiring->d_lock); 266 spin_unlock(&expiring->d_lock);
518 } 267 }
519 spin_unlock(&sbi->lookup_lock); 268 spin_unlock(&sbi->lookup_lock);
520 spin_unlock(&dcache_lock); 269 spin_unlock(&autofs4_lock);
270
271 return NULL;
272}
273
274static int autofs4_mount_wait(struct dentry *dentry)
275{
276 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
277 struct autofs_info *ino = autofs4_dentry_ino(dentry);
278 int status;
279
280 if (ino->flags & AUTOFS_INF_PENDING) {
281 DPRINTK("waiting for mount name=%.*s",
282 dentry->d_name.len, dentry->d_name.name);
283 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
284 DPRINTK("mount wait done status=%d", status);
285 ino->last_used = jiffies;
286 return status;
287 }
288 return 0;
289}
290
291static int do_expire_wait(struct dentry *dentry)
292{
293 struct dentry *expiring;
294
295 expiring = autofs4_lookup_expiring(dentry);
296 if (!expiring)
297 return autofs4_expire_wait(dentry);
298 else {
299 /*
300 * If we are racing with expire the request might not
301 * be quite complete, but the directory has been removed
302 * so it must have been successful, just wait for it.
303 */
304 autofs4_expire_wait(expiring);
305 autofs4_del_expiring(expiring);
306 dput(expiring);
307 }
308 return 0;
309}
310
311static struct dentry *autofs4_mountpoint_changed(struct path *path)
312{
313 struct dentry *dentry = path->dentry;
314 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
315
316 /*
317 * If this is an indirect mount the dentry could have gone away
318 * as a result of an expire and a new one created.
319 */
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent;
322 struct dentry *new = d_lookup(parent, &dentry->d_name);
323 if (!new)
324 return NULL;
325 dput(path->dentry);
326 path->dentry = new;
327 }
328 return path->dentry;
329}
330
331static struct vfsmount *autofs4_d_automount(struct path *path)
332{
333 struct dentry *dentry = path->dentry;
334 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
335 struct autofs_info *ino = autofs4_dentry_ino(dentry);
336 int status;
337
338 DPRINTK("dentry=%p %.*s",
339 dentry, dentry->d_name.len, dentry->d_name.name);
340
341 /*
342 * Someone may have manually umounted this or it was a submount
343 * that has gone away.
344 */
345 spin_lock(&dentry->d_lock);
346 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
347 if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
348 (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
349 __managed_dentry_set_transit(path->dentry);
350 }
351 spin_unlock(&dentry->d_lock);
352
353 /* The daemon never triggers a mount. */
354 if (autofs4_oz_mode(sbi))
355 return NULL;
356
357 /*
358 * If an expire request is pending everyone must wait.
359 * If the expire fails we're still mounted so continue
360 * the follow and return. A return of -EAGAIN (which only
361 * happens with indirect mounts) means the expire completed
362 * and the directory was removed, so just go ahead and try
363 * the mount.
364 */
365 status = do_expire_wait(dentry);
366 if (status && status != -EAGAIN)
367 return NULL;
368
369 /* Callback to the daemon to perform the mount or wait */
370 spin_lock(&sbi->fs_lock);
371 if (ino->flags & AUTOFS_INF_PENDING) {
372 spin_unlock(&sbi->fs_lock);
373 status = autofs4_mount_wait(dentry);
374 if (status)
375 return ERR_PTR(status);
376 spin_lock(&sbi->fs_lock);
377 goto done;
378 }
379
380 /*
381 * If the dentry is a symlink it's equivalent to a directory
382 * having d_mountpoint() true, so there's no need to call back
383 * to the daemon.
384 */
385 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
386 goto done;
387 if (!d_mountpoint(dentry)) {
388 /*
389 * It's possible that user space hasn't removed directories
390 * after umounting a rootless multi-mount, although it
391 * should. For v5 have_submounts() is sufficient to handle
392 * this because the leaves of the directory tree under the
393 * mount never trigger mounts themselves (they have an autofs
394 * trigger mount mounted on them). But v4 pseudo direct mounts
395 * do need the leaves to to trigger mounts. In this case we
396 * have no choice but to use the list_empty() check and
397 * require user space behave.
398 */
399 if (sbi->version > 4) {
400 if (have_submounts(dentry))
401 goto done;
402 } else {
403 spin_lock(&dentry->d_lock);
404 if (!list_empty(&dentry->d_subdirs)) {
405 spin_unlock(&dentry->d_lock);
406 goto done;
407 }
408 spin_unlock(&dentry->d_lock);
409 }
410 ino->flags |= AUTOFS_INF_PENDING;
411 spin_unlock(&sbi->fs_lock);
412 status = autofs4_mount_wait(dentry);
413 if (status)
414 return ERR_PTR(status);
415 spin_lock(&sbi->fs_lock);
416 ino->flags &= ~AUTOFS_INF_PENDING;
417 }
418done:
419 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
420 /*
421 * Any needed mounting has been completed and the path updated
422 * so turn this into a normal dentry so we don't continually
423 * call ->d_automount() and ->d_manage().
424 */
425 spin_lock(&dentry->d_lock);
426 __managed_dentry_clear_transit(dentry);
427 /*
428 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
429 * symlinks as in all other cases the dentry will be covered by
430 * an actual mount so ->d_automount() won't be called during
431 * the follow.
432 */
433 if ((!d_mountpoint(dentry) &&
434 !list_empty(&dentry->d_subdirs)) ||
435 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
436 __managed_dentry_clear_automount(dentry);
437 spin_unlock(&dentry->d_lock);
438 }
439 spin_unlock(&sbi->fs_lock);
440
441 /* Mount succeeded, check if we ended up with a new dentry */
442 dentry = autofs4_mountpoint_changed(path);
443 if (!dentry)
444 return ERR_PTR(-ENOENT);
521 445
522 return NULL; 446 return NULL;
523} 447}
524 448
449int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
450{
451 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
452
453 DPRINTK("dentry=%p %.*s",
454 dentry, dentry->d_name.len, dentry->d_name.name);
455
456 /* The daemon never waits. */
457 if (autofs4_oz_mode(sbi) || mounting_here) {
458 if (!d_mountpoint(dentry))
459 return -EISDIR;
460 return 0;
461 }
462
463 /* We need to sleep, so we need pathwalk to be in ref-mode */
464 if (rcu_walk)
465 return -ECHILD;
466
467 /* Wait for pending expires */
468 do_expire_wait(dentry);
469
470 /*
471 * This dentry may be under construction so wait on mount
472 * completion.
473 */
474 return autofs4_mount_wait(dentry);
475}
476
525/* Lookups in the root directory */ 477/* Lookups in the root directory */
526static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 478static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
527{ 479{
528 struct autofs_sb_info *sbi; 480 struct autofs_sb_info *sbi;
529 struct autofs_info *ino; 481 struct autofs_info *ino;
530 struct dentry *expiring, *active; 482 struct dentry *active;
531 int oz_mode;
532 483
533 DPRINTK("name = %.*s", 484 DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
534 dentry->d_name.len, dentry->d_name.name);
535 485
536 /* File name too long to exist */ 486 /* File name too long to exist */
537 if (dentry->d_name.len > NAME_MAX) 487 if (dentry->d_name.len > NAME_MAX)
538 return ERR_PTR(-ENAMETOOLONG); 488 return ERR_PTR(-ENAMETOOLONG);
539 489
540 sbi = autofs4_sbi(dir->i_sb); 490 sbi = autofs4_sbi(dir->i_sb);
541 oz_mode = autofs4_oz_mode(sbi);
542 491
543 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 492 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
544 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 493 current->pid, task_pgrp_nr(current), sbi->catatonic,
494 autofs4_oz_mode(sbi));
545 495
546 active = autofs4_lookup_active(dentry); 496 active = autofs4_lookup_active(dentry);
547 if (active) { 497 if (active) {
548 dentry = active; 498 return active;
549 ino = autofs4_dentry_ino(dentry);
550 } else { 499 } else {
551 /* 500 /*
552 * Mark the dentry incomplete but don't hash it. We do this 501 * A dentry that is not within the root can never trigger a
553 * to serialize our inode creation operations (symlink and 502 * mount operation, unless the directory already exists, so we
554 * mkdir) which prevents deadlock during the callback to 503 * can return fail immediately. The daemon however does need
555 * the daemon. Subsequent user space lookups for the same 504 * to create directories within the file system.
556 * dentry are placed on the wait queue while the daemon
557 * itself is allowed passage unresticted so the create
558 * operation itself can then hash the dentry. Finally,
559 * we check for the hashed dentry and return the newly
560 * hashed dentry.
561 */ 505 */
562 dentry->d_op = &autofs4_root_dentry_operations; 506 if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
507 return ERR_PTR(-ENOENT);
563 508
564 /* 509 /* Mark entries in the root as mount triggers */
565 * And we need to ensure that the same dentry is used for 510 if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
566 * all following lookup calls until it is hashed so that 511 __managed_dentry_set_managed(dentry);
567 * the dentry flags are persistent throughout the request. 512
568 */ 513 ino = autofs4_new_ino(sbi);
569 ino = autofs4_init_ino(NULL, sbi, 0555);
570 if (!ino) 514 if (!ino)
571 return ERR_PTR(-ENOMEM); 515 return ERR_PTR(-ENOMEM);
572 516
@@ -577,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
577 521
578 d_instantiate(dentry, NULL); 522 d_instantiate(dentry, NULL);
579 } 523 }
580
581 if (!oz_mode) {
582 mutex_unlock(&dir->i_mutex);
583 expiring = autofs4_lookup_expiring(dentry);
584 if (expiring) {
585 /*
586 * If we are racing with expire the request might not
587 * be quite complete but the directory has been removed
588 * so it must have been successful, so just wait for it.
589 */
590 autofs4_expire_wait(expiring);
591 autofs4_del_expiring(expiring);
592 dput(expiring);
593 }
594
595 spin_lock(&sbi->fs_lock);
596 ino->flags |= AUTOFS_INF_PENDING;
597 spin_unlock(&sbi->fs_lock);
598 if (dentry->d_op && dentry->d_op->d_revalidate)
599 (dentry->d_op->d_revalidate)(dentry, nd);
600 mutex_lock(&dir->i_mutex);
601 }
602
603 /*
604 * If we are still pending, check if we had to handle
605 * a signal. If so we can force a restart..
606 */
607 if (ino->flags & AUTOFS_INF_PENDING) {
608 /* See if we were interrupted */
609 if (signal_pending(current)) {
610 sigset_t *sigset = &current->pending.signal;
611 if (sigismember (sigset, SIGKILL) ||
612 sigismember (sigset, SIGQUIT) ||
613 sigismember (sigset, SIGINT)) {
614 if (active)
615 dput(active);
616 return ERR_PTR(-ERESTARTNOINTR);
617 }
618 }
619 if (!oz_mode) {
620 spin_lock(&sbi->fs_lock);
621 ino->flags &= ~AUTOFS_INF_PENDING;
622 spin_unlock(&sbi->fs_lock);
623 }
624 }
625
626 /*
627 * If this dentry is unhashed, then we shouldn't honour this
628 * lookup. Returning ENOENT here doesn't do the right thing
629 * for all system calls, but it should be OK for the operations
630 * we permit from an autofs.
631 */
632 if (!oz_mode && d_unhashed(dentry)) {
633 /*
634 * A user space application can (and has done in the past)
635 * remove and re-create this directory during the callback.
636 * This can leave us with an unhashed dentry, but a
637 * successful mount! So we need to perform another
638 * cached lookup in case the dentry now exists.
639 */
640 struct dentry *parent = dentry->d_parent;
641 struct dentry *new = d_lookup(parent, &dentry->d_name);
642 if (new != NULL)
643 dentry = new;
644 else
645 dentry = ERR_PTR(-ENOENT);
646
647 if (active)
648 dput(active);
649
650 return dentry;
651 }
652
653 if (active)
654 return active;
655
656 return NULL; 524 return NULL;
657} 525}
658 526
@@ -664,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir,
664 struct autofs_info *ino = autofs4_dentry_ino(dentry); 532 struct autofs_info *ino = autofs4_dentry_ino(dentry);
665 struct autofs_info *p_ino; 533 struct autofs_info *p_ino;
666 struct inode *inode; 534 struct inode *inode;
535 size_t size = strlen(symname);
667 char *cp; 536 char *cp;
668 537
669 DPRINTK("%s <- %.*s", symname, 538 DPRINTK("%s <- %.*s", symname,
@@ -672,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir,
672 if (!autofs4_oz_mode(sbi)) 541 if (!autofs4_oz_mode(sbi))
673 return -EACCES; 542 return -EACCES;
674 543
675 ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555); 544 BUG_ON(!ino);
676 if (!ino) 545
677 return -ENOMEM; 546 autofs4_clean_ino(ino);
678 547
679 autofs4_del_active(dentry); 548 autofs4_del_active(dentry);
680 549
681 ino->size = strlen(symname); 550 cp = kmalloc(size + 1, GFP_KERNEL);
682 cp = kmalloc(ino->size + 1, GFP_KERNEL); 551 if (!cp)
683 if (!cp) {
684 if (!dentry->d_fsdata)
685 kfree(ino);
686 return -ENOMEM; 552 return -ENOMEM;
687 }
688 553
689 strcpy(cp, symname); 554 strcpy(cp, symname);
690 555
691 inode = autofs4_get_inode(dir->i_sb, ino); 556 inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
692 if (!inode) { 557 if (!inode) {
693 kfree(cp); 558 kfree(cp);
694 if (!dentry->d_fsdata) 559 if (!dentry->d_fsdata)
695 kfree(ino); 560 kfree(ino);
696 return -ENOMEM; 561 return -ENOMEM;
697 } 562 }
563 inode->i_private = cp;
564 inode->i_size = size;
698 d_add(dentry, inode); 565 d_add(dentry, inode);
699 566
700 if (dir == dir->i_sb->s_root->d_inode) 567 dget(dentry);
701 dentry->d_op = &autofs4_root_dentry_operations;
702 else
703 dentry->d_op = &autofs4_dentry_operations;
704
705 dentry->d_fsdata = ino;
706 ino->dentry = dget(dentry);
707 atomic_inc(&ino->count); 568 atomic_inc(&ino->count);
708 p_ino = autofs4_dentry_ino(dentry->d_parent); 569 p_ino = autofs4_dentry_ino(dentry->d_parent);
709 if (p_ino && dentry->d_parent != dentry) 570 if (p_ino && dentry->d_parent != dentry)
710 atomic_inc(&p_ino->count); 571 atomic_inc(&p_ino->count);
711 ino->inode = inode;
712 572
713 ino->u.symlink = cp;
714 dir->i_mtime = CURRENT_TIME; 573 dir->i_mtime = CURRENT_TIME;
715 574
716 return 0; 575 return 0;
@@ -753,16 +612,68 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
753 612
754 dir->i_mtime = CURRENT_TIME; 613 dir->i_mtime = CURRENT_TIME;
755 614
756 spin_lock(&dcache_lock); 615 spin_lock(&autofs4_lock);
757 autofs4_add_expiring(dentry); 616 autofs4_add_expiring(dentry);
758 spin_lock(&dentry->d_lock); 617 spin_lock(&dentry->d_lock);
759 __d_drop(dentry); 618 __d_drop(dentry);
760 spin_unlock(&dentry->d_lock); 619 spin_unlock(&dentry->d_lock);
761 spin_unlock(&dcache_lock); 620 spin_unlock(&autofs4_lock);
762 621
763 return 0; 622 return 0;
764} 623}
765 624
625/*
626 * Version 4 of autofs provides a pseudo direct mount implementation
627 * that relies on directories at the leaves of a directory tree under
628 * an indirect mount to trigger mounts. To allow for this we need to
629 * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
630 * of the directory tree. There is no need to clear the automount flag
631 * following a mount or restore it after an expire because these mounts
632 * are always covered. However, it is neccessary to ensure that these
633 * flags are clear on non-empty directories to avoid unnecessary calls
634 * during path walks.
635 */
636static void autofs_set_leaf_automount_flags(struct dentry *dentry)
637{
638 struct dentry *parent;
639
640 /* root and dentrys in the root are already handled */
641 if (IS_ROOT(dentry->d_parent))
642 return;
643
644 managed_dentry_set_managed(dentry);
645
646 parent = dentry->d_parent;
647 /* only consider parents below dentrys in the root */
648 if (IS_ROOT(parent->d_parent))
649 return;
650 managed_dentry_clear_managed(parent);
651 return;
652}
653
654static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
655{
656 struct list_head *d_child;
657 struct dentry *parent;
658
659 /* flags for dentrys in the root are handled elsewhere */
660 if (IS_ROOT(dentry->d_parent))
661 return;
662
663 managed_dentry_clear_managed(dentry);
664
665 parent = dentry->d_parent;
666 /* only consider parents below dentrys in the root */
667 if (IS_ROOT(parent->d_parent))
668 return;
669 d_child = &dentry->d_u.d_child;
670 /* Set parent managed if it's becoming empty */
671 if (d_child->next == &parent->d_subdirs &&
672 d_child->prev == &parent->d_subdirs)
673 managed_dentry_set_managed(parent);
674 return;
675}
676
766static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) 677static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
767{ 678{
768 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 679 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -775,16 +686,23 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
775 if (!autofs4_oz_mode(sbi)) 686 if (!autofs4_oz_mode(sbi))
776 return -EACCES; 687 return -EACCES;
777 688
778 spin_lock(&dcache_lock); 689 spin_lock(&autofs4_lock);
690 spin_lock(&sbi->lookup_lock);
691 spin_lock(&dentry->d_lock);
779 if (!list_empty(&dentry->d_subdirs)) { 692 if (!list_empty(&dentry->d_subdirs)) {
780 spin_unlock(&dcache_lock); 693 spin_unlock(&dentry->d_lock);
694 spin_unlock(&sbi->lookup_lock);
695 spin_unlock(&autofs4_lock);
781 return -ENOTEMPTY; 696 return -ENOTEMPTY;
782 } 697 }
783 autofs4_add_expiring(dentry); 698 __autofs4_add_expiring(dentry);
784 spin_lock(&dentry->d_lock); 699 spin_unlock(&sbi->lookup_lock);
785 __d_drop(dentry); 700 __d_drop(dentry);
786 spin_unlock(&dentry->d_lock); 701 spin_unlock(&dentry->d_lock);
787 spin_unlock(&dcache_lock); 702 spin_unlock(&autofs4_lock);
703
704 if (sbi->version < 5)
705 autofs_clear_leaf_automount_flags(dentry);
788 706
789 if (atomic_dec_and_test(&ino->count)) { 707 if (atomic_dec_and_test(&ino->count)) {
790 p_ino = autofs4_dentry_ino(dentry->d_parent); 708 p_ino = autofs4_dentry_ino(dentry->d_parent);
@@ -814,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
814 DPRINTK("dentry %p, creating %.*s", 732 DPRINTK("dentry %p, creating %.*s",
815 dentry, dentry->d_name.len, dentry->d_name.name); 733 dentry, dentry->d_name.len, dentry->d_name.name);
816 734
817 ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555); 735 BUG_ON(!ino);
818 if (!ino) 736
819 return -ENOMEM; 737 autofs4_clean_ino(ino);
820 738
821 autofs4_del_active(dentry); 739 autofs4_del_active(dentry);
822 740
823 inode = autofs4_get_inode(dir->i_sb, ino); 741 inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
824 if (!inode) { 742 if (!inode)
825 if (!dentry->d_fsdata)
826 kfree(ino);
827 return -ENOMEM; 743 return -ENOMEM;
828 }
829 d_add(dentry, inode); 744 d_add(dentry, inode);
830 745
831 if (dir == dir->i_sb->s_root->d_inode) 746 if (sbi->version < 5)
832 dentry->d_op = &autofs4_root_dentry_operations; 747 autofs_set_leaf_automount_flags(dentry);
833 else
834 dentry->d_op = &autofs4_dentry_operations;
835 748
836 dentry->d_fsdata = ino; 749 dget(dentry);
837 ino->dentry = dget(dentry);
838 atomic_inc(&ino->count); 750 atomic_inc(&ino->count);
839 p_ino = autofs4_dentry_ino(dentry->d_parent); 751 p_ino = autofs4_dentry_ino(dentry->d_parent);
840 if (p_ino && dentry->d_parent != dentry) 752 if (p_ino && dentry->d_parent != dentry)
841 atomic_inc(&p_ino->count); 753 atomic_inc(&p_ino->count);
842 ino->inode = inode;
843 inc_nlink(dir); 754 inc_nlink(dir);
844 dir->i_mtime = CURRENT_TIME; 755 dir->i_mtime = CURRENT_TIME;
845 756
@@ -921,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
921int is_autofs4_dentry(struct dentry *dentry) 832int is_autofs4_dentry(struct dentry *dentry)
922{ 833{
923 return dentry && dentry->d_inode && 834 return dentry && dentry->d_inode &&
924 (dentry->d_op == &autofs4_root_dentry_operations || 835 dentry->d_op == &autofs4_dentry_operations &&
925 dentry->d_op == &autofs4_dentry_operations) &&
926 dentry->d_fsdata != NULL; 836 dentry->d_fsdata != NULL;
927} 837}
928 838
@@ -980,19 +890,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
980 } 890 }
981} 891}
982 892
983static DEFINE_MUTEX(autofs4_ioctl_mutex);
984
985static long autofs4_root_ioctl(struct file *filp, 893static long autofs4_root_ioctl(struct file *filp,
986 unsigned int cmd, unsigned long arg) 894 unsigned int cmd, unsigned long arg)
987{ 895{
988 long ret;
989 struct inode *inode = filp->f_dentry->d_inode; 896 struct inode *inode = filp->f_dentry->d_inode;
990 897 return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
991 mutex_lock(&autofs4_ioctl_mutex);
992 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
993 mutex_unlock(&autofs4_ioctl_mutex);
994
995 return ret;
996} 898}
997 899
998#ifdef CONFIG_COMPAT 900#ifdef CONFIG_COMPAT
@@ -1002,13 +904,11 @@ static long autofs4_root_compat_ioctl(struct file *filp,
1002 struct inode *inode = filp->f_path.dentry->d_inode; 904 struct inode *inode = filp->f_path.dentry->d_inode;
1003 int ret; 905 int ret;
1004 906
1005 mutex_lock(&autofs4_ioctl_mutex);
1006 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) 907 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
1007 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 908 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
1008 else 909 else
1009 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, 910 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
1010 (unsigned long)compat_ptr(arg)); 911 (unsigned long)compat_ptr(arg));
1011 mutex_unlock(&autofs4_ioctl_mutex);
1012 912
1013 return ret; 913 return ret;
1014} 914}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2..f27c094a191 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
14 14
15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) 15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
16{ 16{
17 struct autofs_info *ino = autofs4_dentry_ino(dentry); 17 nd_set_link(nd, dentry->d_inode->i_private);
18 nd_set_link(nd, (char *)ino->u.symlink);
19 return NULL; 18 return NULL;
20} 19}
21 20
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 2341375386f..56010056b2e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -186,16 +186,26 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
186{ 186{
187 struct dentry *root = sbi->sb->s_root; 187 struct dentry *root = sbi->sb->s_root;
188 struct dentry *tmp; 188 struct dentry *tmp;
189 char *buf = *name; 189 char *buf;
190 char *p; 190 char *p;
191 int len = 0; 191 int len;
192 unsigned seq;
192 193
193 spin_lock(&dcache_lock); 194rename_retry:
195 buf = *name;
196 len = 0;
197
198 seq = read_seqbegin(&rename_lock);
199 rcu_read_lock();
200 spin_lock(&autofs4_lock);
194 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) 201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
195 len += tmp->d_name.len + 1; 202 len += tmp->d_name.len + 1;
196 203
197 if (!len || --len > NAME_MAX) { 204 if (!len || --len > NAME_MAX) {
198 spin_unlock(&dcache_lock); 205 spin_unlock(&autofs4_lock);
206 rcu_read_unlock();
207 if (read_seqretry(&rename_lock, seq))
208 goto rename_retry;
199 return 0; 209 return 0;
200 } 210 }
201 211
@@ -208,7 +218,10 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
208 p -= tmp->d_name.len; 218 p -= tmp->d_name.len;
209 strncpy(p, tmp->d_name.name, tmp->d_name.len); 219 strncpy(p, tmp->d_name.name, tmp->d_name.len);
210 } 220 }
211 spin_unlock(&dcache_lock); 221 spin_unlock(&autofs4_lock);
222 rcu_read_unlock();
223 if (read_seqretry(&rename_lock, seq))
224 goto rename_retry;
212 225
213 return len; 226 return len;
214} 227}
@@ -296,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
296 * completed while we waited on the mutex ... 309 * completed while we waited on the mutex ...
297 */ 310 */
298 if (notify == NFY_MOUNT) { 311 if (notify == NFY_MOUNT) {
312 struct dentry *new = NULL;
313 int valid = 1;
314
299 /* 315 /*
300 * If the dentry was successfully mounted while we slept 316 * If the dentry was successfully mounted while we slept
301 * on the wait queue mutex we can return success. If it 317 * on the wait queue mutex we can return success. If it
@@ -303,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
303 * a multi-mount with no mount at it's base) we can 319 * a multi-mount with no mount at it's base) we can
304 * continue on and create a new request. 320 * continue on and create a new request.
305 */ 321 */
322 if (!IS_ROOT(dentry)) {
323 if (dentry->d_inode && d_unhashed(dentry)) {
324 struct dentry *parent = dentry->d_parent;
325 new = d_lookup(parent, &dentry->d_name);
326 if (new)
327 dentry = new;
328 }
329 }
306 if (have_submounts(dentry)) 330 if (have_submounts(dentry))
307 return 0; 331 valid = 0;
332
333 if (new)
334 dput(new);
335 return valid;
308 } 336 }
309 337
310 return 1; 338 return 1;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f024d8aadde..9ad2369d9e3 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
229 return -EIO; 229 return -EIO;
230} 230}
231 231
232static int bad_inode_permission(struct inode *inode, int mask) 232static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
233{ 233{
234 if (flags & IPERM_FLAG_RCU)
235 return -ECHILD;
236
234 return -EIO; 237 return -EIO;
235} 238}
236 239
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 6cb84d896d0..27223878ba9 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
102} 102}
103 103
104static inline befs_data_stream 104static inline befs_data_stream
105fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n) 105fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
106{ 106{
107 befs_data_stream data; 107 befs_data_stream data;
108 int i; 108 int i;
109 109
110 for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i) 110 for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
111 data.direct[i] = fsrun_to_cpu(sb, n.direct[i]); 111 data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
112 112
113 data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range); 113 data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
114 data.indirect = fsrun_to_cpu(sb, n.indirect); 114 data.indirect = fsrun_to_cpu(sb, n->indirect);
115 data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range); 115 data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
116 data.double_indirect = fsrun_to_cpu(sb, n.double_indirect); 116 data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
117 data.max_double_indirect_range = fs64_to_cpu(sb, 117 data.max_double_indirect_range = fs64_to_cpu(sb,
118 n. 118 n->
119 max_double_indirect_range); 119 max_double_indirect_range);
120 data.size = fs64_to_cpu(sb, n.size); 120 data.size = fs64_to_cpu(sb, n->size);
121 121
122 return data; 122 return data;
123} 123}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index aa4e7c7ae3c..b1d0c794747 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb)
284 return &bi->vfs_inode; 284 return &bi->vfs_inode;
285} 285}
286 286
287static void 287static void befs_i_callback(struct rcu_head *head)
288befs_destroy_inode(struct inode *inode)
289{ 288{
289 struct inode *inode = container_of(head, struct inode, i_rcu);
290 INIT_LIST_HEAD(&inode->i_dentry);
290 kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); 291 kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
291} 292}
292 293
294static void befs_destroy_inode(struct inode *inode)
295{
296 call_rcu(&inode->i_rcu, befs_i_callback);
297}
298
293static void init_once(void *foo) 299static void init_once(void *foo)
294{ 300{
295 struct befs_inode_info *bi = (struct befs_inode_info *) foo; 301 struct befs_inode_info *bi = (struct befs_inode_info *) foo;
@@ -384,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
384 int num_blks; 390 int num_blks;
385 391
386 befs_ino->i_data.ds = 392 befs_ino->i_data.ds =
387 fsds_to_cpu(sb, raw_inode->data.datastream); 393 fsds_to_cpu(sb, &raw_inode->data.datastream);
388 394
389 num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds); 395 num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
390 inode->i_blocks = 396 inode->i_blocks =
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 76db6d7d49b..a8e37f81d09 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
248 return &bi->vfs_inode; 248 return &bi->vfs_inode;
249} 249}
250 250
251static void bfs_destroy_inode(struct inode *inode) 251static void bfs_i_callback(struct rcu_head *head)
252{ 252{
253 struct inode *inode = container_of(head, struct inode, i_rcu);
254 INIT_LIST_HEAD(&inode->i_dentry);
253 kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); 255 kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
254} 256}
255 257
258static void bfs_destroy_inode(struct inode *inode)
259{
260 call_rcu(&inode->i_rcu, bfs_i_callback);
261}
262
256static void init_once(void *foo) 263static void init_once(void *foo)
257{ 264{
258 struct bfs_inode_info *bi = foo; 265 struct bfs_inode_info *bi = foo;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6884e198e0c..d5b640ba6cb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
66#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) 66#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
67 67
68static struct linux_binfmt elf_format = { 68static struct linux_binfmt elf_format = {
69 .module = THIS_MODULE, 69 .module = THIS_MODULE,
70 .load_binary = load_elf_binary, 70 .load_binary = load_elf_binary,
71 .load_shlib = load_elf_library, 71 .load_shlib = load_elf_library,
72 .core_dump = elf_core_dump, 72 .core_dump = elf_core_dump,
73 .min_coredump = ELF_EXEC_PAGESIZE, 73 .min_coredump = ELF_EXEC_PAGESIZE,
74 .hasvdso = 1
75}; 74};
76 75
77#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) 76#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
316 return 0; 315 return 0;
317} 316}
318 317
319#ifndef elf_map
320
321static unsigned long elf_map(struct file *filep, unsigned long addr, 318static unsigned long elf_map(struct file *filep, unsigned long addr,
322 struct elf_phdr *eppnt, int prot, int type, 319 struct elf_phdr *eppnt, int prot, int type,
323 unsigned long total_size) 320 unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
354 return(map_addr); 351 return(map_addr);
355} 352}
356 353
357#endif /* !elf_map */
358
359static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) 354static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
360{ 355{
361 int i, first_idx = -1, last_idx = -1; 356 int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
421 goto out; 416 goto out;
422 417
423 retval = kernel_read(interpreter, interp_elf_ex->e_phoff, 418 retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
424 (char *)elf_phdata,size); 419 (char *)elf_phdata, size);
425 error = -EIO; 420 error = -EIO;
426 if (retval != size) { 421 if (retval != size) {
427 if (retval < 0) 422 if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
601 goto out; 596 goto out;
602 if (!elf_check_arch(&loc->elf_ex)) 597 if (!elf_check_arch(&loc->elf_ex))
603 goto out; 598 goto out;
604 if (!bprm->file->f_op||!bprm->file->f_op->mmap) 599 if (!bprm->file->f_op || !bprm->file->f_op->mmap)
605 goto out; 600 goto out;
606 601
607 /* Now read in all of the header information */ 602 /* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
761 /* There was a PT_LOAD segment with p_memsz > p_filesz 756 /* There was a PT_LOAD segment with p_memsz > p_filesz
762 before this one. Map anonymous pages, if needed, 757 before this one. Map anonymous pages, if needed,
763 and clear the area. */ 758 and clear the area. */
764 retval = set_brk (elf_bss + load_bias, 759 retval = set_brk(elf_bss + load_bias,
765 elf_brk + load_bias); 760 elf_brk + load_bias);
766 if (retval) { 761 if (retval) {
767 send_sig(SIGKILL, current, 0); 762 send_sig(SIGKILL, current, 0);
768 goto out_free_dentry; 763 goto out_free_dentry;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4d0ff5ee27b..e49cce234c6 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -782,7 +782,12 @@ void __init bio_integrity_init(void)
782{ 782{
783 unsigned int i; 783 unsigned int i;
784 784
785 kintegrityd_wq = create_workqueue("kintegrityd"); 785 /*
786 * kintegrityd won't block much but may burn a lot of CPU cycles.
787 * Make it highpri CPU intensive wq with max concurrency of 1.
788 */
789 kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
790 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
786 if (!kintegrityd_wq) 791 if (!kintegrityd_wq)
787 panic("Failed to create kintegrityd\n"); 792 panic("Failed to create kintegrityd\n");
788 793
diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7..4bd454fa844 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
370{ 370{
371 struct bio *bio; 371 struct bio *bio;
372 372
373 if (nr_iovecs > UIO_MAXIOV)
374 return NULL;
375
373 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), 376 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
374 gfp_mask); 377 gfp_mask);
375 if (unlikely(!bio)) 378 if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
697static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, 700static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
698 gfp_t gfp_mask) 701 gfp_t gfp_mask)
699{ 702{
700 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); 703 struct bio_map_data *bmd;
701 704
705 if (iov_count > UIO_MAXIOV)
706 return NULL;
707
708 bmd = kmalloc(sizeof(*bmd), gfp_mask);
702 if (!bmd) 709 if (!bmd)
703 return NULL; 710 return NULL;
704 711
@@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
827 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 834 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
828 start = uaddr >> PAGE_SHIFT; 835 start = uaddr >> PAGE_SHIFT;
829 836
837 /*
838 * Overflow, abort
839 */
840 if (end < start)
841 return ERR_PTR(-EINVAL);
842
830 nr_pages += end - start; 843 nr_pages += end - start;
831 len += iov[i].iov_len; 844 len += iov[i].iov_len;
832 } 845 }
@@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
955 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 968 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
956 unsigned long start = uaddr >> PAGE_SHIFT; 969 unsigned long start = uaddr >> PAGE_SHIFT;
957 970
971 /*
972 * Overflow, abort
973 */
974 if (end < start)
975 return ERR_PTR(-EINVAL);
976
958 nr_pages += end - start; 977 nr_pages += end - start;
959 /* 978 /*
960 * buffer must be aligned to at least hardsector size for now 979 * buffer must be aligned to at least hardsector size for now
@@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
982 unsigned long start = uaddr >> PAGE_SHIFT; 1001 unsigned long start = uaddr >> PAGE_SHIFT;
983 const int local_nr_pages = end - start; 1002 const int local_nr_pages = end - start;
984 const int page_limit = cur_page + local_nr_pages; 1003 const int page_limit = cur_page + local_nr_pages;
985 1004
986 ret = get_user_pages_fast(uaddr, local_nr_pages, 1005 ret = get_user_pages_fast(uaddr, local_nr_pages,
987 write_to_vm, &pages[cur_page]); 1006 write_to_vm, &pages[cur_page]);
988 if (ret < local_nr_pages) { 1007 if (ret < local_nr_pages) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 06e8ff12b97..333a7bb4cb9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -11,7 +11,6 @@
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/kmod.h> 12#include <linux/kmod.h>
13#include <linux/major.h> 13#include <linux/major.h>
14#include <linux/smp_lock.h>
15#include <linux/device_cgroup.h> 14#include <linux/device_cgroup.h>
16#include <linux/highmem.h> 15#include <linux/highmem.h>
17#include <linux/blkdev.h> 16#include <linux/blkdev.h>
@@ -410,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
410 return &ei->vfs_inode; 409 return &ei->vfs_inode;
411} 410}
412 411
413static void bdev_destroy_inode(struct inode *inode) 412static void bdev_i_callback(struct rcu_head *head)
414{ 413{
414 struct inode *inode = container_of(head, struct inode, i_rcu);
415 struct bdev_inode *bdi = BDEV_I(inode); 415 struct bdev_inode *bdi = BDEV_I(inode);
416 416
417 INIT_LIST_HEAD(&inode->i_dentry);
417 kmem_cache_free(bdev_cachep, bdi); 418 kmem_cache_free(bdev_cachep, bdi);
418} 419}
419 420
421static void bdev_destroy_inode(struct inode *inode)
422{
423 call_rcu(&inode->i_rcu, bdev_i_callback);
424}
425
420static void init_once(void *foo) 426static void init_once(void *foo)
421{ 427{
422 struct bdev_inode *ei = (struct bdev_inode *) foo; 428 struct bdev_inode *ei = (struct bdev_inode *) foo;
@@ -427,7 +433,7 @@ static void init_once(void *foo)
427 INIT_LIST_HEAD(&bdev->bd_inodes); 433 INIT_LIST_HEAD(&bdev->bd_inodes);
428 INIT_LIST_HEAD(&bdev->bd_list); 434 INIT_LIST_HEAD(&bdev->bd_list);
429#ifdef CONFIG_SYSFS 435#ifdef CONFIG_SYSFS
430 INIT_LIST_HEAD(&bdev->bd_holder_list); 436 INIT_LIST_HEAD(&bdev->bd_holder_disks);
431#endif 437#endif
432 inode_init_once(&ei->vfs_inode); 438 inode_init_once(&ei->vfs_inode);
433 /* Initialize mutex for freeze. */ 439 /* Initialize mutex for freeze. */
@@ -467,7 +473,7 @@ static const struct super_operations bdev_sops = {
467static struct dentry *bd_mount(struct file_system_type *fs_type, 473static struct dentry *bd_mount(struct file_system_type *fs_type,
468 int flags, const char *dev_name, void *data) 474 int flags, const char *dev_name, void *data)
469{ 475{
470 return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); 476 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
471} 477}
472 478
473static struct file_system_type bd_type = { 479static struct file_system_type bd_type = {
@@ -663,7 +669,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
663 else if (bdev->bd_contains == bdev) 669 else if (bdev->bd_contains == bdev)
664 return true; /* is a whole device which isn't held */ 670 return true; /* is a whole device which isn't held */
665 671
666 else if (whole->bd_holder == bd_claim) 672 else if (whole->bd_holder == bd_may_claim)
667 return true; /* is a partition of a device that is being partitioned */ 673 return true; /* is a partition of a device that is being partitioned */
668 else if (whole->bd_holder != NULL) 674 else if (whole->bd_holder != NULL)
669 return false; /* is a partition of a held device */ 675 return false; /* is a partition of a held device */
@@ -775,439 +781,142 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
775 } 781 }
776} 782}
777 783
778/* releases bdev_lock */ 784#ifdef CONFIG_SYSFS
779static void __bd_abort_claiming(struct block_device *whole, void *holder) 785struct bd_holder_disk {
780{ 786 struct list_head list;
781 BUG_ON(whole->bd_claiming != holder); 787 struct gendisk *disk;
782 whole->bd_claiming = NULL; 788 int refcnt;
783 wake_up_bit(&whole->bd_claiming, 0); 789};
784
785 spin_unlock(&bdev_lock);
786 bdput(whole);
787}
788
789/**
790 * bd_abort_claiming - abort claiming a block device
791 * @whole: whole block device returned by bd_start_claiming()
792 * @holder: holder trying to claim @bdev
793 *
794 * Abort a claiming block started by bd_start_claiming(). Note that
795 * @whole is not the block device to be claimed but the whole device
796 * returned by bd_start_claiming().
797 *
798 * CONTEXT:
799 * Grabs and releases bdev_lock.
800 */
801static void bd_abort_claiming(struct block_device *whole, void *holder)
802{
803 spin_lock(&bdev_lock);
804 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
805}
806
807/* increment holders when we have a legitimate claim. requires bdev_lock */
808static void __bd_claim(struct block_device *bdev, struct block_device *whole,
809 void *holder)
810{
811 /* note that for a whole device bd_holders
812 * will be incremented twice, and bd_holder will
813 * be set to bd_claim before being set to holder
814 */
815 whole->bd_holders++;
816 whole->bd_holder = bd_claim;
817 bdev->bd_holders++;
818 bdev->bd_holder = holder;
819}
820
821/**
822 * bd_finish_claiming - finish claiming a block device
823 * @bdev: block device of interest (passed to bd_start_claiming())
824 * @whole: whole block device returned by bd_start_claiming()
825 * @holder: holder trying to claim @bdev
826 *
827 * Finish a claiming block started by bd_start_claiming().
828 *
829 * CONTEXT:
830 * Grabs and releases bdev_lock.
831 */
832static void bd_finish_claiming(struct block_device *bdev,
833 struct block_device *whole, void *holder)
834{
835 spin_lock(&bdev_lock);
836 BUG_ON(!bd_may_claim(bdev, whole, holder));
837 __bd_claim(bdev, whole, holder);
838 __bd_abort_claiming(whole, holder); /* not actually an abort */
839}
840 790
841/** 791static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
842 * bd_claim - claim a block device 792 struct gendisk *disk)
843 * @bdev: block device to claim
844 * @holder: holder trying to claim @bdev
845 *
846 * Try to claim @bdev which must have been opened successfully.
847 *
848 * CONTEXT:
849 * Might sleep.
850 *
851 * RETURNS:
852 * 0 if successful, -EBUSY if @bdev is already claimed.
853 */
854int bd_claim(struct block_device *bdev, void *holder)
855{ 793{
856 struct block_device *whole = bdev->bd_contains; 794 struct bd_holder_disk *holder;
857 int res;
858 795
859 might_sleep(); 796 list_for_each_entry(holder, &bdev->bd_holder_disks, list)
860 797 if (holder->disk == disk)
861 spin_lock(&bdev_lock); 798 return holder;
862 res = bd_prepare_to_claim(bdev, whole, holder); 799 return NULL;
863 if (res == 0)
864 __bd_claim(bdev, whole, holder);
865 spin_unlock(&bdev_lock);
866
867 return res;
868}
869EXPORT_SYMBOL(bd_claim);
870
871void bd_release(struct block_device *bdev)
872{
873 spin_lock(&bdev_lock);
874 if (!--bdev->bd_contains->bd_holders)
875 bdev->bd_contains->bd_holder = NULL;
876 if (!--bdev->bd_holders)
877 bdev->bd_holder = NULL;
878 spin_unlock(&bdev_lock);
879} 800}
880 801
881EXPORT_SYMBOL(bd_release);
882
883#ifdef CONFIG_SYSFS
884/*
885 * Functions for bd_claim_by_kobject / bd_release_from_kobject
886 *
887 * If a kobject is passed to bd_claim_by_kobject()
888 * and the kobject has a parent directory,
889 * following symlinks are created:
890 * o from the kobject to the claimed bdev
891 * o from "holders" directory of the bdev to the parent of the kobject
892 * bd_release_from_kobject() removes these symlinks.
893 *
894 * Example:
895 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to
896 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
897 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
898 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
899 */
900
901static int add_symlink(struct kobject *from, struct kobject *to) 802static int add_symlink(struct kobject *from, struct kobject *to)
902{ 803{
903 if (!from || !to)
904 return 0;
905 return sysfs_create_link(from, to, kobject_name(to)); 804 return sysfs_create_link(from, to, kobject_name(to));
906} 805}
907 806
908static void del_symlink(struct kobject *from, struct kobject *to) 807static void del_symlink(struct kobject *from, struct kobject *to)
909{ 808{
910 if (!from || !to)
911 return;
912 sysfs_remove_link(from, kobject_name(to)); 809 sysfs_remove_link(from, kobject_name(to));
913} 810}
914 811
915/*
916 * 'struct bd_holder' contains pointers to kobjects symlinked by
917 * bd_claim_by_kobject.
918 * It's connected to bd_holder_list which is protected by bdev->bd_sem.
919 */
920struct bd_holder {
921 struct list_head list; /* chain of holders of the bdev */
922 int count; /* references from the holder */
923 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */
924 struct kobject *hdev; /* e.g. "/block/dm-0" */
925 struct kobject *hdir; /* e.g. "/block/sda/holders" */
926 struct kobject *sdev; /* e.g. "/block/sda" */
927};
928
929/*
930 * Get references of related kobjects at once.
931 * Returns 1 on success. 0 on failure.
932 *
933 * Should call bd_holder_release_dirs() after successful use.
934 */
935static int bd_holder_grab_dirs(struct block_device *bdev,
936 struct bd_holder *bo)
937{
938 if (!bdev || !bo)
939 return 0;
940
941 bo->sdir = kobject_get(bo->sdir);
942 if (!bo->sdir)
943 return 0;
944
945 bo->hdev = kobject_get(bo->sdir->parent);
946 if (!bo->hdev)
947 goto fail_put_sdir;
948
949 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
950 if (!bo->sdev)
951 goto fail_put_hdev;
952
953 bo->hdir = kobject_get(bdev->bd_part->holder_dir);
954 if (!bo->hdir)
955 goto fail_put_sdev;
956
957 return 1;
958
959fail_put_sdev:
960 kobject_put(bo->sdev);
961fail_put_hdev:
962 kobject_put(bo->hdev);
963fail_put_sdir:
964 kobject_put(bo->sdir);
965
966 return 0;
967}
968
969/* Put references of related kobjects at once. */
970static void bd_holder_release_dirs(struct bd_holder *bo)
971{
972 kobject_put(bo->hdir);
973 kobject_put(bo->sdev);
974 kobject_put(bo->hdev);
975 kobject_put(bo->sdir);
976}
977
978static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
979{
980 struct bd_holder *bo;
981
982 bo = kzalloc(sizeof(*bo), GFP_KERNEL);
983 if (!bo)
984 return NULL;
985
986 bo->count = 1;
987 bo->sdir = kobj;
988
989 return bo;
990}
991
992static void free_bd_holder(struct bd_holder *bo)
993{
994 kfree(bo);
995}
996
997/** 812/**
998 * find_bd_holder - find matching struct bd_holder from the block device 813 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
814 * @bdev: the claimed slave bdev
815 * @disk: the holding disk
999 * 816 *
1000 * @bdev: struct block device to be searched 817 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
1001 * @bo: target struct bd_holder
1002 * 818 *
1003 * Returns matching entry with @bo in @bdev->bd_holder_list. 819 * This functions creates the following sysfs symlinks.
1004 * If found, increment the reference count and return the pointer. 820 *
1005 * If not found, returns NULL. 821 * - from "slaves" directory of the holder @disk to the claimed @bdev
1006 */ 822 * - from "holders" directory of the @bdev to the holder @disk
1007static struct bd_holder *find_bd_holder(struct block_device *bdev, 823 *
1008 struct bd_holder *bo) 824 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
1009{ 825 * passed to bd_link_disk_holder(), then:
1010 struct bd_holder *tmp; 826 *
1011 827 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
1012 list_for_each_entry(tmp, &bdev->bd_holder_list, list) 828 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
1013 if (tmp->sdir == bo->sdir) {
1014 tmp->count++;
1015 return tmp;
1016 }
1017
1018 return NULL;
1019}
1020
1021/**
1022 * add_bd_holder - create sysfs symlinks for bd_claim() relationship
1023 * 829 *
1024 * @bdev: block device to be bd_claimed 830 * The caller must have claimed @bdev before calling this function and
1025 * @bo: preallocated and initialized by alloc_bd_holder() 831 * ensure that both @bdev and @disk are valid during the creation and
832 * lifetime of these symlinks.
1026 * 833 *
1027 * Add @bo to @bdev->bd_holder_list, create symlinks. 834 * CONTEXT:
835 * Might sleep.
1028 * 836 *
1029 * Returns 0 if symlinks are created. 837 * RETURNS:
1030 * Returns -ve if something fails. 838 * 0 on success, -errno on failure.
1031 */ 839 */
1032static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 840int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
1033{ 841{
1034 int err; 842 struct bd_holder_disk *holder;
843 int ret = 0;
1035 844
1036 if (!bo) 845 mutex_lock(&bdev->bd_mutex);
1037 return -EINVAL;
1038 846
1039 if (!bd_holder_grab_dirs(bdev, bo)) 847 WARN_ON_ONCE(!bdev->bd_holder);
1040 return -EBUSY;
1041 848
1042 err = add_symlink(bo->sdir, bo->sdev); 849 /* FIXME: remove the following once add_disk() handles errors */
1043 if (err) 850 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
1044 return err; 851 goto out_unlock;
1045 852
1046 err = add_symlink(bo->hdir, bo->hdev); 853 holder = bd_find_holder_disk(bdev, disk);
1047 if (err) { 854 if (holder) {
1048 del_symlink(bo->sdir, bo->sdev); 855 holder->refcnt++;
1049 return err; 856 goto out_unlock;
1050 } 857 }
1051 858
1052 list_add_tail(&bo->list, &bdev->bd_holder_list); 859 holder = kzalloc(sizeof(*holder), GFP_KERNEL);
1053 return 0; 860 if (!holder) {
1054} 861 ret = -ENOMEM;
1055 862 goto out_unlock;
1056/**
1057 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
1058 *
1059 * @bdev: block device to be bd_claimed
1060 * @kobj: holder's kobject
1061 *
1062 * If there is matching entry with @kobj in @bdev->bd_holder_list
1063 * and no other bd_claim() from the same kobject,
1064 * remove the struct bd_holder from the list, delete symlinks for it.
1065 *
1066 * Returns a pointer to the struct bd_holder when it's removed from the list
1067 * and ready to be freed.
1068 * Returns NULL if matching claim isn't found or there is other bd_claim()
1069 * by the same kobject.
1070 */
1071static struct bd_holder *del_bd_holder(struct block_device *bdev,
1072 struct kobject *kobj)
1073{
1074 struct bd_holder *bo;
1075
1076 list_for_each_entry(bo, &bdev->bd_holder_list, list) {
1077 if (bo->sdir == kobj) {
1078 bo->count--;
1079 BUG_ON(bo->count < 0);
1080 if (!bo->count) {
1081 list_del(&bo->list);
1082 del_symlink(bo->sdir, bo->sdev);
1083 del_symlink(bo->hdir, bo->hdev);
1084 bd_holder_release_dirs(bo);
1085 return bo;
1086 }
1087 break;
1088 }
1089 } 863 }
1090 864
1091 return NULL; 865 INIT_LIST_HEAD(&holder->list);
1092} 866 holder->disk = disk;
867 holder->refcnt = 1;
1093 868
1094/** 869 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1095 * bd_claim_by_kobject - bd_claim() with additional kobject signature 870 if (ret)
1096 * 871 goto out_free;
1097 * @bdev: block device to be claimed
1098 * @holder: holder's signature
1099 * @kobj: holder's kobject
1100 *
1101 * Do bd_claim() and if it succeeds, create sysfs symlinks between
1102 * the bdev and the holder's kobject.
1103 * Use bd_release_from_kobject() when relesing the claimed bdev.
1104 *
1105 * Returns 0 on success. (same as bd_claim())
1106 * Returns errno on failure.
1107 */
1108static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
1109 struct kobject *kobj)
1110{
1111 int err;
1112 struct bd_holder *bo, *found;
1113 872
1114 if (!kobj) 873 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
1115 return -EINVAL; 874 if (ret)
875 goto out_del;
1116 876
1117 bo = alloc_bd_holder(kobj); 877 list_add(&holder->list, &bdev->bd_holder_disks);
1118 if (!bo) 878 goto out_unlock;
1119 return -ENOMEM;
1120 879
1121 mutex_lock(&bdev->bd_mutex); 880out_del:
1122 881 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1123 err = bd_claim(bdev, holder); 882out_free:
1124 if (err) 883 kfree(holder);
1125 goto fail; 884out_unlock:
1126
1127 found = find_bd_holder(bdev, bo);
1128 if (found)
1129 goto fail;
1130
1131 err = add_bd_holder(bdev, bo);
1132 if (err)
1133 bd_release(bdev);
1134 else
1135 bo = NULL;
1136fail:
1137 mutex_unlock(&bdev->bd_mutex); 885 mutex_unlock(&bdev->bd_mutex);
1138 free_bd_holder(bo); 886 return ret;
1139 return err;
1140} 887}
888EXPORT_SYMBOL_GPL(bd_link_disk_holder);
1141 889
1142/** 890/**
1143 * bd_release_from_kobject - bd_release() with additional kobject signature 891 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
892 * @bdev: the calimed slave bdev
893 * @disk: the holding disk
1144 * 894 *
1145 * @bdev: block device to be released 895 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
1146 * @kobj: holder's kobject
1147 * 896 *
1148 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). 897 * CONTEXT:
898 * Might sleep.
1149 */ 899 */
1150static void bd_release_from_kobject(struct block_device *bdev, 900void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
1151 struct kobject *kobj)
1152{ 901{
1153 if (!kobj) 902 struct bd_holder_disk *holder;
1154 return;
1155 903
1156 mutex_lock(&bdev->bd_mutex); 904 mutex_lock(&bdev->bd_mutex);
1157 bd_release(bdev);
1158 free_bd_holder(del_bd_holder(bdev, kobj));
1159 mutex_unlock(&bdev->bd_mutex);
1160}
1161 905
1162/** 906 holder = bd_find_holder_disk(bdev, disk);
1163 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
1164 *
1165 * @bdev: block device to be claimed
1166 * @holder: holder's signature
1167 * @disk: holder's gendisk
1168 *
1169 * Call bd_claim_by_kobject() with getting @disk->slave_dir.
1170 */
1171int bd_claim_by_disk(struct block_device *bdev, void *holder,
1172 struct gendisk *disk)
1173{
1174 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
1175}
1176EXPORT_SYMBOL_GPL(bd_claim_by_disk);
1177 907
1178/** 908 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
1179 * bd_release_from_disk - wrapper function for bd_release_from_kobject() 909 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1180 * 910 del_symlink(bdev->bd_part->holder_dir,
1181 * @bdev: block device to be claimed 911 &disk_to_dev(disk)->kobj);
1182 * @disk: holder's gendisk 912 list_del_init(&holder->list);
1183 * 913 kfree(holder);
1184 * Call bd_release_from_kobject() and put @disk->slave_dir. 914 }
1185 */
1186void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
1187{
1188 bd_release_from_kobject(bdev, disk->slave_dir);
1189 kobject_put(disk->slave_dir);
1190}
1191EXPORT_SYMBOL_GPL(bd_release_from_disk);
1192#endif
1193 915
1194/* 916 mutex_unlock(&bdev->bd_mutex);
1195 * Tries to open block device by device number. Use it ONLY if you
1196 * really do not have anything better - i.e. when you are behind a
1197 * truly sucky interface and all you are given is a device number. _Never_
1198 * to be used for internal purposes. If you ever need it - reconsider
1199 * your API.
1200 */
1201struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
1202{
1203 struct block_device *bdev = bdget(dev);
1204 int err = -ENOMEM;
1205 if (bdev)
1206 err = blkdev_get(bdev, mode);
1207 return err ? ERR_PTR(err) : bdev;
1208} 917}
1209 918EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
1210EXPORT_SYMBOL(open_by_devnum); 919#endif
1211 920
1212/** 921/**
1213 * flush_disk - invalidates all buffer-cache entries on a disk 922 * flush_disk - invalidates all buffer-cache entries on a disk
@@ -1303,10 +1012,11 @@ int check_disk_change(struct block_device *bdev)
1303{ 1012{
1304 struct gendisk *disk = bdev->bd_disk; 1013 struct gendisk *disk = bdev->bd_disk;
1305 const struct block_device_operations *bdops = disk->fops; 1014 const struct block_device_operations *bdops = disk->fops;
1015 unsigned int events;
1306 1016
1307 if (!bdops->media_changed) 1017 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1308 return 0; 1018 DISK_EVENT_EJECT_REQUEST);
1309 if (!bdops->media_changed(bdev->bd_disk)) 1019 if (!(events & DISK_EVENT_MEDIA_CHANGE))
1310 return 0; 1020 return 0;
1311 1021
1312 flush_disk(bdev); 1022 flush_disk(bdev);
@@ -1469,17 +1179,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1469 return ret; 1179 return ret;
1470} 1180}
1471 1181
1472int blkdev_get(struct block_device *bdev, fmode_t mode) 1182/**
1183 * blkdev_get - open a block device
1184 * @bdev: block_device to open
1185 * @mode: FMODE_* mask
1186 * @holder: exclusive holder identifier
1187 *
1188 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
1189 * open with exclusive access. Specifying %FMODE_EXCL with %NULL
1190 * @holder is invalid. Exclusive opens may nest for the same @holder.
1191 *
1192 * On success, the reference count of @bdev is unchanged. On failure,
1193 * @bdev is put.
1194 *
1195 * CONTEXT:
1196 * Might sleep.
1197 *
1198 * RETURNS:
1199 * 0 on success, -errno on failure.
1200 */
1201int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1473{ 1202{
1474 return __blkdev_get(bdev, mode, 0); 1203 struct block_device *whole = NULL;
1204 int res;
1205
1206 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1207
1208 if ((mode & FMODE_EXCL) && holder) {
1209 whole = bd_start_claiming(bdev, holder);
1210 if (IS_ERR(whole)) {
1211 bdput(bdev);
1212 return PTR_ERR(whole);
1213 }
1214 }
1215
1216 res = __blkdev_get(bdev, mode, 0);
1217
1218 /* __blkdev_get() may alter read only status, check it afterwards */
1219 if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1220 __blkdev_put(bdev, mode, 0);
1221 res = -EACCES;
1222 }
1223
1224 if (whole) {
1225 /* finish claiming */
1226 mutex_lock(&bdev->bd_mutex);
1227 spin_lock(&bdev_lock);
1228
1229 if (!res) {
1230 BUG_ON(!bd_may_claim(bdev, whole, holder));
1231 /*
1232 * Note that for a whole device bd_holders
1233 * will be incremented twice, and bd_holder
1234 * will be set to bd_may_claim before being
1235 * set to holder
1236 */
1237 whole->bd_holders++;
1238 whole->bd_holder = bd_may_claim;
1239 bdev->bd_holders++;
1240 bdev->bd_holder = holder;
1241 }
1242
1243 /* tell others that we're done */
1244 BUG_ON(whole->bd_claiming != holder);
1245 whole->bd_claiming = NULL;
1246 wake_up_bit(&whole->bd_claiming, 0);
1247
1248 spin_unlock(&bdev_lock);
1249
1250 /*
1251 * Block event polling for write claims. Any write
1252 * holder makes the write_holder state stick until all
1253 * are released. This is good enough and tracking
1254 * individual writeable reference is too fragile given
1255 * the way @mode is used in blkdev_get/put().
1256 */
1257 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
1258 bdev->bd_write_holder = true;
1259 disk_block_events(bdev->bd_disk);
1260 }
1261
1262 mutex_unlock(&bdev->bd_mutex);
1263 bdput(whole);
1264 }
1265
1266 return res;
1475} 1267}
1476EXPORT_SYMBOL(blkdev_get); 1268EXPORT_SYMBOL(blkdev_get);
1477 1269
1270/**
1271 * blkdev_get_by_path - open a block device by name
1272 * @path: path to the block device to open
1273 * @mode: FMODE_* mask
1274 * @holder: exclusive holder identifier
1275 *
1276 * Open the blockdevice described by the device file at @path. @mode
1277 * and @holder are identical to blkdev_get().
1278 *
1279 * On success, the returned block_device has reference count of one.
1280 *
1281 * CONTEXT:
1282 * Might sleep.
1283 *
1284 * RETURNS:
1285 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1286 */
1287struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1288 void *holder)
1289{
1290 struct block_device *bdev;
1291 int err;
1292
1293 bdev = lookup_bdev(path);
1294 if (IS_ERR(bdev))
1295 return bdev;
1296
1297 err = blkdev_get(bdev, mode, holder);
1298 if (err)
1299 return ERR_PTR(err);
1300
1301 return bdev;
1302}
1303EXPORT_SYMBOL(blkdev_get_by_path);
1304
1305/**
1306 * blkdev_get_by_dev - open a block device by device number
1307 * @dev: device number of block device to open
1308 * @mode: FMODE_* mask
1309 * @holder: exclusive holder identifier
1310 *
1311 * Open the blockdevice described by device number @dev. @mode and
1312 * @holder are identical to blkdev_get().
1313 *
1314 * Use it ONLY if you really do not have anything better - i.e. when
1315 * you are behind a truly sucky interface and all you are given is a
1316 * device number. _Never_ to be used for internal purposes. If you
1317 * ever need it - reconsider your API.
1318 *
1319 * On success, the returned block_device has reference count of one.
1320 *
1321 * CONTEXT:
1322 * Might sleep.
1323 *
1324 * RETURNS:
1325 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1326 */
1327struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1328{
1329 struct block_device *bdev;
1330 int err;
1331
1332 bdev = bdget(dev);
1333 if (!bdev)
1334 return ERR_PTR(-ENOMEM);
1335
1336 err = blkdev_get(bdev, mode, holder);
1337 if (err)
1338 return ERR_PTR(err);
1339
1340 return bdev;
1341}
1342EXPORT_SYMBOL(blkdev_get_by_dev);
1343
1478static int blkdev_open(struct inode * inode, struct file * filp) 1344static int blkdev_open(struct inode * inode, struct file * filp)
1479{ 1345{
1480 struct block_device *whole = NULL;
1481 struct block_device *bdev; 1346 struct block_device *bdev;
1482 int res;
1483 1347
1484 /* 1348 /*
1485 * Preserve backwards compatibility and allow large file access 1349 * Preserve backwards compatibility and allow large file access
@@ -1500,26 +1364,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1500 if (bdev == NULL) 1364 if (bdev == NULL)
1501 return -ENOMEM; 1365 return -ENOMEM;
1502 1366
1503 if (filp->f_mode & FMODE_EXCL) {
1504 whole = bd_start_claiming(bdev, filp);
1505 if (IS_ERR(whole)) {
1506 bdput(bdev);
1507 return PTR_ERR(whole);
1508 }
1509 }
1510
1511 filp->f_mapping = bdev->bd_inode->i_mapping; 1367 filp->f_mapping = bdev->bd_inode->i_mapping;
1512 1368
1513 res = blkdev_get(bdev, filp->f_mode); 1369 return blkdev_get(bdev, filp->f_mode, filp);
1514
1515 if (whole) {
1516 if (res == 0)
1517 bd_finish_claiming(bdev, whole, filp);
1518 else
1519 bd_abort_claiming(whole, filp);
1520 }
1521
1522 return res;
1523} 1370}
1524 1371
1525static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1372static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1533,6 +1380,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1533 bdev->bd_part_count--; 1380 bdev->bd_part_count--;
1534 1381
1535 if (!--bdev->bd_openers) { 1382 if (!--bdev->bd_openers) {
1383 WARN_ON_ONCE(bdev->bd_holders);
1536 sync_blockdev(bdev); 1384 sync_blockdev(bdev);
1537 kill_bdev(bdev); 1385 kill_bdev(bdev);
1538 } 1386 }
@@ -1563,6 +1411,44 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1563 1411
1564int blkdev_put(struct block_device *bdev, fmode_t mode) 1412int blkdev_put(struct block_device *bdev, fmode_t mode)
1565{ 1413{
1414 if (mode & FMODE_EXCL) {
1415 bool bdev_free;
1416
1417 /*
1418 * Release a claim on the device. The holder fields
1419 * are protected with bdev_lock. bd_mutex is to
1420 * synchronize disk_holder unlinking.
1421 */
1422 mutex_lock(&bdev->bd_mutex);
1423 spin_lock(&bdev_lock);
1424
1425 WARN_ON_ONCE(--bdev->bd_holders < 0);
1426 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1427
1428 /* bd_contains might point to self, check in a separate step */
1429 if ((bdev_free = !bdev->bd_holders))
1430 bdev->bd_holder = NULL;
1431 if (!bdev->bd_contains->bd_holders)
1432 bdev->bd_contains->bd_holder = NULL;
1433
1434 spin_unlock(&bdev_lock);
1435
1436 /*
1437 * If this was the last claim, remove holder link and
1438 * unblock evpoll if it was a write holder.
1439 */
1440 if (bdev_free) {
1441 if (bdev->bd_write_holder) {
1442 disk_unblock_events(bdev->bd_disk);
1443 bdev->bd_write_holder = false;
1444 } else
1445 disk_check_events(bdev->bd_disk);
1446 }
1447
1448 mutex_unlock(&bdev->bd_mutex);
1449 } else
1450 disk_check_events(bdev->bd_disk);
1451
1566 return __blkdev_put(bdev, mode, 0); 1452 return __blkdev_put(bdev, mode, 0);
1567} 1453}
1568EXPORT_SYMBOL(blkdev_put); 1454EXPORT_SYMBOL(blkdev_put);
@@ -1570,8 +1456,7 @@ EXPORT_SYMBOL(blkdev_put);
1570static int blkdev_close(struct inode * inode, struct file * filp) 1456static int blkdev_close(struct inode * inode, struct file * filp)
1571{ 1457{
1572 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1458 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1573 if (bdev->bd_holder == filp) 1459
1574 bd_release(bdev);
1575 return blkdev_put(bdev, filp->f_mode); 1460 return blkdev_put(bdev, filp->f_mode);
1576} 1461}
1577 1462
@@ -1716,67 +1601,6 @@ fail:
1716} 1601}
1717EXPORT_SYMBOL(lookup_bdev); 1602EXPORT_SYMBOL(lookup_bdev);
1718 1603
1719/**
1720 * open_bdev_exclusive - open a block device by name and set it up for use
1721 *
1722 * @path: special file representing the block device
1723 * @mode: FMODE_... combination to pass be used
1724 * @holder: owner for exclusion
1725 *
1726 * Open the blockdevice described by the special file at @path, claim it
1727 * for the @holder.
1728 */
1729struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1730{
1731 struct block_device *bdev, *whole;
1732 int error;
1733
1734 bdev = lookup_bdev(path);
1735 if (IS_ERR(bdev))
1736 return bdev;
1737
1738 whole = bd_start_claiming(bdev, holder);
1739 if (IS_ERR(whole)) {
1740 bdput(bdev);
1741 return whole;
1742 }
1743
1744 error = blkdev_get(bdev, mode);
1745 if (error)
1746 goto out_abort_claiming;
1747
1748 error = -EACCES;
1749 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1750 goto out_blkdev_put;
1751
1752 bd_finish_claiming(bdev, whole, holder);
1753 return bdev;
1754
1755out_blkdev_put:
1756 blkdev_put(bdev, mode);
1757out_abort_claiming:
1758 bd_abort_claiming(whole, holder);
1759 return ERR_PTR(error);
1760}
1761
1762EXPORT_SYMBOL(open_bdev_exclusive);
1763
1764/**
1765 * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive()
1766 *
1767 * @bdev: blockdevice to close
1768 * @mode: mode, must match that used to open.
1769 *
1770 * This is the counterpart to open_bdev_exclusive().
1771 */
1772void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
1773{
1774 bd_release(bdev);
1775 blkdev_put(bdev, mode);
1776}
1777
1778EXPORT_SYMBOL(close_bdev_exclusive);
1779
1780int __invalidate_device(struct block_device *bdev) 1604int __invalidate_device(struct block_device *bdev)
1781{ 1605{
1782 struct super_block *sb = get_super(bdev); 1606 struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e57..ecb9fd3be14 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
4 select LIBCRC32C 4 select LIBCRC32C
5 select ZLIB_INFLATE 5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS
8 select LZO_DECOMPRESS
7 help 9 help
8 Btrfs is a new filesystem with extents, writable snapshotting, 10 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features. 11 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32f..31610ea73ae 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o 10 compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b..15b5ca2a260 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 61 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl)) 63 if (IS_ERR(acl)) {
64 kfree(value);
64 return acl; 65 return acl;
66 }
65 set_cached_acl(inode, type, acl); 67 set_cached_acl(inode, type, acl);
66 } 68 }
67 kfree(value); 69 kfree(value);
@@ -185,18 +187,23 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
185 return ret; 187 return ret;
186} 188}
187 189
188int btrfs_check_acl(struct inode *inode, int mask) 190int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
189{ 191{
190 struct posix_acl *acl;
191 int error = -EAGAIN; 192 int error = -EAGAIN;
192 193
193 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); 194 if (flags & IPERM_FLAG_RCU) {
195 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
196 error = -ECHILD;
194 197
195 if (IS_ERR(acl)) 198 } else {
196 return PTR_ERR(acl); 199 struct posix_acl *acl;
197 if (acl) { 200 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
198 error = posix_acl_permission(inode, acl, mask); 201 if (IS_ERR(acl))
199 posix_acl_release(acl); 202 return PTR_ERR(acl);
203 if (acl) {
204 error = posix_acl_permission(inode, acl, mask);
205 posix_acl_release(acl);
206 }
200 } 207 }
201 208
202 return error; 209 return error;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca..ccc991c542d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
157 /* 157 /*
158 * always compress this one file 158 * always compress this one file
159 */ 159 */
160 unsigned force_compress:1; 160 unsigned force_compress:4;
161 161
162 struct inode vfs_inode; 162 struct inode vfs_inode;
163}; 163};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7845d1f7d1d..f745287fbf2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
62 /* number of bytes on disk */ 62 /* number of bytes on disk */
63 unsigned long compressed_len; 63 unsigned long compressed_len;
64 64
65 /* the compression algorithm for this bio */
66 int compress_type;
67
65 /* number of compressed pages in the array */ 68 /* number of compressed pages in the array */
66 unsigned long nr_pages; 69 unsigned long nr_pages;
67 70
@@ -91,23 +94,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
91static struct bio *compressed_bio_alloc(struct block_device *bdev, 94static struct bio *compressed_bio_alloc(struct block_device *bdev,
92 u64 first_byte, gfp_t gfp_flags) 95 u64 first_byte, gfp_t gfp_flags)
93{ 96{
94 struct bio *bio;
95 int nr_vecs; 97 int nr_vecs;
96 98
97 nr_vecs = bio_get_nr_vecs(bdev); 99 nr_vecs = bio_get_nr_vecs(bdev);
98 bio = bio_alloc(gfp_flags, nr_vecs); 100 return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
99
100 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
101 while (!bio && (nr_vecs /= 2))
102 bio = bio_alloc(gfp_flags, nr_vecs);
103 }
104
105 if (bio) {
106 bio->bi_size = 0;
107 bio->bi_bdev = bdev;
108 bio->bi_sector = first_byte >> 9;
109 }
110 return bio;
111} 101}
112 102
113static int check_compressed_csum(struct inode *inode, 103static int check_compressed_csum(struct inode *inode,
@@ -186,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
186 /* ok, we're the last bio for this extent, lets start 176 /* ok, we're the last bio for this extent, lets start
187 * the decompression. 177 * the decompression.
188 */ 178 */
189 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 179 ret = btrfs_decompress_biovec(cb->compress_type,
190 cb->start, 180 cb->compressed_pages,
191 cb->orig_bio->bi_io_vec, 181 cb->start,
192 cb->orig_bio->bi_vcnt, 182 cb->orig_bio->bi_io_vec,
193 cb->compressed_len); 183 cb->orig_bio->bi_vcnt,
184 cb->compressed_len);
194csum_failed: 185csum_failed:
195 if (ret) 186 if (ret)
196 cb->errors = 1; 187 cb->errors = 1;
@@ -601,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
601 592
602 cb->len = uncompressed_len; 593 cb->len = uncompressed_len;
603 cb->compressed_len = compressed_len; 594 cb->compressed_len = compressed_len;
595 cb->compress_type = extent_compress_type(bio_flags);
604 cb->orig_bio = bio; 596 cb->orig_bio = bio;
605 597
606 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 598 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -690,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
690 bio_put(comp_bio); 682 bio_put(comp_bio);
691 return 0; 683 return 0;
692} 684}
685
686static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
687static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
688static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
689static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
690static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
691
692struct btrfs_compress_op *btrfs_compress_op[] = {
693 &btrfs_zlib_compress,
694 &btrfs_lzo_compress,
695};
696
697int __init btrfs_init_compress(void)
698{
699 int i;
700
701 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
702 INIT_LIST_HEAD(&comp_idle_workspace[i]);
703 spin_lock_init(&comp_workspace_lock[i]);
704 atomic_set(&comp_alloc_workspace[i], 0);
705 init_waitqueue_head(&comp_workspace_wait[i]);
706 }
707 return 0;
708}
709
710/*
711 * this finds an available workspace or allocates a new one
712 * ERR_PTR is returned if things go bad.
713 */
714static struct list_head *find_workspace(int type)
715{
716 struct list_head *workspace;
717 int cpus = num_online_cpus();
718 int idx = type - 1;
719
720 struct list_head *idle_workspace = &comp_idle_workspace[idx];
721 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
722 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
723 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
724 int *num_workspace = &comp_num_workspace[idx];
725again:
726 spin_lock(workspace_lock);
727 if (!list_empty(idle_workspace)) {
728 workspace = idle_workspace->next;
729 list_del(workspace);
730 (*num_workspace)--;
731 spin_unlock(workspace_lock);
732 return workspace;
733
734 }
735 if (atomic_read(alloc_workspace) > cpus) {
736 DEFINE_WAIT(wait);
737
738 spin_unlock(workspace_lock);
739 prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
740 if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
741 schedule();
742 finish_wait(workspace_wait, &wait);
743 goto again;
744 }
745 atomic_inc(alloc_workspace);
746 spin_unlock(workspace_lock);
747
748 workspace = btrfs_compress_op[idx]->alloc_workspace();
749 if (IS_ERR(workspace)) {
750 atomic_dec(alloc_workspace);
751 wake_up(workspace_wait);
752 }
753 return workspace;
754}
755
756/*
757 * put a workspace struct back on the list or free it if we have enough
758 * idle ones sitting around
759 */
760static void free_workspace(int type, struct list_head *workspace)
761{
762 int idx = type - 1;
763 struct list_head *idle_workspace = &comp_idle_workspace[idx];
764 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
765 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
766 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
767 int *num_workspace = &comp_num_workspace[idx];
768
769 spin_lock(workspace_lock);
770 if (*num_workspace < num_online_cpus()) {
771 list_add_tail(workspace, idle_workspace);
772 (*num_workspace)++;
773 spin_unlock(workspace_lock);
774 goto wake;
775 }
776 spin_unlock(workspace_lock);
777
778 btrfs_compress_op[idx]->free_workspace(workspace);
779 atomic_dec(alloc_workspace);
780wake:
781 if (waitqueue_active(workspace_wait))
782 wake_up(workspace_wait);
783}
784
785/*
786 * cleanup function for module exit
787 */
788static void free_workspaces(void)
789{
790 struct list_head *workspace;
791 int i;
792
793 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
794 while (!list_empty(&comp_idle_workspace[i])) {
795 workspace = comp_idle_workspace[i].next;
796 list_del(workspace);
797 btrfs_compress_op[i]->free_workspace(workspace);
798 atomic_dec(&comp_alloc_workspace[i]);
799 }
800 }
801}
802
803/*
804 * given an address space and start/len, compress the bytes.
805 *
806 * pages are allocated to hold the compressed result and stored
807 * in 'pages'
808 *
809 * out_pages is used to return the number of pages allocated. There
810 * may be pages allocated even if we return an error
811 *
812 * total_in is used to return the number of bytes actually read. It
813 * may be smaller then len if we had to exit early because we
814 * ran out of room in the pages array or because we cross the
815 * max_out threshold.
816 *
817 * total_out is used to return the total number of compressed bytes
818 *
819 * max_out tells us the max number of bytes that we're allowed to
820 * stuff into pages
821 */
822int btrfs_compress_pages(int type, struct address_space *mapping,
823 u64 start, unsigned long len,
824 struct page **pages,
825 unsigned long nr_dest_pages,
826 unsigned long *out_pages,
827 unsigned long *total_in,
828 unsigned long *total_out,
829 unsigned long max_out)
830{
831 struct list_head *workspace;
832 int ret;
833
834 workspace = find_workspace(type);
835 if (IS_ERR(workspace))
836 return -1;
837
838 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
839 start, len, pages,
840 nr_dest_pages, out_pages,
841 total_in, total_out,
842 max_out);
843 free_workspace(type, workspace);
844 return ret;
845}
846
847/*
848 * pages_in is an array of pages with compressed data.
849 *
850 * disk_start is the starting logical offset of this array in the file
851 *
852 * bvec is a bio_vec of pages from the file that we want to decompress into
853 *
854 * vcnt is the count of pages in the biovec
855 *
856 * srclen is the number of bytes in pages_in
857 *
858 * The basic idea is that we have a bio that was created by readpages.
859 * The pages in the bio are for the uncompressed data, and they may not
860 * be contiguous. They all correspond to the range of bytes covered by
861 * the compressed extent.
862 */
863int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
864 struct bio_vec *bvec, int vcnt, size_t srclen)
865{
866 struct list_head *workspace;
867 int ret;
868
869 workspace = find_workspace(type);
870 if (IS_ERR(workspace))
871 return -ENOMEM;
872
873 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
874 disk_start,
875 bvec, vcnt, srclen);
876 free_workspace(type, workspace);
877 return ret;
878}
879
880/*
881 * a less complex decompression routine. Our compressed data fits in a
882 * single page, and we want to read a single page out of it.
883 * start_byte tells us the offset into the compressed data we're interested in
884 */
885int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
886 unsigned long start_byte, size_t srclen, size_t destlen)
887{
888 struct list_head *workspace;
889 int ret;
890
891 workspace = find_workspace(type);
892 if (IS_ERR(workspace))
893 return -ENOMEM;
894
895 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
896 dest_page, start_byte,
897 srclen, destlen);
898
899 free_workspace(type, workspace);
900 return ret;
901}
902
903void __exit btrfs_exit_compress(void)
904{
905 free_workspaces();
906}
907
908/*
909 * Copy uncompressed data from working buffer to pages.
910 *
911 * buf_start is the byte offset we're of the start of our workspace buffer.
912 *
913 * total_out is the last byte of the buffer
914 */
915int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
916 unsigned long total_out, u64 disk_start,
917 struct bio_vec *bvec, int vcnt,
918 unsigned long *page_index,
919 unsigned long *pg_offset)
920{
921 unsigned long buf_offset;
922 unsigned long current_buf_start;
923 unsigned long start_byte;
924 unsigned long working_bytes = total_out - buf_start;
925 unsigned long bytes;
926 char *kaddr;
927 struct page *page_out = bvec[*page_index].bv_page;
928
929 /*
930 * start byte is the first byte of the page we're currently
931 * copying into relative to the start of the compressed data.
932 */
933 start_byte = page_offset(page_out) - disk_start;
934
935 /* we haven't yet hit data corresponding to this page */
936 if (total_out <= start_byte)
937 return 1;
938
939 /*
940 * the start of the data we care about is offset into
941 * the middle of our working buffer
942 */
943 if (total_out > start_byte && buf_start < start_byte) {
944 buf_offset = start_byte - buf_start;
945 working_bytes -= buf_offset;
946 } else {
947 buf_offset = 0;
948 }
949 current_buf_start = buf_start;
950
951 /* copy bytes from the working buffer into the pages */
952 while (working_bytes > 0) {
953 bytes = min(PAGE_CACHE_SIZE - *pg_offset,
954 PAGE_CACHE_SIZE - buf_offset);
955 bytes = min(bytes, working_bytes);
956 kaddr = kmap_atomic(page_out, KM_USER0);
957 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
958 kunmap_atomic(kaddr, KM_USER0);
959 flush_dcache_page(page_out);
960
961 *pg_offset += bytes;
962 buf_offset += bytes;
963 working_bytes -= bytes;
964 current_buf_start += bytes;
965
966 /* check if we need to pick another page */
967 if (*pg_offset == PAGE_CACHE_SIZE) {
968 (*page_index)++;
969 if (*page_index >= vcnt)
970 return 0;
971
972 page_out = bvec[*page_index].bv_page;
973 *pg_offset = 0;
974 start_byte = page_offset(page_out) - disk_start;
975
976 /*
977 * make sure our new page is covered by this
978 * working buffer
979 */
980 if (total_out <= start_byte)
981 return 1;
982
983 /*
984 * the next page in the biovec might not be adjacent
985 * to the last page, but it might still be found
986 * inside this working buffer. bump our offset pointer
987 */
988 if (total_out > start_byte &&
989 current_buf_start < start_byte) {
990 buf_offset = start_byte - buf_start;
991 working_bytes = total_out - start_byte;
992 current_buf_start = buf_start + buf_offset;
993 }
994 }
995 }
996
997 return 1;
998}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa71..51000174b9d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
19#ifndef __BTRFS_COMPRESSION_ 19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_ 20#define __BTRFS_COMPRESSION_
21 21
22int btrfs_zlib_decompress(unsigned char *data_in, 22int btrfs_init_compress(void);
23 struct page *dest_page, 23void btrfs_exit_compress(void);
24 unsigned long start_byte, 24
25 size_t srclen, size_t destlen); 25int btrfs_compress_pages(int type, struct address_space *mapping,
26int btrfs_zlib_compress_pages(struct address_space *mapping, 26 u64 start, unsigned long len,
27 u64 start, unsigned long len, 27 struct page **pages,
28 struct page **pages, 28 unsigned long nr_dest_pages,
29 unsigned long nr_dest_pages, 29 unsigned long *out_pages,
30 unsigned long *out_pages, 30 unsigned long *total_in,
31 unsigned long *total_in, 31 unsigned long *total_out,
32 unsigned long *total_out, 32 unsigned long max_out);
33 unsigned long max_out); 33int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
34int btrfs_zlib_decompress_biovec(struct page **pages_in, 34 struct bio_vec *bvec, int vcnt, size_t srclen);
35 u64 disk_start, 35int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
36 struct bio_vec *bvec, 36 unsigned long start_byte, size_t srclen, size_t destlen);
37 int vcnt, 37int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
38 size_t srclen); 38 unsigned long total_out, u64 disk_start,
39void btrfs_zlib_exit(void); 39 struct bio_vec *bvec, int vcnt,
40 unsigned long *page_index,
41 unsigned long *pg_offset);
42
40int btrfs_submit_compressed_write(struct inode *inode, u64 start, 43int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start, 44 unsigned long len, u64 disk_start,
42 unsigned long compressed_len, 45 unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
44 unsigned long nr_pages); 47 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 48int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags); 49 int mirror_num, unsigned long bio_flags);
50
51struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void);
53
54 void (*free_workspace)(struct list_head *workspace);
55
56 int (*compress_pages)(struct list_head *workspace,
57 struct address_space *mapping,
58 u64 start, unsigned long len,
59 struct page **pages,
60 unsigned long nr_dest_pages,
61 unsigned long *out_pages,
62 unsigned long *total_in,
63 unsigned long *total_out,
64 unsigned long max_out);
65
66 int (*decompress_biovec)(struct list_head *workspace,
67 struct page **pages_in,
68 u64 disk_start,
69 struct bio_vec *bvec,
70 int vcnt,
71 size_t srclen);
72
73 int (*decompress)(struct list_head *workspace,
74 unsigned char *data_in,
75 struct page *dest_page,
76 unsigned long start_byte,
77 size_t srclen, size_t destlen);
78};
79
80extern struct btrfs_compress_op btrfs_zlib_compress;
81extern struct btrfs_compress_op btrfs_lzo_compress;
82
47#endif 83#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac17159925..b5baff0dccf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
105/* this also releases the path */ 105/* this also releases the path */
106void btrfs_free_path(struct btrfs_path *p) 106void btrfs_free_path(struct btrfs_path *p)
107{ 107{
108 if (!p)
109 return;
108 btrfs_release_path(NULL, p); 110 btrfs_release_path(NULL, p);
109 kmem_cache_free(btrfs_path_cachep, p); 111 kmem_cache_free(btrfs_path_cachep, p);
110} 112}
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2514 btrfs_assert_tree_locked(path->nodes[1]); 2516 btrfs_assert_tree_locked(path->nodes[1]);
2515 2517
2516 right = read_node_slot(root, upper, slot + 1); 2518 right = read_node_slot(root, upper, slot + 1);
2519 if (right == NULL)
2520 return 1;
2521
2517 btrfs_tree_lock(right); 2522 btrfs_tree_lock(right);
2518 btrfs_set_lock_blocking(right); 2523 btrfs_set_lock_blocking(right);
2519 2524
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2764 btrfs_assert_tree_locked(path->nodes[1]); 2769 btrfs_assert_tree_locked(path->nodes[1]);
2765 2770
2766 left = read_node_slot(root, path->nodes[1], slot - 1); 2771 left = read_node_slot(root, path->nodes[1], slot - 1);
2772 if (left == NULL)
2773 return 1;
2774
2767 btrfs_tree_lock(left); 2775 btrfs_tree_lock(left);
2768 btrfs_set_lock_blocking(left); 2776 btrfs_set_lock_blocking(left);
2769 2777
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8db9234f6b4..2c98b3af605 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/kobject.h>
30#include <asm/kmap_types.h> 31#include <asm/kmap_types.h>
31#include "extent_io.h" 32#include "extent_io.h"
32#include "extent_map.h" 33#include "extent_map.h"
@@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
294#define BTRFS_FSID_SIZE 16 295#define BTRFS_FSID_SIZE 16
295#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) 296#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
296#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) 297#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
298
299/*
300 * File system states
301 */
302
303/* Errors detected */
304#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
305
297#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) 306#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
298#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) 307#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
299 308
@@ -398,13 +407,15 @@ struct btrfs_super_block {
398#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 407#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
399#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) 408#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
400#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) 409#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
410#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
401 411
402#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 412#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
403#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 413#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
404#define BTRFS_FEATURE_INCOMPAT_SUPP \ 414#define BTRFS_FEATURE_INCOMPAT_SUPP \
405 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 415 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
406 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 416 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
407 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 417 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
418 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
408 419
409/* 420/*
410 * A leaf is full of items. offset and size tell us where to find 421 * A leaf is full of items. offset and size tell us where to find
@@ -551,9 +562,11 @@ struct btrfs_timespec {
551} __attribute__ ((__packed__)); 562} __attribute__ ((__packed__));
552 563
553enum btrfs_compression_type { 564enum btrfs_compression_type {
554 BTRFS_COMPRESS_NONE = 0, 565 BTRFS_COMPRESS_NONE = 0,
555 BTRFS_COMPRESS_ZLIB = 1, 566 BTRFS_COMPRESS_ZLIB = 1,
556 BTRFS_COMPRESS_LAST = 2, 567 BTRFS_COMPRESS_LZO = 2,
568 BTRFS_COMPRESS_TYPES = 2,
569 BTRFS_COMPRESS_LAST = 3,
557}; 570};
558 571
559struct btrfs_inode_item { 572struct btrfs_inode_item {
@@ -597,6 +610,8 @@ struct btrfs_dir_item {
597 u8 type; 610 u8 type;
598} __attribute__ ((__packed__)); 611} __attribute__ ((__packed__));
599 612
613#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
614
600struct btrfs_root_item { 615struct btrfs_root_item {
601 struct btrfs_inode_item inode; 616 struct btrfs_inode_item inode;
602 __le64 generation; 617 __le64 generation;
@@ -808,9 +823,9 @@ struct btrfs_block_group_cache {
808 int extents_thresh; 823 int extents_thresh;
809 int free_extents; 824 int free_extents;
810 int total_bitmaps; 825 int total_bitmaps;
811 int ro:1; 826 unsigned int ro:1;
812 int dirty:1; 827 unsigned int dirty:1;
813 int iref:1; 828 unsigned int iref:1;
814 829
815 int disk_cache_state; 830 int disk_cache_state;
816 831
@@ -895,7 +910,8 @@ struct btrfs_fs_info {
895 */ 910 */
896 u64 last_trans_log_full_commit; 911 u64 last_trans_log_full_commit;
897 u64 open_ioctl_trans; 912 u64 open_ioctl_trans;
898 unsigned long mount_opt; 913 unsigned long mount_opt:20;
914 unsigned long compress_type:4;
899 u64 max_inline; 915 u64 max_inline;
900 u64 alloc_start; 916 u64 alloc_start;
901 struct btrfs_transaction *running_transaction; 917 struct btrfs_transaction *running_transaction;
@@ -1050,6 +1066,9 @@ struct btrfs_fs_info {
1050 unsigned metadata_ratio; 1066 unsigned metadata_ratio;
1051 1067
1052 void *bdev_holder; 1068 void *bdev_holder;
1069
1070 /* filesystem state */
1071 u64 fs_state;
1053}; 1072};
1054 1073
1055/* 1074/*
@@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1893BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 1912BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1894 last_snapshot, 64); 1913 last_snapshot, 64);
1895 1914
1915static inline bool btrfs_root_readonly(struct btrfs_root *root)
1916{
1917 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1918}
1919
1896/* struct btrfs_super_block */ 1920/* struct btrfs_super_block */
1897 1921
1898BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 1922BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2145int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2169int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2146 struct btrfs_root *root, u64 group_start); 2170 struct btrfs_root *root, u64 group_start);
2147u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2171u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2172u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2148void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2173void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2149void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2174void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2150int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2175int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
2188int btrfs_set_block_group_rw(struct btrfs_root *root, 2213int btrfs_set_block_group_rw(struct btrfs_root *root,
2189 struct btrfs_block_group_cache *cache); 2214 struct btrfs_block_group_cache *cache);
2190void btrfs_put_block_group_cache(struct btrfs_fs_info *info); 2215void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2216u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2217int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2218 u64 start, u64 end);
2219int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2220 u64 num_bytes);
2221
2191/* ctree.c */ 2222/* ctree.c */
2192int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2223int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2193 int level, int *slot); 2224 int level, int *slot);
@@ -2541,10 +2572,18 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2541/* super.c */ 2572/* super.c */
2542int btrfs_parse_options(struct btrfs_root *root, char *options); 2573int btrfs_parse_options(struct btrfs_root *root, char *options);
2543int btrfs_sync_fs(struct super_block *sb, int wait); 2574int btrfs_sync_fs(struct super_block *sb, int wait);
2575void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
2576 unsigned int line, int errno);
2577
2578#define btrfs_std_error(fs_info, errno) \
2579do { \
2580 if ((errno)) \
2581 __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
2582} while (0)
2544 2583
2545/* acl.c */ 2584/* acl.c */
2546#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2585#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2547int btrfs_check_acl(struct inode *inode, int mask); 2586int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
2548#else 2587#else
2549#define btrfs_check_acl NULL 2588#define btrfs_check_acl NULL
2550#endif 2589#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb827d0d718..b531c36455d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/migrate.h>
31#include "compat.h" 32#include "compat.h"
32#include "ctree.h" 33#include "ctree.h"
33#include "disk-io.h" 34#include "disk-io.h"
@@ -43,6 +44,20 @@
43static struct extent_io_ops btree_extent_io_ops; 44static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work); 45static void end_workqueue_fn(struct btrfs_work *work);
45static void free_fs_root(struct btrfs_root *root); 46static void free_fs_root(struct btrfs_root *root);
47static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
48 int read_only);
49static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
50static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
51static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
52 struct btrfs_root *root);
53static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
54static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
55static int btrfs_destroy_marked_extents(struct btrfs_root *root,
56 struct extent_io_tree *dirty_pages,
57 int mark);
58static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
59 struct extent_io_tree *pinned_extents);
60static int btrfs_cleanup_transaction(struct btrfs_root *root);
46 61
47/* 62/*
48 * end_io_wq structs are used to do processing in task context when an IO is 63 * end_io_wq structs are used to do processing in task context when an IO is
@@ -352,9 +367,15 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
352 WARN_ON(len == 0); 367 WARN_ON(len == 0);
353 368
354 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 369 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
370 if (eb == NULL) {
371 WARN_ON(1);
372 goto out;
373 }
355 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 374 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
356 btrfs_header_generation(eb)); 375 btrfs_header_generation(eb));
357 BUG_ON(ret); 376 BUG_ON(ret);
377 WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
378
358 found_start = btrfs_header_bytenr(eb); 379 found_start = btrfs_header_bytenr(eb);
359 if (found_start != start) { 380 if (found_start != start) {
360 WARN_ON(1); 381 WARN_ON(1);
@@ -424,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
424 WARN_ON(len == 0); 445 WARN_ON(len == 0);
425 446
426 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 447 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
448 if (eb == NULL) {
449 ret = -EIO;
450 goto out;
451 }
427 452
428 found_start = btrfs_header_bytenr(eb); 453 found_start = btrfs_header_bytenr(eb);
429 if (found_start != start) { 454 if (found_start != start) {
@@ -693,6 +718,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
693 __btree_submit_bio_done); 718 __btree_submit_bio_done);
694} 719}
695 720
721#ifdef CONFIG_MIGRATION
722static int btree_migratepage(struct address_space *mapping,
723 struct page *newpage, struct page *page)
724{
725 /*
726 * we can't safely write a btree page from here,
727 * we haven't done the locking hook
728 */
729 if (PageDirty(page))
730 return -EAGAIN;
731 /*
732 * Buffers may be managed in a filesystem specific way.
733 * We must have no buffers or drop them.
734 */
735 if (page_has_private(page) &&
736 !try_to_release_page(page, GFP_KERNEL))
737 return -EAGAIN;
738 return migrate_page(mapping, newpage, page);
739}
740#endif
741
696static int btree_writepage(struct page *page, struct writeback_control *wbc) 742static int btree_writepage(struct page *page, struct writeback_control *wbc)
697{ 743{
698 struct extent_io_tree *tree; 744 struct extent_io_tree *tree;
@@ -707,8 +753,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
707 } 753 }
708 754
709 redirty_page_for_writepage(wbc, page); 755 redirty_page_for_writepage(wbc, page);
710 eb = btrfs_find_tree_block(root, page_offset(page), 756 eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
711 PAGE_CACHE_SIZE);
712 WARN_ON(!eb); 757 WARN_ON(!eb);
713 758
714 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 759 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -799,6 +844,9 @@ static const struct address_space_operations btree_aops = {
799 .releasepage = btree_releasepage, 844 .releasepage = btree_releasepage,
800 .invalidatepage = btree_invalidatepage, 845 .invalidatepage = btree_invalidatepage,
801 .sync_page = block_sync_page, 846 .sync_page = block_sync_page,
847#ifdef CONFIG_MIGRATION
848 .migratepage = btree_migratepage,
849#endif
802}; 850};
803 851
804int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 852int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -981,7 +1029,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
981 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1029 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
982 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1030 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
983 blocksize, generation); 1031 blocksize, generation);
984 BUG_ON(!root->node); 1032 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1033 free_extent_buffer(root->node);
1034 return -EIO;
1035 }
985 root->commit_root = btrfs_root_node(root); 1036 root->commit_root = btrfs_root_node(root);
986 return 0; 1037 return 0;
987} 1038}
@@ -1116,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1116 } 1167 }
1117 btrfs_free_path(path); 1168 btrfs_free_path(path);
1118 if (ret) { 1169 if (ret) {
1170 kfree(root);
1119 if (ret > 0) 1171 if (ret > 0)
1120 ret = -ENOENT; 1172 ret = -ENOENT;
1121 return ERR_PTR(ret); 1173 return ERR_PTR(ret);
@@ -1538,10 +1590,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1538 GFP_NOFS); 1590 GFP_NOFS);
1539 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), 1591 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1540 GFP_NOFS); 1592 GFP_NOFS);
1541 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), 1593 struct btrfs_root *tree_root = btrfs_sb(sb);
1542 GFP_NOFS); 1594 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1543 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1544 GFP_NOFS);
1545 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1595 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1546 GFP_NOFS); 1596 GFP_NOFS);
1547 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1597 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1686,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1686 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1736 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1687 1737
1688 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1738 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1689 if (!bh) 1739 if (!bh) {
1740 err = -EINVAL;
1690 goto fail_iput; 1741 goto fail_iput;
1742 }
1691 1743
1692 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1744 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1693 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1745 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1700,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1700 if (!btrfs_super_root(disk_super)) 1752 if (!btrfs_super_root(disk_super))
1701 goto fail_iput; 1753 goto fail_iput;
1702 1754
1755 /* check FS state, whether FS is broken. */
1756 fs_info->fs_state |= btrfs_super_flags(disk_super);
1757
1758 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1759
1703 ret = btrfs_parse_options(tree_root, options); 1760 ret = btrfs_parse_options(tree_root, options);
1704 if (ret) { 1761 if (ret) {
1705 err = ret; 1762 err = ret;
@@ -1717,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1717 } 1774 }
1718 1775
1719 features = btrfs_super_incompat_flags(disk_super); 1776 features = btrfs_super_incompat_flags(disk_super);
1720 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1777 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1721 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1778 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1722 btrfs_set_super_incompat_flags(disk_super, features); 1779 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1723 } 1780 btrfs_set_super_incompat_flags(disk_super, features);
1724 1781
1725 features = btrfs_super_compat_ro_flags(disk_super) & 1782 features = btrfs_super_compat_ro_flags(disk_super) &
1726 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1783 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1930,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1930 btrfs_set_opt(fs_info->mount_opt, SSD); 1987 btrfs_set_opt(fs_info->mount_opt, SSD);
1931 } 1988 }
1932 1989
1933 if (btrfs_super_log_root(disk_super) != 0) { 1990 /* do not make disk changes in broken FS */
1991 if (btrfs_super_log_root(disk_super) != 0 &&
1992 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1934 u64 bytenr = btrfs_super_log_root(disk_super); 1993 u64 bytenr = btrfs_super_log_root(disk_super);
1935 1994
1936 if (fs_devices->rw_devices == 0) { 1995 if (fs_devices->rw_devices == 0) {
@@ -2415,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
2415 smp_mb(); 2474 smp_mb();
2416 2475
2417 btrfs_put_block_group_cache(fs_info); 2476 btrfs_put_block_group_cache(fs_info);
2477
2478 /*
2479 * Here come 2 situations when btrfs is broken to flip readonly:
2480 *
2481 * 1. when btrfs flips readonly somewhere else before
2482 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2483 * and btrfs will skip to write sb directly to keep
2484 * ERROR state on disk.
2485 *
2486 * 2. when btrfs flips readonly just in btrfs_commit_super,
2487 * and in such case, btrfs cannnot write sb via btrfs_commit_super,
2488 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2489 * btrfs will cleanup all FS resources first and write sb then.
2490 */
2418 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2491 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2419 ret = btrfs_commit_super(root); 2492 ret = btrfs_commit_super(root);
2493 if (ret)
2494 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2495 }
2496
2497 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2498 ret = btrfs_error_commit_super(root);
2420 if (ret) 2499 if (ret)
2421 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2500 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2422 } 2501 }
@@ -2592,6 +2671,352 @@ out:
2592 return 0; 2671 return 0;
2593} 2672}
2594 2673
2674static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2675 int read_only)
2676{
2677 if (read_only)
2678 return;
2679
2680 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2681 printk(KERN_WARNING "warning: mount fs with errors, "
2682 "running btrfsck is recommended\n");
2683}
2684
2685int btrfs_error_commit_super(struct btrfs_root *root)
2686{
2687 int ret;
2688
2689 mutex_lock(&root->fs_info->cleaner_mutex);
2690 btrfs_run_delayed_iputs(root);
2691 mutex_unlock(&root->fs_info->cleaner_mutex);
2692
2693 down_write(&root->fs_info->cleanup_work_sem);
2694 up_write(&root->fs_info->cleanup_work_sem);
2695
2696 /* cleanup FS via transaction */
2697 btrfs_cleanup_transaction(root);
2698
2699 ret = write_ctree_super(NULL, root, 0);
2700
2701 return ret;
2702}
2703
2704static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2705{
2706 struct btrfs_inode *btrfs_inode;
2707 struct list_head splice;
2708
2709 INIT_LIST_HEAD(&splice);
2710
2711 mutex_lock(&root->fs_info->ordered_operations_mutex);
2712 spin_lock(&root->fs_info->ordered_extent_lock);
2713
2714 list_splice_init(&root->fs_info->ordered_operations, &splice);
2715 while (!list_empty(&splice)) {
2716 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2717 ordered_operations);
2718
2719 list_del_init(&btrfs_inode->ordered_operations);
2720
2721 btrfs_invalidate_inodes(btrfs_inode->root);
2722 }
2723
2724 spin_unlock(&root->fs_info->ordered_extent_lock);
2725 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2726
2727 return 0;
2728}
2729
2730static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2731{
2732 struct list_head splice;
2733 struct btrfs_ordered_extent *ordered;
2734 struct inode *inode;
2735
2736 INIT_LIST_HEAD(&splice);
2737
2738 spin_lock(&root->fs_info->ordered_extent_lock);
2739
2740 list_splice_init(&root->fs_info->ordered_extents, &splice);
2741 while (!list_empty(&splice)) {
2742 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2743 root_extent_list);
2744
2745 list_del_init(&ordered->root_extent_list);
2746 atomic_inc(&ordered->refs);
2747
2748 /* the inode may be getting freed (in sys_unlink path). */
2749 inode = igrab(ordered->inode);
2750
2751 spin_unlock(&root->fs_info->ordered_extent_lock);
2752 if (inode)
2753 iput(inode);
2754
2755 atomic_set(&ordered->refs, 1);
2756 btrfs_put_ordered_extent(ordered);
2757
2758 spin_lock(&root->fs_info->ordered_extent_lock);
2759 }
2760
2761 spin_unlock(&root->fs_info->ordered_extent_lock);
2762
2763 return 0;
2764}
2765
2766static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2767 struct btrfs_root *root)
2768{
2769 struct rb_node *node;
2770 struct btrfs_delayed_ref_root *delayed_refs;
2771 struct btrfs_delayed_ref_node *ref;
2772 int ret = 0;
2773
2774 delayed_refs = &trans->delayed_refs;
2775
2776 spin_lock(&delayed_refs->lock);
2777 if (delayed_refs->num_entries == 0) {
2778 printk(KERN_INFO "delayed_refs has NO entry\n");
2779 return ret;
2780 }
2781
2782 node = rb_first(&delayed_refs->root);
2783 while (node) {
2784 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2785 node = rb_next(node);
2786
2787 ref->in_tree = 0;
2788 rb_erase(&ref->rb_node, &delayed_refs->root);
2789 delayed_refs->num_entries--;
2790
2791 atomic_set(&ref->refs, 1);
2792 if (btrfs_delayed_ref_is_head(ref)) {
2793 struct btrfs_delayed_ref_head *head;
2794
2795 head = btrfs_delayed_node_to_head(ref);
2796 mutex_lock(&head->mutex);
2797 kfree(head->extent_op);
2798 delayed_refs->num_heads--;
2799 if (list_empty(&head->cluster))
2800 delayed_refs->num_heads_ready--;
2801 list_del_init(&head->cluster);
2802 mutex_unlock(&head->mutex);
2803 }
2804
2805 spin_unlock(&delayed_refs->lock);
2806 btrfs_put_delayed_ref(ref);
2807
2808 cond_resched();
2809 spin_lock(&delayed_refs->lock);
2810 }
2811
2812 spin_unlock(&delayed_refs->lock);
2813
2814 return ret;
2815}
2816
2817static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2818{
2819 struct btrfs_pending_snapshot *snapshot;
2820 struct list_head splice;
2821
2822 INIT_LIST_HEAD(&splice);
2823
2824 list_splice_init(&t->pending_snapshots, &splice);
2825
2826 while (!list_empty(&splice)) {
2827 snapshot = list_entry(splice.next,
2828 struct btrfs_pending_snapshot,
2829 list);
2830
2831 list_del_init(&snapshot->list);
2832
2833 kfree(snapshot);
2834 }
2835
2836 return 0;
2837}
2838
2839static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2840{
2841 struct btrfs_inode *btrfs_inode;
2842 struct list_head splice;
2843
2844 INIT_LIST_HEAD(&splice);
2845
2846 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2847
2848 spin_lock(&root->fs_info->delalloc_lock);
2849
2850 while (!list_empty(&splice)) {
2851 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2852 delalloc_inodes);
2853
2854 list_del_init(&btrfs_inode->delalloc_inodes);
2855
2856 btrfs_invalidate_inodes(btrfs_inode->root);
2857 }
2858
2859 spin_unlock(&root->fs_info->delalloc_lock);
2860
2861 return 0;
2862}
2863
2864static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2865 struct extent_io_tree *dirty_pages,
2866 int mark)
2867{
2868 int ret;
2869 struct page *page;
2870 struct inode *btree_inode = root->fs_info->btree_inode;
2871 struct extent_buffer *eb;
2872 u64 start = 0;
2873 u64 end;
2874 u64 offset;
2875 unsigned long index;
2876
2877 while (1) {
2878 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2879 mark);
2880 if (ret)
2881 break;
2882
2883 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2884 while (start <= end) {
2885 index = start >> PAGE_CACHE_SHIFT;
2886 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2887 page = find_get_page(btree_inode->i_mapping, index);
2888 if (!page)
2889 continue;
2890 offset = page_offset(page);
2891
2892 spin_lock(&dirty_pages->buffer_lock);
2893 eb = radix_tree_lookup(
2894 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2895 offset >> PAGE_CACHE_SHIFT);
2896 spin_unlock(&dirty_pages->buffer_lock);
2897 if (eb) {
2898 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2899 &eb->bflags);
2900 atomic_set(&eb->refs, 1);
2901 }
2902 if (PageWriteback(page))
2903 end_page_writeback(page);
2904
2905 lock_page(page);
2906 if (PageDirty(page)) {
2907 clear_page_dirty_for_io(page);
2908 spin_lock_irq(&page->mapping->tree_lock);
2909 radix_tree_tag_clear(&page->mapping->page_tree,
2910 page_index(page),
2911 PAGECACHE_TAG_DIRTY);
2912 spin_unlock_irq(&page->mapping->tree_lock);
2913 }
2914
2915 page->mapping->a_ops->invalidatepage(page, 0);
2916 unlock_page(page);
2917 }
2918 }
2919
2920 return ret;
2921}
2922
2923static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2924 struct extent_io_tree *pinned_extents)
2925{
2926 struct extent_io_tree *unpin;
2927 u64 start;
2928 u64 end;
2929 int ret;
2930
2931 unpin = pinned_extents;
2932 while (1) {
2933 ret = find_first_extent_bit(unpin, 0, &start, &end,
2934 EXTENT_DIRTY);
2935 if (ret)
2936 break;
2937
2938 /* opt_discard */
2939 ret = btrfs_error_discard_extent(root, start, end + 1 - start);
2940
2941 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2942 btrfs_error_unpin_extent_range(root, start, end);
2943 cond_resched();
2944 }
2945
2946 return 0;
2947}
2948
2949static int btrfs_cleanup_transaction(struct btrfs_root *root)
2950{
2951 struct btrfs_transaction *t;
2952 LIST_HEAD(list);
2953
2954 WARN_ON(1);
2955
2956 mutex_lock(&root->fs_info->trans_mutex);
2957 mutex_lock(&root->fs_info->transaction_kthread_mutex);
2958
2959 list_splice_init(&root->fs_info->trans_list, &list);
2960 while (!list_empty(&list)) {
2961 t = list_entry(list.next, struct btrfs_transaction, list);
2962 if (!t)
2963 break;
2964
2965 btrfs_destroy_ordered_operations(root);
2966
2967 btrfs_destroy_ordered_extents(root);
2968
2969 btrfs_destroy_delayed_refs(t, root);
2970
2971 btrfs_block_rsv_release(root,
2972 &root->fs_info->trans_block_rsv,
2973 t->dirty_pages.dirty_bytes);
2974
2975 /* FIXME: cleanup wait for commit */
2976 t->in_commit = 1;
2977 t->blocked = 1;
2978 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
2979 wake_up(&root->fs_info->transaction_blocked_wait);
2980
2981 t->blocked = 0;
2982 if (waitqueue_active(&root->fs_info->transaction_wait))
2983 wake_up(&root->fs_info->transaction_wait);
2984 mutex_unlock(&root->fs_info->trans_mutex);
2985
2986 mutex_lock(&root->fs_info->trans_mutex);
2987 t->commit_done = 1;
2988 if (waitqueue_active(&t->commit_wait))
2989 wake_up(&t->commit_wait);
2990 mutex_unlock(&root->fs_info->trans_mutex);
2991
2992 mutex_lock(&root->fs_info->trans_mutex);
2993
2994 btrfs_destroy_pending_snapshots(t);
2995
2996 btrfs_destroy_delalloc_inodes(root);
2997
2998 spin_lock(&root->fs_info->new_trans_lock);
2999 root->fs_info->running_transaction = NULL;
3000 spin_unlock(&root->fs_info->new_trans_lock);
3001
3002 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3003 EXTENT_DIRTY);
3004
3005 btrfs_destroy_pinned_extent(root,
3006 root->fs_info->pinned_extents);
3007
3008 t->use_count = 0;
3009 list_del_init(&t->list);
3010 memset(t, 0, sizeof(*t));
3011 kmem_cache_free(btrfs_transaction_cachep, t);
3012 }
3013
3014 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3015 mutex_unlock(&root->fs_info->trans_mutex);
3016
3017 return 0;
3018}
3019
2595static struct extent_io_ops btree_extent_io_ops = { 3020static struct extent_io_ops btree_extent_io_ops = {
2596 .write_cache_pages_lock_hook = btree_lock_page_hook, 3021 .write_cache_pages_lock_hook = btree_lock_page_hook,
2597 .readpage_end_io_hook = btree_readpage_end_io_hook, 3022 .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf2..07b20dc2fd9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors); 52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root); 54int btrfs_commit_super(struct btrfs_root *root);
55int btrfs_error_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 56struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize); 57 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 58struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09b82f..9786963b07e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
65{ 65{
66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
67 struct btrfs_root *root; 67 struct btrfs_root *root;
68 struct dentry *dentry;
69 struct inode *inode; 68 struct inode *inode;
70 struct btrfs_key key; 69 struct btrfs_key key;
71 int index; 70 int index;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
108 return ERR_PTR(-ESTALE); 107 return ERR_PTR(-ESTALE);
109 } 108 }
110 109
111 dentry = d_obtain_alias(inode); 110 return d_obtain_alias(inode);
112 if (!IS_ERR(dentry))
113 dentry->d_op = &btrfs_dentry_operations;
114 return dentry;
115fail: 111fail:
116 srcu_read_unlock(&fs_info->subvol_srcu, index); 112 srcu_read_unlock(&fs_info->subvol_srcu, index);
117 return ERR_PTR(err); 113 return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
166static struct dentry *btrfs_get_parent(struct dentry *child) 162static struct dentry *btrfs_get_parent(struct dentry *child)
167{ 163{
168 struct inode *dir = child->d_inode; 164 struct inode *dir = child->d_inode;
169 static struct dentry *dentry;
170 struct btrfs_root *root = BTRFS_I(dir)->root; 165 struct btrfs_root *root = BTRFS_I(dir)->root;
171 struct btrfs_path *path; 166 struct btrfs_path *path;
172 struct extent_buffer *leaf; 167 struct extent_buffer *leaf;
@@ -223,18 +218,91 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 218
224 key.type = BTRFS_INODE_ITEM_KEY; 219 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 220 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); 221 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry;
230fail: 222fail:
231 btrfs_free_path(path); 223 btrfs_free_path(path);
232 return ERR_PTR(ret); 224 return ERR_PTR(ret);
233} 225}
234 226
227static int btrfs_get_name(struct dentry *parent, char *name,
228 struct dentry *child)
229{
230 struct inode *inode = child->d_inode;
231 struct inode *dir = parent->d_inode;
232 struct btrfs_path *path;
233 struct btrfs_root *root = BTRFS_I(dir)->root;
234 struct btrfs_inode_ref *iref;
235 struct btrfs_root_ref *rref;
236 struct extent_buffer *leaf;
237 unsigned long name_ptr;
238 struct btrfs_key key;
239 int name_len;
240 int ret;
241
242 if (!dir || !inode)
243 return -EINVAL;
244
245 if (!S_ISDIR(dir->i_mode))
246 return -EINVAL;
247
248 path = btrfs_alloc_path();
249 if (!path)
250 return -ENOMEM;
251 path->leave_spinning = 1;
252
253 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
254 key.objectid = BTRFS_I(inode)->root->root_key.objectid;
255 key.type = BTRFS_ROOT_BACKREF_KEY;
256 key.offset = (u64)-1;
257 root = root->fs_info->tree_root;
258 } else {
259 key.objectid = inode->i_ino;
260 key.offset = dir->i_ino;
261 key.type = BTRFS_INODE_REF_KEY;
262 }
263
264 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
265 if (ret < 0) {
266 btrfs_free_path(path);
267 return ret;
268 } else if (ret > 0) {
269 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
270 path->slots[0]--;
271 } else {
272 btrfs_free_path(path);
273 return -ENOENT;
274 }
275 }
276 leaf = path->nodes[0];
277
278 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
279 rref = btrfs_item_ptr(leaf, path->slots[0],
280 struct btrfs_root_ref);
281 name_ptr = (unsigned long)(rref + 1);
282 name_len = btrfs_root_ref_name_len(leaf, rref);
283 } else {
284 iref = btrfs_item_ptr(leaf, path->slots[0],
285 struct btrfs_inode_ref);
286 name_ptr = (unsigned long)(iref + 1);
287 name_len = btrfs_inode_ref_name_len(leaf, iref);
288 }
289
290 read_extent_buffer(leaf, name, name_ptr, name_len);
291 btrfs_free_path(path);
292
293 /*
294 * have to add the null termination to make sure that reconnect_path
295 * gets the right len for strlen
296 */
297 name[name_len] = '\0';
298
299 return 0;
300}
301
235const struct export_operations btrfs_export_ops = { 302const struct export_operations btrfs_export_ops = {
236 .encode_fh = btrfs_encode_fh, 303 .encode_fh = btrfs_encode_fh,
237 .fh_to_dentry = btrfs_fh_to_dentry, 304 .fh_to_dentry = btrfs_fh_to_dentry,
238 .fh_to_parent = btrfs_fh_to_parent, 305 .fh_to_parent = btrfs_fh_to_parent,
239 .get_parent = btrfs_get_parent, 306 .get_parent = btrfs_get_parent,
307 .get_name = btrfs_get_name,
240}; 308};
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0c097f3aec4..b55269340ce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -429,6 +429,7 @@ err:
429 429
430static int cache_block_group(struct btrfs_block_group_cache *cache, 430static int cache_block_group(struct btrfs_block_group_cache *cache,
431 struct btrfs_trans_handle *trans, 431 struct btrfs_trans_handle *trans,
432 struct btrfs_root *root,
432 int load_cache_only) 433 int load_cache_only)
433{ 434{
434 struct btrfs_fs_info *fs_info = cache->fs_info; 435 struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
442 443
443 /* 444 /*
444 * We can't do the read from on-disk cache during a commit since we need 445 * We can't do the read from on-disk cache during a commit since we need
445 * to have the normal tree locking. 446 * to have the normal tree locking. Also if we are currently trying to
447 * allocate blocks for the tree root we can't do the fast caching since
448 * we likely hold important locks.
446 */ 449 */
447 if (!trans->transaction->in_commit) { 450 if (!trans->transaction->in_commit &&
451 (root && root != root->fs_info->tree_root)) {
448 spin_lock(&cache->lock); 452 spin_lock(&cache->lock);
449 if (cache->cached != BTRFS_CACHE_NO) { 453 if (cache->cached != BTRFS_CACHE_NO) {
450 spin_unlock(&cache->lock); 454 spin_unlock(&cache->lock);
@@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2741 struct btrfs_root *root = block_group->fs_info->tree_root; 2745 struct btrfs_root *root = block_group->fs_info->tree_root;
2742 struct inode *inode = NULL; 2746 struct inode *inode = NULL;
2743 u64 alloc_hint = 0; 2747 u64 alloc_hint = 0;
2748 int dcs = BTRFS_DC_ERROR;
2744 int num_pages = 0; 2749 int num_pages = 0;
2745 int retries = 0; 2750 int retries = 0;
2746 int ret = 0; 2751 int ret = 0;
@@ -2795,6 +2800,8 @@ again:
2795 2800
2796 spin_lock(&block_group->lock); 2801 spin_lock(&block_group->lock);
2797 if (block_group->cached != BTRFS_CACHE_FINISHED) { 2802 if (block_group->cached != BTRFS_CACHE_FINISHED) {
2803 /* We're not cached, don't bother trying to write stuff out */
2804 dcs = BTRFS_DC_WRITTEN;
2798 spin_unlock(&block_group->lock); 2805 spin_unlock(&block_group->lock);
2799 goto out_put; 2806 goto out_put;
2800 } 2807 }
@@ -2821,6 +2828,8 @@ again:
2821 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 2828 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2822 num_pages, num_pages, 2829 num_pages, num_pages,
2823 &alloc_hint); 2830 &alloc_hint);
2831 if (!ret)
2832 dcs = BTRFS_DC_SETUP;
2824 btrfs_free_reserved_data_space(inode, num_pages); 2833 btrfs_free_reserved_data_space(inode, num_pages);
2825out_put: 2834out_put:
2826 iput(inode); 2835 iput(inode);
@@ -2828,10 +2837,7 @@ out_free:
2828 btrfs_release_path(root, path); 2837 btrfs_release_path(root, path);
2829out: 2838out:
2830 spin_lock(&block_group->lock); 2839 spin_lock(&block_group->lock);
2831 if (ret) 2840 block_group->disk_cache_state = dcs;
2832 block_group->disk_cache_state = BTRFS_DC_ERROR;
2833 else
2834 block_group->disk_cache_state = BTRFS_DC_SETUP;
2835 spin_unlock(&block_group->lock); 2841 spin_unlock(&block_group->lock);
2836 2842
2837 return ret; 2843 return ret;
@@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3037 3043
3038u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3044u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3039{ 3045{
3040 u64 num_devices = root->fs_info->fs_devices->rw_devices; 3046 /*
3047 * we add in the count of missing devices because we want
3048 * to make sure that any RAID levels on a degraded FS
3049 * continue to be honored.
3050 */
3051 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3052 root->fs_info->fs_devices->missing_devices;
3041 3053
3042 if (num_devices == 1) 3054 if (num_devices == 1)
3043 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3055 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3077,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3077 return btrfs_reduce_alloc_profile(root, flags); 3089 return btrfs_reduce_alloc_profile(root, flags);
3078} 3090}
3079 3091
3080static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3092u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3081{ 3093{
3082 u64 flags; 3094 u64 flags;
3083 3095
@@ -3149,8 +3161,12 @@ alloc:
3149 bytes + 2 * 1024 * 1024, 3161 bytes + 2 * 1024 * 1024,
3150 alloc_target, 0); 3162 alloc_target, 0);
3151 btrfs_end_transaction(trans, root); 3163 btrfs_end_transaction(trans, root);
3152 if (ret < 0) 3164 if (ret < 0) {
3153 return ret; 3165 if (ret != -ENOSPC)
3166 return ret;
3167 else
3168 goto commit_trans;
3169 }
3154 3170
3155 if (!data_sinfo) { 3171 if (!data_sinfo) {
3156 btrfs_set_inode_space_info(root, inode); 3172 btrfs_set_inode_space_info(root, inode);
@@ -3161,6 +3177,7 @@ alloc:
3161 spin_unlock(&data_sinfo->lock); 3177 spin_unlock(&data_sinfo->lock);
3162 3178
3163 /* commit the current transaction and try again */ 3179 /* commit the current transaction and try again */
3180commit_trans:
3164 if (!committed && !root->fs_info->open_ioctl_trans) { 3181 if (!committed && !root->fs_info->open_ioctl_trans) {
3165 committed = 1; 3182 committed = 1;
3166 trans = btrfs_join_transaction(root, 1); 3183 trans = btrfs_join_transaction(root, 1);
@@ -3412,7 +3429,7 @@ again:
3412 * our reservation. 3429 * our reservation.
3413 */ 3430 */
3414 if (unused <= space_info->total_bytes) { 3431 if (unused <= space_info->total_bytes) {
3415 unused -= space_info->total_bytes; 3432 unused = space_info->total_bytes - unused;
3416 if (unused >= num_bytes) { 3433 if (unused >= num_bytes) {
3417 if (!reserved) 3434 if (!reserved)
3418 space_info->bytes_reserved += orig_bytes; 3435 space_info->bytes_reserved += orig_bytes;
@@ -3709,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3709 return 0; 3726 return 0;
3710 } 3727 }
3711 3728
3712 WARN_ON(1);
3713 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3714 block_rsv->size, block_rsv->reserved,
3715 block_rsv->freed[0], block_rsv->freed[1]);
3716
3717 return -ENOSPC; 3729 return -ENOSPC;
3718} 3730}
3719 3731
@@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4080 * space back to the block group, otherwise we will leak space. 4092 * space back to the block group, otherwise we will leak space.
4081 */ 4093 */
4082 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4094 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4083 cache_block_group(cache, trans, 1); 4095 cache_block_group(cache, trans, NULL, 1);
4084 4096
4085 byte_in_group = bytenr - cache->key.objectid; 4097 byte_in_group = bytenr - cache->key.objectid;
4086 WARN_ON(byte_in_group > cache->key.offset); 4098 WARN_ON(byte_in_group > cache->key.offset);
@@ -4930,11 +4942,31 @@ search:
4930 btrfs_get_block_group(block_group); 4942 btrfs_get_block_group(block_group);
4931 search_start = block_group->key.objectid; 4943 search_start = block_group->key.objectid;
4932 4944
4945 /*
4946 * this can happen if we end up cycling through all the
4947 * raid types, but we want to make sure we only allocate
4948 * for the proper type.
4949 */
4950 if (!block_group_bits(block_group, data)) {
4951 u64 extra = BTRFS_BLOCK_GROUP_DUP |
4952 BTRFS_BLOCK_GROUP_RAID1 |
4953 BTRFS_BLOCK_GROUP_RAID10;
4954
4955 /*
4956 * if they asked for extra copies and this block group
4957 * doesn't provide them, bail. This does allow us to
4958 * fill raid0 from raid1.
4959 */
4960 if ((data & extra) && !(block_group->flags & extra))
4961 goto loop;
4962 }
4963
4933have_block_group: 4964have_block_group:
4934 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4965 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4935 u64 free_percent; 4966 u64 free_percent;
4936 4967
4937 ret = cache_block_group(block_group, trans, 1); 4968 ret = cache_block_group(block_group, trans,
4969 orig_root, 1);
4938 if (block_group->cached == BTRFS_CACHE_FINISHED) 4970 if (block_group->cached == BTRFS_CACHE_FINISHED)
4939 goto have_block_group; 4971 goto have_block_group;
4940 4972
@@ -4958,7 +4990,8 @@ have_block_group:
4958 if (loop > LOOP_CACHING_NOWAIT || 4990 if (loop > LOOP_CACHING_NOWAIT ||
4959 (loop > LOOP_FIND_IDEAL && 4991 (loop > LOOP_FIND_IDEAL &&
4960 atomic_read(&space_info->caching_threads) < 2)) { 4992 atomic_read(&space_info->caching_threads) < 2)) {
4961 ret = cache_block_group(block_group, trans, 0); 4993 ret = cache_block_group(block_group, trans,
4994 orig_root, 0);
4962 BUG_ON(ret); 4995 BUG_ON(ret);
4963 } 4996 }
4964 found_uncached_bg = true; 4997 found_uncached_bg = true;
@@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5515 u64 num_bytes = ins->offset; 5548 u64 num_bytes = ins->offset;
5516 5549
5517 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 5550 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5518 cache_block_group(block_group, trans, 0); 5551 cache_block_group(block_group, trans, NULL, 0);
5519 caching_ctl = get_caching_control(block_group); 5552 caching_ctl = get_caching_control(block_group);
5520 5553
5521 if (!caching_ctl) { 5554 if (!caching_ctl) {
@@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6300 NULL, NULL); 6333 NULL, NULL);
6301 BUG_ON(ret < 0); 6334 BUG_ON(ret < 0);
6302 if (ret > 0) { 6335 if (ret > 0) {
6303 ret = btrfs_del_orphan_item(trans, tree_root, 6336 /* if we fail to delete the orphan item this time
6304 root->root_key.objectid); 6337 * around, it'll get picked up the next time.
6305 BUG_ON(ret); 6338 *
6339 * The most common failure here is just -ENOENT.
6340 */
6341 btrfs_del_orphan_item(trans, tree_root,
6342 root->root_key.objectid);
6306 } 6343 }
6307 } 6344 }
6308 6345
@@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7878 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7915 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7879 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7916 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7880 7917
7881 num_devices = root->fs_info->fs_devices->rw_devices; 7918 /*
7919 * we add in the count of missing devices because we want
7920 * to make sure that any RAID levels on a degraded FS
7921 * continue to be honored.
7922 */
7923 num_devices = root->fs_info->fs_devices->rw_devices +
7924 root->fs_info->fs_devices->missing_devices;
7925
7882 if (num_devices == 1) { 7926 if (num_devices == 1) {
7883 stripped |= BTRFS_BLOCK_GROUP_DUP; 7927 stripped |= BTRFS_BLOCK_GROUP_DUP;
7884 stripped = flags & ~stripped; 7928 stripped = flags & ~stripped;
@@ -7926,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7926 7970
7927 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 7971 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7928 sinfo->bytes_may_use + sinfo->bytes_readonly + 7972 sinfo->bytes_may_use + sinfo->bytes_readonly +
7929 cache->reserved_pinned + num_bytes < sinfo->total_bytes) { 7973 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
7930 sinfo->bytes_readonly += num_bytes; 7974 sinfo->bytes_readonly += num_bytes;
7931 sinfo->bytes_reserved += cache->reserved_pinned; 7975 sinfo->bytes_reserved += cache->reserved_pinned;
7932 cache->reserved_pinned = 0; 7976 cache->reserved_pinned = 0;
7933 cache->ro = 1; 7977 cache->ro = 1;
7934 ret = 0; 7978 ret = 0;
7935 } 7979 }
7980
7936 spin_unlock(&cache->lock); 7981 spin_unlock(&cache->lock);
7937 spin_unlock(&sinfo->lock); 7982 spin_unlock(&sinfo->lock);
7938 return ret; 7983 return ret;
@@ -7968,6 +8013,62 @@ out:
7968 return ret; 8013 return ret;
7969} 8014}
7970 8015
8016/*
8017 * helper to account the unused space of all the readonly block group in the
8018 * list. takes mirrors into account.
8019 */
8020static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8021{
8022 struct btrfs_block_group_cache *block_group;
8023 u64 free_bytes = 0;
8024 int factor;
8025
8026 list_for_each_entry(block_group, groups_list, list) {
8027 spin_lock(&block_group->lock);
8028
8029 if (!block_group->ro) {
8030 spin_unlock(&block_group->lock);
8031 continue;
8032 }
8033
8034 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8035 BTRFS_BLOCK_GROUP_RAID10 |
8036 BTRFS_BLOCK_GROUP_DUP))
8037 factor = 2;
8038 else
8039 factor = 1;
8040
8041 free_bytes += (block_group->key.offset -
8042 btrfs_block_group_used(&block_group->item)) *
8043 factor;
8044
8045 spin_unlock(&block_group->lock);
8046 }
8047
8048 return free_bytes;
8049}
8050
8051/*
8052 * helper to account the unused space of all the readonly block group in the
8053 * space_info. takes mirrors into account.
8054 */
8055u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8056{
8057 int i;
8058 u64 free_bytes = 0;
8059
8060 spin_lock(&sinfo->lock);
8061
8062 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8063 if (!list_empty(&sinfo->block_groups[i]))
8064 free_bytes += __btrfs_get_ro_block_group_free_space(
8065 &sinfo->block_groups[i]);
8066
8067 spin_unlock(&sinfo->lock);
8068
8069 return free_bytes;
8070}
8071
7971int btrfs_set_block_group_rw(struct btrfs_root *root, 8072int btrfs_set_block_group_rw(struct btrfs_root *root,
7972 struct btrfs_block_group_cache *cache) 8073 struct btrfs_block_group_cache *cache)
7973{ 8074{
@@ -8048,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8048 mutex_lock(&root->fs_info->chunk_mutex); 8149 mutex_lock(&root->fs_info->chunk_mutex);
8049 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8150 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8050 u64 min_free = btrfs_block_group_used(&block_group->item); 8151 u64 min_free = btrfs_block_group_used(&block_group->item);
8051 u64 dev_offset, max_avail; 8152 u64 dev_offset;
8052 8153
8053 /* 8154 /*
8054 * check to make sure we can actually find a chunk with enough 8155 * check to make sure we can actually find a chunk with enough
@@ -8056,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8056 */ 8157 */
8057 if (device->total_bytes > device->bytes_used + min_free) { 8158 if (device->total_bytes > device->bytes_used + min_free) {
8058 ret = find_free_dev_extent(NULL, device, min_free, 8159 ret = find_free_dev_extent(NULL, device, min_free,
8059 &dev_offset, &max_avail); 8160 &dev_offset, NULL);
8060 if (!ret) 8161 if (!ret)
8061 break; 8162 break;
8062 ret = -1; 8163 ret = -1;
@@ -8247,7 +8348,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8247 break; 8348 break;
8248 if (ret != 0) 8349 if (ret != 0)
8249 goto error; 8350 goto error;
8250
8251 leaf = path->nodes[0]; 8351 leaf = path->nodes[0];
8252 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8352 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8253 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8353 cache = kzalloc(sizeof(*cache), GFP_NOFS);
@@ -8541,3 +8641,14 @@ out:
8541 btrfs_free_path(path); 8641 btrfs_free_path(path);
8542 return ret; 8642 return ret;
8543} 8643}
8644
8645int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8646{
8647 return unpin_extent_range(root, start, end);
8648}
8649
8650int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8651 u64 num_bytes)
8652{
8653 return btrfs_discard_extent(root, bytenr, num_bytes);
8654}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eac10e3260a..2e993cf1766 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1828 bio_put(bio); 1828 bio_put(bio);
1829} 1829}
1830 1830
1831static struct bio * 1831struct bio *
1832extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1832btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1833 gfp_t gfp_flags) 1833 gfp_t gfp_flags)
1834{ 1834{
1835 struct bio *bio; 1835 struct bio *bio;
1836 1836
@@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1919 else 1919 else
1920 nr = bio_get_nr_vecs(bdev); 1920 nr = bio_get_nr_vecs(bdev);
1921 1921
1922 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1923 1923
1924 bio_add_page(bio, page, page_size, offset); 1924 bio_add_page(bio, page, page_size, offset);
1925 bio->bi_end_io = end_io_func; 1925 bio->bi_end_io = end_io_func;
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2028 BUG_ON(extent_map_end(em) <= cur); 2028 BUG_ON(extent_map_end(em) <= cur);
2029 BUG_ON(end < cur); 2029 BUG_ON(end < cur);
2030 2030
2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2032 this_bio_flag = EXTENT_BIO_COMPRESSED; 2032 this_bio_flag = EXTENT_BIO_COMPRESSED;
2033 extent_set_compress_type(&this_bio_flag,
2034 em->compress_type);
2035 }
2033 2036
2034 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2037 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2035 cur_end = min(extent_map_end(em) - 1, end); 2038 cur_end = min(extent_map_end(em) - 1, end);
@@ -2901,21 +2904,53 @@ out:
2901int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2904int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2902 __u64 start, __u64 len, get_extent_t *get_extent) 2905 __u64 start, __u64 len, get_extent_t *get_extent)
2903{ 2906{
2904 int ret; 2907 int ret = 0;
2905 u64 off = start; 2908 u64 off = start;
2906 u64 max = start + len; 2909 u64 max = start + len;
2907 u32 flags = 0; 2910 u32 flags = 0;
2911 u32 found_type;
2912 u64 last;
2908 u64 disko = 0; 2913 u64 disko = 0;
2914 struct btrfs_key found_key;
2909 struct extent_map *em = NULL; 2915 struct extent_map *em = NULL;
2910 struct extent_state *cached_state = NULL; 2916 struct extent_state *cached_state = NULL;
2917 struct btrfs_path *path;
2918 struct btrfs_file_extent_item *item;
2911 int end = 0; 2919 int end = 0;
2912 u64 em_start = 0, em_len = 0; 2920 u64 em_start = 0, em_len = 0;
2913 unsigned long emflags; 2921 unsigned long emflags;
2914 ret = 0; 2922 int hole = 0;
2915 2923
2916 if (len == 0) 2924 if (len == 0)
2917 return -EINVAL; 2925 return -EINVAL;
2918 2926
2927 path = btrfs_alloc_path();
2928 if (!path)
2929 return -ENOMEM;
2930 path->leave_spinning = 1;
2931
2932 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2933 path, inode->i_ino, -1, 0);
2934 if (ret < 0) {
2935 btrfs_free_path(path);
2936 return ret;
2937 }
2938 WARN_ON(!ret);
2939 path->slots[0]--;
2940 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2941 struct btrfs_file_extent_item);
2942 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2943 found_type = btrfs_key_type(&found_key);
2944
2945 /* No extents, just return */
2946 if (found_key.objectid != inode->i_ino ||
2947 found_type != BTRFS_EXTENT_DATA_KEY) {
2948 btrfs_free_path(path);
2949 return 0;
2950 }
2951 last = found_key.offset;
2952 btrfs_free_path(path);
2953
2919 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2954 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2920 &cached_state, GFP_NOFS); 2955 &cached_state, GFP_NOFS);
2921 em = get_extent(inode, NULL, 0, off, max - off, 0); 2956 em = get_extent(inode, NULL, 0, off, max - off, 0);
@@ -2925,11 +2960,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2925 ret = PTR_ERR(em); 2960 ret = PTR_ERR(em);
2926 goto out; 2961 goto out;
2927 } 2962 }
2963
2928 while (!end) { 2964 while (!end) {
2965 hole = 0;
2929 off = em->start + em->len; 2966 off = em->start + em->len;
2930 if (off >= max) 2967 if (off >= max)
2931 end = 1; 2968 end = 1;
2932 2969
2970 if (em->block_start == EXTENT_MAP_HOLE) {
2971 hole = 1;
2972 goto next;
2973 }
2974
2933 em_start = em->start; 2975 em_start = em->start;
2934 em_len = em->len; 2976 em_len = em->len;
2935 2977
@@ -2939,8 +2981,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2939 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2981 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2940 end = 1; 2982 end = 1;
2941 flags |= FIEMAP_EXTENT_LAST; 2983 flags |= FIEMAP_EXTENT_LAST;
2942 } else if (em->block_start == EXTENT_MAP_HOLE) {
2943 flags |= FIEMAP_EXTENT_UNWRITTEN;
2944 } else if (em->block_start == EXTENT_MAP_INLINE) { 2984 } else if (em->block_start == EXTENT_MAP_INLINE) {
2945 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2985 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2946 FIEMAP_EXTENT_NOT_ALIGNED); 2986 FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2953,10 +2993,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2953 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2993 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2954 flags |= FIEMAP_EXTENT_ENCODED; 2994 flags |= FIEMAP_EXTENT_ENCODED;
2955 2995
2996next:
2956 emflags = em->flags; 2997 emflags = em->flags;
2957 free_extent_map(em); 2998 free_extent_map(em);
2958 em = NULL; 2999 em = NULL;
2959
2960 if (!end) { 3000 if (!end) {
2961 em = get_extent(inode, NULL, 0, off, max - off, 0); 3001 em = get_extent(inode, NULL, 0, off, max - off, 0);
2962 if (!em) 3002 if (!em)
@@ -2967,15 +3007,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2967 } 3007 }
2968 emflags = em->flags; 3008 emflags = em->flags;
2969 } 3009 }
3010
2970 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 3011 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2971 flags |= FIEMAP_EXTENT_LAST; 3012 flags |= FIEMAP_EXTENT_LAST;
2972 end = 1; 3013 end = 1;
2973 } 3014 }
2974 3015
2975 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3016 if (em_start == last) {
2976 em_len, flags); 3017 flags |= FIEMAP_EXTENT_LAST;
2977 if (ret) 3018 end = 1;
2978 goto out_free; 3019 }
3020
3021 if (!hole) {
3022 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3023 em_len, flags);
3024 if (ret)
3025 goto out_free;
3026 }
2979 } 3027 }
2980out_free: 3028out_free:
2981 free_extent_map(em); 3029 free_extent_map(em);
@@ -3027,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3027#endif 3075#endif
3028 3076
3029 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3077 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3078 if (eb == NULL)
3079 return NULL;
3030 eb->start = start; 3080 eb->start = start;
3031 eb->len = len; 3081 eb->len = len;
3032 spin_lock_init(&eb->lock); 3082 spin_lock_init(&eb->lock);
@@ -3836,8 +3886,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3836 3886
3837 spin_lock(&tree->buffer_lock); 3887 spin_lock(&tree->buffer_lock);
3838 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3888 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3839 if (!eb) 3889 if (!eb) {
3840 goto out; 3890 spin_unlock(&tree->buffer_lock);
3891 return ret;
3892 }
3841 3893
3842 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3894 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3843 ret = 0; 3895 ret = 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1c6d4f342ef..7083cfafd06 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 22
23/* flags for bio submission */ 23/*
24 * flags for bio submission. The high bits indicate the compression
25 * type for this bio
26 */
24#define EXTENT_BIO_COMPRESSED 1 27#define EXTENT_BIO_COMPRESSED 1
28#define EXTENT_BIO_FLAG_SHIFT 16
25 29
26/* these are bit numbers for test/set bit */ 30/* these are bit numbers for test/set bit */
27#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
135 wait_queue_head_t lock_wq; 139 wait_queue_head_t lock_wq;
136}; 140};
137 141
142static inline void extent_set_compress_type(unsigned long *bio_flags,
143 int compress_type)
144{
145 *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
146}
147
148static inline int extent_compress_type(unsigned long bio_flags)
149{
150 return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
151}
152
138struct extent_map_tree; 153struct extent_map_tree;
139 154
140static inline struct extent_state *extent_state_next(struct extent_state *state) 155static inline struct extent_state *extent_state_next(struct extent_state *state)
@@ -310,4 +325,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
310 struct extent_io_tree *tree, 325 struct extent_io_tree *tree,
311 u64 start, u64 end, struct page *locked_page, 326 u64 start, u64 end, struct page *locked_page,
312 unsigned long op); 327 unsigned long op);
328struct bio *
329btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
330 gfp_t gfp_flags);
313#endif 331#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff6..b0e1fce1253 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/hardirq.h> 5#include <linux/hardirq.h>
6#include "ctree.h"
6#include "extent_map.h" 7#include "extent_map.h"
7 8
8 9
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
54 return em; 55 return em;
55 em->in_tree = 0; 56 em->in_tree = 0;
56 em->flags = 0; 57 em->flags = 0;
58 em->compress_type = BTRFS_COMPRESS_NONE;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
58 return em; 60 return em;
59} 61}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e64..28b44dbd1e3 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
26 unsigned long flags; 26 unsigned long flags;
27 struct block_device *bdev; 27 struct block_device *bdev;
28 atomic_t refs; 28 atomic_t refs;
29 int in_tree; 29 unsigned int in_tree:1;
30 unsigned int compress_type:4;
30}; 31};
31 32
32struct extent_map_tree { 33struct extent_map_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df08..c800d58f301 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/falloc.h>
27#include <linux/swap.h> 28#include <linux/swap.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
29#include <linux/statfs.h> 30#include <linux/statfs.h>
@@ -48,30 +49,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 struct page **prepared_pages, 49 struct page **prepared_pages,
49 struct iov_iter *i) 50 struct iov_iter *i)
50{ 51{
51 size_t copied; 52 size_t copied = 0;
52 int pg = 0; 53 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55 int total_copied = 0;
54 56
55 while (write_bytes > 0) { 57 while (write_bytes > 0) {
56 size_t count = min_t(size_t, 58 size_t count = min_t(size_t,
57 PAGE_CACHE_SIZE - offset, write_bytes); 59 PAGE_CACHE_SIZE - offset, write_bytes);
58 struct page *page = prepared_pages[pg]; 60 struct page *page = prepared_pages[pg];
59again: 61 /*
60 if (unlikely(iov_iter_fault_in_readable(i, count))) 62 * Copy data from userspace to the current page
61 return -EFAULT; 63 *
62 64 * Disable pagefault to avoid recursive lock since
63 /* Copy data from userspace to the current page */ 65 * the pages are already locked
64 copied = iov_iter_copy_from_user(page, i, offset, count); 66 */
67 pagefault_disable();
68 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
69 pagefault_enable();
65 70
66 /* Flush processor's dcache for this page */ 71 /* Flush processor's dcache for this page */
67 flush_dcache_page(page); 72 flush_dcache_page(page);
68 iov_iter_advance(i, copied); 73 iov_iter_advance(i, copied);
69 write_bytes -= copied; 74 write_bytes -= copied;
75 total_copied += copied;
70 76
77 /* Return to btrfs_file_aio_write to fault page */
71 if (unlikely(copied == 0)) { 78 if (unlikely(copied == 0)) {
72 count = min_t(size_t, PAGE_CACHE_SIZE - offset, 79 break;
73 iov_iter_single_seg_count(i));
74 goto again;
75 } 80 }
76 81
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 82 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
@@ -81,7 +86,7 @@ again:
81 offset = 0; 86 offset = 0;
82 } 87 }
83 } 88 }
84 return 0; 89 return total_copied;
85} 90}
86 91
87/* 92/*
@@ -220,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
220 225
221 split->bdev = em->bdev; 226 split->bdev = em->bdev;
222 split->flags = flags; 227 split->flags = flags;
228 split->compress_type = em->compress_type;
223 ret = add_extent_mapping(em_tree, split); 229 ret = add_extent_mapping(em_tree, split);
224 BUG_ON(ret); 230 BUG_ON(ret);
225 free_extent_map(split); 231 free_extent_map(split);
@@ -234,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
234 split->len = em->start + em->len - (start + len); 240 split->len = em->start + em->len - (start + len);
235 split->bdev = em->bdev; 241 split->bdev = em->bdev;
236 split->flags = flags; 242 split->flags = flags;
243 split->compress_type = em->compress_type;
237 244
238 if (compressed) { 245 if (compressed) {
239 split->block_len = em->block_len; 246 split->block_len = em->block_len;
@@ -854,6 +861,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
854 unsigned long last_index; 861 unsigned long last_index;
855 int will_write; 862 int will_write;
856 int buffered = 0; 863 int buffered = 0;
864 int copied = 0;
865 int dirty_pages = 0;
857 866
858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 867 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
859 (file->f_flags & O_DIRECT)); 868 (file->f_flags & O_DIRECT));
@@ -884,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
884 if (err) 893 if (err)
885 goto out; 894 goto out;
886 895
896 /*
897 * If BTRFS flips readonly due to some impossible error
898 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
899 * although we have opened a file as writable, we have
900 * to stop this write operation to ensure FS consistency.
901 */
902 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
903 err = -EROFS;
904 goto out;
905 }
906
887 file_update_time(file); 907 file_update_time(file);
888 BTRFS_I(inode)->sequence++; 908 BTRFS_I(inode)->sequence++;
889 909
@@ -970,7 +990,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
970 WARN_ON(num_pages > nrptrs); 990 WARN_ON(num_pages > nrptrs);
971 memset(pages, 0, sizeof(struct page *) * nrptrs); 991 memset(pages, 0, sizeof(struct page *) * nrptrs);
972 992
973 ret = btrfs_delalloc_reserve_space(inode, write_bytes); 993 /*
994 * Fault pages before locking them in prepare_pages
995 * to avoid recursive lock
996 */
997 if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
998 ret = -EFAULT;
999 goto out;
1000 }
1001
1002 ret = btrfs_delalloc_reserve_space(inode,
1003 num_pages << PAGE_CACHE_SHIFT);
974 if (ret) 1004 if (ret)
975 goto out; 1005 goto out;
976 1006
@@ -978,37 +1008,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
978 pos, first_index, last_index, 1008 pos, first_index, last_index,
979 write_bytes); 1009 write_bytes);
980 if (ret) { 1010 if (ret) {
981 btrfs_delalloc_release_space(inode, write_bytes); 1011 btrfs_delalloc_release_space(inode,
1012 num_pages << PAGE_CACHE_SHIFT);
982 goto out; 1013 goto out;
983 } 1014 }
984 1015
985 ret = btrfs_copy_from_user(pos, num_pages, 1016 copied = btrfs_copy_from_user(pos, num_pages,
986 write_bytes, pages, &i); 1017 write_bytes, pages, &i);
987 if (ret == 0) { 1018 dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
1019 PAGE_CACHE_SHIFT;
1020
1021 if (num_pages > dirty_pages) {
1022 if (copied > 0)
1023 atomic_inc(
1024 &BTRFS_I(inode)->outstanding_extents);
1025 btrfs_delalloc_release_space(inode,
1026 (num_pages - dirty_pages) <<
1027 PAGE_CACHE_SHIFT);
1028 }
1029
1030 if (copied > 0) {
988 dirty_and_release_pages(NULL, root, file, pages, 1031 dirty_and_release_pages(NULL, root, file, pages,
989 num_pages, pos, write_bytes); 1032 dirty_pages, pos, copied);
990 } 1033 }
991 1034
992 btrfs_drop_pages(pages, num_pages); 1035 btrfs_drop_pages(pages, num_pages);
993 if (ret) {
994 btrfs_delalloc_release_space(inode, write_bytes);
995 goto out;
996 }
997 1036
998 if (will_write) { 1037 if (copied > 0) {
999 filemap_fdatawrite_range(inode->i_mapping, pos, 1038 if (will_write) {
1000 pos + write_bytes - 1); 1039 filemap_fdatawrite_range(inode->i_mapping, pos,
1001 } else { 1040 pos + copied - 1);
1002 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1041 } else {
1003 num_pages); 1042 balance_dirty_pages_ratelimited_nr(
1004 if (num_pages < 1043 inode->i_mapping,
1005 (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1044 dirty_pages);
1006 btrfs_btree_balance_dirty(root, 1); 1045 if (dirty_pages <
1007 btrfs_throttle(root); 1046 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1047 btrfs_btree_balance_dirty(root, 1);
1048 btrfs_throttle(root);
1049 }
1008 } 1050 }
1009 1051
1010 pos += write_bytes; 1052 pos += copied;
1011 num_written += write_bytes; 1053 num_written += copied;
1012 1054
1013 cond_resched(); 1055 cond_resched();
1014 } 1056 }
@@ -1047,8 +1089,14 @@ out:
1047 1089
1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1090 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1049 trans = btrfs_start_transaction(root, 0); 1091 trans = btrfs_start_transaction(root, 0);
1092 if (IS_ERR(trans)) {
1093 num_written = PTR_ERR(trans);
1094 goto done;
1095 }
1096 mutex_lock(&inode->i_mutex);
1050 ret = btrfs_log_dentry_safe(trans, root, 1097 ret = btrfs_log_dentry_safe(trans, root,
1051 file->f_dentry); 1098 file->f_dentry);
1099 mutex_unlock(&inode->i_mutex);
1052 if (ret == 0) { 1100 if (ret == 0) {
1053 ret = btrfs_sync_log(trans, root); 1101 ret = btrfs_sync_log(trans, root);
1054 if (ret == 0) 1102 if (ret == 0)
@@ -1067,6 +1115,7 @@ out:
1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1115 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1068 } 1116 }
1069 } 1117 }
1118done:
1070 current->backing_dev_info = NULL; 1119 current->backing_dev_info = NULL;
1071 return num_written ? num_written : err; 1120 return num_written ? num_written : err;
1072} 1121}
@@ -1202,6 +1251,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1202 return 0; 1251 return 0;
1203} 1252}
1204 1253
1254static long btrfs_fallocate(struct file *file, int mode,
1255 loff_t offset, loff_t len)
1256{
1257 struct inode *inode = file->f_path.dentry->d_inode;
1258 struct extent_state *cached_state = NULL;
1259 u64 cur_offset;
1260 u64 last_byte;
1261 u64 alloc_start;
1262 u64 alloc_end;
1263 u64 alloc_hint = 0;
1264 u64 locked_end;
1265 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1266 struct extent_map *em;
1267 int ret;
1268
1269 alloc_start = offset & ~mask;
1270 alloc_end = (offset + len + mask) & ~mask;
1271
1272 /* We only support the FALLOC_FL_KEEP_SIZE mode */
1273 if (mode & ~FALLOC_FL_KEEP_SIZE)
1274 return -EOPNOTSUPP;
1275
1276 /*
1277 * wait for ordered IO before we have any locks. We'll loop again
1278 * below with the locks held.
1279 */
1280 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1281
1282 mutex_lock(&inode->i_mutex);
1283 ret = inode_newsize_ok(inode, alloc_end);
1284 if (ret)
1285 goto out;
1286
1287 if (alloc_start > inode->i_size) {
1288 ret = btrfs_cont_expand(inode, alloc_start);
1289 if (ret)
1290 goto out;
1291 }
1292
1293 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1294 if (ret)
1295 goto out;
1296
1297 locked_end = alloc_end - 1;
1298 while (1) {
1299 struct btrfs_ordered_extent *ordered;
1300
1301 /* the extent lock is ordered inside the running
1302 * transaction
1303 */
1304 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1305 locked_end, 0, &cached_state, GFP_NOFS);
1306 ordered = btrfs_lookup_first_ordered_extent(inode,
1307 alloc_end - 1);
1308 if (ordered &&
1309 ordered->file_offset + ordered->len > alloc_start &&
1310 ordered->file_offset < alloc_end) {
1311 btrfs_put_ordered_extent(ordered);
1312 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1313 alloc_start, locked_end,
1314 &cached_state, GFP_NOFS);
1315 /*
1316 * we can't wait on the range with the transaction
1317 * running or with the extent lock held
1318 */
1319 btrfs_wait_ordered_range(inode, alloc_start,
1320 alloc_end - alloc_start);
1321 } else {
1322 if (ordered)
1323 btrfs_put_ordered_extent(ordered);
1324 break;
1325 }
1326 }
1327
1328 cur_offset = alloc_start;
1329 while (1) {
1330 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1331 alloc_end - cur_offset, 0);
1332 BUG_ON(IS_ERR(em) || !em);
1333 last_byte = min(extent_map_end(em), alloc_end);
1334 last_byte = (last_byte + mask) & ~mask;
1335 if (em->block_start == EXTENT_MAP_HOLE ||
1336 (cur_offset >= inode->i_size &&
1337 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1338 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1339 last_byte - cur_offset,
1340 1 << inode->i_blkbits,
1341 offset + len,
1342 &alloc_hint);
1343 if (ret < 0) {
1344 free_extent_map(em);
1345 break;
1346 }
1347 }
1348 free_extent_map(em);
1349
1350 cur_offset = last_byte;
1351 if (cur_offset >= alloc_end) {
1352 ret = 0;
1353 break;
1354 }
1355 }
1356 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1357 &cached_state, GFP_NOFS);
1358
1359 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1360out:
1361 mutex_unlock(&inode->i_mutex);
1362 return ret;
1363}
1364
1205const struct file_operations btrfs_file_operations = { 1365const struct file_operations btrfs_file_operations = {
1206 .llseek = generic_file_llseek, 1366 .llseek = generic_file_llseek,
1207 .read = do_sync_read, 1367 .read = do_sync_read,
@@ -1213,6 +1373,7 @@ const struct file_operations btrfs_file_operations = {
1213 .open = generic_file_open, 1373 .open = generic_file_open,
1214 .release = btrfs_release_file, 1374 .release = btrfs_release_file,
1215 .fsync = btrfs_sync_file, 1375 .fsync = btrfs_sync_file,
1376 .fallocate = btrfs_fallocate,
1216 .unlocked_ioctl = btrfs_ioctl, 1377 .unlocked_ioctl = btrfs_ioctl,
1217#ifdef CONFIG_COMPAT 1378#ifdef CONFIG_COMPAT
1218 .compat_ioctl = btrfs_ioctl, 1379 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 22ee0dc2e6b..60d68426695 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
290 (unsigned long long)BTRFS_I(inode)->generation, 290 (unsigned long long)BTRFS_I(inode)->generation,
291 (unsigned long long)generation, 291 (unsigned long long)generation,
292 (unsigned long long)block_group->key.objectid); 292 (unsigned long long)block_group->key.objectid);
293 goto out; 293 goto free_cache;
294 } 294 }
295 295
296 if (!num_entries) 296 if (!num_entries)
@@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
524 return 0; 524 return 0;
525 } 525 }
526 526
527 node = rb_first(&block_group->free_space_offset);
528 if (!node) {
529 iput(inode);
530 return 0;
531 }
532
527 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 533 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
528 filemap_write_and_wait(inode->i_mapping); 534 filemap_write_and_wait(inode->i_mapping);
529 btrfs_wait_ordered_range(inode, inode->i_size & 535 btrfs_wait_ordered_range(inode, inode->i_size &
@@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
543 */ 549 */
544 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 550 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
545 551
546 node = rb_first(&block_group->free_space_offset);
547 if (!node)
548 goto out_free;
549
550 /* 552 /*
551 * Lock all pages first so we can lock the extent safely. 553 * Lock all pages first so we can lock the extent safely.
552 * 554 *
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 558cac2dfa5..160b55b3e13 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
122 size_t cur_size = size; 122 size_t cur_size = size;
123 size_t datasize; 123 size_t datasize;
124 unsigned long offset; 124 unsigned long offset;
125 int use_compress = 0; 125 int compress_type = BTRFS_COMPRESS_NONE;
126 126
127 if (compressed_size && compressed_pages) { 127 if (compressed_size && compressed_pages) {
128 use_compress = 1; 128 compress_type = root->fs_info->compress_type;
129 cur_size = compressed_size; 129 cur_size = compressed_size;
130 } 130 }
131 131
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
159 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 159 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
160 ptr = btrfs_file_extent_inline_start(ei); 160 ptr = btrfs_file_extent_inline_start(ei);
161 161
162 if (use_compress) { 162 if (compress_type != BTRFS_COMPRESS_NONE) {
163 struct page *cpage; 163 struct page *cpage;
164 int i = 0; 164 int i = 0;
165 while (compressed_size > 0) { 165 while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
176 compressed_size -= cur_size; 176 compressed_size -= cur_size;
177 } 177 }
178 btrfs_set_file_extent_compression(leaf, ei, 178 btrfs_set_file_extent_compression(leaf, ei,
179 BTRFS_COMPRESS_ZLIB); 179 compress_type);
180 } else { 180 } else {
181 page = find_get_page(inode->i_mapping, 181 page = find_get_page(inode->i_mapping,
182 start >> PAGE_CACHE_SHIFT); 182 start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
263 u64 compressed_size; 263 u64 compressed_size;
264 struct page **pages; 264 struct page **pages;
265 unsigned long nr_pages; 265 unsigned long nr_pages;
266 int compress_type;
266 struct list_head list; 267 struct list_head list;
267}; 268};
268 269
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
280 u64 start, u64 ram_size, 281 u64 start, u64 ram_size,
281 u64 compressed_size, 282 u64 compressed_size,
282 struct page **pages, 283 struct page **pages,
283 unsigned long nr_pages) 284 unsigned long nr_pages,
285 int compress_type)
284{ 286{
285 struct async_extent *async_extent; 287 struct async_extent *async_extent;
286 288
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
290 async_extent->compressed_size = compressed_size; 292 async_extent->compressed_size = compressed_size;
291 async_extent->pages = pages; 293 async_extent->pages = pages;
292 async_extent->nr_pages = nr_pages; 294 async_extent->nr_pages = nr_pages;
295 async_extent->compress_type = compress_type;
293 list_add_tail(&async_extent->list, &cow->extents); 296 list_add_tail(&async_extent->list, &cow->extents);
294 return 0; 297 return 0;
295} 298}
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
332 unsigned long max_uncompressed = 128 * 1024; 335 unsigned long max_uncompressed = 128 * 1024;
333 int i; 336 int i;
334 int will_compress; 337 int will_compress;
338 int compress_type = root->fs_info->compress_type;
335 339
336 actual_end = min_t(u64, isize, end + 1); 340 actual_end = min_t(u64, isize, end + 1);
337again: 341again:
@@ -381,12 +385,16 @@ again:
381 WARN_ON(pages); 385 WARN_ON(pages);
382 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
383 387
384 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 388 if (BTRFS_I(inode)->force_compress)
385 total_compressed, pages, 389 compress_type = BTRFS_I(inode)->force_compress;
386 nr_pages, &nr_pages_ret, 390
387 &total_in, 391 ret = btrfs_compress_pages(compress_type,
388 &total_compressed, 392 inode->i_mapping, start,
389 max_compressed); 393 total_compressed, pages,
394 nr_pages, &nr_pages_ret,
395 &total_in,
396 &total_compressed,
397 max_compressed);
390 398
391 if (!ret) { 399 if (!ret) {
392 unsigned long offset = total_compressed & 400 unsigned long offset = total_compressed &
@@ -493,9 +501,10 @@ again:
493 * and will submit them to the elevator. 501 * and will submit them to the elevator.
494 */ 502 */
495 add_async_extent(async_cow, start, num_bytes, 503 add_async_extent(async_cow, start, num_bytes,
496 total_compressed, pages, nr_pages_ret); 504 total_compressed, pages, nr_pages_ret,
505 compress_type);
497 506
498 if (start + num_bytes < end && start + num_bytes < actual_end) { 507 if (start + num_bytes < end) {
499 start += num_bytes; 508 start += num_bytes;
500 pages = NULL; 509 pages = NULL;
501 cond_resched(); 510 cond_resched();
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
515 __set_page_dirty_nobuffers(locked_page); 524 __set_page_dirty_nobuffers(locked_page);
516 /* unlocked later on in the async handlers */ 525 /* unlocked later on in the async handlers */
517 } 526 }
518 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 527 add_async_extent(async_cow, start, end - start + 1,
528 0, NULL, 0, BTRFS_COMPRESS_NONE);
519 *num_added += 1; 529 *num_added += 1;
520 } 530 }
521 531
@@ -640,6 +650,7 @@ retry:
640 em->block_start = ins.objectid; 650 em->block_start = ins.objectid;
641 em->block_len = ins.offset; 651 em->block_len = ins.offset;
642 em->bdev = root->fs_info->fs_devices->latest_bdev; 652 em->bdev = root->fs_info->fs_devices->latest_bdev;
653 em->compress_type = async_extent->compress_type;
643 set_bit(EXTENT_FLAG_PINNED, &em->flags); 654 set_bit(EXTENT_FLAG_PINNED, &em->flags);
644 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 655 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
645 656
@@ -656,11 +667,13 @@ retry:
656 async_extent->ram_size - 1, 0); 667 async_extent->ram_size - 1, 0);
657 } 668 }
658 669
659 ret = btrfs_add_ordered_extent(inode, async_extent->start, 670 ret = btrfs_add_ordered_extent_compress(inode,
660 ins.objectid, 671 async_extent->start,
661 async_extent->ram_size, 672 ins.objectid,
662 ins.offset, 673 async_extent->ram_size,
663 BTRFS_ORDERED_COMPRESSED); 674 ins.offset,
675 BTRFS_ORDERED_COMPRESSED,
676 async_extent->compress_type);
664 BUG_ON(ret); 677 BUG_ON(ret);
665 678
666 /* 679 /*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1670 struct btrfs_ordered_extent *ordered_extent = NULL; 1683 struct btrfs_ordered_extent *ordered_extent = NULL;
1671 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1684 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1672 struct extent_state *cached_state = NULL; 1685 struct extent_state *cached_state = NULL;
1673 int compressed = 0; 1686 int compress_type = 0;
1674 int ret; 1687 int ret;
1675 bool nolock = false; 1688 bool nolock = false;
1676 1689
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1711 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1724 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1712 1725
1713 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1726 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1714 compressed = 1; 1727 compress_type = ordered_extent->compress_type;
1715 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1728 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1716 BUG_ON(compressed); 1729 BUG_ON(compress_type);
1717 ret = btrfs_mark_extent_written(trans, inode, 1730 ret = btrfs_mark_extent_written(trans, inode,
1718 ordered_extent->file_offset, 1731 ordered_extent->file_offset,
1719 ordered_extent->file_offset + 1732 ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1727 ordered_extent->disk_len, 1740 ordered_extent->disk_len,
1728 ordered_extent->len, 1741 ordered_extent->len,
1729 ordered_extent->len, 1742 ordered_extent->len,
1730 compressed, 0, 0, 1743 compress_type, 0, 0,
1731 BTRFS_FILE_EXTENT_REG); 1744 BTRFS_FILE_EXTENT_REG);
1732 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1745 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1733 ordered_extent->file_offset, 1746 ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1829 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1842 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1830 logical = em->block_start; 1843 logical = em->block_start;
1831 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1844 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1845 extent_set_compress_type(&failrec->bio_flags,
1846 em->compress_type);
1832 } 1847 }
1833 failrec->logical = logical; 1848 failrec->logical = logical;
1834 free_extent_map(em); 1849 free_extent_map(em);
@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3671static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3686static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3672{ 3687{
3673 struct inode *inode = dentry->d_inode; 3688 struct inode *inode = dentry->d_inode;
3689 struct btrfs_root *root = BTRFS_I(inode)->root;
3674 int err; 3690 int err;
3675 3691
3692 if (btrfs_root_readonly(root))
3693 return -EROFS;
3694
3676 err = inode_change_ok(inode, attr); 3695 err = inode_change_ok(inode, attr);
3677 if (err) 3696 if (err)
3678 return err; 3697 return err;
@@ -4084,8 +4103,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4084 int index; 4103 int index;
4085 int ret; 4104 int ret;
4086 4105
4087 dentry->d_op = &btrfs_dentry_operations;
4088
4089 if (dentry->d_name.len > BTRFS_NAME_LEN) 4106 if (dentry->d_name.len > BTRFS_NAME_LEN)
4090 return ERR_PTR(-ENAMETOOLONG); 4107 return ERR_PTR(-ENAMETOOLONG);
4091 4108
@@ -4127,7 +4144,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4127 return inode; 4144 return inode;
4128} 4145}
4129 4146
4130static int btrfs_dentry_delete(struct dentry *dentry) 4147static int btrfs_dentry_delete(const struct dentry *dentry)
4131{ 4148{
4132 struct btrfs_root *root; 4149 struct btrfs_root *root;
4133 4150
@@ -4501,6 +4518,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4501 BTRFS_I(inode)->index_cnt = 2; 4518 BTRFS_I(inode)->index_cnt = 2;
4502 BTRFS_I(inode)->root = root; 4519 BTRFS_I(inode)->root = root;
4503 BTRFS_I(inode)->generation = trans->transid; 4520 BTRFS_I(inode)->generation = trans->transid;
4521 inode->i_generation = BTRFS_I(inode)->generation;
4504 btrfs_set_inode_space_info(root, inode); 4522 btrfs_set_inode_space_info(root, inode);
4505 4523
4506 if (mode & S_IFDIR) 4524 if (mode & S_IFDIR)
@@ -4622,12 +4640,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4622} 4640}
4623 4641
4624static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4642static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4625 struct dentry *dentry, struct inode *inode, 4643 struct inode *dir, struct dentry *dentry,
4626 int backref, u64 index) 4644 struct inode *inode, int backref, u64 index)
4627{ 4645{
4628 int err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4646 int err = btrfs_add_link(trans, dir, inode,
4629 inode, dentry->d_name.name, 4647 dentry->d_name.name, dentry->d_name.len,
4630 dentry->d_name.len, backref, index); 4648 backref, index);
4631 if (!err) { 4649 if (!err) {
4632 d_instantiate(dentry, inode); 4650 d_instantiate(dentry, inode);
4633 return 0; 4651 return 0;
@@ -4668,8 +4686,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4668 btrfs_set_trans_block_group(trans, dir); 4686 btrfs_set_trans_block_group(trans, dir);
4669 4687
4670 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4688 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4671 dentry->d_name.len, 4689 dentry->d_name.len, dir->i_ino, objectid,
4672 dentry->d_parent->d_inode->i_ino, objectid,
4673 BTRFS_I(dir)->block_group, mode, &index); 4690 BTRFS_I(dir)->block_group, mode, &index);
4674 err = PTR_ERR(inode); 4691 err = PTR_ERR(inode);
4675 if (IS_ERR(inode)) 4692 if (IS_ERR(inode))
@@ -4682,7 +4699,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4682 } 4699 }
4683 4700
4684 btrfs_set_trans_block_group(trans, inode); 4701 btrfs_set_trans_block_group(trans, inode);
4685 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4702 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4686 if (err) 4703 if (err)
4687 drop_inode = 1; 4704 drop_inode = 1;
4688 else { 4705 else {
@@ -4730,10 +4747,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4730 btrfs_set_trans_block_group(trans, dir); 4747 btrfs_set_trans_block_group(trans, dir);
4731 4748
4732 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4749 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4733 dentry->d_name.len, 4750 dentry->d_name.len, dir->i_ino, objectid,
4734 dentry->d_parent->d_inode->i_ino, 4751 BTRFS_I(dir)->block_group, mode, &index);
4735 objectid, BTRFS_I(dir)->block_group, mode,
4736 &index);
4737 err = PTR_ERR(inode); 4752 err = PTR_ERR(inode);
4738 if (IS_ERR(inode)) 4753 if (IS_ERR(inode))
4739 goto out_unlock; 4754 goto out_unlock;
@@ -4745,7 +4760,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4745 } 4760 }
4746 4761
4747 btrfs_set_trans_block_group(trans, inode); 4762 btrfs_set_trans_block_group(trans, inode);
4748 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4763 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4749 if (err) 4764 if (err)
4750 drop_inode = 1; 4765 drop_inode = 1;
4751 else { 4766 else {
@@ -4787,6 +4802,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4787 return -EPERM; 4802 return -EPERM;
4788 4803
4789 btrfs_inc_nlink(inode); 4804 btrfs_inc_nlink(inode);
4805 inode->i_ctime = CURRENT_TIME;
4790 4806
4791 err = btrfs_set_inode_index(dir, &index); 4807 err = btrfs_set_inode_index(dir, &index);
4792 if (err) 4808 if (err)
@@ -4805,15 +4821,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4805 btrfs_set_trans_block_group(trans, dir); 4821 btrfs_set_trans_block_group(trans, dir);
4806 ihold(inode); 4822 ihold(inode);
4807 4823
4808 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4824 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
4809 4825
4810 if (err) { 4826 if (err) {
4811 drop_inode = 1; 4827 drop_inode = 1;
4812 } else { 4828 } else {
4829 struct dentry *parent = dget_parent(dentry);
4813 btrfs_update_inode_block_group(trans, dir); 4830 btrfs_update_inode_block_group(trans, dir);
4814 err = btrfs_update_inode(trans, root, inode); 4831 err = btrfs_update_inode(trans, root, inode);
4815 BUG_ON(err); 4832 BUG_ON(err);
4816 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); 4833 btrfs_log_new_name(trans, inode, NULL, parent);
4834 dput(parent);
4817 } 4835 }
4818 4836
4819 nr = trans->blocks_used; 4837 nr = trans->blocks_used;
@@ -4853,8 +4871,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4853 btrfs_set_trans_block_group(trans, dir); 4871 btrfs_set_trans_block_group(trans, dir);
4854 4872
4855 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4873 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4856 dentry->d_name.len, 4874 dentry->d_name.len, dir->i_ino, objectid,
4857 dentry->d_parent->d_inode->i_ino, objectid,
4858 BTRFS_I(dir)->block_group, S_IFDIR | mode, 4875 BTRFS_I(dir)->block_group, S_IFDIR | mode,
4859 &index); 4876 &index);
4860 if (IS_ERR(inode)) { 4877 if (IS_ERR(inode)) {
@@ -4877,9 +4894,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4877 if (err) 4894 if (err)
4878 goto out_fail; 4895 goto out_fail;
4879 4896
4880 err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4897 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
4881 inode, dentry->d_name.name, 4898 dentry->d_name.len, 0, index);
4882 dentry->d_name.len, 0, index);
4883 if (err) 4899 if (err)
4884 goto out_fail; 4900 goto out_fail;
4885 4901
@@ -4931,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4931 size_t max_size; 4947 size_t max_size;
4932 unsigned long inline_size; 4948 unsigned long inline_size;
4933 unsigned long ptr; 4949 unsigned long ptr;
4950 int compress_type;
4934 4951
4935 WARN_ON(pg_offset != 0); 4952 WARN_ON(pg_offset != 0);
4953 compress_type = btrfs_file_extent_compression(leaf, item);
4936 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4954 max_size = btrfs_file_extent_ram_bytes(leaf, item);
4937 inline_size = btrfs_file_extent_inline_item_len(leaf, 4955 inline_size = btrfs_file_extent_inline_item_len(leaf,
4938 btrfs_item_nr(leaf, path->slots[0])); 4956 btrfs_item_nr(leaf, path->slots[0]));
@@ -4942,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4942 read_extent_buffer(leaf, tmp, ptr, inline_size); 4960 read_extent_buffer(leaf, tmp, ptr, inline_size);
4943 4961
4944 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4962 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4945 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 4963 ret = btrfs_decompress(compress_type, tmp, page,
4946 inline_size, max_size); 4964 extent_offset, inline_size, max_size);
4947 if (ret) { 4965 if (ret) {
4948 char *kaddr = kmap_atomic(page, KM_USER0); 4966 char *kaddr = kmap_atomic(page, KM_USER0);
4949 unsigned long copy_size = min_t(u64, 4967 unsigned long copy_size = min_t(u64,
@@ -4985,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4985 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5003 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4986 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5004 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4987 struct btrfs_trans_handle *trans = NULL; 5005 struct btrfs_trans_handle *trans = NULL;
4988 int compressed; 5006 int compress_type;
4989 5007
4990again: 5008again:
4991 read_lock(&em_tree->lock); 5009 read_lock(&em_tree->lock);
@@ -5044,7 +5062,7 @@ again:
5044 5062
5045 found_type = btrfs_file_extent_type(leaf, item); 5063 found_type = btrfs_file_extent_type(leaf, item);
5046 extent_start = found_key.offset; 5064 extent_start = found_key.offset;
5047 compressed = btrfs_file_extent_compression(leaf, item); 5065 compress_type = btrfs_file_extent_compression(leaf, item);
5048 if (found_type == BTRFS_FILE_EXTENT_REG || 5066 if (found_type == BTRFS_FILE_EXTENT_REG ||
5049 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5067 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5050 extent_end = extent_start + 5068 extent_end = extent_start +
@@ -5090,8 +5108,9 @@ again:
5090 em->block_start = EXTENT_MAP_HOLE; 5108 em->block_start = EXTENT_MAP_HOLE;
5091 goto insert; 5109 goto insert;
5092 } 5110 }
5093 if (compressed) { 5111 if (compress_type != BTRFS_COMPRESS_NONE) {
5094 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5112 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5113 em->compress_type = compress_type;
5095 em->block_start = bytenr; 5114 em->block_start = bytenr;
5096 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5115 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
5097 item); 5116 item);
@@ -5125,12 +5144,14 @@ again:
5125 em->len = (copy_size + root->sectorsize - 1) & 5144 em->len = (copy_size + root->sectorsize - 1) &
5126 ~((u64)root->sectorsize - 1); 5145 ~((u64)root->sectorsize - 1);
5127 em->orig_start = EXTENT_MAP_INLINE; 5146 em->orig_start = EXTENT_MAP_INLINE;
5128 if (compressed) 5147 if (compress_type) {
5129 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5148 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5149 em->compress_type = compress_type;
5150 }
5130 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5151 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
5131 if (create == 0 && !PageUptodate(page)) { 5152 if (create == 0 && !PageUptodate(page)) {
5132 if (btrfs_file_extent_compression(leaf, item) == 5153 if (btrfs_file_extent_compression(leaf, item) !=
5133 BTRFS_COMPRESS_ZLIB) { 5154 BTRFS_COMPRESS_NONE) {
5134 ret = uncompress_inline(path, inode, page, 5155 ret = uncompress_inline(path, inode, page,
5135 pg_offset, 5156 pg_offset,
5136 extent_offset, item); 5157 extent_offset, item);
@@ -5535,13 +5556,21 @@ struct btrfs_dio_private {
5535 u64 bytes; 5556 u64 bytes;
5536 u32 *csums; 5557 u32 *csums;
5537 void *private; 5558 void *private;
5559
5560 /* number of bios pending for this dio */
5561 atomic_t pending_bios;
5562
5563 /* IO errors */
5564 int errors;
5565
5566 struct bio *orig_bio;
5538}; 5567};
5539 5568
5540static void btrfs_endio_direct_read(struct bio *bio, int err) 5569static void btrfs_endio_direct_read(struct bio *bio, int err)
5541{ 5570{
5571 struct btrfs_dio_private *dip = bio->bi_private;
5542 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 5572 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5543 struct bio_vec *bvec = bio->bi_io_vec; 5573 struct bio_vec *bvec = bio->bi_io_vec;
5544 struct btrfs_dio_private *dip = bio->bi_private;
5545 struct inode *inode = dip->inode; 5574 struct inode *inode = dip->inode;
5546 struct btrfs_root *root = BTRFS_I(inode)->root; 5575 struct btrfs_root *root = BTRFS_I(inode)->root;
5547 u64 start; 5576 u64 start;
@@ -5595,15 +5624,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5595 struct btrfs_trans_handle *trans; 5624 struct btrfs_trans_handle *trans;
5596 struct btrfs_ordered_extent *ordered = NULL; 5625 struct btrfs_ordered_extent *ordered = NULL;
5597 struct extent_state *cached_state = NULL; 5626 struct extent_state *cached_state = NULL;
5627 u64 ordered_offset = dip->logical_offset;
5628 u64 ordered_bytes = dip->bytes;
5598 int ret; 5629 int ret;
5599 5630
5600 if (err) 5631 if (err)
5601 goto out_done; 5632 goto out_done;
5602 5633again:
5603 ret = btrfs_dec_test_ordered_pending(inode, &ordered, 5634 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
5604 dip->logical_offset, dip->bytes); 5635 &ordered_offset,
5636 ordered_bytes);
5605 if (!ret) 5637 if (!ret)
5606 goto out_done; 5638 goto out_test;
5607 5639
5608 BUG_ON(!ordered); 5640 BUG_ON(!ordered);
5609 5641
@@ -5663,8 +5695,20 @@ out_unlock:
5663out: 5695out:
5664 btrfs_delalloc_release_metadata(inode, ordered->len); 5696 btrfs_delalloc_release_metadata(inode, ordered->len);
5665 btrfs_end_transaction(trans, root); 5697 btrfs_end_transaction(trans, root);
5698 ordered_offset = ordered->file_offset + ordered->len;
5666 btrfs_put_ordered_extent(ordered); 5699 btrfs_put_ordered_extent(ordered);
5667 btrfs_put_ordered_extent(ordered); 5700 btrfs_put_ordered_extent(ordered);
5701
5702out_test:
5703 /*
5704 * our bio might span multiple ordered extents. If we haven't
5705 * completed the accounting for the whole dio, go back and try again
5706 */
5707 if (ordered_offset < dip->logical_offset + dip->bytes) {
5708 ordered_bytes = dip->logical_offset + dip->bytes -
5709 ordered_offset;
5710 goto again;
5711 }
5668out_done: 5712out_done:
5669 bio->bi_private = dip->private; 5713 bio->bi_private = dip->private;
5670 5714
@@ -5684,6 +5728,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5684 return 0; 5728 return 0;
5685} 5729}
5686 5730
5731static void btrfs_end_dio_bio(struct bio *bio, int err)
5732{
5733 struct btrfs_dio_private *dip = bio->bi_private;
5734
5735 if (err) {
5736 printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
5737 "sector %#Lx len %u err no %d\n",
5738 dip->inode->i_ino, bio->bi_rw,
5739 (unsigned long long)bio->bi_sector, bio->bi_size, err);
5740 dip->errors = 1;
5741
5742 /*
5743 * before atomic variable goto zero, we must make sure
5744 * dip->errors is perceived to be set.
5745 */
5746 smp_mb__before_atomic_dec();
5747 }
5748
5749 /* if there are more bios still pending for this dio, just exit */
5750 if (!atomic_dec_and_test(&dip->pending_bios))
5751 goto out;
5752
5753 if (dip->errors)
5754 bio_io_error(dip->orig_bio);
5755 else {
5756 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
5757 bio_endio(dip->orig_bio, 0);
5758 }
5759out:
5760 bio_put(bio);
5761}
5762
5763static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
5764 u64 first_sector, gfp_t gfp_flags)
5765{
5766 int nr_vecs = bio_get_nr_vecs(bdev);
5767 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
5768}
5769
5770static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5771 int rw, u64 file_offset, int skip_sum,
5772 u32 *csums)
5773{
5774 int write = rw & REQ_WRITE;
5775 struct btrfs_root *root = BTRFS_I(inode)->root;
5776 int ret;
5777
5778 bio_get(bio);
5779 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5780 if (ret)
5781 goto err;
5782
5783 if (write && !skip_sum) {
5784 ret = btrfs_wq_submit_bio(root->fs_info,
5785 inode, rw, bio, 0, 0,
5786 file_offset,
5787 __btrfs_submit_bio_start_direct_io,
5788 __btrfs_submit_bio_done);
5789 goto err;
5790 } else if (!skip_sum)
5791 btrfs_lookup_bio_sums_dio(root, inode, bio,
5792 file_offset, csums);
5793
5794 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5795err:
5796 bio_put(bio);
5797 return ret;
5798}
5799
5800static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5801 int skip_sum)
5802{
5803 struct inode *inode = dip->inode;
5804 struct btrfs_root *root = BTRFS_I(inode)->root;
5805 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5806 struct bio *bio;
5807 struct bio *orig_bio = dip->orig_bio;
5808 struct bio_vec *bvec = orig_bio->bi_io_vec;
5809 u64 start_sector = orig_bio->bi_sector;
5810 u64 file_offset = dip->logical_offset;
5811 u64 submit_len = 0;
5812 u64 map_length;
5813 int nr_pages = 0;
5814 u32 *csums = dip->csums;
5815 int ret = 0;
5816
5817 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
5818 if (!bio)
5819 return -ENOMEM;
5820 bio->bi_private = dip;
5821 bio->bi_end_io = btrfs_end_dio_bio;
5822 atomic_inc(&dip->pending_bios);
5823
5824 map_length = orig_bio->bi_size;
5825 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5826 &map_length, NULL, 0);
5827 if (ret) {
5828 bio_put(bio);
5829 return -EIO;
5830 }
5831
5832 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
5833 if (unlikely(map_length < submit_len + bvec->bv_len ||
5834 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5835 bvec->bv_offset) < bvec->bv_len)) {
5836 /*
5837 * inc the count before we submit the bio so
5838 * we know the end IO handler won't happen before
5839 * we inc the count. Otherwise, the dip might get freed
5840 * before we're done setting it up
5841 */
5842 atomic_inc(&dip->pending_bios);
5843 ret = __btrfs_submit_dio_bio(bio, inode, rw,
5844 file_offset, skip_sum,
5845 csums);
5846 if (ret) {
5847 bio_put(bio);
5848 atomic_dec(&dip->pending_bios);
5849 goto out_err;
5850 }
5851
5852 if (!skip_sum)
5853 csums = csums + nr_pages;
5854 start_sector += submit_len >> 9;
5855 file_offset += submit_len;
5856
5857 submit_len = 0;
5858 nr_pages = 0;
5859
5860 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
5861 start_sector, GFP_NOFS);
5862 if (!bio)
5863 goto out_err;
5864 bio->bi_private = dip;
5865 bio->bi_end_io = btrfs_end_dio_bio;
5866
5867 map_length = orig_bio->bi_size;
5868 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5869 &map_length, NULL, 0);
5870 if (ret) {
5871 bio_put(bio);
5872 goto out_err;
5873 }
5874 } else {
5875 submit_len += bvec->bv_len;
5876 nr_pages ++;
5877 bvec++;
5878 }
5879 }
5880
5881 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
5882 csums);
5883 if (!ret)
5884 return 0;
5885
5886 bio_put(bio);
5887out_err:
5888 dip->errors = 1;
5889 /*
5890 * before atomic variable goto zero, we must
5891 * make sure dip->errors is perceived to be set.
5892 */
5893 smp_mb__before_atomic_dec();
5894 if (atomic_dec_and_test(&dip->pending_bios))
5895 bio_io_error(dip->orig_bio);
5896
5897 /* bio_end_io() will handle error, so we needn't return it */
5898 return 0;
5899}
5900
5687static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 5901static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5688 loff_t file_offset) 5902 loff_t file_offset)
5689{ 5903{
@@ -5723,36 +5937,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5723 5937
5724 dip->disk_bytenr = (u64)bio->bi_sector << 9; 5938 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5725 bio->bi_private = dip; 5939 bio->bi_private = dip;
5940 dip->errors = 0;
5941 dip->orig_bio = bio;
5942 atomic_set(&dip->pending_bios, 0);
5726 5943
5727 if (write) 5944 if (write)
5728 bio->bi_end_io = btrfs_endio_direct_write; 5945 bio->bi_end_io = btrfs_endio_direct_write;
5729 else 5946 else
5730 bio->bi_end_io = btrfs_endio_direct_read; 5947 bio->bi_end_io = btrfs_endio_direct_read;
5731 5948
5732 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 5949 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
5733 if (ret) 5950 if (!ret)
5734 goto out_err;
5735
5736 if (write && !skip_sum) {
5737 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5738 inode, rw, bio, 0, 0,
5739 dip->logical_offset,
5740 __btrfs_submit_bio_start_direct_io,
5741 __btrfs_submit_bio_done);
5742 if (ret)
5743 goto out_err;
5744 return; 5951 return;
5745 } else if (!skip_sum)
5746 btrfs_lookup_bio_sums_dio(root, inode, bio,
5747 dip->logical_offset, dip->csums);
5748
5749 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5750 if (ret)
5751 goto out_err;
5752 return;
5753out_err:
5754 kfree(dip->csums);
5755 kfree(dip);
5756free_ordered: 5952free_ordered:
5757 /* 5953 /*
5758 * If this is a write, we need to clean up the reserved space and kill 5954 * If this is a write, we need to clean up the reserved space and kill
@@ -5760,8 +5956,7 @@ free_ordered:
5760 */ 5956 */
5761 if (write) { 5957 if (write) {
5762 struct btrfs_ordered_extent *ordered; 5958 struct btrfs_ordered_extent *ordered;
5763 ordered = btrfs_lookup_ordered_extent(inode, 5959 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
5764 dip->logical_offset);
5765 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 5960 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5766 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 5961 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5767 btrfs_free_reserved_extent(root, ordered->start, 5962 btrfs_free_reserved_extent(root, ordered->start,
@@ -6306,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6306 ei->ordered_data_close = 0; 6501 ei->ordered_data_close = 0;
6307 ei->orphan_meta_reserved = 0; 6502 ei->orphan_meta_reserved = 0;
6308 ei->dummy_inode = 0; 6503 ei->dummy_inode = 0;
6309 ei->force_compress = 0; 6504 ei->force_compress = BTRFS_COMPRESS_NONE;
6310 6505
6311 inode = &ei->vfs_inode; 6506 inode = &ei->vfs_inode;
6312 extent_map_tree_init(&ei->extent_tree, GFP_NOFS); 6507 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -6322,6 +6517,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6322 return inode; 6517 return inode;
6323} 6518}
6324 6519
6520static void btrfs_i_callback(struct rcu_head *head)
6521{
6522 struct inode *inode = container_of(head, struct inode, i_rcu);
6523 INIT_LIST_HEAD(&inode->i_dentry);
6524 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
6525}
6526
6325void btrfs_destroy_inode(struct inode *inode) 6527void btrfs_destroy_inode(struct inode *inode)
6326{ 6528{
6327 struct btrfs_ordered_extent *ordered; 6529 struct btrfs_ordered_extent *ordered;
@@ -6391,7 +6593,7 @@ void btrfs_destroy_inode(struct inode *inode)
6391 inode_tree_del(inode); 6593 inode_tree_del(inode);
6392 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 6594 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
6393free: 6595free:
6394 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6596 call_rcu(&inode->i_rcu, btrfs_i_callback);
6395} 6597}
6396 6598
6397int btrfs_drop_inode(struct inode *inode) 6599int btrfs_drop_inode(struct inode *inode)
@@ -6607,8 +6809,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6607 BUG_ON(ret); 6809 BUG_ON(ret);
6608 6810
6609 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 6811 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
6610 btrfs_log_new_name(trans, old_inode, old_dir, 6812 struct dentry *parent = dget_parent(new_dentry);
6611 new_dentry->d_parent); 6813 btrfs_log_new_name(trans, old_inode, old_dir, parent);
6814 dput(parent);
6612 btrfs_end_log_trans(root); 6815 btrfs_end_log_trans(root);
6613 } 6816 }
6614out_fail: 6817out_fail:
@@ -6758,8 +6961,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6758 btrfs_set_trans_block_group(trans, dir); 6961 btrfs_set_trans_block_group(trans, dir);
6759 6962
6760 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6963 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6761 dentry->d_name.len, 6964 dentry->d_name.len, dir->i_ino, objectid,
6762 dentry->d_parent->d_inode->i_ino, objectid,
6763 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 6965 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
6764 &index); 6966 &index);
6765 err = PTR_ERR(inode); 6967 err = PTR_ERR(inode);
@@ -6773,7 +6975,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6773 } 6975 }
6774 6976
6775 btrfs_set_trans_block_group(trans, inode); 6977 btrfs_set_trans_block_group(trans, inode);
6776 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 6978 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6777 if (err) 6979 if (err)
6778 drop_inode = 1; 6980 drop_inode = 1;
6779 else { 6981 else {
@@ -6844,6 +7046,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
6844 struct btrfs_root *root = BTRFS_I(inode)->root; 7046 struct btrfs_root *root = BTRFS_I(inode)->root;
6845 struct btrfs_key ins; 7047 struct btrfs_key ins;
6846 u64 cur_offset = start; 7048 u64 cur_offset = start;
7049 u64 i_size;
6847 int ret = 0; 7050 int ret = 0;
6848 bool own_trans = true; 7051 bool own_trans = true;
6849 7052
@@ -6885,11 +7088,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
6885 (actual_len > inode->i_size) && 7088 (actual_len > inode->i_size) &&
6886 (cur_offset > inode->i_size)) { 7089 (cur_offset > inode->i_size)) {
6887 if (cur_offset > actual_len) 7090 if (cur_offset > actual_len)
6888 i_size_write(inode, actual_len); 7091 i_size = actual_len;
6889 else 7092 else
6890 i_size_write(inode, cur_offset); 7093 i_size = cur_offset;
6891 i_size_write(inode, cur_offset); 7094 i_size_write(inode, i_size);
6892 btrfs_ordered_update_i_size(inode, cur_offset, NULL); 7095 btrfs_ordered_update_i_size(inode, i_size, NULL);
6893 } 7096 }
6894 7097
6895 ret = btrfs_update_inode(trans, root, inode); 7098 ret = btrfs_update_inode(trans, root, inode);
@@ -6919,118 +7122,20 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
6919 min_size, actual_len, alloc_hint, trans); 7122 min_size, actual_len, alloc_hint, trans);
6920} 7123}
6921 7124
6922static long btrfs_fallocate(struct inode *inode, int mode,
6923 loff_t offset, loff_t len)
6924{
6925 struct extent_state *cached_state = NULL;
6926 u64 cur_offset;
6927 u64 last_byte;
6928 u64 alloc_start;
6929 u64 alloc_end;
6930 u64 alloc_hint = 0;
6931 u64 locked_end;
6932 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
6933 struct extent_map *em;
6934 int ret;
6935
6936 alloc_start = offset & ~mask;
6937 alloc_end = (offset + len + mask) & ~mask;
6938
6939 /*
6940 * wait for ordered IO before we have any locks. We'll loop again
6941 * below with the locks held.
6942 */
6943 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
6944
6945 mutex_lock(&inode->i_mutex);
6946 if (alloc_start > inode->i_size) {
6947 ret = btrfs_cont_expand(inode, alloc_start);
6948 if (ret)
6949 goto out;
6950 }
6951
6952 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
6953 if (ret)
6954 goto out;
6955
6956 locked_end = alloc_end - 1;
6957 while (1) {
6958 struct btrfs_ordered_extent *ordered;
6959
6960 /* the extent lock is ordered inside the running
6961 * transaction
6962 */
6963 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
6964 locked_end, 0, &cached_state, GFP_NOFS);
6965 ordered = btrfs_lookup_first_ordered_extent(inode,
6966 alloc_end - 1);
6967 if (ordered &&
6968 ordered->file_offset + ordered->len > alloc_start &&
6969 ordered->file_offset < alloc_end) {
6970 btrfs_put_ordered_extent(ordered);
6971 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
6972 alloc_start, locked_end,
6973 &cached_state, GFP_NOFS);
6974 /*
6975 * we can't wait on the range with the transaction
6976 * running or with the extent lock held
6977 */
6978 btrfs_wait_ordered_range(inode, alloc_start,
6979 alloc_end - alloc_start);
6980 } else {
6981 if (ordered)
6982 btrfs_put_ordered_extent(ordered);
6983 break;
6984 }
6985 }
6986
6987 cur_offset = alloc_start;
6988 while (1) {
6989 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
6990 alloc_end - cur_offset, 0);
6991 BUG_ON(IS_ERR(em) || !em);
6992 last_byte = min(extent_map_end(em), alloc_end);
6993 last_byte = (last_byte + mask) & ~mask;
6994 if (em->block_start == EXTENT_MAP_HOLE ||
6995 (cur_offset >= inode->i_size &&
6996 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6997 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
6998 last_byte - cur_offset,
6999 1 << inode->i_blkbits,
7000 offset + len,
7001 &alloc_hint);
7002 if (ret < 0) {
7003 free_extent_map(em);
7004 break;
7005 }
7006 }
7007 free_extent_map(em);
7008
7009 cur_offset = last_byte;
7010 if (cur_offset >= alloc_end) {
7011 ret = 0;
7012 break;
7013 }
7014 }
7015 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
7016 &cached_state, GFP_NOFS);
7017
7018 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
7019out:
7020 mutex_unlock(&inode->i_mutex);
7021 return ret;
7022}
7023
7024static int btrfs_set_page_dirty(struct page *page) 7125static int btrfs_set_page_dirty(struct page *page)
7025{ 7126{
7026 return __set_page_dirty_nobuffers(page); 7127 return __set_page_dirty_nobuffers(page);
7027} 7128}
7028 7129
7029static int btrfs_permission(struct inode *inode, int mask) 7130static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
7030{ 7131{
7132 struct btrfs_root *root = BTRFS_I(inode)->root;
7133
7134 if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
7135 return -EROFS;
7031 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7136 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
7032 return -EACCES; 7137 return -EACCES;
7033 return generic_permission(inode, mask, btrfs_check_acl); 7138 return generic_permission(inode, mask, flags, btrfs_check_acl);
7034} 7139}
7035 7140
7036static const struct inode_operations btrfs_dir_inode_operations = { 7141static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7123,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
7123 .listxattr = btrfs_listxattr, 7228 .listxattr = btrfs_listxattr,
7124 .removexattr = btrfs_removexattr, 7229 .removexattr = btrfs_removexattr,
7125 .permission = btrfs_permission, 7230 .permission = btrfs_permission,
7126 .fallocate = btrfs_fallocate,
7127 .fiemap = btrfs_fiemap, 7231 .fiemap = btrfs_fiemap,
7128}; 7232};
7129static const struct inode_operations btrfs_special_inode_operations = { 7233static const struct inode_operations btrfs_special_inode_operations = {
@@ -7139,6 +7243,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7139 .readlink = generic_readlink, 7243 .readlink = generic_readlink,
7140 .follow_link = page_follow_link_light, 7244 .follow_link = page_follow_link_light,
7141 .put_link = page_put_link, 7245 .put_link = page_put_link,
7246 .getattr = btrfs_getattr,
7142 .permission = btrfs_permission, 7247 .permission = btrfs_permission,
7143 .setxattr = btrfs_setxattr, 7248 .setxattr = btrfs_setxattr,
7144 .getxattr = btrfs_getxattr, 7249 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 463d91b4dd3..a506a22b522 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
147 unsigned int flags, oldflags; 147 unsigned int flags, oldflags;
148 int ret; 148 int ret;
149 149
150 if (btrfs_root_readonly(root))
151 return -EROFS;
152
150 if (copy_from_user(&flags, arg, sizeof(flags))) 153 if (copy_from_user(&flags, arg, sizeof(flags)))
151 return -EFAULT; 154 return -EFAULT;
152 155
@@ -233,7 +236,8 @@ static noinline int create_subvol(struct btrfs_root *root,
233 struct btrfs_inode_item *inode_item; 236 struct btrfs_inode_item *inode_item;
234 struct extent_buffer *leaf; 237 struct extent_buffer *leaf;
235 struct btrfs_root *new_root; 238 struct btrfs_root *new_root;
236 struct inode *dir = dentry->d_parent->d_inode; 239 struct dentry *parent = dget_parent(dentry);
240 struct inode *dir;
237 int ret; 241 int ret;
238 int err; 242 int err;
239 u64 objectid; 243 u64 objectid;
@@ -242,8 +246,13 @@ static noinline int create_subvol(struct btrfs_root *root,
242 246
243 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, 247 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
244 0, &objectid); 248 0, &objectid);
245 if (ret) 249 if (ret) {
250 dput(parent);
246 return ret; 251 return ret;
252 }
253
254 dir = parent->d_inode;
255
247 /* 256 /*
248 * 1 - inode item 257 * 1 - inode item
249 * 2 - refs 258 * 2 - refs
@@ -251,8 +260,10 @@ static noinline int create_subvol(struct btrfs_root *root,
251 * 2 - dir items 260 * 2 - dir items
252 */ 261 */
253 trans = btrfs_start_transaction(root, 6); 262 trans = btrfs_start_transaction(root, 6);
254 if (IS_ERR(trans)) 263 if (IS_ERR(trans)) {
264 dput(parent);
255 return PTR_ERR(trans); 265 return PTR_ERR(trans);
266 }
256 267
257 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 268 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
258 0, objectid, NULL, 0, 0, 0); 269 0, objectid, NULL, 0, 0, 0);
@@ -339,6 +350,7 @@ static noinline int create_subvol(struct btrfs_root *root,
339 350
340 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 351 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
341fail: 352fail:
353 dput(parent);
342 if (async_transid) { 354 if (async_transid) {
343 *async_transid = trans->transid; 355 *async_transid = trans->transid;
344 err = btrfs_commit_transaction_async(trans, root, 1); 356 err = btrfs_commit_transaction_async(trans, root, 1);
@@ -351,9 +363,11 @@ fail:
351} 363}
352 364
353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 365static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
354 char *name, int namelen, u64 *async_transid) 366 char *name, int namelen, u64 *async_transid,
367 bool readonly)
355{ 368{
356 struct inode *inode; 369 struct inode *inode;
370 struct dentry *parent;
357 struct btrfs_pending_snapshot *pending_snapshot; 371 struct btrfs_pending_snapshot *pending_snapshot;
358 struct btrfs_trans_handle *trans; 372 struct btrfs_trans_handle *trans;
359 int ret; 373 int ret;
@@ -368,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
368 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 382 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
369 pending_snapshot->dentry = dentry; 383 pending_snapshot->dentry = dentry;
370 pending_snapshot->root = root; 384 pending_snapshot->root = root;
385 pending_snapshot->readonly = readonly;
371 386
372 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 387 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
373 if (IS_ERR(trans)) { 388 if (IS_ERR(trans)) {
@@ -396,7 +411,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
396 411
397 btrfs_orphan_cleanup(pending_snapshot->snap); 412 btrfs_orphan_cleanup(pending_snapshot->snap);
398 413
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 414 parent = dget_parent(dentry);
415 inode = btrfs_lookup_dentry(parent->d_inode, dentry);
416 dput(parent);
400 if (IS_ERR(inode)) { 417 if (IS_ERR(inode)) {
401 ret = PTR_ERR(inode); 418 ret = PTR_ERR(inode);
402 goto fail; 419 goto fail;
@@ -497,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
497static noinline int btrfs_mksubvol(struct path *parent, 514static noinline int btrfs_mksubvol(struct path *parent,
498 char *name, int namelen, 515 char *name, int namelen,
499 struct btrfs_root *snap_src, 516 struct btrfs_root *snap_src,
500 u64 *async_transid) 517 u64 *async_transid, bool readonly)
501{ 518{
502 struct inode *dir = parent->dentry->d_inode; 519 struct inode *dir = parent->dentry->d_inode;
503 struct dentry *dentry; 520 struct dentry *dentry;
@@ -529,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
529 546
530 if (snap_src) { 547 if (snap_src) {
531 error = create_snapshot(snap_src, dentry, 548 error = create_snapshot(snap_src, dentry,
532 name, namelen, async_transid); 549 name, namelen, async_transid, readonly);
533 } else { 550 } else {
534 error = create_subvol(BTRFS_I(dir)->root, dentry, 551 error = create_subvol(BTRFS_I(dir)->root, dentry,
535 name, namelen, async_transid); 552 name, namelen, async_transid);
@@ -626,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
626 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 643 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
627 struct btrfs_ordered_extent *ordered; 644 struct btrfs_ordered_extent *ordered;
628 struct page *page; 645 struct page *page;
646 struct btrfs_super_block *disk_super;
629 unsigned long last_index; 647 unsigned long last_index;
630 unsigned long ra_pages = root->fs_info->bdi.ra_pages; 648 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
631 unsigned long total_read = 0; 649 unsigned long total_read = 0;
650 u64 features;
632 u64 page_start; 651 u64 page_start;
633 u64 page_end; 652 u64 page_end;
634 u64 last_len = 0; 653 u64 last_len = 0;
@@ -636,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
636 u64 defrag_end = 0; 655 u64 defrag_end = 0;
637 unsigned long i; 656 unsigned long i;
638 int ret; 657 int ret;
658 int compress_type = BTRFS_COMPRESS_ZLIB;
659
660 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
661 if (range->compress_type > BTRFS_COMPRESS_TYPES)
662 return -EINVAL;
663 if (range->compress_type)
664 compress_type = range->compress_type;
665 }
639 666
640 if (inode->i_size == 0) 667 if (inode->i_size == 0)
641 return 0; 668 return 0;
@@ -671,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
671 total_read++; 698 total_read++;
672 mutex_lock(&inode->i_mutex); 699 mutex_lock(&inode->i_mutex);
673 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 700 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
674 BTRFS_I(inode)->force_compress = 1; 701 BTRFS_I(inode)->force_compress = compress_type;
675 702
676 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 703 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
677 if (ret) 704 if (ret)
@@ -769,10 +796,17 @@ loop_unlock:
769 atomic_dec(&root->fs_info->async_submit_draining); 796 atomic_dec(&root->fs_info->async_submit_draining);
770 797
771 mutex_lock(&inode->i_mutex); 798 mutex_lock(&inode->i_mutex);
772 BTRFS_I(inode)->force_compress = 0; 799 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
773 mutex_unlock(&inode->i_mutex); 800 mutex_unlock(&inode->i_mutex);
774 } 801 }
775 802
803 disk_super = &root->fs_info->super_copy;
804 features = btrfs_super_incompat_flags(disk_super);
805 if (range->compress_type == BTRFS_COMPRESS_LZO) {
806 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
807 btrfs_set_super_incompat_flags(disk_super, features);
808 }
809
776 return 0; 810 return 0;
777 811
778err_reservations: 812err_reservations:
@@ -889,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
889 char *name, 923 char *name,
890 unsigned long fd, 924 unsigned long fd,
891 int subvol, 925 int subvol,
892 u64 *transid) 926 u64 *transid,
927 bool readonly)
893{ 928{
894 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 929 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
895 struct file *src_file; 930 struct file *src_file;
@@ -907,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
907 942
908 if (subvol) { 943 if (subvol) {
909 ret = btrfs_mksubvol(&file->f_path, name, namelen, 944 ret = btrfs_mksubvol(&file->f_path, name, namelen,
910 NULL, transid); 945 NULL, transid, readonly);
911 } else { 946 } else {
912 struct inode *src_inode; 947 struct inode *src_inode;
913 src_file = fget(fd); 948 src_file = fget(fd);
@@ -926,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
926 } 961 }
927 ret = btrfs_mksubvol(&file->f_path, name, namelen, 962 ret = btrfs_mksubvol(&file->f_path, name, namelen,
928 BTRFS_I(src_inode)->root, 963 BTRFS_I(src_inode)->root,
929 transid); 964 transid, readonly);
930 fput(src_file); 965 fput(src_file);
931 } 966 }
932out: 967out:
@@ -934,49 +969,142 @@ out:
934} 969}
935 970
936static noinline int btrfs_ioctl_snap_create(struct file *file, 971static noinline int btrfs_ioctl_snap_create(struct file *file,
937 void __user *arg, int subvol, 972 void __user *arg, int subvol)
938 int async)
939{ 973{
940 struct btrfs_ioctl_vol_args *vol_args = NULL; 974 struct btrfs_ioctl_vol_args *vol_args;
941 struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
942 char *name;
943 u64 fd;
944 u64 transid = 0;
945 int ret; 975 int ret;
946 976
947 if (async) { 977 vol_args = memdup_user(arg, sizeof(*vol_args));
948 async_vol_args = memdup_user(arg, sizeof(*async_vol_args)); 978 if (IS_ERR(vol_args))
949 if (IS_ERR(async_vol_args)) 979 return PTR_ERR(vol_args);
950 return PTR_ERR(async_vol_args); 980 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
951 981
952 name = async_vol_args->name; 982 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
953 fd = async_vol_args->fd; 983 vol_args->fd, subvol,
954 async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0'; 984 NULL, false);
955 } else {
956 vol_args = memdup_user(arg, sizeof(*vol_args));
957 if (IS_ERR(vol_args))
958 return PTR_ERR(vol_args);
959 name = vol_args->name;
960 fd = vol_args->fd;
961 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
962 }
963 985
964 ret = btrfs_ioctl_snap_create_transid(file, name, fd, 986 kfree(vol_args);
965 subvol, &transid); 987 return ret;
988}
966 989
967 if (!ret && async) { 990static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
968 if (copy_to_user(arg + 991 void __user *arg, int subvol)
969 offsetof(struct btrfs_ioctl_async_vol_args, 992{
970 transid), &transid, sizeof(transid))) 993 struct btrfs_ioctl_vol_args_v2 *vol_args;
971 return -EFAULT; 994 int ret;
995 u64 transid = 0;
996 u64 *ptr = NULL;
997 bool readonly = false;
998
999 vol_args = memdup_user(arg, sizeof(*vol_args));
1000 if (IS_ERR(vol_args))
1001 return PTR_ERR(vol_args);
1002 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1003
1004 if (vol_args->flags &
1005 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
1006 ret = -EOPNOTSUPP;
1007 goto out;
972 } 1008 }
973 1009
1010 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1011 ptr = &transid;
1012 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1013 readonly = true;
1014
1015 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1016 vol_args->fd, subvol,
1017 ptr, readonly);
1018
1019 if (ret == 0 && ptr &&
1020 copy_to_user(arg +
1021 offsetof(struct btrfs_ioctl_vol_args_v2,
1022 transid), ptr, sizeof(*ptr)))
1023 ret = -EFAULT;
1024out:
974 kfree(vol_args); 1025 kfree(vol_args);
975 kfree(async_vol_args); 1026 return ret;
1027}
1028
1029static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1030 void __user *arg)
1031{
1032 struct inode *inode = fdentry(file)->d_inode;
1033 struct btrfs_root *root = BTRFS_I(inode)->root;
1034 int ret = 0;
1035 u64 flags = 0;
1036
1037 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1038 return -EINVAL;
1039
1040 down_read(&root->fs_info->subvol_sem);
1041 if (btrfs_root_readonly(root))
1042 flags |= BTRFS_SUBVOL_RDONLY;
1043 up_read(&root->fs_info->subvol_sem);
1044
1045 if (copy_to_user(arg, &flags, sizeof(flags)))
1046 ret = -EFAULT;
976 1047
977 return ret; 1048 return ret;
978} 1049}
979 1050
1051static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1052 void __user *arg)
1053{
1054 struct inode *inode = fdentry(file)->d_inode;
1055 struct btrfs_root *root = BTRFS_I(inode)->root;
1056 struct btrfs_trans_handle *trans;
1057 u64 root_flags;
1058 u64 flags;
1059 int ret = 0;
1060
1061 if (root->fs_info->sb->s_flags & MS_RDONLY)
1062 return -EROFS;
1063
1064 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1065 return -EINVAL;
1066
1067 if (copy_from_user(&flags, arg, sizeof(flags)))
1068 return -EFAULT;
1069
1070 if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
1071 return -EINVAL;
1072
1073 if (flags & ~BTRFS_SUBVOL_RDONLY)
1074 return -EOPNOTSUPP;
1075
1076 down_write(&root->fs_info->subvol_sem);
1077
1078 /* nothing to do */
1079 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1080 goto out;
1081
1082 root_flags = btrfs_root_flags(&root->root_item);
1083 if (flags & BTRFS_SUBVOL_RDONLY)
1084 btrfs_set_root_flags(&root->root_item,
1085 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1086 else
1087 btrfs_set_root_flags(&root->root_item,
1088 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1089
1090 trans = btrfs_start_transaction(root, 1);
1091 if (IS_ERR(trans)) {
1092 ret = PTR_ERR(trans);
1093 goto out_reset;
1094 }
1095
1096 ret = btrfs_update_root(trans, root,
1097 &root->root_key, &root->root_item);
1098
1099 btrfs_commit_transaction(trans, root);
1100out_reset:
1101 if (ret)
1102 btrfs_set_root_flags(&root->root_item, root_flags);
1103out:
1104 up_write(&root->fs_info->subvol_sem);
1105 return ret;
1106}
1107
980/* 1108/*
981 * helper to check if the subvolume references other subvolumes 1109 * helper to check if the subvolume references other subvolumes
982 */ 1110 */
@@ -1485,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1485 struct btrfs_ioctl_defrag_range_args *range; 1613 struct btrfs_ioctl_defrag_range_args *range;
1486 int ret; 1614 int ret;
1487 1615
1616 if (btrfs_root_readonly(root))
1617 return -EROFS;
1618
1488 ret = mnt_want_write(file->f_path.mnt); 1619 ret = mnt_want_write(file->f_path.mnt);
1489 if (ret) 1620 if (ret)
1490 return ret; 1621 return ret;
@@ -1613,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1613 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) 1744 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1614 return -EINVAL; 1745 return -EINVAL;
1615 1746
1747 if (btrfs_root_readonly(root))
1748 return -EROFS;
1749
1616 ret = mnt_want_write(file->f_path.mnt); 1750 ret = mnt_want_write(file->f_path.mnt);
1617 if (ret) 1751 if (ret)
1618 return ret; 1752 return ret;
@@ -1669,12 +1803,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1669 olen = len = src->i_size - off; 1803 olen = len = src->i_size - off;
1670 /* if we extend to eof, continue to block boundary */ 1804 /* if we extend to eof, continue to block boundary */
1671 if (off + len == src->i_size) 1805 if (off + len == src->i_size)
1672 len = ((src->i_size + bs-1) & ~(bs-1)) 1806 len = ALIGN(src->i_size, bs) - off;
1673 - off;
1674 1807
1675 /* verify the end result is block aligned */ 1808 /* verify the end result is block aligned */
1676 if ((off & (bs-1)) || 1809 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
1677 ((off + len) & (bs-1))) 1810 !IS_ALIGNED(destoff, bs))
1678 goto out_unlock; 1811 goto out_unlock;
1679 1812
1680 /* do any pending delalloc/csum calc on src, one way or 1813 /* do any pending delalloc/csum calc on src, one way or
@@ -1874,8 +2007,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1874 * but shouldn't round up the file size 2007 * but shouldn't round up the file size
1875 */ 2008 */
1876 endoff = new_key.offset + datal; 2009 endoff = new_key.offset + datal;
1877 if (endoff > off+olen) 2010 if (endoff > destoff+olen)
1878 endoff = off+olen; 2011 endoff = destoff+olen;
1879 if (endoff > inode->i_size) 2012 if (endoff > inode->i_size)
1880 btrfs_i_size_write(inode, endoff); 2013 btrfs_i_size_write(inode, endoff);
1881 2014
@@ -1935,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
1935 if (file->private_data) 2068 if (file->private_data)
1936 goto out; 2069 goto out;
1937 2070
2071 ret = -EROFS;
2072 if (btrfs_root_readonly(root))
2073 goto out;
2074
1938 ret = mnt_want_write(file->f_path.mnt); 2075 ret = mnt_want_write(file->f_path.mnt);
1939 if (ret) 2076 if (ret)
1940 goto out; 2077 goto out;
@@ -2234,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
2234 case FS_IOC_GETVERSION: 2371 case FS_IOC_GETVERSION:
2235 return btrfs_ioctl_getversion(file, argp); 2372 return btrfs_ioctl_getversion(file, argp);
2236 case BTRFS_IOC_SNAP_CREATE: 2373 case BTRFS_IOC_SNAP_CREATE:
2237 return btrfs_ioctl_snap_create(file, argp, 0, 0); 2374 return btrfs_ioctl_snap_create(file, argp, 0);
2238 case BTRFS_IOC_SNAP_CREATE_ASYNC: 2375 case BTRFS_IOC_SNAP_CREATE_V2:
2239 return btrfs_ioctl_snap_create(file, argp, 0, 1); 2376 return btrfs_ioctl_snap_create_v2(file, argp, 0);
2240 case BTRFS_IOC_SUBVOL_CREATE: 2377 case BTRFS_IOC_SUBVOL_CREATE:
2241 return btrfs_ioctl_snap_create(file, argp, 1, 0); 2378 return btrfs_ioctl_snap_create(file, argp, 1);
2242 case BTRFS_IOC_SNAP_DESTROY: 2379 case BTRFS_IOC_SNAP_DESTROY:
2243 return btrfs_ioctl_snap_destroy(file, argp); 2380 return btrfs_ioctl_snap_destroy(file, argp);
2381 case BTRFS_IOC_SUBVOL_GETFLAGS:
2382 return btrfs_ioctl_subvol_getflags(file, argp);
2383 case BTRFS_IOC_SUBVOL_SETFLAGS:
2384 return btrfs_ioctl_subvol_setflags(file, argp);
2244 case BTRFS_IOC_DEFAULT_SUBVOL: 2385 case BTRFS_IOC_DEFAULT_SUBVOL:
2245 return btrfs_ioctl_default_subvol(file, argp); 2386 return btrfs_ioctl_default_subvol(file, argp);
2246 case BTRFS_IOC_DEFRAG: 2387 case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 17c99ebdf96..8fb382167b1 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,11 +30,16 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_SNAPSHOT_NAME_MAX 4079 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34struct btrfs_ioctl_async_vol_args { 34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35
36#define BTRFS_SUBVOL_NAME_MAX 4039
37struct btrfs_ioctl_vol_args_v2 {
35 __s64 fd; 38 __s64 fd;
36 __u64 transid; 39 __u64 transid;
37 char name[BTRFS_SNAPSHOT_NAME_MAX + 1]; 40 __u64 flags;
41 __u64 unused[4];
42 char name[BTRFS_SUBVOL_NAME_MAX + 1];
38}; 43};
39 44
40#define BTRFS_INO_LOOKUP_PATH_MAX 4080 45#define BTRFS_INO_LOOKUP_PATH_MAX 4080
@@ -129,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
129 */ 134 */
130 __u32 extent_thresh; 135 __u32 extent_thresh;
131 136
137 /*
138 * which compression method to use if turning on compression
139 * for this defrag operation. If unspecified, zlib will
140 * be used
141 */
142 __u32 compress_type;
143
132 /* spare for later */ 144 /* spare for later */
133 __u32 unused[5]; 145 __u32 unused[4];
134}; 146};
135 147
136struct btrfs_ioctl_space_info { 148struct btrfs_ioctl_space_info {
@@ -187,6 +199,8 @@ struct btrfs_ioctl_space_args {
187 struct btrfs_ioctl_space_args) 199 struct btrfs_ioctl_space_args)
188#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) 200#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
189#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 201#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
190#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \ 202#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
191 struct btrfs_ioctl_async_vol_args) 203 struct btrfs_ioctl_vol_args_v2)
204#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
205#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
192#endif 206#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 00000000000..cc9b450399d
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/slab.h>
21#include <linux/vmalloc.h>
22#include <linux/init.h>
23#include <linux/err.h>
24#include <linux/sched.h>
25#include <linux/pagemap.h>
26#include <linux/bio.h>
27#include <linux/lzo.h>
28#include "compression.h"
29
30#define LZO_LEN 4
31
32struct workspace {
33 void *mem;
34 void *buf; /* where compressed data goes */
35 void *cbuf; /* where decompressed data goes */
36 struct list_head list;
37};
38
39static void lzo_free_workspace(struct list_head *ws)
40{
41 struct workspace *workspace = list_entry(ws, struct workspace, list);
42
43 vfree(workspace->buf);
44 vfree(workspace->cbuf);
45 vfree(workspace->mem);
46 kfree(workspace);
47}
48
49static struct list_head *lzo_alloc_workspace(void)
50{
51 struct workspace *workspace;
52
53 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
54 if (!workspace)
55 return ERR_PTR(-ENOMEM);
56
57 workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
58 workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
59 workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
60 if (!workspace->mem || !workspace->buf || !workspace->cbuf)
61 goto fail;
62
63 INIT_LIST_HEAD(&workspace->list);
64
65 return &workspace->list;
66fail:
67 lzo_free_workspace(&workspace->list);
68 return ERR_PTR(-ENOMEM);
69}
70
71static inline void write_compress_length(char *buf, size_t len)
72{
73 __le32 dlen;
74
75 dlen = cpu_to_le32(len);
76 memcpy(buf, &dlen, LZO_LEN);
77}
78
79static inline size_t read_compress_length(char *buf)
80{
81 __le32 dlen;
82
83 memcpy(&dlen, buf, LZO_LEN);
84 return le32_to_cpu(dlen);
85}
86
87static int lzo_compress_pages(struct list_head *ws,
88 struct address_space *mapping,
89 u64 start, unsigned long len,
90 struct page **pages,
91 unsigned long nr_dest_pages,
92 unsigned long *out_pages,
93 unsigned long *total_in,
94 unsigned long *total_out,
95 unsigned long max_out)
96{
97 struct workspace *workspace = list_entry(ws, struct workspace, list);
98 int ret = 0;
99 char *data_in;
100 char *cpage_out;
101 int nr_pages = 0;
102 struct page *in_page = NULL;
103 struct page *out_page = NULL;
104 unsigned long bytes_left;
105
106 size_t in_len;
107 size_t out_len;
108 char *buf;
109 unsigned long tot_in = 0;
110 unsigned long tot_out = 0;
111 unsigned long pg_bytes_left;
112 unsigned long out_offset;
113 unsigned long bytes;
114
115 *out_pages = 0;
116 *total_out = 0;
117 *total_in = 0;
118
119 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
120 data_in = kmap(in_page);
121
122 /*
123 * store the size of all chunks of compressed data in
124 * the first 4 bytes
125 */
126 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
127 if (out_page == NULL) {
128 ret = -ENOMEM;
129 goto out;
130 }
131 cpage_out = kmap(out_page);
132 out_offset = LZO_LEN;
133 tot_out = LZO_LEN;
134 pages[0] = out_page;
135 nr_pages = 1;
136 pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
137
138 /* compress at most one page of data each time */
139 in_len = min(len, PAGE_CACHE_SIZE);
140 while (tot_in < len) {
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
145 ret);
146 ret = -1;
147 goto out;
148 }
149
150 /* store the size of this chunk of compressed data */
151 write_compress_length(cpage_out + out_offset, out_len);
152 tot_out += LZO_LEN;
153 out_offset += LZO_LEN;
154 pg_bytes_left -= LZO_LEN;
155
156 tot_in += in_len;
157 tot_out += out_len;
158
159 /* copy bytes from the working buffer into the pages */
160 buf = workspace->cbuf;
161 while (out_len) {
162 bytes = min_t(unsigned long, pg_bytes_left, out_len);
163
164 memcpy(cpage_out + out_offset, buf, bytes);
165
166 out_len -= bytes;
167 pg_bytes_left -= bytes;
168 buf += bytes;
169 out_offset += bytes;
170
171 /*
172 * we need another page for writing out.
173 *
174 * Note if there's less than 4 bytes left, we just
175 * skip to a new page.
176 */
177 if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
178 pg_bytes_left == 0) {
179 if (pg_bytes_left) {
180 memset(cpage_out + out_offset, 0,
181 pg_bytes_left);
182 tot_out += pg_bytes_left;
183 }
184
185 /* we're done, don't allocate new page */
186 if (out_len == 0 && tot_in >= len)
187 break;
188
189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL;
192 ret = -1;
193 goto out;
194 }
195
196 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
197 if (out_page == NULL) {
198 ret = -ENOMEM;
199 goto out;
200 }
201 cpage_out = kmap(out_page);
202 pages[nr_pages++] = out_page;
203
204 pg_bytes_left = PAGE_CACHE_SIZE;
205 out_offset = 0;
206 }
207 }
208
209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out)
211 goto out;
212
213 /* we're all done */
214 if (tot_in >= len)
215 break;
216
217 if (tot_out > max_out)
218 break;
219
220 bytes_left = len - tot_in;
221 kunmap(in_page);
222 page_cache_release(in_page);
223
224 start += PAGE_CACHE_SIZE;
225 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
226 data_in = kmap(in_page);
227 in_len = min(bytes_left, PAGE_CACHE_SIZE);
228 }
229
230 if (tot_out > tot_in)
231 goto out;
232
233 /* store the size of all chunks of compressed data */
234 cpage_out = kmap(pages[0]);
235 write_compress_length(cpage_out, tot_out);
236
237 kunmap(pages[0]);
238
239 ret = 0;
240 *total_out = tot_out;
241 *total_in = tot_in;
242out:
243 *out_pages = nr_pages;
244 if (out_page)
245 kunmap(out_page);
246
247 if (in_page) {
248 kunmap(in_page);
249 page_cache_release(in_page);
250 }
251
252 return ret;
253}
254
255static int lzo_decompress_biovec(struct list_head *ws,
256 struct page **pages_in,
257 u64 disk_start,
258 struct bio_vec *bvec,
259 int vcnt,
260 size_t srclen)
261{
262 struct workspace *workspace = list_entry(ws, struct workspace, list);
263 int ret = 0, ret2;
264 char *data_in;
265 unsigned long page_in_index = 0;
266 unsigned long page_out_index = 0;
267 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
268 PAGE_CACHE_SIZE;
269 unsigned long buf_start;
270 unsigned long buf_offset = 0;
271 unsigned long bytes;
272 unsigned long working_bytes;
273 unsigned long pg_offset;
274
275 size_t in_len;
276 size_t out_len;
277 unsigned long in_offset;
278 unsigned long in_page_bytes_left;
279 unsigned long tot_in;
280 unsigned long tot_out;
281 unsigned long tot_len;
282 char *buf;
283
284 data_in = kmap(pages_in[0]);
285 tot_len = read_compress_length(data_in);
286
287 tot_in = LZO_LEN;
288 in_offset = LZO_LEN;
289 tot_len = min_t(size_t, srclen, tot_len);
290 in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
291
292 tot_out = 0;
293 pg_offset = 0;
294
295 while (tot_in < tot_len) {
296 in_len = read_compress_length(data_in + in_offset);
297 in_page_bytes_left -= LZO_LEN;
298 in_offset += LZO_LEN;
299 tot_in += LZO_LEN;
300
301 tot_in += in_len;
302 working_bytes = in_len;
303
304 /* fast path: avoid using the working buffer */
305 if (in_page_bytes_left >= in_len) {
306 buf = data_in + in_offset;
307 bytes = in_len;
308 goto cont;
309 }
310
311 /* copy bytes from the pages into the working buffer */
312 buf = workspace->cbuf;
313 buf_offset = 0;
314 while (working_bytes) {
315 bytes = min(working_bytes, in_page_bytes_left);
316
317 memcpy(buf + buf_offset, data_in + in_offset, bytes);
318 buf_offset += bytes;
319cont:
320 working_bytes -= bytes;
321 in_page_bytes_left -= bytes;
322 in_offset += bytes;
323
324 /* check if we need to pick another page */
325 if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
326 || in_page_bytes_left == 0) {
327 tot_in += in_page_bytes_left;
328
329 if (working_bytes == 0 && tot_in >= tot_len)
330 break;
331
332 kunmap(pages_in[page_in_index]);
333 page_in_index++;
334 if (page_in_index >= total_pages_in) {
335 ret = -1;
336 data_in = NULL;
337 goto done;
338 }
339 data_in = kmap(pages_in[page_in_index]);
340
341 in_page_bytes_left = PAGE_CACHE_SIZE;
342 in_offset = 0;
343 }
344 }
345
346 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
347 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
348 &out_len);
349 if (ret != LZO_E_OK) {
350 printk(KERN_WARNING "btrfs decompress failed\n");
351 ret = -1;
352 break;
353 }
354
355 buf_start = tot_out;
356 tot_out += out_len;
357
358 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
359 tot_out, disk_start,
360 bvec, vcnt,
361 &page_out_index, &pg_offset);
362 if (ret2 == 0)
363 break;
364 }
365done:
366 if (data_in)
367 kunmap(pages_in[page_in_index]);
368 return ret;
369}
370
371static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
372 struct page *dest_page,
373 unsigned long start_byte,
374 size_t srclen, size_t destlen)
375{
376 struct workspace *workspace = list_entry(ws, struct workspace, list);
377 size_t in_len;
378 size_t out_len;
379 size_t tot_len;
380 int ret = 0;
381 char *kaddr;
382 unsigned long bytes;
383
384 BUG_ON(srclen < LZO_LEN);
385
386 tot_len = read_compress_length(data_in);
387 data_in += LZO_LEN;
388
389 in_len = read_compress_length(data_in);
390 data_in += LZO_LEN;
391
392 out_len = PAGE_CACHE_SIZE;
393 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
394 if (ret != LZO_E_OK) {
395 printk(KERN_WARNING "btrfs decompress failed!\n");
396 ret = -1;
397 goto out;
398 }
399
400 if (out_len < start_byte) {
401 ret = -1;
402 goto out;
403 }
404
405 bytes = min_t(unsigned long, destlen, out_len - start_byte);
406
407 kaddr = kmap_atomic(dest_page, KM_USER0);
408 memcpy(kaddr, workspace->buf + start_byte, bytes);
409 kunmap_atomic(kaddr, KM_USER0);
410out:
411 return ret;
412}
413
414struct btrfs_compress_op btrfs_lzo_compress = {
415 .alloc_workspace = lzo_alloc_workspace,
416 .free_workspace = lzo_free_workspace,
417 .compress_pages = lzo_compress_pages,
418 .decompress_biovec = lzo_decompress_biovec,
419 .decompress = lzo_decompress,
420};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f4621f6deca..2b61e1ddcd9 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
172 */ 172 */
173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 u64 start, u64 len, u64 disk_len, 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio) 175 int type, int dio, int compress_type)
176{ 176{
177 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
178 struct rb_node *node; 178 struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
189 entry->disk_len = disk_len; 189 entry->disk_len = disk_len;
190 entry->bytes_left = len; 190 entry->bytes_left = len;
191 entry->inode = inode; 191 entry->inode = inode;
192 entry->compress_type = compress_type;
192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 193 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
193 set_bit(type, &entry->flags); 194 set_bit(type, &entry->flags);
194 195
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type) 221 u64 start, u64 len, u64 disk_len, int type)
221{ 222{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 223 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0); 224 disk_len, type, 0,
225 BTRFS_COMPRESS_NONE);
224} 226}
225 227
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 228int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type) 229 u64 start, u64 len, u64 disk_len, int type)
228{ 230{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 231 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1); 232 disk_len, type, 1,
233 BTRFS_COMPRESS_NONE);
234}
235
236int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
237 u64 start, u64 len, u64 disk_len,
238 int type, int compress_type)
239{
240 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
241 disk_len, type, 0,
242 compress_type);
231} 243}
232 244
233/* 245/*
@@ -250,6 +262,73 @@ int btrfs_add_ordered_sum(struct inode *inode,
250 262
251/* 263/*
252 * this is used to account for finished IO across a given range 264 * this is used to account for finished IO across a given range
265 * of the file. The IO may span ordered extents. If
266 * a given ordered_extent is completely done, 1 is returned, otherwise
267 * 0.
268 *
269 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
270 * to make sure this function only returns 1 once for a given ordered extent.
271 *
272 * file_offset is updated to one byte past the range that is recorded as
273 * complete. This allows you to walk forward in the file.
274 */
275int btrfs_dec_test_first_ordered_pending(struct inode *inode,
276 struct btrfs_ordered_extent **cached,
277 u64 *file_offset, u64 io_size)
278{
279 struct btrfs_ordered_inode_tree *tree;
280 struct rb_node *node;
281 struct btrfs_ordered_extent *entry = NULL;
282 int ret;
283 u64 dec_end;
284 u64 dec_start;
285 u64 to_dec;
286
287 tree = &BTRFS_I(inode)->ordered_tree;
288 spin_lock(&tree->lock);
289 node = tree_search(tree, *file_offset);
290 if (!node) {
291 ret = 1;
292 goto out;
293 }
294
295 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
296 if (!offset_in_entry(entry, *file_offset)) {
297 ret = 1;
298 goto out;
299 }
300
301 dec_start = max(*file_offset, entry->file_offset);
302 dec_end = min(*file_offset + io_size, entry->file_offset +
303 entry->len);
304 *file_offset = dec_end;
305 if (dec_start > dec_end) {
306 printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
307 (unsigned long long)dec_start,
308 (unsigned long long)dec_end);
309 }
310 to_dec = dec_end - dec_start;
311 if (to_dec > entry->bytes_left) {
312 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
313 (unsigned long long)entry->bytes_left,
314 (unsigned long long)to_dec);
315 }
316 entry->bytes_left -= to_dec;
317 if (entry->bytes_left == 0)
318 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
319 else
320 ret = 1;
321out:
322 if (!ret && cached && entry) {
323 *cached = entry;
324 atomic_inc(&entry->refs);
325 }
326 spin_unlock(&tree->lock);
327 return ret == 0;
328}
329
330/*
331 * this is used to account for finished IO across a given range
253 * of the file. The IO should not span ordered extents. If 332 * of the file. The IO should not span ordered extents. If
254 * a given ordered_extent is completely done, 1 is returned, otherwise 333 * a given ordered_extent is completely done, 1 is returned, otherwise
255 * 0. 334 * 0.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8ac365492a3..ff1f69aa188 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
68 68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70 70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ 71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
93 /* flags (described above) */ 93 /* flags (described above) */
94 unsigned long flags; 94 unsigned long flags;
95 95
96 /* compression algorithm */
97 int compress_type;
98
96 /* reference count */ 99 /* reference count */
97 atomic_t refs; 100 atomic_t refs;
98 101
@@ -141,10 +144,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
141int btrfs_dec_test_ordered_pending(struct inode *inode, 144int btrfs_dec_test_ordered_pending(struct inode *inode,
142 struct btrfs_ordered_extent **cached, 145 struct btrfs_ordered_extent **cached,
143 u64 file_offset, u64 io_size); 146 u64 file_offset, u64 io_size);
147int btrfs_dec_test_first_ordered_pending(struct inode *inode,
148 struct btrfs_ordered_extent **cached,
149 u64 *file_offset, u64 io_size);
144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 150int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
145 u64 start, u64 len, u64 disk_len, int type); 151 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type); 153 u64 start, u64 len, u64 disk_len, int type);
154int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
155 u64 start, u64 len, u64 disk_len,
156 int type, int compress_type);
148int btrfs_add_ordered_sum(struct inode *inode, 157int btrfs_add_ordered_sum(struct inode *inode,
149 struct btrfs_ordered_extent *entry, 158 struct btrfs_ordered_extent *entry,
150 struct btrfs_ordered_sum *sum); 159 struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28..f8be250963a 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret) 59 if (ret < 0)
60 goto out; 60 goto out;
61 if (ret) {
62 ret = -ENOENT;
63 goto out;
64 }
61 65
62 ret = btrfs_del_item(trans, root, path); 66 ret = btrfs_del_item(trans, root, path);
63 67
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8299a25ffc8..b2130c46fdb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
54 54
55static const struct super_operations btrfs_super_ops; 55static const struct super_operations btrfs_super_ops;
56 56
57static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
58 char nbuf[16])
59{
60 char *errstr = NULL;
61
62 switch (errno) {
63 case -EIO:
64 errstr = "IO failure";
65 break;
66 case -ENOMEM:
67 errstr = "Out of memory";
68 break;
69 case -EROFS:
70 errstr = "Readonly filesystem";
71 break;
72 default:
73 if (nbuf) {
74 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
75 errstr = nbuf;
76 }
77 break;
78 }
79
80 return errstr;
81}
82
83static void __save_error_info(struct btrfs_fs_info *fs_info)
84{
85 /*
86 * today we only save the error info into ram. Long term we'll
87 * also send it down to the disk
88 */
89 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
90}
91
92/* NOTE:
93 * We move write_super stuff at umount in order to avoid deadlock
94 * for umount hold all lock.
95 */
96static void save_error_info(struct btrfs_fs_info *fs_info)
97{
98 __save_error_info(fs_info);
99}
100
101/* btrfs handle error by forcing the filesystem readonly */
102static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
103{
104 struct super_block *sb = fs_info->sb;
105
106 if (sb->s_flags & MS_RDONLY)
107 return;
108
109 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
110 sb->s_flags |= MS_RDONLY;
111 printk(KERN_INFO "btrfs is forced readonly\n");
112 }
113}
114
115/*
116 * __btrfs_std_error decodes expected errors from the caller and
117 * invokes the approciate error response.
118 */
119void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
120 unsigned int line, int errno)
121{
122 struct super_block *sb = fs_info->sb;
123 char nbuf[16];
124 const char *errstr;
125
126 /*
127 * Special case: if the error is EROFS, and we're already
128 * under MS_RDONLY, then it is safe here.
129 */
130 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
131 return;
132
133 errstr = btrfs_decode_error(fs_info, errno, nbuf);
134 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
135 sb->s_id, function, line, errstr);
136 save_error_info(fs_info);
137
138 btrfs_handle_error(fs_info);
139}
140
57static void btrfs_put_super(struct super_block *sb) 141static void btrfs_put_super(struct super_block *sb)
58{ 142{
59 struct btrfs_root *root = btrfs_sb(sb); 143 struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
69 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 153 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
70 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 154 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
71 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 155 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
72 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
73 Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, 157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
74 Opt_user_subvol_rm_allowed, 158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
75}; 159};
76 160
77static match_table_t tokens = { 161static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
86 {Opt_alloc_start, "alloc_start=%s"}, 170 {Opt_alloc_start, "alloc_start=%s"},
87 {Opt_thread_pool, "thread_pool=%d"}, 171 {Opt_thread_pool, "thread_pool=%d"},
88 {Opt_compress, "compress"}, 172 {Opt_compress, "compress"},
173 {Opt_compress_type, "compress=%s"},
89 {Opt_compress_force, "compress-force"}, 174 {Opt_compress_force, "compress-force"},
175 {Opt_compress_force_type, "compress-force=%s"},
90 {Opt_ssd, "ssd"}, 176 {Opt_ssd, "ssd"},
91 {Opt_ssd_spread, "ssd_spread"}, 177 {Opt_ssd_spread, "ssd_spread"},
92 {Opt_nossd, "nossd"}, 178 {Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
112 char *p, *num, *orig; 198 char *p, *num, *orig;
113 int intarg; 199 int intarg;
114 int ret = 0; 200 int ret = 0;
201 char *compress_type;
202 bool compress_force = false;
115 203
116 if (!options) 204 if (!options)
117 return 0; 205 return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
154 btrfs_set_opt(info->mount_opt, NODATACOW); 242 btrfs_set_opt(info->mount_opt, NODATACOW);
155 btrfs_set_opt(info->mount_opt, NODATASUM); 243 btrfs_set_opt(info->mount_opt, NODATASUM);
156 break; 244 break;
157 case Opt_compress:
158 printk(KERN_INFO "btrfs: use compression\n");
159 btrfs_set_opt(info->mount_opt, COMPRESS);
160 break;
161 case Opt_compress_force: 245 case Opt_compress_force:
162 printk(KERN_INFO "btrfs: forcing compression\n"); 246 case Opt_compress_force_type:
163 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 247 compress_force = true;
248 case Opt_compress:
249 case Opt_compress_type:
250 if (token == Opt_compress ||
251 token == Opt_compress_force ||
252 strcmp(args[0].from, "zlib") == 0) {
253 compress_type = "zlib";
254 info->compress_type = BTRFS_COMPRESS_ZLIB;
255 } else if (strcmp(args[0].from, "lzo") == 0) {
256 compress_type = "lzo";
257 info->compress_type = BTRFS_COMPRESS_LZO;
258 } else {
259 ret = -EINVAL;
260 goto out;
261 }
262
164 btrfs_set_opt(info->mount_opt, COMPRESS); 263 btrfs_set_opt(info->mount_opt, COMPRESS);
264 if (compress_force) {
265 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
266 pr_info("btrfs: force %s compression\n",
267 compress_type);
268 } else
269 pr_info("btrfs: use %s compression\n",
270 compress_type);
165 break; 271 break;
166 case Opt_ssd: 272 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 273 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -244,6 +350,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
244 case Opt_space_cache: 350 case Opt_space_cache:
245 printk(KERN_INFO "btrfs: enabling disk space caching\n"); 351 printk(KERN_INFO "btrfs: enabling disk space caching\n");
246 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 352 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
353 break;
247 case Opt_clear_cache: 354 case Opt_clear_cache:
248 printk(KERN_INFO "btrfs: force clearing of disk cache\n"); 355 printk(KERN_INFO "btrfs: force clearing of disk cache\n");
249 btrfs_set_opt(info->mount_opt, CLEAR_CACHE); 356 btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
@@ -459,6 +566,7 @@ static int btrfs_fill_super(struct super_block *sb,
459 sb->s_maxbytes = MAX_LFS_FILESIZE; 566 sb->s_maxbytes = MAX_LFS_FILESIZE;
460 sb->s_magic = BTRFS_SUPER_MAGIC; 567 sb->s_magic = BTRFS_SUPER_MAGIC;
461 sb->s_op = &btrfs_super_ops; 568 sb->s_op = &btrfs_super_ops;
569 sb->s_d_op = &btrfs_dentry_operations;
462 sb->s_export_op = &btrfs_export_ops; 570 sb->s_export_op = &btrfs_export_ops;
463 sb->s_xattr = btrfs_xattr_handlers; 571 sb->s_xattr = btrfs_xattr_handlers;
464 sb->s_time_gran = 1; 572 sb->s_time_gran = 1;
@@ -562,12 +670,26 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
562 670
563static int btrfs_test_super(struct super_block *s, void *data) 671static int btrfs_test_super(struct super_block *s, void *data)
564{ 672{
565 struct btrfs_fs_devices *test_fs_devices = data; 673 struct btrfs_root *test_root = data;
566 struct btrfs_root *root = btrfs_sb(s); 674 struct btrfs_root *root = btrfs_sb(s);
567 675
568 return root->fs_info->fs_devices == test_fs_devices; 676 /*
677 * If this super block is going away, return false as it
678 * can't match as an existing super block.
679 */
680 if (!atomic_read(&s->s_active))
681 return 0;
682 return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
569} 683}
570 684
685static int btrfs_set_super(struct super_block *s, void *data)
686{
687 s->s_fs_info = data;
688
689 return set_anon_super(s, data);
690}
691
692
571/* 693/*
572 * Find a superblock for the given device / mount point. 694 * Find a superblock for the given device / mount point.
573 * 695 *
@@ -581,6 +703,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
581 struct super_block *s; 703 struct super_block *s;
582 struct dentry *root; 704 struct dentry *root;
583 struct btrfs_fs_devices *fs_devices = NULL; 705 struct btrfs_fs_devices *fs_devices = NULL;
706 struct btrfs_root *tree_root = NULL;
707 struct btrfs_fs_info *fs_info = NULL;
584 fmode_t mode = FMODE_READ; 708 fmode_t mode = FMODE_READ;
585 char *subvol_name = NULL; 709 char *subvol_name = NULL;
586 u64 subvol_objectid = 0; 710 u64 subvol_objectid = 0;
@@ -608,8 +732,24 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
608 goto error_close_devices; 732 goto error_close_devices;
609 } 733 }
610 734
735 /*
736 * Setup a dummy root and fs_info for test/set super. This is because
737 * we don't actually fill this stuff out until open_ctree, but we need
738 * it for searching for existing supers, so this lets us do that and
739 * then open_ctree will properly initialize everything later.
740 */
741 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
742 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
743 if (!fs_info || !tree_root) {
744 error = -ENOMEM;
745 goto error_close_devices;
746 }
747 fs_info->tree_root = tree_root;
748 fs_info->fs_devices = fs_devices;
749 tree_root->fs_info = fs_info;
750
611 bdev = fs_devices->latest_bdev; 751 bdev = fs_devices->latest_bdev;
612 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); 752 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
613 if (IS_ERR(s)) 753 if (IS_ERR(s))
614 goto error_s; 754 goto error_s;
615 755
@@ -652,9 +792,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
652 mutex_unlock(&root->d_inode->i_mutex); 792 mutex_unlock(&root->d_inode->i_mutex);
653 793
654 if (IS_ERR(new_root)) { 794 if (IS_ERR(new_root)) {
795 dput(root);
655 deactivate_locked_super(s); 796 deactivate_locked_super(s);
656 error = PTR_ERR(new_root); 797 error = PTR_ERR(new_root);
657 dput(root);
658 goto error_free_subvol_name; 798 goto error_free_subvol_name;
659 } 799 }
660 if (!new_root->d_inode) { 800 if (!new_root->d_inode) {
@@ -675,6 +815,8 @@ error_s:
675 error = PTR_ERR(s); 815 error = PTR_ERR(s);
676error_close_devices: 816error_close_devices:
677 btrfs_close_devices(fs_devices); 817 btrfs_close_devices(fs_devices);
818 kfree(fs_info);
819 kfree(tree_root);
678error_free_subvol_name: 820error_free_subvol_name:
679 kfree(subvol_name); 821 kfree(subvol_name);
680 return ERR_PTR(error); 822 return ERR_PTR(error);
@@ -717,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
717 return 0; 859 return 0;
718} 860}
719 861
862/*
863 * The helper to calc the free space on the devices that can be used to store
864 * file data.
865 */
866static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
867{
868 struct btrfs_fs_info *fs_info = root->fs_info;
869 struct btrfs_device_info *devices_info;
870 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
871 struct btrfs_device *device;
872 u64 skip_space;
873 u64 type;
874 u64 avail_space;
875 u64 used_space;
876 u64 min_stripe_size;
877 int min_stripes = 1;
878 int i = 0, nr_devices;
879 int ret;
880
881 nr_devices = fs_info->fs_devices->rw_devices;
882 BUG_ON(!nr_devices);
883
884 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
885 GFP_NOFS);
886 if (!devices_info)
887 return -ENOMEM;
888
889 /* calc min stripe number for data space alloction */
890 type = btrfs_get_alloc_profile(root, 1);
891 if (type & BTRFS_BLOCK_GROUP_RAID0)
892 min_stripes = 2;
893 else if (type & BTRFS_BLOCK_GROUP_RAID1)
894 min_stripes = 2;
895 else if (type & BTRFS_BLOCK_GROUP_RAID10)
896 min_stripes = 4;
897
898 if (type & BTRFS_BLOCK_GROUP_DUP)
899 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
900 else
901 min_stripe_size = BTRFS_STRIPE_LEN;
902
903 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
904 if (!device->in_fs_metadata)
905 continue;
906
907 avail_space = device->total_bytes - device->bytes_used;
908
909 /* align with stripe_len */
910 do_div(avail_space, BTRFS_STRIPE_LEN);
911 avail_space *= BTRFS_STRIPE_LEN;
912
913 /*
914 * In order to avoid overwritting the superblock on the drive,
915 * btrfs starts at an offset of at least 1MB when doing chunk
916 * allocation.
917 */
918 skip_space = 1024 * 1024;
919
920 /* user can set the offset in fs_info->alloc_start. */
921 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
922 device->total_bytes)
923 skip_space = max(fs_info->alloc_start, skip_space);
924
925 /*
926 * btrfs can not use the free space in [0, skip_space - 1],
927 * we must subtract it from the total. In order to implement
928 * it, we account the used space in this range first.
929 */
930 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
931 &used_space);
932 if (ret) {
933 kfree(devices_info);
934 return ret;
935 }
936
937 /* calc the free space in [0, skip_space - 1] */
938 skip_space -= used_space;
939
940 /*
941 * we can use the free space in [0, skip_space - 1], subtract
942 * it from the total.
943 */
944 if (avail_space && avail_space >= skip_space)
945 avail_space -= skip_space;
946 else
947 avail_space = 0;
948
949 if (avail_space < min_stripe_size)
950 continue;
951
952 devices_info[i].dev = device;
953 devices_info[i].max_avail = avail_space;
954
955 i++;
956 }
957
958 nr_devices = i;
959
960 btrfs_descending_sort_devices(devices_info, nr_devices);
961
962 i = nr_devices - 1;
963 avail_space = 0;
964 while (nr_devices >= min_stripes) {
965 if (devices_info[i].max_avail >= min_stripe_size) {
966 int j;
967 u64 alloc_size;
968
969 avail_space += devices_info[i].max_avail * min_stripes;
970 alloc_size = devices_info[i].max_avail;
971 for (j = i + 1 - min_stripes; j <= i; j++)
972 devices_info[j].max_avail -= alloc_size;
973 }
974 i--;
975 nr_devices--;
976 }
977
978 kfree(devices_info);
979 *free_bytes = avail_space;
980 return 0;
981}
982
720static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 983static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
721{ 984{
722 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 985 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -724,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
724 struct list_head *head = &root->fs_info->space_info; 987 struct list_head *head = &root->fs_info->space_info;
725 struct btrfs_space_info *found; 988 struct btrfs_space_info *found;
726 u64 total_used = 0; 989 u64 total_used = 0;
727 u64 total_used_data = 0; 990 u64 total_free_data = 0;
728 int bits = dentry->d_sb->s_blocksize_bits; 991 int bits = dentry->d_sb->s_blocksize_bits;
729 __be32 *fsid = (__be32 *)root->fs_info->fsid; 992 __be32 *fsid = (__be32 *)root->fs_info->fsid;
993 int ret;
730 994
995 /* holding chunk_muext to avoid allocating new chunks */
996 mutex_lock(&root->fs_info->chunk_mutex);
731 rcu_read_lock(); 997 rcu_read_lock();
732 list_for_each_entry_rcu(found, head, list) { 998 list_for_each_entry_rcu(found, head, list) {
733 if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | 999 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
734 BTRFS_BLOCK_GROUP_SYSTEM)) 1000 total_free_data += found->disk_total - found->disk_used;
735 total_used_data += found->disk_total; 1001 total_free_data -=
736 else 1002 btrfs_account_ro_block_groups_free_space(found);
737 total_used_data += found->disk_used; 1003 }
1004
738 total_used += found->disk_used; 1005 total_used += found->disk_used;
739 } 1006 }
740 rcu_read_unlock(); 1007 rcu_read_unlock();
@@ -742,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
742 buf->f_namelen = BTRFS_NAME_LEN; 1009 buf->f_namelen = BTRFS_NAME_LEN;
743 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1010 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
744 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1011 buf->f_bfree = buf->f_blocks - (total_used >> bits);
745 buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
746 buf->f_bsize = dentry->d_sb->s_blocksize; 1012 buf->f_bsize = dentry->d_sb->s_blocksize;
747 buf->f_type = BTRFS_SUPER_MAGIC; 1013 buf->f_type = BTRFS_SUPER_MAGIC;
1014 buf->f_bavail = total_free_data;
1015 ret = btrfs_calc_avail_data_space(root, &total_free_data);
1016 if (ret) {
1017 mutex_unlock(&root->fs_info->chunk_mutex);
1018 return ret;
1019 }
1020 buf->f_bavail += total_free_data;
1021 buf->f_bavail = buf->f_bavail >> bits;
1022 mutex_unlock(&root->fs_info->chunk_mutex);
748 1023
749 /* We treat it as constant endianness (it doesn't matter _which_) 1024 /* We treat it as constant endianness (it doesn't matter _which_)
750 because we want the fsid to come out the same whether mounted 1025 because we want the fsid to come out the same whether mounted
@@ -861,10 +1136,14 @@ static int __init init_btrfs_fs(void)
861 if (err) 1136 if (err)
862 return err; 1137 return err;
863 1138
864 err = btrfs_init_cachep(); 1139 err = btrfs_init_compress();
865 if (err) 1140 if (err)
866 goto free_sysfs; 1141 goto free_sysfs;
867 1142
1143 err = btrfs_init_cachep();
1144 if (err)
1145 goto free_compress;
1146
868 err = extent_io_init(); 1147 err = extent_io_init();
869 if (err) 1148 if (err)
870 goto free_cachep; 1149 goto free_cachep;
@@ -892,6 +1171,8 @@ free_extent_io:
892 extent_io_exit(); 1171 extent_io_exit();
893free_cachep: 1172free_cachep:
894 btrfs_destroy_cachep(); 1173 btrfs_destroy_cachep();
1174free_compress:
1175 btrfs_exit_compress();
895free_sysfs: 1176free_sysfs:
896 btrfs_exit_sysfs(); 1177 btrfs_exit_sysfs();
897 return err; 1178 return err;
@@ -906,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
906 unregister_filesystem(&btrfs_fs_type); 1187 unregister_filesystem(&btrfs_fs_type);
907 btrfs_exit_sysfs(); 1188 btrfs_exit_sysfs();
908 btrfs_cleanup_fs_uuids(); 1189 btrfs_cleanup_fs_uuids();
909 btrfs_zlib_exit(); 1190 btrfs_exit_compress();
910} 1191}
911 1192
912module_init(init_btrfs_fs) 1193module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1fffbc017bd..bae5c7b8bbe 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
181 struct btrfs_trans_handle *h; 181 struct btrfs_trans_handle *h;
182 struct btrfs_transaction *cur_trans; 182 struct btrfs_transaction *cur_trans;
183 int ret; 183 int ret;
184
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
186 return ERR_PTR(-EROFS);
184again: 187again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 188 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h) 189 if (!h)
@@ -902,6 +905,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
902 struct btrfs_root *root = pending->root; 905 struct btrfs_root *root = pending->root;
903 struct btrfs_root *parent_root; 906 struct btrfs_root *parent_root;
904 struct inode *parent_inode; 907 struct inode *parent_inode;
908 struct dentry *parent;
905 struct dentry *dentry; 909 struct dentry *dentry;
906 struct extent_buffer *tmp; 910 struct extent_buffer *tmp;
907 struct extent_buffer *old; 911 struct extent_buffer *old;
@@ -909,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
909 u64 to_reserve = 0; 913 u64 to_reserve = 0;
910 u64 index = 0; 914 u64 index = 0;
911 u64 objectid; 915 u64 objectid;
916 u64 root_flags;
912 917
913 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 918 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
914 if (!new_root_item) { 919 if (!new_root_item) {
@@ -941,7 +946,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
941 trans->block_rsv = &pending->block_rsv; 946 trans->block_rsv = &pending->block_rsv;
942 947
943 dentry = pending->dentry; 948 dentry = pending->dentry;
944 parent_inode = dentry->d_parent->d_inode; 949 parent = dget_parent(dentry);
950 parent_inode = parent->d_inode;
945 parent_root = BTRFS_I(parent_inode)->root; 951 parent_root = BTRFS_I(parent_inode)->root;
946 record_root_in_trans(trans, parent_root); 952 record_root_in_trans(trans, parent_root);
947 953
@@ -965,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
965 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 971 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
966 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 972 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
967 973
974 root_flags = btrfs_root_flags(new_root_item);
975 if (pending->readonly)
976 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
977 else
978 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
979 btrfs_set_root_flags(new_root_item, root_flags);
980
968 old = btrfs_lock_root_node(root); 981 old = btrfs_lock_root_node(root);
969 btrfs_cow_block(trans, root, old, NULL, 0, &old); 982 btrfs_cow_block(trans, root, old, NULL, 0, &old);
970 btrfs_set_lock_blocking(old); 983 btrfs_set_lock_blocking(old);
@@ -989,6 +1002,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
989 parent_inode->i_ino, index, 1002 parent_inode->i_ino, index,
990 dentry->d_name.name, dentry->d_name.len); 1003 dentry->d_name.name, dentry->d_name.len);
991 BUG_ON(ret); 1004 BUG_ON(ret);
1005 dput(parent);
992 1006
993 key.offset = (u64)-1; 1007 key.offset = (u64)-1;
994 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); 1008 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4e..229a594cacd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
62 struct btrfs_block_rsv block_rsv; 62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */ 63 /* extra metadata reseration for relocation */
64 int error; 64 int error;
65 bool readonly;
65 struct list_head list; 66 struct list_head list;
66}; 67};
67 68
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a29f19384a2..054744ac571 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2869{ 2869{
2870 int ret = 0; 2870 int ret = 0;
2871 struct btrfs_root *root; 2871 struct btrfs_root *root;
2872 struct dentry *old_parent = NULL;
2872 2873
2873 /* 2874 /*
2874 * for regular files, if its inode is already on disk, we don't 2875 * for regular files, if its inode is already on disk, we don't
@@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2910 if (IS_ROOT(parent)) 2911 if (IS_ROOT(parent))
2911 break; 2912 break;
2912 2913
2913 parent = parent->d_parent; 2914 parent = dget_parent(parent);
2915 dput(old_parent);
2916 old_parent = parent;
2914 inode = parent->d_inode; 2917 inode = parent->d_inode;
2915 2918
2916 } 2919 }
2920 dput(old_parent);
2917out: 2921out:
2918 return ret; 2922 return ret;
2919} 2923}
@@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2945{ 2949{
2946 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2950 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2947 struct super_block *sb; 2951 struct super_block *sb;
2952 struct dentry *old_parent = NULL;
2948 int ret = 0; 2953 int ret = 0;
2949 u64 last_committed = root->fs_info->last_trans_committed; 2954 u64 last_committed = root->fs_info->last_trans_committed;
2950 2955
@@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3016 if (IS_ROOT(parent)) 3021 if (IS_ROOT(parent))
3017 break; 3022 break;
3018 3023
3019 parent = parent->d_parent; 3024 parent = dget_parent(parent);
3025 dput(old_parent);
3026 old_parent = parent;
3020 } 3027 }
3021 ret = 0; 3028 ret = 0;
3022end_trans: 3029end_trans:
3030 dput(old_parent);
3023 if (ret < 0) { 3031 if (ret < 0) {
3024 BUG_ON(ret != -ENOSPC); 3032 BUG_ON(ret != -ENOSPC);
3025 root->fs_info->last_trans_log_full_commit = trans->transid; 3033 root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -3039,8 +3047,13 @@ end_no_trans:
3039int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3047int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3040 struct btrfs_root *root, struct dentry *dentry) 3048 struct btrfs_root *root, struct dentry *dentry)
3041{ 3049{
3042 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 3050 struct dentry *parent = dget_parent(dentry);
3043 dentry->d_parent, 0); 3051 int ret;
3052
3053 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3054 dput(parent);
3055
3056 return ret;
3044} 3057}
3045 3058
3046/* 3059/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cc04dc1445d..d158530233b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include "compat.h" 27#include "compat.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -412,12 +413,16 @@ static noinline int device_list_add(const char *path,
412 413
413 device->fs_devices = fs_devices; 414 device->fs_devices = fs_devices;
414 fs_devices->num_devices++; 415 fs_devices->num_devices++;
415 } else if (strcmp(device->name, path)) { 416 } else if (!device->name || strcmp(device->name, path)) {
416 name = kstrdup(path, GFP_NOFS); 417 name = kstrdup(path, GFP_NOFS);
417 if (!name) 418 if (!name)
418 return -ENOMEM; 419 return -ENOMEM;
419 kfree(device->name); 420 kfree(device->name);
420 device->name = name; 421 device->name = name;
422 if (device->missing) {
423 fs_devices->missing_devices--;
424 device->missing = 0;
425 }
421 } 426 }
422 427
423 if (found_transid > fs_devices->latest_trans) { 428 if (found_transid > fs_devices->latest_trans) {
@@ -489,7 +494,7 @@ again:
489 continue; 494 continue;
490 495
491 if (device->bdev) { 496 if (device->bdev) {
492 close_bdev_exclusive(device->bdev, device->mode); 497 blkdev_put(device->bdev, device->mode);
493 device->bdev = NULL; 498 device->bdev = NULL;
494 fs_devices->open_devices--; 499 fs_devices->open_devices--;
495 } 500 }
@@ -523,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
523 528
524 list_for_each_entry(device, &fs_devices->devices, dev_list) { 529 list_for_each_entry(device, &fs_devices->devices, dev_list) {
525 if (device->bdev) { 530 if (device->bdev) {
526 close_bdev_exclusive(device->bdev, device->mode); 531 blkdev_put(device->bdev, device->mode);
527 fs_devices->open_devices--; 532 fs_devices->open_devices--;
528 } 533 }
529 if (device->writeable) { 534 if (device->writeable) {
@@ -580,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
580 int seeding = 1; 585 int seeding = 1;
581 int ret = 0; 586 int ret = 0;
582 587
588 flags |= FMODE_EXCL;
589
583 list_for_each_entry(device, head, dev_list) { 590 list_for_each_entry(device, head, dev_list) {
584 if (device->bdev) 591 if (device->bdev)
585 continue; 592 continue;
586 if (!device->name) 593 if (!device->name)
587 continue; 594 continue;
588 595
589 bdev = open_bdev_exclusive(device->name, flags, holder); 596 bdev = blkdev_get_by_path(device->name, flags, holder);
590 if (IS_ERR(bdev)) { 597 if (IS_ERR(bdev)) {
591 printk(KERN_INFO "open %s failed\n", device->name); 598 printk(KERN_INFO "open %s failed\n", device->name);
592 goto error; 599 goto error;
@@ -594,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
594 set_blocksize(bdev, 4096); 601 set_blocksize(bdev, 4096);
595 602
596 bh = btrfs_read_dev_super(bdev); 603 bh = btrfs_read_dev_super(bdev);
597 if (!bh) 604 if (!bh) {
605 ret = -EINVAL;
598 goto error_close; 606 goto error_close;
607 }
599 608
600 disk_super = (struct btrfs_super_block *)bh->b_data; 609 disk_super = (struct btrfs_super_block *)bh->b_data;
601 devid = btrfs_stack_device_id(&disk_super->dev_item); 610 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -638,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
638error_brelse: 647error_brelse:
639 brelse(bh); 648 brelse(bh);
640error_close: 649error_close:
641 close_bdev_exclusive(bdev, FMODE_READ); 650 blkdev_put(bdev, flags);
642error: 651error:
643 continue; 652 continue;
644 } 653 }
@@ -684,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
684 693
685 mutex_lock(&uuid_mutex); 694 mutex_lock(&uuid_mutex);
686 695
687 bdev = open_bdev_exclusive(path, flags, holder); 696 flags |= FMODE_EXCL;
697 bdev = blkdev_get_by_path(path, flags, holder);
688 698
689 if (IS_ERR(bdev)) { 699 if (IS_ERR(bdev)) {
690 ret = PTR_ERR(bdev); 700 ret = PTR_ERR(bdev);
@@ -696,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
696 goto error_close; 706 goto error_close;
697 bh = btrfs_read_dev_super(bdev); 707 bh = btrfs_read_dev_super(bdev);
698 if (!bh) { 708 if (!bh) {
699 ret = -EIO; 709 ret = -EINVAL;
700 goto error_close; 710 goto error_close;
701 } 711 }
702 disk_super = (struct btrfs_super_block *)bh->b_data; 712 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -716,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
716 726
717 brelse(bh); 727 brelse(bh);
718error_close: 728error_close:
719 close_bdev_exclusive(bdev, flags); 729 blkdev_put(bdev, flags);
720error: 730error:
721 mutex_unlock(&uuid_mutex); 731 mutex_unlock(&uuid_mutex);
722 return ret; 732 return ret;
723} 733}
724 734
735/* helper to account the used device space in the range */
736int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
737 u64 end, u64 *length)
738{
739 struct btrfs_key key;
740 struct btrfs_root *root = device->dev_root;
741 struct btrfs_dev_extent *dev_extent;
742 struct btrfs_path *path;
743 u64 extent_end;
744 int ret;
745 int slot;
746 struct extent_buffer *l;
747
748 *length = 0;
749
750 if (start >= device->total_bytes)
751 return 0;
752
753 path = btrfs_alloc_path();
754 if (!path)
755 return -ENOMEM;
756 path->reada = 2;
757
758 key.objectid = device->devid;
759 key.offset = start;
760 key.type = BTRFS_DEV_EXTENT_KEY;
761
762 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
763 if (ret < 0)
764 goto out;
765 if (ret > 0) {
766 ret = btrfs_previous_item(root, path, key.objectid, key.type);
767 if (ret < 0)
768 goto out;
769 }
770
771 while (1) {
772 l = path->nodes[0];
773 slot = path->slots[0];
774 if (slot >= btrfs_header_nritems(l)) {
775 ret = btrfs_next_leaf(root, path);
776 if (ret == 0)
777 continue;
778 if (ret < 0)
779 goto out;
780
781 break;
782 }
783 btrfs_item_key_to_cpu(l, &key, slot);
784
785 if (key.objectid < device->devid)
786 goto next;
787
788 if (key.objectid > device->devid)
789 break;
790
791 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
792 goto next;
793
794 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
795 extent_end = key.offset + btrfs_dev_extent_length(l,
796 dev_extent);
797 if (key.offset <= start && extent_end > end) {
798 *length = end - start + 1;
799 break;
800 } else if (key.offset <= start && extent_end > start)
801 *length += extent_end - start;
802 else if (key.offset > start && extent_end <= end)
803 *length += extent_end - key.offset;
804 else if (key.offset > start && key.offset <= end) {
805 *length += end - key.offset + 1;
806 break;
807 } else if (key.offset > end)
808 break;
809
810next:
811 path->slots[0]++;
812 }
813 ret = 0;
814out:
815 btrfs_free_path(path);
816 return ret;
817}
818
725/* 819/*
820 * find_free_dev_extent - find free space in the specified device
821 * @trans: transaction handler
822 * @device: the device which we search the free space in
823 * @num_bytes: the size of the free space that we need
824 * @start: store the start of the free space.
825 * @len: the size of the free space. that we find, or the size of the max
826 * free space if we don't find suitable free space
827 *
726 * this uses a pretty simple search, the expectation is that it is 828 * this uses a pretty simple search, the expectation is that it is
727 * called very infrequently and that a given device has a small number 829 * called very infrequently and that a given device has a small number
728 * of extents 830 * of extents
831 *
832 * @start is used to store the start of the free space if we find. But if we
833 * don't find suitable free space, it will be used to store the start position
834 * of the max free space.
835 *
836 * @len is used to store the size of the free space that we find.
837 * But if we don't find suitable free space, it is used to store the size of
838 * the max free space.
729 */ 839 */
730int find_free_dev_extent(struct btrfs_trans_handle *trans, 840int find_free_dev_extent(struct btrfs_trans_handle *trans,
731 struct btrfs_device *device, u64 num_bytes, 841 struct btrfs_device *device, u64 num_bytes,
732 u64 *start, u64 *max_avail) 842 u64 *start, u64 *len)
733{ 843{
734 struct btrfs_key key; 844 struct btrfs_key key;
735 struct btrfs_root *root = device->dev_root; 845 struct btrfs_root *root = device->dev_root;
736 struct btrfs_dev_extent *dev_extent = NULL; 846 struct btrfs_dev_extent *dev_extent;
737 struct btrfs_path *path; 847 struct btrfs_path *path;
738 u64 hole_size = 0; 848 u64 hole_size;
739 u64 last_byte = 0; 849 u64 max_hole_start;
740 u64 search_start = 0; 850 u64 max_hole_size;
851 u64 extent_end;
852 u64 search_start;
741 u64 search_end = device->total_bytes; 853 u64 search_end = device->total_bytes;
742 int ret; 854 int ret;
743 int slot = 0; 855 int slot;
744 int start_found;
745 struct extent_buffer *l; 856 struct extent_buffer *l;
746 857
747 path = btrfs_alloc_path();
748 if (!path)
749 return -ENOMEM;
750 path->reada = 2;
751 start_found = 0;
752
753 /* FIXME use last free of some kind */ 858 /* FIXME use last free of some kind */
754 859
755 /* we don't want to overwrite the superblock on the drive, 860 /* we don't want to overwrite the superblock on the drive,
756 * so we make sure to start at an offset of at least 1MB 861 * so we make sure to start at an offset of at least 1MB
757 */ 862 */
758 search_start = max((u64)1024 * 1024, search_start); 863 search_start = 1024 * 1024;
759 864
760 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 865 if (root->fs_info->alloc_start + num_bytes <= search_end)
761 search_start = max(root->fs_info->alloc_start, search_start); 866 search_start = max(root->fs_info->alloc_start, search_start);
762 867
868 max_hole_start = search_start;
869 max_hole_size = 0;
870
871 if (search_start >= search_end) {
872 ret = -ENOSPC;
873 goto error;
874 }
875
876 path = btrfs_alloc_path();
877 if (!path) {
878 ret = -ENOMEM;
879 goto error;
880 }
881 path->reada = 2;
882
763 key.objectid = device->devid; 883 key.objectid = device->devid;
764 key.offset = search_start; 884 key.offset = search_start;
765 key.type = BTRFS_DEV_EXTENT_KEY; 885 key.type = BTRFS_DEV_EXTENT_KEY;
886
766 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 887 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
767 if (ret < 0) 888 if (ret < 0)
768 goto error; 889 goto out;
769 if (ret > 0) { 890 if (ret > 0) {
770 ret = btrfs_previous_item(root, path, key.objectid, key.type); 891 ret = btrfs_previous_item(root, path, key.objectid, key.type);
771 if (ret < 0) 892 if (ret < 0)
772 goto error; 893 goto out;
773 if (ret > 0)
774 start_found = 1;
775 } 894 }
776 l = path->nodes[0]; 895
777 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
778 while (1) { 896 while (1) {
779 l = path->nodes[0]; 897 l = path->nodes[0];
780 slot = path->slots[0]; 898 slot = path->slots[0];
@@ -783,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
783 if (ret == 0) 901 if (ret == 0)
784 continue; 902 continue;
785 if (ret < 0) 903 if (ret < 0)
786 goto error; 904 goto out;
787no_more_items: 905
788 if (!start_found) { 906 break;
789 if (search_start >= search_end) {
790 ret = -ENOSPC;
791 goto error;
792 }
793 *start = search_start;
794 start_found = 1;
795 goto check_pending;
796 }
797 *start = last_byte > search_start ?
798 last_byte : search_start;
799 if (search_end <= *start) {
800 ret = -ENOSPC;
801 goto error;
802 }
803 goto check_pending;
804 } 907 }
805 btrfs_item_key_to_cpu(l, &key, slot); 908 btrfs_item_key_to_cpu(l, &key, slot);
806 909
@@ -808,48 +911,62 @@ no_more_items:
808 goto next; 911 goto next;
809 912
810 if (key.objectid > device->devid) 913 if (key.objectid > device->devid)
811 goto no_more_items; 914 break;
812 915
813 if (key.offset >= search_start && key.offset > last_byte && 916 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
814 start_found) { 917 goto next;
815 if (last_byte < search_start)
816 last_byte = search_start;
817 hole_size = key.offset - last_byte;
818 918
819 if (hole_size > *max_avail) 919 if (key.offset > search_start) {
820 *max_avail = hole_size; 920 hole_size = key.offset - search_start;
921
922 if (hole_size > max_hole_size) {
923 max_hole_start = search_start;
924 max_hole_size = hole_size;
925 }
821 926
822 if (key.offset > last_byte && 927 /*
823 hole_size >= num_bytes) { 928 * If this free space is greater than which we need,
824 *start = last_byte; 929 * it must be the max free space that we have found
825 goto check_pending; 930 * until now, so max_hole_start must point to the start
931 * of this free space and the length of this free space
932 * is stored in max_hole_size. Thus, we return
933 * max_hole_start and max_hole_size and go back to the
934 * caller.
935 */
936 if (hole_size >= num_bytes) {
937 ret = 0;
938 goto out;
826 } 939 }
827 } 940 }
828 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
829 goto next;
830 941
831 start_found = 1;
832 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 942 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
833 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 943 extent_end = key.offset + btrfs_dev_extent_length(l,
944 dev_extent);
945 if (extent_end > search_start)
946 search_start = extent_end;
834next: 947next:
835 path->slots[0]++; 948 path->slots[0]++;
836 cond_resched(); 949 cond_resched();
837 } 950 }
838check_pending:
839 /* we have to make sure we didn't find an extent that has already
840 * been allocated by the map tree or the original allocation
841 */
842 BUG_ON(*start < search_start);
843 951
844 if (*start + num_bytes > search_end) { 952 hole_size = search_end- search_start;
845 ret = -ENOSPC; 953 if (hole_size > max_hole_size) {
846 goto error; 954 max_hole_start = search_start;
955 max_hole_size = hole_size;
847 } 956 }
848 /* check for pending inserts here */
849 ret = 0;
850 957
851error: 958 /* See above. */
959 if (hole_size < num_bytes)
960 ret = -ENOSPC;
961 else
962 ret = 0;
963
964out:
852 btrfs_free_path(path); 965 btrfs_free_path(path);
966error:
967 *start = max_hole_start;
968 if (len)
969 *len = max_hole_size;
853 return ret; 970 return ret;
854} 971}
855 972
@@ -1179,8 +1296,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1179 goto out; 1296 goto out;
1180 } 1297 }
1181 } else { 1298 } else {
1182 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1299 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1183 root->fs_info->bdev_holder); 1300 root->fs_info->bdev_holder);
1184 if (IS_ERR(bdev)) { 1301 if (IS_ERR(bdev)) {
1185 ret = PTR_ERR(bdev); 1302 ret = PTR_ERR(bdev);
1186 goto out; 1303 goto out;
@@ -1189,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1189 set_blocksize(bdev, 4096); 1306 set_blocksize(bdev, 4096);
1190 bh = btrfs_read_dev_super(bdev); 1307 bh = btrfs_read_dev_super(bdev);
1191 if (!bh) { 1308 if (!bh) {
1192 ret = -EIO; 1309 ret = -EINVAL;
1193 goto error_close; 1310 goto error_close;
1194 } 1311 }
1195 disk_super = (struct btrfs_super_block *)bh->b_data; 1312 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1236,6 +1353,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1236 1353
1237 device->fs_devices->num_devices--; 1354 device->fs_devices->num_devices--;
1238 1355
1356 if (device->missing)
1357 root->fs_info->fs_devices->missing_devices--;
1358
1239 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1359 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1240 struct btrfs_device, dev_list); 1360 struct btrfs_device, dev_list);
1241 if (device->bdev == root->fs_info->sb->s_bdev) 1361 if (device->bdev == root->fs_info->sb->s_bdev)
@@ -1244,7 +1364,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1244 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1364 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1245 1365
1246 if (device->bdev) { 1366 if (device->bdev) {
1247 close_bdev_exclusive(device->bdev, device->mode); 1367 blkdev_put(device->bdev, device->mode);
1248 device->bdev = NULL; 1368 device->bdev = NULL;
1249 device->fs_devices->open_devices--; 1369 device->fs_devices->open_devices--;
1250 } 1370 }
@@ -1287,7 +1407,7 @@ error_brelse:
1287 brelse(bh); 1407 brelse(bh);
1288error_close: 1408error_close:
1289 if (bdev) 1409 if (bdev)
1290 close_bdev_exclusive(bdev, FMODE_READ); 1410 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1291out: 1411out:
1292 mutex_unlock(&root->fs_info->volume_mutex); 1412 mutex_unlock(&root->fs_info->volume_mutex);
1293 mutex_unlock(&uuid_mutex); 1413 mutex_unlock(&uuid_mutex);
@@ -1439,7 +1559,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1439 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1559 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1440 return -EINVAL; 1560 return -EINVAL;
1441 1561
1442 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1562 bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1563 root->fs_info->bdev_holder);
1443 if (IS_ERR(bdev)) 1564 if (IS_ERR(bdev))
1444 return PTR_ERR(bdev); 1565 return PTR_ERR(bdev);
1445 1566
@@ -1565,7 +1686,7 @@ out:
1565 mutex_unlock(&root->fs_info->volume_mutex); 1686 mutex_unlock(&root->fs_info->volume_mutex);
1566 return ret; 1687 return ret;
1567error: 1688error:
1568 close_bdev_exclusive(bdev, 0); 1689 blkdev_put(bdev, FMODE_EXCL);
1569 if (seeding_dev) { 1690 if (seeding_dev) {
1570 mutex_unlock(&uuid_mutex); 1691 mutex_unlock(&uuid_mutex);
1571 up_write(&sb->s_umount); 1692 up_write(&sb->s_umount);
@@ -1905,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1905 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2026 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1906 return -EROFS; 2027 return -EROFS;
1907 2028
2029 if (!capable(CAP_SYS_ADMIN))
2030 return -EPERM;
2031
1908 mutex_lock(&dev_root->fs_info->volume_mutex); 2032 mutex_lock(&dev_root->fs_info->volume_mutex);
1909 dev_root = dev_root->fs_info->dev_root; 2033 dev_root = dev_root->fs_info->dev_root;
1910 2034
@@ -2143,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2143 return calc_size * num_stripes; 2267 return calc_size * num_stripes;
2144} 2268}
2145 2269
2146static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2270/* Used to sort the devices by max_avail(descending sort) */
2147 struct btrfs_root *extent_root, 2271int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2148 struct map_lookup **map_ret,
2149 u64 *num_bytes, u64 *stripe_size,
2150 u64 start, u64 type)
2151{ 2272{
2152 struct btrfs_fs_info *info = extent_root->fs_info; 2273 if (((struct btrfs_device_info *)dev_info1)->max_avail >
2153 struct btrfs_device *device = NULL; 2274 ((struct btrfs_device_info *)dev_info2)->max_avail)
2154 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2275 return -1;
2155 struct list_head *cur; 2276 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2156 struct map_lookup *map = NULL; 2277 ((struct btrfs_device_info *)dev_info2)->max_avail)
2157 struct extent_map_tree *em_tree; 2278 return 1;
2158 struct extent_map *em; 2279 else
2159 struct list_head private_devs; 2280 return 0;
2160 int min_stripe_size = 1 * 1024 * 1024; 2281}
2161 u64 calc_size = 1024 * 1024 * 1024;
2162 u64 max_chunk_size = calc_size;
2163 u64 min_free;
2164 u64 avail;
2165 u64 max_avail = 0;
2166 u64 dev_offset;
2167 int num_stripes = 1;
2168 int min_stripes = 1;
2169 int sub_stripes = 0;
2170 int looped = 0;
2171 int ret;
2172 int index;
2173 int stripe_len = 64 * 1024;
2174 2282
2175 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2283static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2176 (type & BTRFS_BLOCK_GROUP_DUP)) { 2284 int *num_stripes, int *min_stripes,
2177 WARN_ON(1); 2285 int *sub_stripes)
2178 type &= ~BTRFS_BLOCK_GROUP_DUP; 2286{
2179 } 2287 *num_stripes = 1;
2180 if (list_empty(&fs_devices->alloc_list)) 2288 *min_stripes = 1;
2181 return -ENOSPC; 2289 *sub_stripes = 0;
2182 2290
2183 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2291 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2184 num_stripes = fs_devices->rw_devices; 2292 *num_stripes = fs_devices->rw_devices;
2185 min_stripes = 2; 2293 *min_stripes = 2;
2186 } 2294 }
2187 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2295 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2188 num_stripes = 2; 2296 *num_stripes = 2;
2189 min_stripes = 2; 2297 *min_stripes = 2;
2190 } 2298 }
2191 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2299 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2192 if (fs_devices->rw_devices < 2) 2300 if (fs_devices->rw_devices < 2)
2193 return -ENOSPC; 2301 return -ENOSPC;
2194 num_stripes = 2; 2302 *num_stripes = 2;
2195 min_stripes = 2; 2303 *min_stripes = 2;
2196 } 2304 }
2197 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2305 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2198 num_stripes = fs_devices->rw_devices; 2306 *num_stripes = fs_devices->rw_devices;
2199 if (num_stripes < 4) 2307 if (*num_stripes < 4)
2200 return -ENOSPC; 2308 return -ENOSPC;
2201 num_stripes &= ~(u32)1; 2309 *num_stripes &= ~(u32)1;
2202 sub_stripes = 2; 2310 *sub_stripes = 2;
2203 min_stripes = 4; 2311 *min_stripes = 4;
2204 } 2312 }
2205 2313
2314 return 0;
2315}
2316
2317static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2318 u64 proposed_size, u64 type,
2319 int num_stripes, int small_stripe)
2320{
2321 int min_stripe_size = 1 * 1024 * 1024;
2322 u64 calc_size = proposed_size;
2323 u64 max_chunk_size = calc_size;
2324 int ncopies = 1;
2325
2326 if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2327 BTRFS_BLOCK_GROUP_DUP |
2328 BTRFS_BLOCK_GROUP_RAID10))
2329 ncopies = 2;
2330
2206 if (type & BTRFS_BLOCK_GROUP_DATA) { 2331 if (type & BTRFS_BLOCK_GROUP_DATA) {
2207 max_chunk_size = 10 * calc_size; 2332 max_chunk_size = 10 * calc_size;
2208 min_stripe_size = 64 * 1024 * 1024; 2333 min_stripe_size = 64 * 1024 * 1024;
@@ -2219,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2219 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2344 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2220 max_chunk_size); 2345 max_chunk_size);
2221 2346
2222again: 2347 if (calc_size * num_stripes > max_chunk_size * ncopies) {
2223 max_avail = 0; 2348 calc_size = max_chunk_size * ncopies;
2224 if (!map || map->num_stripes != num_stripes) {
2225 kfree(map);
2226 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2227 if (!map)
2228 return -ENOMEM;
2229 map->num_stripes = num_stripes;
2230 }
2231
2232 if (calc_size * num_stripes > max_chunk_size) {
2233 calc_size = max_chunk_size;
2234 do_div(calc_size, num_stripes); 2349 do_div(calc_size, num_stripes);
2235 do_div(calc_size, stripe_len); 2350 do_div(calc_size, BTRFS_STRIPE_LEN);
2236 calc_size *= stripe_len; 2351 calc_size *= BTRFS_STRIPE_LEN;
2237 } 2352 }
2238 2353
2239 /* we don't want tiny stripes */ 2354 /* we don't want tiny stripes */
2240 if (!looped) 2355 if (!small_stripe)
2241 calc_size = max_t(u64, min_stripe_size, calc_size); 2356 calc_size = max_t(u64, min_stripe_size, calc_size);
2242 2357
2243 /* 2358 /*
2244 * we're about to do_div by the stripe_len so lets make sure 2359 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2245 * we end up with something bigger than a stripe 2360 * we end up with something bigger than a stripe
2246 */ 2361 */
2247 calc_size = max_t(u64, calc_size, stripe_len * 4); 2362 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2363
2364 do_div(calc_size, BTRFS_STRIPE_LEN);
2365 calc_size *= BTRFS_STRIPE_LEN;
2366
2367 return calc_size;
2368}
2248 2369
2249 do_div(calc_size, stripe_len); 2370static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2250 calc_size *= stripe_len; 2371 int num_stripes)
2372{
2373 struct map_lookup *new;
2374 size_t len = map_lookup_size(num_stripes);
2375
2376 BUG_ON(map->num_stripes < num_stripes);
2377
2378 if (map->num_stripes == num_stripes)
2379 return map;
2380
2381 new = kmalloc(len, GFP_NOFS);
2382 if (!new) {
2383 /* just change map->num_stripes */
2384 map->num_stripes = num_stripes;
2385 return map;
2386 }
2387
2388 memcpy(new, map, len);
2389 new->num_stripes = num_stripes;
2390 kfree(map);
2391 return new;
2392}
2393
2394/*
2395 * helper to allocate device space from btrfs_device_info, in which we stored
2396 * max free space information of every device. It is used when we can not
2397 * allocate chunks by default size.
2398 *
2399 * By this helper, we can allocate a new chunk as larger as possible.
2400 */
2401static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2402 struct btrfs_fs_devices *fs_devices,
2403 struct btrfs_device_info *devices,
2404 int nr_device, u64 type,
2405 struct map_lookup **map_lookup,
2406 int min_stripes, u64 *stripe_size)
2407{
2408 int i, index, sort_again = 0;
2409 int min_devices = min_stripes;
2410 u64 max_avail, min_free;
2411 struct map_lookup *map = *map_lookup;
2412 int ret;
2413
2414 if (nr_device < min_stripes)
2415 return -ENOSPC;
2416
2417 btrfs_descending_sort_devices(devices, nr_device);
2418
2419 max_avail = devices[0].max_avail;
2420 if (!max_avail)
2421 return -ENOSPC;
2422
2423 for (i = 0; i < nr_device; i++) {
2424 /*
2425 * if dev_offset = 0, it means the free space of this device
2426 * is less than what we need, and we didn't search max avail
2427 * extent on this device, so do it now.
2428 */
2429 if (!devices[i].dev_offset) {
2430 ret = find_free_dev_extent(trans, devices[i].dev,
2431 max_avail,
2432 &devices[i].dev_offset,
2433 &devices[i].max_avail);
2434 if (ret != 0 && ret != -ENOSPC)
2435 return ret;
2436 sort_again = 1;
2437 }
2438 }
2439
2440 /* we update the max avail free extent of each devices, sort again */
2441 if (sort_again)
2442 btrfs_descending_sort_devices(devices, nr_device);
2443
2444 if (type & BTRFS_BLOCK_GROUP_DUP)
2445 min_devices = 1;
2446
2447 if (!devices[min_devices - 1].max_avail)
2448 return -ENOSPC;
2449
2450 max_avail = devices[min_devices - 1].max_avail;
2451 if (type & BTRFS_BLOCK_GROUP_DUP)
2452 do_div(max_avail, 2);
2453
2454 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2455 min_stripes, 1);
2456 if (type & BTRFS_BLOCK_GROUP_DUP)
2457 min_free = max_avail * 2;
2458 else
2459 min_free = max_avail;
2460
2461 if (min_free > devices[min_devices - 1].max_avail)
2462 return -ENOSPC;
2463
2464 map = __shrink_map_lookup_stripes(map, min_stripes);
2465 *stripe_size = max_avail;
2466
2467 index = 0;
2468 for (i = 0; i < min_stripes; i++) {
2469 map->stripes[i].dev = devices[index].dev;
2470 map->stripes[i].physical = devices[index].dev_offset;
2471 if (type & BTRFS_BLOCK_GROUP_DUP) {
2472 i++;
2473 map->stripes[i].dev = devices[index].dev;
2474 map->stripes[i].physical = devices[index].dev_offset +
2475 max_avail;
2476 }
2477 index++;
2478 }
2479 *map_lookup = map;
2480
2481 return 0;
2482}
2483
2484static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2485 struct btrfs_root *extent_root,
2486 struct map_lookup **map_ret,
2487 u64 *num_bytes, u64 *stripe_size,
2488 u64 start, u64 type)
2489{
2490 struct btrfs_fs_info *info = extent_root->fs_info;
2491 struct btrfs_device *device = NULL;
2492 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2493 struct list_head *cur;
2494 struct map_lookup *map;
2495 struct extent_map_tree *em_tree;
2496 struct extent_map *em;
2497 struct btrfs_device_info *devices_info;
2498 struct list_head private_devs;
2499 u64 calc_size = 1024 * 1024 * 1024;
2500 u64 min_free;
2501 u64 avail;
2502 u64 dev_offset;
2503 int num_stripes;
2504 int min_stripes;
2505 int sub_stripes;
2506 int min_devices; /* the min number of devices we need */
2507 int i;
2508 int ret;
2509 int index;
2510
2511 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2512 (type & BTRFS_BLOCK_GROUP_DUP)) {
2513 WARN_ON(1);
2514 type &= ~BTRFS_BLOCK_GROUP_DUP;
2515 }
2516 if (list_empty(&fs_devices->alloc_list))
2517 return -ENOSPC;
2518
2519 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2520 &min_stripes, &sub_stripes);
2521 if (ret)
2522 return ret;
2523
2524 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2525 GFP_NOFS);
2526 if (!devices_info)
2527 return -ENOMEM;
2528
2529 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2530 if (!map) {
2531 ret = -ENOMEM;
2532 goto error;
2533 }
2534 map->num_stripes = num_stripes;
2251 2535
2252 cur = fs_devices->alloc_list.next; 2536 cur = fs_devices->alloc_list.next;
2253 index = 0; 2537 index = 0;
2538 i = 0;
2254 2539
2255 if (type & BTRFS_BLOCK_GROUP_DUP) 2540 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2541 num_stripes, 0);
2542
2543 if (type & BTRFS_BLOCK_GROUP_DUP) {
2256 min_free = calc_size * 2; 2544 min_free = calc_size * 2;
2257 else 2545 min_devices = 1;
2546 } else {
2258 min_free = calc_size; 2547 min_free = calc_size;
2259 2548 min_devices = min_stripes;
2260 /* 2549 }
2261 * we add 1MB because we never use the first 1MB of the device, unless
2262 * we've looped, then we are likely allocating the maximum amount of
2263 * space left already
2264 */
2265 if (!looped)
2266 min_free += 1024 * 1024;
2267 2550
2268 INIT_LIST_HEAD(&private_devs); 2551 INIT_LIST_HEAD(&private_devs);
2269 while (index < num_stripes) { 2552 while (index < num_stripes) {
@@ -2276,27 +2559,39 @@ again:
2276 cur = cur->next; 2559 cur = cur->next;
2277 2560
2278 if (device->in_fs_metadata && avail >= min_free) { 2561 if (device->in_fs_metadata && avail >= min_free) {
2279 ret = find_free_dev_extent(trans, device, 2562 ret = find_free_dev_extent(trans, device, min_free,
2280 min_free, &dev_offset, 2563 &devices_info[i].dev_offset,
2281 &max_avail); 2564 &devices_info[i].max_avail);
2282 if (ret == 0) { 2565 if (ret == 0) {
2283 list_move_tail(&device->dev_alloc_list, 2566 list_move_tail(&device->dev_alloc_list,
2284 &private_devs); 2567 &private_devs);
2285 map->stripes[index].dev = device; 2568 map->stripes[index].dev = device;
2286 map->stripes[index].physical = dev_offset; 2569 map->stripes[index].physical =
2570 devices_info[i].dev_offset;
2287 index++; 2571 index++;
2288 if (type & BTRFS_BLOCK_GROUP_DUP) { 2572 if (type & BTRFS_BLOCK_GROUP_DUP) {
2289 map->stripes[index].dev = device; 2573 map->stripes[index].dev = device;
2290 map->stripes[index].physical = 2574 map->stripes[index].physical =
2291 dev_offset + calc_size; 2575 devices_info[i].dev_offset +
2576 calc_size;
2292 index++; 2577 index++;
2293 } 2578 }
2294 } 2579 } else if (ret != -ENOSPC)
2295 } else if (device->in_fs_metadata && avail > max_avail) 2580 goto error;
2296 max_avail = avail; 2581
2582 devices_info[i].dev = device;
2583 i++;
2584 } else if (device->in_fs_metadata &&
2585 avail >= BTRFS_STRIPE_LEN) {
2586 devices_info[i].dev = device;
2587 devices_info[i].max_avail = avail;
2588 i++;
2589 }
2590
2297 if (cur == &fs_devices->alloc_list) 2591 if (cur == &fs_devices->alloc_list)
2298 break; 2592 break;
2299 } 2593 }
2594
2300 list_splice(&private_devs, &fs_devices->alloc_list); 2595 list_splice(&private_devs, &fs_devices->alloc_list);
2301 if (index < num_stripes) { 2596 if (index < num_stripes) {
2302 if (index >= min_stripes) { 2597 if (index >= min_stripes) {
@@ -2305,34 +2600,36 @@ again:
2305 num_stripes /= sub_stripes; 2600 num_stripes /= sub_stripes;
2306 num_stripes *= sub_stripes; 2601 num_stripes *= sub_stripes;
2307 } 2602 }
2308 looped = 1; 2603
2309 goto again; 2604 map = __shrink_map_lookup_stripes(map, num_stripes);
2310 } 2605 } else if (i >= min_devices) {
2311 if (!looped && max_avail > 0) { 2606 ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2312 looped = 1; 2607 devices_info, i, type,
2313 calc_size = max_avail; 2608 &map, min_stripes,
2314 goto again; 2609 &calc_size);
2610 if (ret)
2611 goto error;
2612 } else {
2613 ret = -ENOSPC;
2614 goto error;
2315 } 2615 }
2316 kfree(map);
2317 return -ENOSPC;
2318 } 2616 }
2319 map->sector_size = extent_root->sectorsize; 2617 map->sector_size = extent_root->sectorsize;
2320 map->stripe_len = stripe_len; 2618 map->stripe_len = BTRFS_STRIPE_LEN;
2321 map->io_align = stripe_len; 2619 map->io_align = BTRFS_STRIPE_LEN;
2322 map->io_width = stripe_len; 2620 map->io_width = BTRFS_STRIPE_LEN;
2323 map->type = type; 2621 map->type = type;
2324 map->num_stripes = num_stripes;
2325 map->sub_stripes = sub_stripes; 2622 map->sub_stripes = sub_stripes;
2326 2623
2327 *map_ret = map; 2624 *map_ret = map;
2328 *stripe_size = calc_size; 2625 *stripe_size = calc_size;
2329 *num_bytes = chunk_bytes_by_type(type, calc_size, 2626 *num_bytes = chunk_bytes_by_type(type, calc_size,
2330 num_stripes, sub_stripes); 2627 map->num_stripes, sub_stripes);
2331 2628
2332 em = alloc_extent_map(GFP_NOFS); 2629 em = alloc_extent_map(GFP_NOFS);
2333 if (!em) { 2630 if (!em) {
2334 kfree(map); 2631 ret = -ENOMEM;
2335 return -ENOMEM; 2632 goto error;
2336 } 2633 }
2337 em->bdev = (struct block_device *)map; 2634 em->bdev = (struct block_device *)map;
2338 em->start = start; 2635 em->start = start;
@@ -2365,7 +2662,13 @@ again:
2365 index++; 2662 index++;
2366 } 2663 }
2367 2664
2665 kfree(devices_info);
2368 return 0; 2666 return 0;
2667
2668error:
2669 kfree(map);
2670 kfree(devices_info);
2671 return ret;
2369} 2672}
2370 2673
2371static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2674static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -3080,7 +3383,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3080 device->devid = devid; 3383 device->devid = devid;
3081 device->work.func = pending_bios_fn; 3384 device->work.func = pending_bios_fn;
3082 device->fs_devices = fs_devices; 3385 device->fs_devices = fs_devices;
3386 device->missing = 1;
3083 fs_devices->num_devices++; 3387 fs_devices->num_devices++;
3388 fs_devices->missing_devices++;
3084 spin_lock_init(&device->io_lock); 3389 spin_lock_init(&device->io_lock);
3085 INIT_LIST_HEAD(&device->dev_alloc_list); 3390 INIT_LIST_HEAD(&device->dev_alloc_list);
3086 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3391 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3278,6 +3583,15 @@ static int read_one_dev(struct btrfs_root *root,
3278 device = add_missing_dev(root, devid, dev_uuid); 3583 device = add_missing_dev(root, devid, dev_uuid);
3279 if (!device) 3584 if (!device)
3280 return -ENOMEM; 3585 return -ENOMEM;
3586 } else if (!device->missing) {
3587 /*
3588 * this happens when a device that was properly setup
3589 * in the device info lists suddenly goes bad.
3590 * device->bdev is NULL, and so we have to set
3591 * device->missing to one here
3592 */
3593 root->fs_info->fs_devices->missing_devices++;
3594 device->missing = 1;
3281 } 3595 }
3282 } 3596 }
3283 3597
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4ee..7fb59d45fe8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
20#define __BTRFS_VOLUMES_ 20#define __BTRFS_VOLUMES_
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h>
23#include "async-thread.h" 24#include "async-thread.h"
24 25
26#define BTRFS_STRIPE_LEN (64 * 1024)
27
25struct buffer_head; 28struct buffer_head;
26struct btrfs_pending_bios { 29struct btrfs_pending_bios {
27 struct bio *head; 30 struct bio *head;
@@ -44,12 +47,13 @@ struct btrfs_device {
44 47
45 int writeable; 48 int writeable;
46 int in_fs_metadata; 49 int in_fs_metadata;
50 int missing;
47 51
48 spinlock_t io_lock; 52 spinlock_t io_lock;
49 53
50 struct block_device *bdev; 54 struct block_device *bdev;
51 55
52 /* the mode sent to open_bdev_exclusive */ 56 /* the mode sent to blkdev_get */
53 fmode_t mode; 57 fmode_t mode;
54 58
55 char *name; 59 char *name;
@@ -93,6 +97,7 @@ struct btrfs_fs_devices {
93 u64 num_devices; 97 u64 num_devices;
94 u64 open_devices; 98 u64 open_devices;
95 u64 rw_devices; 99 u64 rw_devices;
100 u64 missing_devices;
96 u64 total_rw_bytes; 101 u64 total_rw_bytes;
97 struct block_device *latest_bdev; 102 struct block_device *latest_bdev;
98 103
@@ -134,6 +139,30 @@ struct btrfs_multi_bio {
134 struct btrfs_bio_stripe stripes[]; 139 struct btrfs_bio_stripe stripes[];
135}; 140};
136 141
142struct btrfs_device_info {
143 struct btrfs_device *dev;
144 u64 dev_offset;
145 u64 max_avail;
146};
147
148/* Used to sort the devices by max_avail(descending sort) */
149int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
150
151/*
152 * sort the devices by max_avail, in which max free extent size of each device
153 * is stored.(Descending Sort)
154 */
155static inline void btrfs_descending_sort_devices(
156 struct btrfs_device_info *devices,
157 size_t nr_devices)
158{
159 sort(devices, nr_devices, sizeof(struct btrfs_device_info),
160 btrfs_cmp_device_free_bytes, NULL);
161}
162
163int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
164 u64 end, u64 *length);
165
137#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 166#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
138 (sizeof(struct btrfs_bio_stripe) * (n))) 167 (sizeof(struct btrfs_bio_stripe) * (n)))
139 168
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739..a5776531dc2 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, 316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
317 size_t size, int flags) 317 size_t size, int flags)
318{ 318{
319 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
320
321 /*
322 * The permission on security.* and system.* is not checked
323 * in permission().
324 */
325 if (btrfs_root_readonly(root))
326 return -EROFS;
327
319 /* 328 /*
320 * If this is a request for a synthetic attribute in the system.* 329 * If this is a request for a synthetic attribute in the system.*
321 * namespace use the generic infrastructure to resolve a handler 330 * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
336 345
337int btrfs_removexattr(struct dentry *dentry, const char *name) 346int btrfs_removexattr(struct dentry *dentry, const char *name)
338{ 347{
348 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
349
350 /*
351 * The permission on security.* and system.* is not checked
352 * in permission().
353 */
354 if (btrfs_root_readonly(root))
355 return -EROFS;
356
339 /* 357 /*
340 * If this is a request for a synthetic attribute in the system.* 358 * If this is a request for a synthetic attribute in the system.*
341 * namespace use the generic infrastructure to resolve a handler 359 * namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71..f5ec2d44150 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
32#include <linux/bio.h> 32#include <linux/bio.h>
33#include "compression.h" 33#include "compression.h"
34 34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace { 35struct workspace {
45 z_stream inf_strm; 36 z_stream inf_strm;
46 z_stream def_strm; 37 z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
48 struct list_head list; 39 struct list_head list;
49}; 40};
50 41
51static LIST_HEAD(idle_workspace); 42static void zlib_free_workspace(struct list_head *ws)
52static DEFINE_SPINLOCK(workspace_lock); 43{
53static unsigned long num_workspace; 44 struct workspace *workspace = list_entry(ws, struct workspace, list);
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56 45
57/* 46 vfree(workspace->def_strm.workspace);
58 * this finds an available zlib workspace or allocates a new one 47 vfree(workspace->inf_strm.workspace);
59 * NULL or an ERR_PTR is returned if things go bad. 48 kfree(workspace->buf);
60 */ 49 kfree(workspace);
61static struct workspace *find_zlib_workspace(void) 50}
51
52static struct list_head *zlib_alloc_workspace(void)
62{ 53{
63 struct workspace *workspace; 54 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76 55
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) { 57 if (!workspace)
90 ret = -ENOMEM; 58 return ERR_PTR(-ENOMEM);
91 goto fail;
92 }
93 59
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 61 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) { 63 if (!workspace->def_strm.workspace ||
106 ret = -ENOMEM; 64 !workspace->inf_strm.workspace || !workspace->buf)
107 goto fail_kmalloc; 65 goto fail;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142 66
143 atomic_dec(&alloc_workspace); 67 INIT_LIST_HEAD(&workspace->list);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148 68
149/* 69 return &workspace->list;
150 * cleanup function for module exit 70fail:
151 */ 71 zlib_free_workspace(&workspace->list);
152static void free_workspaces(void) 72 return ERR_PTR(-ENOMEM);
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165} 73}
166 74
167/* 75static int zlib_compress_pages(struct list_head *ws,
168 * given an address space and start/len, compress the bytes. 76 struct address_space *mapping,
169 * 77 u64 start, unsigned long len,
170 * pages are allocated to hold the compressed result and stored 78 struct page **pages,
171 * in 'pages' 79 unsigned long nr_dest_pages,
172 * 80 unsigned long *out_pages,
173 * out_pages is used to return the number of pages allocated. There 81 unsigned long *total_in,
174 * may be pages allocated even if we return an error 82 unsigned long *total_out,
175 * 83 unsigned long max_out)
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{ 84{
85 struct workspace *workspace = list_entry(ws, struct workspace, list);
195 int ret; 86 int ret;
196 struct workspace *workspace;
197 char *data_in; 87 char *data_in;
198 char *cpage_out; 88 char *cpage_out;
199 int nr_pages = 0; 89 int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
205 *total_out = 0; 95 *total_out = 0;
206 *total_in = 0; 96 *total_in = 0;
207 97
208 workspace = find_zlib_workspace();
209 if (IS_ERR(workspace))
210 return -1;
211
212 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 98 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
213 printk(KERN_WARNING "deflateInit failed\n"); 99 printk(KERN_WARNING "deflateInit failed\n");
214 ret = -1; 100 ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
222 data_in = kmap(in_page); 108 data_in = kmap(in_page);
223 109
224 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 110 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
111 if (out_page == NULL) {
112 ret = -1;
113 goto out;
114 }
225 cpage_out = kmap(out_page); 115 cpage_out = kmap(out_page);
226 pages[0] = out_page; 116 pages[0] = out_page;
227 nr_pages = 1; 117 nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
260 goto out; 150 goto out;
261 } 151 }
262 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 152 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
153 if (out_page == NULL) {
154 ret = -1;
155 goto out;
156 }
263 cpage_out = kmap(out_page); 157 cpage_out = kmap(out_page);
264 pages[nr_pages] = out_page; 158 pages[nr_pages] = out_page;
265 nr_pages++; 159 nr_pages++;
@@ -314,55 +208,26 @@ out:
314 kunmap(in_page); 208 kunmap(in_page);
315 page_cache_release(in_page); 209 page_cache_release(in_page);
316 } 210 }
317 free_workspace(workspace);
318 return ret; 211 return ret;
319} 212}
320 213
321/* 214static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
322 * pages_in is an array of pages with compressed data. 215 u64 disk_start,
323 * 216 struct bio_vec *bvec,
324 * disk_start is the starting logical offset of this array in the file 217 int vcnt,
325 * 218 size_t srclen)
326 * bvec is a bio_vec of pages from the file that we want to decompress into
327 *
328 * vcnt is the count of pages in the biovec
329 *
330 * srclen is the number of bytes in pages_in
331 *
332 * The basic idea is that we have a bio that was created by readpages.
333 * The pages in the bio are for the uncompressed data, and they may not
334 * be contiguous. They all correspond to the range of bytes covered by
335 * the compressed extent.
336 */
337int btrfs_zlib_decompress_biovec(struct page **pages_in,
338 u64 disk_start,
339 struct bio_vec *bvec,
340 int vcnt,
341 size_t srclen)
342{ 219{
343 int ret = 0; 220 struct workspace *workspace = list_entry(ws, struct workspace, list);
221 int ret = 0, ret2;
344 int wbits = MAX_WBITS; 222 int wbits = MAX_WBITS;
345 struct workspace *workspace;
346 char *data_in; 223 char *data_in;
347 size_t total_out = 0; 224 size_t total_out = 0;
348 unsigned long page_bytes_left;
349 unsigned long page_in_index = 0; 225 unsigned long page_in_index = 0;
350 unsigned long page_out_index = 0; 226 unsigned long page_out_index = 0;
351 struct page *page_out;
352 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 227 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
353 PAGE_CACHE_SIZE; 228 PAGE_CACHE_SIZE;
354 unsigned long buf_start; 229 unsigned long buf_start;
355 unsigned long buf_offset;
356 unsigned long bytes;
357 unsigned long working_bytes;
358 unsigned long pg_offset; 230 unsigned long pg_offset;
359 unsigned long start_byte;
360 unsigned long current_buf_start;
361 char *kaddr;
362
363 workspace = find_zlib_workspace();
364 if (IS_ERR(workspace))
365 return -ENOMEM;
366 231
367 data_in = kmap(pages_in[page_in_index]); 232 data_in = kmap(pages_in[page_in_index]);
368 workspace->inf_strm.next_in = data_in; 233 workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
372 workspace->inf_strm.total_out = 0; 237 workspace->inf_strm.total_out = 0;
373 workspace->inf_strm.next_out = workspace->buf; 238 workspace->inf_strm.next_out = workspace->buf;
374 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 239 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
375 page_out = bvec[page_out_index].bv_page;
376 page_bytes_left = PAGE_CACHE_SIZE;
377 pg_offset = 0; 240 pg_offset = 0;
378 241
379 /* If it's deflate, and it's got no preset dictionary, then 242 /* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
389 252
390 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 253 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
391 printk(KERN_WARNING "inflateInit failed\n"); 254 printk(KERN_WARNING "inflateInit failed\n");
392 ret = -1; 255 return -1;
393 goto out;
394 } 256 }
395 while (workspace->inf_strm.total_in < srclen) { 257 while (workspace->inf_strm.total_in < srclen) {
396 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 258 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
397 if (ret != Z_OK && ret != Z_STREAM_END) 259 if (ret != Z_OK && ret != Z_STREAM_END)
398 break; 260 break;
399 /*
400 * buf start is the byte offset we're of the start of
401 * our workspace buffer
402 */
403 buf_start = total_out;
404 261
405 /* total_out is the last byte of the workspace buffer */ 262 buf_start = total_out;
406 total_out = workspace->inf_strm.total_out; 263 total_out = workspace->inf_strm.total_out;
407 264
408 working_bytes = total_out - buf_start; 265 /* we didn't make progress in this inflate call, we're done */
409 266 if (buf_start == total_out)
410 /*
411 * start byte is the first byte of the page we're currently
412 * copying into relative to the start of the compressed data.
413 */
414 start_byte = page_offset(page_out) - disk_start;
415
416 if (working_bytes == 0) {
417 /* we didn't make progress in this inflate
418 * call, we're done
419 */
420 if (ret != Z_STREAM_END)
421 ret = -1;
422 break; 267 break;
423 }
424 268
425 /* we haven't yet hit data corresponding to this page */ 269 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
426 if (total_out <= start_byte) 270 total_out, disk_start,
427 goto next; 271 bvec, vcnt,
428 272 &page_out_index, &pg_offset);
429 /* 273 if (ret2 == 0) {
430 * the start of the data we care about is offset into 274 ret = 0;
431 * the middle of our working buffer 275 goto done;
432 */
433 if (total_out > start_byte && buf_start < start_byte) {
434 buf_offset = start_byte - buf_start;
435 working_bytes -= buf_offset;
436 } else {
437 buf_offset = 0;
438 }
439 current_buf_start = buf_start;
440
441 /* copy bytes from the working buffer into the pages */
442 while (working_bytes > 0) {
443 bytes = min(PAGE_CACHE_SIZE - pg_offset,
444 PAGE_CACHE_SIZE - buf_offset);
445 bytes = min(bytes, working_bytes);
446 kaddr = kmap_atomic(page_out, KM_USER0);
447 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
448 bytes);
449 kunmap_atomic(kaddr, KM_USER0);
450 flush_dcache_page(page_out);
451
452 pg_offset += bytes;
453 page_bytes_left -= bytes;
454 buf_offset += bytes;
455 working_bytes -= bytes;
456 current_buf_start += bytes;
457
458 /* check if we need to pick another page */
459 if (page_bytes_left == 0) {
460 page_out_index++;
461 if (page_out_index >= vcnt) {
462 ret = 0;
463 goto done;
464 }
465
466 page_out = bvec[page_out_index].bv_page;
467 pg_offset = 0;
468 page_bytes_left = PAGE_CACHE_SIZE;
469 start_byte = page_offset(page_out) - disk_start;
470
471 /*
472 * make sure our new page is covered by this
473 * working buffer
474 */
475 if (total_out <= start_byte)
476 goto next;
477
478 /* the next page in the biovec might not
479 * be adjacent to the last page, but it
480 * might still be found inside this working
481 * buffer. bump our offset pointer
482 */
483 if (total_out > start_byte &&
484 current_buf_start < start_byte) {
485 buf_offset = start_byte - buf_start;
486 working_bytes = total_out - start_byte;
487 current_buf_start = buf_start +
488 buf_offset;
489 }
490 }
491 } 276 }
492next: 277
493 workspace->inf_strm.next_out = workspace->buf; 278 workspace->inf_strm.next_out = workspace->buf;
494 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 279 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
495 280
@@ -516,35 +301,21 @@ done:
516 zlib_inflateEnd(&workspace->inf_strm); 301 zlib_inflateEnd(&workspace->inf_strm);
517 if (data_in) 302 if (data_in)
518 kunmap(pages_in[page_in_index]); 303 kunmap(pages_in[page_in_index]);
519out:
520 free_workspace(workspace);
521 return ret; 304 return ret;
522} 305}
523 306
524/* 307static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
525 * a less complex decompression routine. Our compressed data fits in a 308 struct page *dest_page,
526 * single page, and we want to read a single page out of it. 309 unsigned long start_byte,
527 * start_byte tells us the offset into the compressed data we're interested in 310 size_t srclen, size_t destlen)
528 */
529int btrfs_zlib_decompress(unsigned char *data_in,
530 struct page *dest_page,
531 unsigned long start_byte,
532 size_t srclen, size_t destlen)
533{ 311{
312 struct workspace *workspace = list_entry(ws, struct workspace, list);
534 int ret = 0; 313 int ret = 0;
535 int wbits = MAX_WBITS; 314 int wbits = MAX_WBITS;
536 struct workspace *workspace;
537 unsigned long bytes_left = destlen; 315 unsigned long bytes_left = destlen;
538 unsigned long total_out = 0; 316 unsigned long total_out = 0;
539 char *kaddr; 317 char *kaddr;
540 318
541 if (destlen > PAGE_CACHE_SIZE)
542 return -ENOMEM;
543
544 workspace = find_zlib_workspace();
545 if (IS_ERR(workspace))
546 return -ENOMEM;
547
548 workspace->inf_strm.next_in = data_in; 319 workspace->inf_strm.next_in = data_in;
549 workspace->inf_strm.avail_in = srclen; 320 workspace->inf_strm.avail_in = srclen;
550 workspace->inf_strm.total_in = 0; 321 workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
565 336
566 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 337 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
567 printk(KERN_WARNING "inflateInit failed\n"); 338 printk(KERN_WARNING "inflateInit failed\n");
568 ret = -1; 339 return -1;
569 goto out;
570 } 340 }
571 341
572 while (bytes_left > 0) { 342 while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
616 ret = 0; 386 ret = 0;
617 387
618 zlib_inflateEnd(&workspace->inf_strm); 388 zlib_inflateEnd(&workspace->inf_strm);
619out:
620 free_workspace(workspace);
621 return ret; 389 return ret;
622} 390}
623 391
624void btrfs_zlib_exit(void) 392struct btrfs_compress_op btrfs_zlib_compress = {
625{ 393 .alloc_workspace = zlib_alloc_workspace,
626 free_workspaces(); 394 .free_workspace = zlib_free_workspace,
627} 395 .compress_pages = zlib_compress_pages,
396 .decompress_biovec = zlib_decompress_biovec,
397 .decompress = zlib_decompress,
398};
diff --git a/fs/buffer.c b/fs/buffer.c
index 5930e382959..2219a76e2ca 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1270,12 +1270,10 @@ static inline void check_irqs_on(void)
1270static void bh_lru_install(struct buffer_head *bh) 1270static void bh_lru_install(struct buffer_head *bh)
1271{ 1271{
1272 struct buffer_head *evictee = NULL; 1272 struct buffer_head *evictee = NULL;
1273 struct bh_lru *lru;
1274 1273
1275 check_irqs_on(); 1274 check_irqs_on();
1276 bh_lru_lock(); 1275 bh_lru_lock();
1277 lru = &__get_cpu_var(bh_lrus); 1276 if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1278 if (lru->bhs[0] != bh) {
1279 struct buffer_head *bhs[BH_LRU_SIZE]; 1277 struct buffer_head *bhs[BH_LRU_SIZE];
1280 int in; 1278 int in;
1281 int out = 0; 1279 int out = 0;
@@ -1283,7 +1281,8 @@ static void bh_lru_install(struct buffer_head *bh)
1283 get_bh(bh); 1281 get_bh(bh);
1284 bhs[out++] = bh; 1282 bhs[out++] = bh;
1285 for (in = 0; in < BH_LRU_SIZE; in++) { 1283 for (in = 0; in < BH_LRU_SIZE; in++) {
1286 struct buffer_head *bh2 = lru->bhs[in]; 1284 struct buffer_head *bh2 =
1285 __this_cpu_read(bh_lrus.bhs[in]);
1287 1286
1288 if (bh2 == bh) { 1287 if (bh2 == bh) {
1289 __brelse(bh2); 1288 __brelse(bh2);
@@ -1298,7 +1297,7 @@ static void bh_lru_install(struct buffer_head *bh)
1298 } 1297 }
1299 while (out < BH_LRU_SIZE) 1298 while (out < BH_LRU_SIZE)
1300 bhs[out++] = NULL; 1299 bhs[out++] = NULL;
1301 memcpy(lru->bhs, bhs, sizeof(bhs)); 1300 memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1302 } 1301 }
1303 bh_lru_unlock(); 1302 bh_lru_unlock();
1304 1303
@@ -1313,23 +1312,22 @@ static struct buffer_head *
1313lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) 1312lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1314{ 1313{
1315 struct buffer_head *ret = NULL; 1314 struct buffer_head *ret = NULL;
1316 struct bh_lru *lru;
1317 unsigned int i; 1315 unsigned int i;
1318 1316
1319 check_irqs_on(); 1317 check_irqs_on();
1320 bh_lru_lock(); 1318 bh_lru_lock();
1321 lru = &__get_cpu_var(bh_lrus);
1322 for (i = 0; i < BH_LRU_SIZE; i++) { 1319 for (i = 0; i < BH_LRU_SIZE; i++) {
1323 struct buffer_head *bh = lru->bhs[i]; 1320 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1324 1321
1325 if (bh && bh->b_bdev == bdev && 1322 if (bh && bh->b_bdev == bdev &&
1326 bh->b_blocknr == block && bh->b_size == size) { 1323 bh->b_blocknr == block && bh->b_size == size) {
1327 if (i) { 1324 if (i) {
1328 while (i) { 1325 while (i) {
1329 lru->bhs[i] = lru->bhs[i - 1]; 1326 __this_cpu_write(bh_lrus.bhs[i],
1327 __this_cpu_read(bh_lrus.bhs[i - 1]));
1330 i--; 1328 i--;
1331 } 1329 }
1332 lru->bhs[0] = bh; 1330 __this_cpu_write(bh_lrus.bhs[0], bh);
1333 } 1331 }
1334 get_bh(bh); 1332 get_bh(bh);
1335 ret = bh; 1333 ret = bh;
@@ -3203,22 +3201,23 @@ static void recalc_bh_state(void)
3203 int i; 3201 int i;
3204 int tot = 0; 3202 int tot = 0;
3205 3203
3206 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) 3204 if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3207 return; 3205 return;
3208 __get_cpu_var(bh_accounting).ratelimit = 0; 3206 __this_cpu_write(bh_accounting.ratelimit, 0);
3209 for_each_online_cpu(i) 3207 for_each_online_cpu(i)
3210 tot += per_cpu(bh_accounting, i).nr; 3208 tot += per_cpu(bh_accounting, i).nr;
3211 buffer_heads_over_limit = (tot > max_buffer_heads); 3209 buffer_heads_over_limit = (tot > max_buffer_heads);
3212} 3210}
3213 3211
3214struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3212struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3215{ 3213{
3216 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); 3214 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3217 if (ret) { 3215 if (ret) {
3218 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3216 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3219 get_cpu_var(bh_accounting).nr++; 3217 preempt_disable();
3218 __this_cpu_inc(bh_accounting.nr);
3220 recalc_bh_state(); 3219 recalc_bh_state();
3221 put_cpu_var(bh_accounting); 3220 preempt_enable();
3222 } 3221 }
3223 return ret; 3222 return ret;
3224} 3223}
@@ -3228,9 +3227,10 @@ void free_buffer_head(struct buffer_head *bh)
3228{ 3227{
3229 BUG_ON(!list_empty(&bh->b_assoc_buffers)); 3228 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3230 kmem_cache_free(bh_cachep, bh); 3229 kmem_cache_free(bh_cachep, bh);
3231 get_cpu_var(bh_accounting).nr--; 3230 preempt_disable();
3231 __this_cpu_dec(bh_accounting.nr);
3232 recalc_bh_state(); 3232 recalc_bh_state();
3233 put_cpu_var(bh_accounting); 3233 preempt_enable();
3234} 3234}
3235EXPORT_SYMBOL(free_buffer_head); 3235EXPORT_SYMBOL(free_buffer_head);
3236 3236
@@ -3243,9 +3243,8 @@ static void buffer_exit_cpu(int cpu)
3243 brelse(b->bhs[i]); 3243 brelse(b->bhs[i]);
3244 b->bhs[i] = NULL; 3244 b->bhs[i] = NULL;
3245 } 3245 }
3246 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; 3246 this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3247 per_cpu(bh_accounting, cpu).nr = 0; 3247 per_cpu(bh_accounting, cpu).nr = 0;
3248 put_cpu_var(bh_accounting);
3249} 3248}
3250 3249
3251static int buffer_cpu_notify(struct notifier_block *self, 3250static int buffer_cpu_notify(struct notifier_block *self,
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 9e6c4f2e8ff..bd352125e82 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,31 +2,10 @@
2# Makefile for CEPH filesystem. 2# Makefile for CEPH filesystem.
3# 3#
4 4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o 5obj-$(CONFIG_CEPH_FS) += ceph.o
8 6
9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 7ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 8 export.o caps.o snap.o xattr.o \
11 mds_client.o mdsmap.o strings.o ceph_frag.o \ 9 mds_client.o mdsmap.o strings.o ceph_frag.o \
12 debugfs.o 10 debugfs.o
13 11
14else
15#Otherwise we were called directly from the command
16# line; invoke the kernel build system.
17
18KERNELDIR ?= /lib/modules/$(shell uname -r)/build
19PWD := $(shell pwd)
20
21default: all
22
23all:
24 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
25
26modules_install:
27 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
28
29clean:
30 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
31
32endif
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e9c874abc9e..561438b6a50 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
205 page->index << PAGE_CACHE_SHIFT, &len, 205 page->index << PAGE_CACHE_SHIFT, &len,
206 ci->i_truncate_seq, ci->i_truncate_size, 206 ci->i_truncate_seq, ci->i_truncate_size,
207 &page, 1); 207 &page, 1, 0);
208 if (err == -ENOENT) 208 if (err == -ENOENT)
209 err = 0; 209 err = 0;
210 if (err < 0) { 210 if (err < 0) {
@@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
288 offset, &len, 288 offset, &len,
289 ci->i_truncate_seq, ci->i_truncate_size, 289 ci->i_truncate_seq, ci->i_truncate_size,
290 pages, nr_pages); 290 pages, nr_pages, 0);
291 if (rc == -ENOENT) 291 if (rc == -ENOENT)
292 rc = 0; 292 rc = 0;
293 if (rc < 0) 293 if (rc < 0)
@@ -774,7 +774,7 @@ get_more_pages:
774 snapc, do_sync, 774 snapc, do_sync,
775 ci->i_truncate_seq, 775 ci->i_truncate_seq,
776 ci->i_truncate_size, 776 ci->i_truncate_size,
777 &inode->i_mtime, true, 1); 777 &inode->i_mtime, true, 1, 0);
778 max_pages = req->r_num_pages; 778 max_pages = req->r_num_pages;
779 779
780 alloc_page_vec(fsc, req); 780 alloc_page_vec(fsc, req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98ab13e2b71..6b61ded701e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
1430 invalidating_gen == ci->i_rdcache_gen) { 1430 invalidating_gen == ci->i_rdcache_gen) {
1431 /* success. */ 1431 /* success. */
1432 dout("try_nonblocking_invalidate %p success\n", inode); 1432 dout("try_nonblocking_invalidate %p success\n", inode);
1433 ci->i_rdcache_gen = 0; 1433 /* save any racing async invalidate some trouble */
1434 ci->i_rdcache_revoking = 0; 1434 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
1435 return 0; 1435 return 0;
1436 } 1436 }
1437 dout("try_nonblocking_invalidate %p failed\n", inode); 1437 dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -1560,9 +1560,10 @@ retry_locked:
1560 /* NOTE: no side-effects allowed, until we take s_mutex */ 1560 /* NOTE: no side-effects allowed, until we take s_mutex */
1561 1561
1562 revoking = cap->implemented & ~cap->issued; 1562 revoking = cap->implemented & ~cap->issued;
1563 if (revoking) 1563 dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
1564 dout(" mds%d revoking %s\n", cap->mds, 1564 cap->mds, cap, ceph_cap_string(cap->issued),
1565 ceph_cap_string(revoking)); 1565 ceph_cap_string(cap->implemented),
1566 ceph_cap_string(revoking));
1566 1567
1567 if (cap == ci->i_auth_cap && 1568 if (cap == ci->i_auth_cap &&
1568 (cap->issued & CEPH_CAP_FILE_WR)) { 1569 (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1658,6 +1659,8 @@ ack:
1658 1659
1659 if (cap == ci->i_auth_cap && ci->i_dirty_caps) 1660 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1660 flushing = __mark_caps_flushing(inode, session); 1661 flushing = __mark_caps_flushing(inode, session);
1662 else
1663 flushing = 0;
1661 1664
1662 mds = cap->mds; /* remember mds, so we don't repeat */ 1665 mds = cap->mds; /* remember mds, so we don't repeat */
1663 sent++; 1666 sent++;
@@ -1940,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1940 } 1943 }
1941} 1944}
1942 1945
1946static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1947 struct ceph_mds_session *session,
1948 struct inode *inode)
1949{
1950 struct ceph_inode_info *ci = ceph_inode(inode);
1951 struct ceph_cap *cap;
1952 int delayed = 0;
1953
1954 spin_lock(&inode->i_lock);
1955 cap = ci->i_auth_cap;
1956 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1957 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
1958 __ceph_flush_snaps(ci, &session, 1);
1959 if (ci->i_flushing_caps) {
1960 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1961 __ceph_caps_used(ci),
1962 __ceph_caps_wanted(ci),
1963 cap->issued | cap->implemented,
1964 ci->i_flushing_caps, NULL);
1965 if (delayed) {
1966 spin_lock(&inode->i_lock);
1967 __cap_delay_requeue(mdsc, ci);
1968 spin_unlock(&inode->i_lock);
1969 }
1970 } else {
1971 spin_unlock(&inode->i_lock);
1972 }
1973}
1974
1943 1975
1944/* 1976/*
1945 * Take references to capabilities we hold, so that we don't release 1977 * Take references to capabilities we hold, so that we don't release
@@ -2273,8 +2305,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2273{ 2305{
2274 struct ceph_inode_info *ci = ceph_inode(inode); 2306 struct ceph_inode_info *ci = ceph_inode(inode);
2275 int mds = session->s_mds; 2307 int mds = session->s_mds;
2276 unsigned seq = le32_to_cpu(grant->seq); 2308 int seq = le32_to_cpu(grant->seq);
2277 unsigned issue_seq = le32_to_cpu(grant->issue_seq);
2278 int newcaps = le32_to_cpu(grant->caps); 2309 int newcaps = le32_to_cpu(grant->caps);
2279 int issued, implemented, used, wanted, dirty; 2310 int issued, implemented, used, wanted, dirty;
2280 u64 size = le64_to_cpu(grant->size); 2311 u64 size = le64_to_cpu(grant->size);
@@ -2286,8 +2317,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2286 int revoked_rdcache = 0; 2317 int revoked_rdcache = 0;
2287 int queue_invalidate = 0; 2318 int queue_invalidate = 0;
2288 2319
2289 dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", 2320 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2290 inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); 2321 inode, cap, mds, seq, ceph_cap_string(newcaps));
2291 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2322 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2292 inode->i_size); 2323 inode->i_size);
2293 2324
@@ -2383,7 +2414,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2383 } 2414 }
2384 2415
2385 cap->seq = seq; 2416 cap->seq = seq;
2386 cap->issue_seq = issue_seq;
2387 2417
2388 /* file layout may have changed */ 2418 /* file layout may have changed */
2389 ci->i_layout = grant->layout; 2419 ci->i_layout = grant->layout;
@@ -2689,8 +2719,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2689 ceph_add_cap(inode, session, cap_id, -1, 2719 ceph_add_cap(inode, session, cap_id, -1,
2690 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, 2720 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2691 NULL /* no caps context */); 2721 NULL /* no caps context */);
2692 try_flush_caps(inode, session, NULL); 2722 kick_flushing_inode_caps(mdsc, session, inode);
2693 up_read(&mdsc->snap_rwsem); 2723 up_read(&mdsc->snap_rwsem);
2724
2725 /* make sure we re-request max_size, if necessary */
2726 spin_lock(&inode->i_lock);
2727 ci->i_requested_max_size = 0;
2728 spin_unlock(&inode->i_lock);
2694} 2729}
2695 2730
2696/* 2731/*
@@ -2782,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2782 case CEPH_CAP_OP_IMPORT: 2817 case CEPH_CAP_OP_IMPORT:
2783 handle_cap_import(mdsc, inode, h, session, 2818 handle_cap_import(mdsc, inode, h, session,
2784 snaptrace, snaptrace_len); 2819 snaptrace, snaptrace_len);
2785 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, 2820 ceph_check_caps(ceph_inode(inode), 0, session);
2786 session);
2787 goto done_unlocked; 2821 goto done_unlocked;
2788 } 2822 }
2789 2823
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7ae1b3d55b5..08f65faac11 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
60 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { 60 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
61 req = rb_entry(rp, struct ceph_mds_request, r_node); 61 req = rb_entry(rp, struct ceph_mds_request, r_node);
62 62
63 if (req->r_request) 63 if (req->r_request && req->r_session)
64 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); 64 seq_printf(s, "%lld\tmds%d\t", req->r_tid,
65 else 65 req->r_session->s_mds);
66 else if (!req->r_request)
66 seq_printf(s, "%lld\t(no request)\t", req->r_tid); 67 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
68 else
69 seq_printf(s, "%lld\t(no session)\t", req->r_tid);
67 70
68 seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); 71 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
69 72
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e0a2dc6fcaf..0bc68de8edd 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,12 +40,13 @@ int ceph_init_dentry(struct dentry *dentry)
40 if (dentry->d_fsdata) 40 if (dentry->d_fsdata)
41 return 0; 41 return 0;
42 42
43 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) 43 if (dentry->d_parent == NULL || /* nfs fh_to_dentry */
44 dentry->d_op = &ceph_dentry_ops; 44 ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
45 d_set_d_op(dentry, &ceph_dentry_ops);
45 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) 46 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
46 dentry->d_op = &ceph_snapdir_dentry_ops; 47 d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
47 else 48 else
48 dentry->d_op = &ceph_snap_dentry_ops; 49 d_set_d_op(dentry, &ceph_snap_dentry_ops);
49 50
50 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); 51 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
51 if (!di) 52 if (!di)
@@ -111,11 +112,11 @@ static int __dcache_readdir(struct file *filp,
111 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, 112 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
112 last); 113 last);
113 114
114 spin_lock(&dcache_lock); 115 spin_lock(&parent->d_lock);
115 116
116 /* start at beginning? */ 117 /* start at beginning? */
117 if (filp->f_pos == 2 || (last && 118 if (filp->f_pos == 2 || last == NULL ||
118 filp->f_pos < ceph_dentry(last)->offset)) { 119 filp->f_pos < ceph_dentry(last)->offset) {
119 if (list_empty(&parent->d_subdirs)) 120 if (list_empty(&parent->d_subdirs))
120 goto out_unlock; 121 goto out_unlock;
121 p = parent->d_subdirs.prev; 122 p = parent->d_subdirs.prev;
@@ -135,6 +136,7 @@ more:
135 fi->at_end = 1; 136 fi->at_end = 1;
136 goto out_unlock; 137 goto out_unlock;
137 } 138 }
139 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
138 if (!d_unhashed(dentry) && dentry->d_inode && 140 if (!d_unhashed(dentry) && dentry->d_inode &&
139 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 141 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
140 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 142 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -144,13 +146,15 @@ more:
144 dentry->d_name.len, dentry->d_name.name, di->offset, 146 dentry->d_name.len, dentry->d_name.name, di->offset,
145 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", 147 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
146 !dentry->d_inode ? " null" : ""); 148 !dentry->d_inode ? " null" : "");
149 spin_unlock(&dentry->d_lock);
147 p = p->prev; 150 p = p->prev;
148 dentry = list_entry(p, struct dentry, d_u.d_child); 151 dentry = list_entry(p, struct dentry, d_u.d_child);
149 di = ceph_dentry(dentry); 152 di = ceph_dentry(dentry);
150 } 153 }
151 154
152 atomic_inc(&dentry->d_count); 155 dget_dlock(dentry);
153 spin_unlock(&dcache_lock); 156 spin_unlock(&dentry->d_lock);
157 spin_unlock(&parent->d_lock);
154 158
155 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 159 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
156 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 160 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -176,19 +180,19 @@ more:
176 180
177 filp->f_pos++; 181 filp->f_pos++;
178 182
179 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ 183 /* make sure a dentry wasn't dropped while we didn't have parent lock */
180 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { 184 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
181 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 185 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
182 err = -EAGAIN; 186 err = -EAGAIN;
183 goto out; 187 goto out;
184 } 188 }
185 189
186 spin_lock(&dcache_lock); 190 spin_lock(&parent->d_lock);
187 p = p->prev; /* advance to next dentry */ 191 p = p->prev; /* advance to next dentry */
188 goto more; 192 goto more;
189 193
190out_unlock: 194out_unlock:
191 spin_unlock(&dcache_lock); 195 spin_unlock(&parent->d_lock);
192out: 196out:
193 if (last) 197 if (last)
194 dput(last); 198 dput(last);
@@ -336,7 +340,10 @@ more:
336 if (req->r_reply_info.dir_end) { 340 if (req->r_reply_info.dir_end) {
337 kfree(fi->last_name); 341 kfree(fi->last_name);
338 fi->last_name = NULL; 342 fi->last_name = NULL;
339 fi->next_offset = 2; 343 if (ceph_frag_is_rightmost(frag))
344 fi->next_offset = 2;
345 else
346 fi->next_offset = 0;
340 } else { 347 } else {
341 rinfo = &req->r_reply_info; 348 rinfo = &req->r_reply_info;
342 err = note_last_dentry(fi, 349 err = note_last_dentry(fi,
@@ -355,18 +362,22 @@ more:
355 u64 pos = ceph_make_fpos(frag, off); 362 u64 pos = ceph_make_fpos(frag, off);
356 struct ceph_mds_reply_inode *in = 363 struct ceph_mds_reply_inode *in =
357 rinfo->dir_in[off - fi->offset].in; 364 rinfo->dir_in[off - fi->offset].in;
365 struct ceph_vino vino;
366 ino_t ino;
367
358 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 368 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
359 off, off - fi->offset, rinfo->dir_nr, pos, 369 off, off - fi->offset, rinfo->dir_nr, pos,
360 rinfo->dir_dname_len[off - fi->offset], 370 rinfo->dir_dname_len[off - fi->offset],
361 rinfo->dir_dname[off - fi->offset], in); 371 rinfo->dir_dname[off - fi->offset], in);
362 BUG_ON(!in); 372 BUG_ON(!in);
363 ftype = le32_to_cpu(in->mode) >> 12; 373 ftype = le32_to_cpu(in->mode) >> 12;
374 vino.ino = le64_to_cpu(in->ino);
375 vino.snap = le64_to_cpu(in->snapid);
376 ino = ceph_vino_to_ino(vino);
364 if (filldir(dirent, 377 if (filldir(dirent,
365 rinfo->dir_dname[off - fi->offset], 378 rinfo->dir_dname[off - fi->offset],
366 rinfo->dir_dname_len[off - fi->offset], 379 rinfo->dir_dname_len[off - fi->offset],
367 pos, 380 pos, ino, ftype) < 0) {
368 le64_to_cpu(in->ino),
369 ftype) < 0) {
370 dout("filldir stopping us...\n"); 381 dout("filldir stopping us...\n");
371 return 0; 382 return 0;
372 } 383 }
@@ -414,6 +425,7 @@ static void reset_readdir(struct ceph_file_info *fi)
414 fi->last_readdir = NULL; 425 fi->last_readdir = NULL;
415 } 426 }
416 kfree(fi->last_name); 427 kfree(fi->last_name);
428 fi->last_name = NULL;
417 fi->next_offset = 2; /* compensate for . and .. */ 429 fi->next_offset = 2; /* compensate for . and .. */
418 if (fi->dentry) { 430 if (fi->dentry) {
419 dput(fi->dentry); 431 dput(fi->dentry);
@@ -978,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
978 */ 990 */
979static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) 991static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
980{ 992{
981 struct inode *dir = dentry->d_parent->d_inode; 993 struct inode *dir;
994
995 if (nd->flags & LOOKUP_RCU)
996 return -ECHILD;
997
998 dir = dentry->d_parent->d_inode;
982 999
983 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, 1000 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
984 dentry->d_name.len, dentry->d_name.name, dentry->d_inode, 1001 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
@@ -1207,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
1207 } 1224 }
1208} 1225}
1209 1226
1227/*
1228 * Return name hash for a given dentry. This is dependent on
1229 * the parent directory's hash function.
1230 */
1231unsigned ceph_dentry_hash(struct dentry *dn)
1232{
1233 struct inode *dir = dn->d_parent->d_inode;
1234 struct ceph_inode_info *dci = ceph_inode(dir);
1235
1236 switch (dci->i_dir_layout.dl_dir_hash) {
1237 case 0: /* for backward compat */
1238 case CEPH_STR_HASH_LINUX:
1239 return dn->d_name.hash;
1240
1241 default:
1242 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
1243 dn->d_name.name, dn->d_name.len);
1244 }
1245}
1246
1210const struct file_operations ceph_dir_fops = { 1247const struct file_operations ceph_dir_fops = {
1211 .read = ceph_read_dir, 1248 .read = ceph_read_dir,
1212 .readdir = ceph_readdir, 1249 .readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 2297d942699..e41056174bf 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
59 dout("encode_fh %p connectable\n", dentry); 59 dout("encode_fh %p connectable\n", dentry);
60 cfh->ino = ceph_ino(dentry->d_inode); 60 cfh->ino = ceph_ino(dentry->d_inode);
61 cfh->parent_ino = ceph_ino(parent->d_inode); 61 cfh->parent_ino = ceph_ino(parent->d_inode);
62 cfh->parent_name_hash = parent->d_name.hash; 62 cfh->parent_name_hash = ceph_dentry_hash(parent);
63 *max_len = connected_handle_length; 63 *max_len = connected_handle_length;
64 type = 2; 64 type = 2;
65 } else if (*max_len >= handle_length) { 65 } else if (*max_len >= handle_length) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e77c28cf369..7d0e4a82d89 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
154 } 154 }
155 155
156 /* 156 /*
157 * No need to block if we have any caps. Update wanted set 157 * No need to block if we have caps on the auth MDS (for
158 * write) or any MDS (for read). Update wanted set
158 * asynchronously. 159 * asynchronously.
159 */ 160 */
160 spin_lock(&inode->i_lock); 161 spin_lock(&inode->i_lock);
161 if (__ceph_is_any_real_caps(ci)) { 162 if (__ceph_is_any_real_caps(ci) &&
163 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
162 int mds_wanted = __ceph_caps_mds_wanted(ci); 164 int mds_wanted = __ceph_caps_mds_wanted(ci);
163 int issued = __ceph_caps_issued(ci, NULL); 165 int issued = __ceph_caps_issued(ci, NULL);
164 166
@@ -280,11 +282,13 @@ int ceph_release(struct inode *inode, struct file *file)
280static int striped_read(struct inode *inode, 282static int striped_read(struct inode *inode,
281 u64 off, u64 len, 283 u64 off, u64 len,
282 struct page **pages, int num_pages, 284 struct page **pages, int num_pages,
283 int *checkeof) 285 int *checkeof, bool align_to_pages,
286 unsigned long buf_align)
284{ 287{
285 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 288 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
286 struct ceph_inode_info *ci = ceph_inode(inode); 289 struct ceph_inode_info *ci = ceph_inode(inode);
287 u64 pos, this_len; 290 u64 pos, this_len;
291 int io_align, page_align;
288 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 292 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
289 int left, pages_left; 293 int left, pages_left;
290 int read; 294 int read;
@@ -300,14 +304,19 @@ static int striped_read(struct inode *inode,
300 page_pos = pages; 304 page_pos = pages;
301 pages_left = num_pages; 305 pages_left = num_pages;
302 read = 0; 306 read = 0;
307 io_align = off & ~PAGE_MASK;
303 308
304more: 309more:
310 if (align_to_pages)
311 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
312 else
313 page_align = pos & ~PAGE_MASK;
305 this_len = left; 314 this_len = left;
306 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 315 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
307 &ci->i_layout, pos, &this_len, 316 &ci->i_layout, pos, &this_len,
308 ci->i_truncate_seq, 317 ci->i_truncate_seq,
309 ci->i_truncate_size, 318 ci->i_truncate_size,
310 page_pos, pages_left); 319 page_pos, pages_left, page_align);
311 hit_stripe = this_len < left; 320 hit_stripe = this_len < left;
312 was_short = ret >= 0 && ret < this_len; 321 was_short = ret >= 0 && ret < this_len;
313 if (ret == -ENOENT) 322 if (ret == -ENOENT)
@@ -368,32 +377,34 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
368 struct inode *inode = file->f_dentry->d_inode; 377 struct inode *inode = file->f_dentry->d_inode;
369 struct page **pages; 378 struct page **pages;
370 u64 off = *poff; 379 u64 off = *poff;
371 int num_pages = calc_pages_for(off, len); 380 int num_pages, ret;
372 int ret;
373 381
374 dout("sync_read on file %p %llu~%u %s\n", file, off, len, 382 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
375 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 383 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
376 384
377 if (file->f_flags & O_DIRECT) { 385 if (file->f_flags & O_DIRECT) {
378 pages = ceph_get_direct_page_vector(data, num_pages, off, len); 386 num_pages = calc_pages_for((unsigned long)data, len);
379 387 pages = ceph_get_direct_page_vector(data, num_pages, true);
380 /*
381 * flush any page cache pages in this range. this
382 * will make concurrent normal and O_DIRECT io slow,
383 * but it will at least behave sensibly when they are
384 * in sequence.
385 */
386 } else { 388 } else {
389 num_pages = calc_pages_for(off, len);
387 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 390 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
388 } 391 }
389 if (IS_ERR(pages)) 392 if (IS_ERR(pages))
390 return PTR_ERR(pages); 393 return PTR_ERR(pages);
391 394
395 /*
396 * flush any page cache pages in this range. this
397 * will make concurrent normal and sync io slow,
398 * but it will at least behave sensibly when they are
399 * in sequence.
400 */
392 ret = filemap_write_and_wait(inode->i_mapping); 401 ret = filemap_write_and_wait(inode->i_mapping);
393 if (ret < 0) 402 if (ret < 0)
394 goto done; 403 goto done;
395 404
396 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 405 ret = striped_read(inode, off, len, pages, num_pages, checkeof,
406 file->f_flags & O_DIRECT,
407 (unsigned long)data & ~PAGE_MASK);
397 408
398 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 409 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
399 ret = ceph_copy_page_vector_to_user(pages, data, off, ret); 410 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
@@ -402,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
402 413
403done: 414done:
404 if (file->f_flags & O_DIRECT) 415 if (file->f_flags & O_DIRECT)
405 ceph_put_page_vector(pages, num_pages); 416 ceph_put_page_vector(pages, num_pages, true);
406 else 417 else
407 ceph_release_page_vector(pages, num_pages); 418 ceph_release_page_vector(pages, num_pages);
408 dout("sync_read result %d\n", ret); 419 dout("sync_read result %d\n", ret);
@@ -448,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
448 int flags; 459 int flags;
449 int do_sync = 0; 460 int do_sync = 0;
450 int check_caps = 0; 461 int check_caps = 0;
462 int page_align, io_align;
463 unsigned long buf_align;
451 int ret; 464 int ret;
452 struct timespec mtime = CURRENT_TIME; 465 struct timespec mtime = CURRENT_TIME;
453 466
@@ -462,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
462 else 475 else
463 pos = *offset; 476 pos = *offset;
464 477
478 io_align = pos & ~PAGE_MASK;
479 buf_align = (unsigned long)data & ~PAGE_MASK;
480
465 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 481 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
466 if (ret < 0) 482 if (ret < 0)
467 return ret; 483 return ret;
@@ -486,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
486 */ 502 */
487more: 503more:
488 len = left; 504 len = left;
505 if (file->f_flags & O_DIRECT) {
506 /* write from beginning of first page, regardless of
507 io alignment */
508 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
509 num_pages = calc_pages_for((unsigned long)data, len);
510 } else {
511 page_align = pos & ~PAGE_MASK;
512 num_pages = calc_pages_for(pos, len);
513 }
489 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 514 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
490 ceph_vino(inode), pos, &len, 515 ceph_vino(inode), pos, &len,
491 CEPH_OSD_OP_WRITE, flags, 516 CEPH_OSD_OP_WRITE, flags,
492 ci->i_snap_realm->cached_context, 517 ci->i_snap_realm->cached_context,
493 do_sync, 518 do_sync,
494 ci->i_truncate_seq, ci->i_truncate_size, 519 ci->i_truncate_seq, ci->i_truncate_size,
495 &mtime, false, 2); 520 &mtime, false, 2, page_align);
496 if (!req) 521 if (!req)
497 return -ENOMEM; 522 return -ENOMEM;
498 523
499 num_pages = calc_pages_for(pos, len);
500
501 if (file->f_flags & O_DIRECT) { 524 if (file->f_flags & O_DIRECT) {
502 pages = ceph_get_direct_page_vector(data, num_pages, pos, len); 525 pages = ceph_get_direct_page_vector(data, num_pages, false);
503 if (IS_ERR(pages)) { 526 if (IS_ERR(pages)) {
504 ret = PTR_ERR(pages); 527 ret = PTR_ERR(pages);
505 goto out; 528 goto out;
@@ -549,7 +572,7 @@ more:
549 } 572 }
550 573
551 if (file->f_flags & O_DIRECT) 574 if (file->f_flags & O_DIRECT)
552 ceph_put_page_vector(pages, num_pages); 575 ceph_put_page_vector(pages, num_pages, false);
553 else if (file->f_flags & O_SYNC) 576 else if (file->f_flags & O_SYNC)
554 ceph_release_page_vector(pages, num_pages); 577 ceph_release_page_vector(pages, num_pages);
555 578
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1d6a45b5a04..5625463aa47 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2,7 +2,6 @@
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h> 5#include <linux/slab.h>
7#include <linux/string.h> 6#include <linux/string.h>
8#include <linux/uaccess.h> 7#include <linux/uaccess.h>
@@ -298,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
298 ci->i_release_count = 0; 297 ci->i_release_count = 0;
299 ci->i_symlink = NULL; 298 ci->i_symlink = NULL;
300 299
300 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
301
301 ci->i_fragtree = RB_ROOT; 302 ci->i_fragtree = RB_ROOT;
302 mutex_init(&ci->i_fragtree_mutex); 303 mutex_init(&ci->i_fragtree_mutex);
303 304
@@ -369,6 +370,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
369 return &ci->vfs_inode; 370 return &ci->vfs_inode;
370} 371}
371 372
373static void ceph_i_callback(struct rcu_head *head)
374{
375 struct inode *inode = container_of(head, struct inode, i_rcu);
376 struct ceph_inode_info *ci = ceph_inode(inode);
377
378 INIT_LIST_HEAD(&inode->i_dentry);
379 kmem_cache_free(ceph_inode_cachep, ci);
380}
381
372void ceph_destroy_inode(struct inode *inode) 382void ceph_destroy_inode(struct inode *inode)
373{ 383{
374 struct ceph_inode_info *ci = ceph_inode(inode); 384 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -408,7 +418,7 @@ void ceph_destroy_inode(struct inode *inode)
408 if (ci->i_xattrs.prealloc_blob) 418 if (ci->i_xattrs.prealloc_blob)
409 ceph_buffer_put(ci->i_xattrs.prealloc_blob); 419 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
410 420
411 kmem_cache_free(ceph_inode_cachep, ci); 421 call_rcu(&inode->i_rcu, ceph_i_callback);
412} 422}
413 423
414 424
@@ -471,7 +481,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
471 481
472 if (issued & (CEPH_CAP_FILE_EXCL| 482 if (issued & (CEPH_CAP_FILE_EXCL|
473 CEPH_CAP_FILE_WR| 483 CEPH_CAP_FILE_WR|
474 CEPH_CAP_FILE_BUFFER)) { 484 CEPH_CAP_FILE_BUFFER|
485 CEPH_CAP_AUTH_EXCL|
486 CEPH_CAP_XATTR_EXCL)) {
475 if (timespec_compare(ctime, &inode->i_ctime) > 0) { 487 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
476 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", 488 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
477 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, 489 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -511,7 +523,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
511 warn = 1; 523 warn = 1;
512 } 524 }
513 } else { 525 } else {
514 /* we have no write caps; whatever the MDS says is true */ 526 /* we have no write|excl caps; whatever the MDS says is true */
515 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { 527 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
516 inode->i_ctime = *ctime; 528 inode->i_ctime = *ctime;
517 inode->i_mtime = *mtime; 529 inode->i_mtime = *mtime;
@@ -567,12 +579,17 @@ static int fill_inode(struct inode *inode,
567 579
568 /* 580 /*
569 * provided version will be odd if inode value is projected, 581 * provided version will be odd if inode value is projected,
570 * even if stable. skip the update if we have a newer info 582 * even if stable. skip the update if we have newer stable
571 * (e.g., due to inode info racing form multiple MDSs), or if 583 * info (ours>=theirs, e.g. due to racing mds replies), unless
572 * we are getting projected (unstable) inode info. 584 * we are getting projected (unstable) info (in which case the
585 * version is odd, and we want ours>theirs).
586 * us them
587 * 2 2 skip
588 * 3 2 skip
589 * 3 3 update
573 */ 590 */
574 if (le64_to_cpu(info->version) > 0 && 591 if (le64_to_cpu(info->version) > 0 &&
575 (ci->i_version & ~1) > le64_to_cpu(info->version)) 592 (ci->i_version & ~1) >= le64_to_cpu(info->version))
576 goto no_change; 593 goto no_change;
577 594
578 issued = __ceph_caps_issued(ci, &implemented); 595 issued = __ceph_caps_issued(ci, &implemented);
@@ -606,7 +623,14 @@ static int fill_inode(struct inode *inode,
606 le32_to_cpu(info->time_warp_seq), 623 le32_to_cpu(info->time_warp_seq),
607 &ctime, &mtime, &atime); 624 &ctime, &mtime, &atime);
608 625
609 ci->i_max_size = le64_to_cpu(info->max_size); 626 /* only update max_size on auth cap */
627 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
628 ci->i_max_size != le64_to_cpu(info->max_size)) {
629 dout("max_size %lld -> %llu\n", ci->i_max_size,
630 le64_to_cpu(info->max_size));
631 ci->i_max_size = le64_to_cpu(info->max_size);
632 }
633
610 ci->i_layout = info->layout; 634 ci->i_layout = info->layout;
611 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 635 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
612 636
@@ -667,6 +691,8 @@ static int fill_inode(struct inode *inode,
667 inode->i_op = &ceph_dir_iops; 691 inode->i_op = &ceph_dir_iops;
668 inode->i_fop = &ceph_dir_fops; 692 inode->i_fop = &ceph_dir_fops;
669 693
694 ci->i_dir_layout = iinfo->dir_layout;
695
670 ci->i_files = le64_to_cpu(info->files); 696 ci->i_files = le64_to_cpu(info->files);
671 ci->i_subdirs = le64_to_cpu(info->subdirs); 697 ci->i_subdirs = le64_to_cpu(info->subdirs);
672 ci->i_rbytes = le64_to_cpu(info->rbytes); 698 ci->i_rbytes = le64_to_cpu(info->rbytes);
@@ -684,10 +710,6 @@ static int fill_inode(struct inode *inode,
684 ci->i_ceph_flags |= CEPH_I_COMPLETE; 710 ci->i_ceph_flags |= CEPH_I_COMPLETE;
685 ci->i_max_offset = 2; 711 ci->i_max_offset = 2;
686 } 712 }
687
688 /* it may be better to set st_size in getattr instead? */
689 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
690 inode->i_size = ci->i_rbytes;
691 break; 713 break;
692 default: 714 default:
693 pr_err("fill_inode %llx.%llx BAD mode 0%o\n", 715 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -828,13 +850,13 @@ static void ceph_set_dentry_offset(struct dentry *dn)
828 di->offset = ceph_inode(inode)->i_max_offset++; 850 di->offset = ceph_inode(inode)->i_max_offset++;
829 spin_unlock(&inode->i_lock); 851 spin_unlock(&inode->i_lock);
830 852
831 spin_lock(&dcache_lock); 853 spin_lock(&dir->d_lock);
832 spin_lock(&dn->d_lock); 854 spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
833 list_move(&dn->d_u.d_child, &dir->d_subdirs); 855 list_move(&dn->d_u.d_child, &dir->d_subdirs);
834 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, 856 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
835 dn->d_u.d_child.prev, dn->d_u.d_child.next); 857 dn->d_u.d_child.prev, dn->d_u.d_child.next);
836 spin_unlock(&dn->d_lock); 858 spin_unlock(&dn->d_lock);
837 spin_unlock(&dcache_lock); 859 spin_unlock(&dir->d_lock);
838} 860}
839 861
840/* 862/*
@@ -866,8 +888,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
866 } else if (realdn) { 888 } else if (realdn) {
867 dout("dn %p (%d) spliced with %p (%d) " 889 dout("dn %p (%d) spliced with %p (%d) "
868 "inode %p ino %llx.%llx\n", 890 "inode %p ino %llx.%llx\n",
869 dn, atomic_read(&dn->d_count), 891 dn, dn->d_count,
870 realdn, atomic_read(&realdn->d_count), 892 realdn, realdn->d_count,
871 realdn->d_inode, ceph_vinop(realdn->d_inode)); 893 realdn->d_inode, ceph_vinop(realdn->d_inode));
872 dput(dn); 894 dput(dn);
873 dn = realdn; 895 dn = realdn;
@@ -1055,7 +1077,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1055 ininfo = rinfo->targeti.in; 1077 ininfo = rinfo->targeti.in;
1056 vino.ino = le64_to_cpu(ininfo->ino); 1078 vino.ino = le64_to_cpu(ininfo->ino);
1057 vino.snap = le64_to_cpu(ininfo->snapid); 1079 vino.snap = le64_to_cpu(ininfo->snapid);
1058 if (!dn->d_inode) { 1080 in = dn->d_inode;
1081 if (!in) {
1059 in = ceph_get_inode(sb, vino); 1082 in = ceph_get_inode(sb, vino);
1060 if (IS_ERR(in)) { 1083 if (IS_ERR(in)) {
1061 pr_err("fill_trace bad get_inode " 1084 pr_err("fill_trace bad get_inode "
@@ -1217,11 +1240,11 @@ retry_lookup:
1217 goto retry_lookup; 1240 goto retry_lookup;
1218 } else { 1241 } else {
1219 /* reorder parent's d_subdirs */ 1242 /* reorder parent's d_subdirs */
1220 spin_lock(&dcache_lock); 1243 spin_lock(&parent->d_lock);
1221 spin_lock(&dn->d_lock); 1244 spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
1222 list_move(&dn->d_u.d_child, &parent->d_subdirs); 1245 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1223 spin_unlock(&dn->d_lock); 1246 spin_unlock(&dn->d_lock);
1224 spin_unlock(&dcache_lock); 1247 spin_unlock(&parent->d_lock);
1225 } 1248 }
1226 1249
1227 di = dn->d_fsdata; 1250 di = dn->d_fsdata;
@@ -1386,11 +1409,8 @@ static void ceph_invalidate_work(struct work_struct *work)
1386 spin_lock(&inode->i_lock); 1409 spin_lock(&inode->i_lock);
1387 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1410 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1388 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1411 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1389 if (ci->i_rdcache_gen == 0 || 1412 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1390 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1391 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1392 /* nevermind! */ 1413 /* nevermind! */
1393 ci->i_rdcache_revoking = 0;
1394 spin_unlock(&inode->i_lock); 1414 spin_unlock(&inode->i_lock);
1395 goto out; 1415 goto out;
1396 } 1416 }
@@ -1400,15 +1420,16 @@ static void ceph_invalidate_work(struct work_struct *work)
1400 ceph_invalidate_nondirty_pages(inode->i_mapping); 1420 ceph_invalidate_nondirty_pages(inode->i_mapping);
1401 1421
1402 spin_lock(&inode->i_lock); 1422 spin_lock(&inode->i_lock);
1403 if (orig_gen == ci->i_rdcache_gen) { 1423 if (orig_gen == ci->i_rdcache_gen &&
1424 orig_gen == ci->i_rdcache_revoking) {
1404 dout("invalidate_pages %p gen %d successful\n", inode, 1425 dout("invalidate_pages %p gen %d successful\n", inode,
1405 ci->i_rdcache_gen); 1426 ci->i_rdcache_gen);
1406 ci->i_rdcache_gen = 0; 1427 ci->i_rdcache_revoking--;
1407 ci->i_rdcache_revoking = 0;
1408 check = 1; 1428 check = 1;
1409 } else { 1429 } else {
1410 dout("invalidate_pages %p gen %d raced, gen now %d\n", 1430 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
1411 inode, orig_gen, ci->i_rdcache_gen); 1431 inode, orig_gen, ci->i_rdcache_gen,
1432 ci->i_rdcache_revoking);
1412 } 1433 }
1413 spin_unlock(&inode->i_lock); 1434 spin_unlock(&inode->i_lock);
1414 1435
@@ -1739,7 +1760,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
1739 return 0; 1760 return 0;
1740 } 1761 }
1741 1762
1742 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); 1763 dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
1743 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1764 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1744 return 0; 1765 return 0;
1745 1766
@@ -1760,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
1760 * Check inode permissions. We verify we have a valid value for 1781 * Check inode permissions. We verify we have a valid value for
1761 * the AUTH cap, then call the generic handler. 1782 * the AUTH cap, then call the generic handler.
1762 */ 1783 */
1763int ceph_permission(struct inode *inode, int mask) 1784int ceph_permission(struct inode *inode, int mask, unsigned int flags)
1764{ 1785{
1765 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); 1786 int err;
1787
1788 if (flags & IPERM_FLAG_RCU)
1789 return -ECHILD;
1790
1791 err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1766 1792
1767 if (!err) 1793 if (!err)
1768 err = generic_permission(inode, mask, NULL); 1794 err = generic_permission(inode, mask, flags, NULL);
1769 return err; 1795 return err;
1770} 1796}
1771 1797
@@ -1789,7 +1815,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1789 else 1815 else
1790 stat->dev = 0; 1816 stat->dev = 0;
1791 if (S_ISDIR(inode->i_mode)) { 1817 if (S_ISDIR(inode->i_mode)) {
1792 stat->size = ci->i_rbytes; 1818 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
1819 RBYTES))
1820 stat->size = ci->i_rbytes;
1821 else
1822 stat->size = ci->i_files + ci->i_subdirs;
1793 stat->blocks = 0; 1823 stat->blocks = 0;
1794 stat->blksize = 65536; 1824 stat->blksize = 65536;
1795 } 1825 }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index a6ce54e94eb..52e8fd74d45 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
4#include <linux/ioctl.h> 4#include <linux/ioctl.h>
5#include <linux/types.h> 5#include <linux/types.h>
6 6
7#define CEPH_IOCTL_MAGIC 0x98 7#define CEPH_IOCTL_MAGIC 0x97
8 8
9/* just use u64 to align sanely on all archs */ 9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout { 10struct ceph_ioctl_layout {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 40abde93c34..476b329867d 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -11,40 +11,68 @@
11 * Implement fcntl and flock locking functions. 11 * Implement fcntl and flock locking functions.
12 */ 12 */
13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, 13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
14 u64 pid, u64 pid_ns, 14 int cmd, u8 wait, struct file_lock *fl)
15 int cmd, u64 start, u64 length, u8 wait)
16{ 15{
17 struct inode *inode = file->f_dentry->d_inode; 16 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc = 17 struct ceph_mds_client *mdsc =
19 ceph_sb_to_client(inode->i_sb)->mdsc; 18 ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req; 19 struct ceph_mds_request *req;
21 int err; 20 int err;
21 u64 length = 0;
22 22
23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
24 if (IS_ERR(req)) 24 if (IS_ERR(req))
25 return PTR_ERR(req); 25 return PTR_ERR(req);
26 req->r_inode = igrab(inode); 26 req->r_inode = igrab(inode);
27 27
28 /* mds requires start and length rather than start and end */
29 if (LLONG_MAX == fl->fl_end)
30 length = 0;
31 else
32 length = fl->fl_end - fl->fl_start + 1;
33
28 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 34 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
29 "length: %llu, wait: %d, type`: %d", (int)lock_type, 35 "length: %llu, wait: %d, type`: %d", (int)lock_type,
30 (int)operation, pid, start, length, wait, cmd); 36 (int)operation, (u64)fl->fl_pid, fl->fl_start,
37 length, wait, fl->fl_type);
38
31 39
32 req->r_args.filelock_change.rule = lock_type; 40 req->r_args.filelock_change.rule = lock_type;
33 req->r_args.filelock_change.type = cmd; 41 req->r_args.filelock_change.type = cmd;
34 req->r_args.filelock_change.pid = cpu_to_le64(pid); 42 req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
35 /* This should be adjusted, but I'm not sure if 43 /* This should be adjusted, but I'm not sure if
36 namespaces actually get id numbers*/ 44 namespaces actually get id numbers*/
37 req->r_args.filelock_change.pid_namespace = 45 req->r_args.filelock_change.pid_namespace =
38 cpu_to_le64((u64)pid_ns); 46 cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
39 req->r_args.filelock_change.start = cpu_to_le64(start); 47 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
40 req->r_args.filelock_change.length = cpu_to_le64(length); 48 req->r_args.filelock_change.length = cpu_to_le64(length);
41 req->r_args.filelock_change.wait = wait; 49 req->r_args.filelock_change.wait = wait;
42 50
43 err = ceph_mdsc_do_request(mdsc, inode, req); 51 err = ceph_mdsc_do_request(mdsc, inode, req);
52
53 if ( operation == CEPH_MDS_OP_GETFILELOCK){
54 fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
55 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
56 fl->fl_type = F_RDLCK;
57 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
58 fl->fl_type = F_WRLCK;
59 else
60 fl->fl_type = F_UNLCK;
61
62 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
63 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
64 le64_to_cpu(req->r_reply_info.filelock_reply->length);
65 if (length >= 1)
66 fl->fl_end = length -1;
67 else
68 fl->fl_end = 0;
69
70 }
44 ceph_mdsc_put_request(req); 71 ceph_mdsc_put_request(req);
45 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 72 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
46 "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type, 73 "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
47 (int)operation, pid, start, length, wait, cmd, err); 74 (int)operation, (u64)fl->fl_pid, fl->fl_start,
75 length, wait, fl->fl_type, err);
48 return err; 76 return err;
49} 77}
50 78
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
54 */ 82 */
55int ceph_lock(struct file *file, int cmd, struct file_lock *fl) 83int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
56{ 84{
57 u64 length;
58 u8 lock_cmd; 85 u8 lock_cmd;
59 int err; 86 int err;
60 u8 wait = 0; 87 u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
76 else 103 else
77 lock_cmd = CEPH_LOCK_UNLOCK; 104 lock_cmd = CEPH_LOCK_UNLOCK;
78 105
79 if (LLONG_MAX == fl->fl_end) 106 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
80 length = 0;
81 else
82 length = fl->fl_end - fl->fl_start + 1;
83
84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
85 (u64)fl->fl_pid,
86 (u64)(unsigned long)fl->fl_nspid,
87 lock_cmd, fl->fl_start,
88 length, wait);
89 if (!err) { 107 if (!err) {
90 dout("mds locked, locking locally"); 108 if ( op != CEPH_MDS_OP_GETFILELOCK ){
91 err = posix_lock_file(file, fl, NULL); 109 dout("mds locked, locking locally");
92 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { 110 err = posix_lock_file(file, fl, NULL);
93 /* undo! This should only happen if the kernel detects 111 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
94 * local deadlock. */ 112 /* undo! This should only happen if the kernel detects
95 ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 113 * local deadlock. */
96 (u64)fl->fl_pid, 114 ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
97 (u64)(unsigned long)fl->fl_nspid, 115 CEPH_LOCK_UNLOCK, 0, fl);
98 CEPH_LOCK_UNLOCK, fl->fl_start, 116 dout("got %d on posix_lock_file, undid lock", err);
99 length, 0); 117 }
100 dout("got %d on posix_lock_file, undid lock", err);
101 } 118 }
119
102 } else { 120 } else {
103 dout("mds returned error code %d", err); 121 dout("mds returned error code %d", err);
104 } 122 }
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
107 125
108int ceph_flock(struct file *file, int cmd, struct file_lock *fl) 126int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
109{ 127{
110 u64 length;
111 u8 lock_cmd; 128 u8 lock_cmd;
112 int err; 129 int err;
113 u8 wait = 1; 130 u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
127 lock_cmd = CEPH_LOCK_EXCL; 144 lock_cmd = CEPH_LOCK_EXCL;
128 else 145 else
129 lock_cmd = CEPH_LOCK_UNLOCK; 146 lock_cmd = CEPH_LOCK_UNLOCK;
130 /* mds requires start and length rather than start and end */
131 if (LLONG_MAX == fl->fl_end)
132 length = 0;
133 else
134 length = fl->fl_end - fl->fl_start + 1;
135 147
136 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 148 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
137 file, (u64)fl->fl_pid, 149 file, lock_cmd, wait, fl);
138 (u64)(unsigned long)fl->fl_nspid,
139 lock_cmd, fl->fl_start,
140 length, wait);
141 if (!err) { 150 if (!err) {
142 err = flock_lock_file_wait(file, fl); 151 err = flock_lock_file_wait(file, fl);
143 if (err) { 152 if (err) {
144 ceph_lock_message(CEPH_LOCK_FLOCK, 153 ceph_lock_message(CEPH_LOCK_FLOCK,
145 CEPH_MDS_OP_SETFILELOCK, 154 CEPH_MDS_OP_SETFILELOCK,
146 file, (u64)fl->fl_pid, 155 file, CEPH_LOCK_UNLOCK, 0, fl);
147 (u64)(unsigned long)fl->fl_nspid,
148 CEPH_LOCK_UNLOCK, fl->fl_start,
149 length, 0);
150 dout("got %d on flock_lock_file_wait, undid lock", err); 156 dout("got %d on flock_lock_file_wait, undid lock", err);
151 } 157 }
152 } else { 158 } else {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3142b15940c..a1ee8fa3a8e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -6,7 +6,6 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/smp_lock.h>
10 9
11#include "super.h" 10#include "super.h"
12#include "mds_client.h" 11#include "mds_client.h"
@@ -61,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
61 * parse individual inode info 60 * parse individual inode info
62 */ 61 */
63static int parse_reply_info_in(void **p, void *end, 62static int parse_reply_info_in(void **p, void *end,
64 struct ceph_mds_reply_info_in *info) 63 struct ceph_mds_reply_info_in *info,
64 int features)
65{ 65{
66 int err = -EIO; 66 int err = -EIO;
67 67
@@ -75,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
75 info->symlink = *p; 75 info->symlink = *p;
76 *p += info->symlink_len; 76 *p += info->symlink_len;
77 77
78 if (features & CEPH_FEATURE_DIRLAYOUTHASH)
79 ceph_decode_copy_safe(p, end, &info->dir_layout,
80 sizeof(info->dir_layout), bad);
81 else
82 memset(&info->dir_layout, 0, sizeof(info->dir_layout));
83
78 ceph_decode_32_safe(p, end, info->xattr_len, bad); 84 ceph_decode_32_safe(p, end, info->xattr_len, bad);
79 ceph_decode_need(p, end, info->xattr_len, bad); 85 ceph_decode_need(p, end, info->xattr_len, bad);
80 info->xattr_data = *p; 86 info->xattr_data = *p;
@@ -89,12 +95,13 @@ bad:
89 * target inode. 95 * target inode.
90 */ 96 */
91static int parse_reply_info_trace(void **p, void *end, 97static int parse_reply_info_trace(void **p, void *end,
92 struct ceph_mds_reply_info_parsed *info) 98 struct ceph_mds_reply_info_parsed *info,
99 int features)
93{ 100{
94 int err; 101 int err;
95 102
96 if (info->head->is_dentry) { 103 if (info->head->is_dentry) {
97 err = parse_reply_info_in(p, end, &info->diri); 104 err = parse_reply_info_in(p, end, &info->diri, features);
98 if (err < 0) 105 if (err < 0)
99 goto out_bad; 106 goto out_bad;
100 107
@@ -115,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
115 } 122 }
116 123
117 if (info->head->is_target) { 124 if (info->head->is_target) {
118 err = parse_reply_info_in(p, end, &info->targeti); 125 err = parse_reply_info_in(p, end, &info->targeti, features);
119 if (err < 0) 126 if (err < 0)
120 goto out_bad; 127 goto out_bad;
121 } 128 }
@@ -135,7 +142,8 @@ out_bad:
135 * parse readdir results 142 * parse readdir results
136 */ 143 */
137static int parse_reply_info_dir(void **p, void *end, 144static int parse_reply_info_dir(void **p, void *end,
138 struct ceph_mds_reply_info_parsed *info) 145 struct ceph_mds_reply_info_parsed *info,
146 int features)
139{ 147{
140 u32 num, i = 0; 148 u32 num, i = 0;
141 int err; 149 int err;
@@ -183,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
183 *p += sizeof(struct ceph_mds_reply_lease); 191 *p += sizeof(struct ceph_mds_reply_lease);
184 192
185 /* inode */ 193 /* inode */
186 err = parse_reply_info_in(p, end, &info->dir_in[i]); 194 err = parse_reply_info_in(p, end, &info->dir_in[i], features);
187 if (err < 0) 195 if (err < 0)
188 goto out_bad; 196 goto out_bad;
189 i++; 197 i++;
@@ -203,10 +211,45 @@ out_bad:
203} 211}
204 212
205/* 213/*
214 * parse fcntl F_GETLK results
215 */
216static int parse_reply_info_filelock(void **p, void *end,
217 struct ceph_mds_reply_info_parsed *info,
218 int features)
219{
220 if (*p + sizeof(*info->filelock_reply) > end)
221 goto bad;
222
223 info->filelock_reply = *p;
224 *p += sizeof(*info->filelock_reply);
225
226 if (unlikely(*p != end))
227 goto bad;
228 return 0;
229
230bad:
231 return -EIO;
232}
233
234/*
235 * parse extra results
236 */
237static int parse_reply_info_extra(void **p, void *end,
238 struct ceph_mds_reply_info_parsed *info,
239 int features)
240{
241 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
242 return parse_reply_info_filelock(p, end, info, features);
243 else
244 return parse_reply_info_dir(p, end, info, features);
245}
246
247/*
206 * parse entire mds reply 248 * parse entire mds reply
207 */ 249 */
208static int parse_reply_info(struct ceph_msg *msg, 250static int parse_reply_info(struct ceph_msg *msg,
209 struct ceph_mds_reply_info_parsed *info) 251 struct ceph_mds_reply_info_parsed *info,
252 int features)
210{ 253{
211 void *p, *end; 254 void *p, *end;
212 u32 len; 255 u32 len;
@@ -219,15 +262,15 @@ static int parse_reply_info(struct ceph_msg *msg,
219 /* trace */ 262 /* trace */
220 ceph_decode_32_safe(&p, end, len, bad); 263 ceph_decode_32_safe(&p, end, len, bad);
221 if (len > 0) { 264 if (len > 0) {
222 err = parse_reply_info_trace(&p, p+len, info); 265 err = parse_reply_info_trace(&p, p+len, info, features);
223 if (err < 0) 266 if (err < 0)
224 goto out_bad; 267 goto out_bad;
225 } 268 }
226 269
227 /* dir content */ 270 /* extra */
228 ceph_decode_32_safe(&p, end, len, bad); 271 ceph_decode_32_safe(&p, end, len, bad);
229 if (len > 0) { 272 if (len > 0) {
230 err = parse_reply_info_dir(&p, p+len, info); 273 err = parse_reply_info_extra(&p, p+len, info, features);
231 if (err < 0) 274 if (err < 0)
232 goto out_bad; 275 goto out_bad;
233 } 276 }
@@ -529,6 +572,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
529 ceph_mdsc_get_request(req); 572 ceph_mdsc_get_request(req);
530 __insert_request(mdsc, req); 573 __insert_request(mdsc, req);
531 574
575 req->r_uid = current_fsuid();
576 req->r_gid = current_fsgid();
577
532 if (dir) { 578 if (dir) {
533 struct ceph_inode_info *ci = ceph_inode(dir); 579 struct ceph_inode_info *ci = ceph_inode(dir);
534 580
@@ -620,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
620 } else { 666 } else {
621 /* dir + name */ 667 /* dir + name */
622 inode = dir; 668 inode = dir;
623 hash = req->r_dentry->d_name.hash; 669 hash = ceph_dentry_hash(req->r_dentry);
624 is_hash = true; 670 is_hash = true;
625 } 671 }
626 } 672 }
@@ -647,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
647 dout("choose_mds %p %llx.%llx " 693 dout("choose_mds %p %llx.%llx "
648 "frag %u mds%d (%d/%d)\n", 694 "frag %u mds%d (%d/%d)\n",
649 inode, ceph_vinop(inode), 695 inode, ceph_vinop(inode),
650 frag.frag, frag.mds, 696 frag.frag, mds,
651 (int)r, frag.ndist); 697 (int)r, frag.ndist);
652 return mds; 698 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
699 CEPH_MDS_STATE_ACTIVE)
700 return mds;
653 } 701 }
654 702
655 /* since this file/dir wasn't known to be 703 /* since this file/dir wasn't known to be
@@ -662,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
662 dout("choose_mds %p %llx.%llx " 710 dout("choose_mds %p %llx.%llx "
663 "frag %u mds%d (auth)\n", 711 "frag %u mds%d (auth)\n",
664 inode, ceph_vinop(inode), frag.frag, mds); 712 inode, ceph_vinop(inode), frag.frag, mds);
665 return mds; 713 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
714 CEPH_MDS_STATE_ACTIVE)
715 return mds;
666 } 716 }
667 } 717 }
668 } 718 }
@@ -1452,7 +1502,7 @@ retry:
1452 *base = ceph_ino(temp->d_inode); 1502 *base = ceph_ino(temp->d_inode);
1453 *plen = len; 1503 *plen = len;
1454 dout("build_path on %p %d built %llx '%.*s'\n", 1504 dout("build_path on %p %d built %llx '%.*s'\n",
1455 dentry, atomic_read(&dentry->d_count), *base, len, path); 1505 dentry, dentry->d_count, *base, len, path);
1456 return path; 1506 return path;
1457} 1507}
1458 1508
@@ -1588,8 +1638,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1588 1638
1589 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); 1639 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1590 head->op = cpu_to_le32(req->r_op); 1640 head->op = cpu_to_le32(req->r_op);
1591 head->caller_uid = cpu_to_le32(current_fsuid()); 1641 head->caller_uid = cpu_to_le32(req->r_uid);
1592 head->caller_gid = cpu_to_le32(current_fsgid()); 1642 head->caller_gid = cpu_to_le32(req->r_gid);
1593 head->args = req->r_args; 1643 head->args = req->r_args;
1594 1644
1595 ceph_encode_filepath(&p, end, ino1, path1); 1645 ceph_encode_filepath(&p, end, ino1, path1);
@@ -1659,7 +1709,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1659 struct ceph_msg *msg; 1709 struct ceph_msg *msg;
1660 int flags = 0; 1710 int flags = 0;
1661 1711
1662 req->r_mds = mds;
1663 req->r_attempts++; 1712 req->r_attempts++;
1664 if (req->r_inode) { 1713 if (req->r_inode) {
1665 struct ceph_cap *cap = 1714 struct ceph_cap *cap =
@@ -1746,6 +1795,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
1746 goto finish; 1795 goto finish;
1747 } 1796 }
1748 1797
1798 put_request_session(req);
1799
1749 mds = __choose_mds(mdsc, req); 1800 mds = __choose_mds(mdsc, req);
1750 if (mds < 0 || 1801 if (mds < 0 ||
1751 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { 1802 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1763,6 +1814,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
1763 goto finish; 1814 goto finish;
1764 } 1815 }
1765 } 1816 }
1817 req->r_session = get_session(session);
1818
1766 dout("do_request mds%d session %p state %s\n", mds, session, 1819 dout("do_request mds%d session %p state %s\n", mds, session,
1767 session_state_name(session->s_state)); 1820 session_state_name(session->s_state));
1768 if (session->s_state != CEPH_MDS_SESSION_OPEN && 1821 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1775,7 +1828,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
1775 } 1828 }
1776 1829
1777 /* send request */ 1830 /* send request */
1778 req->r_session = get_session(session);
1779 req->r_resend_mds = -1; /* forget any previous mds hint */ 1831 req->r_resend_mds = -1; /* forget any previous mds hint */
1780 1832
1781 if (req->r_request_started == 0) /* note request start time */ 1833 if (req->r_request_started == 0) /* note request start time */
@@ -1829,7 +1881,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1829 if (req->r_session && 1881 if (req->r_session &&
1830 req->r_session->s_mds == mds) { 1882 req->r_session->s_mds == mds) {
1831 dout(" kicking tid %llu\n", req->r_tid); 1883 dout(" kicking tid %llu\n", req->r_tid);
1832 put_request_session(req);
1833 __do_request(mdsc, req); 1884 __do_request(mdsc, req);
1834 } 1885 }
1835 } 1886 }
@@ -2022,8 +2073,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2022 goto out; 2073 goto out;
2023 } else { 2074 } else {
2024 struct ceph_inode_info *ci = ceph_inode(req->r_inode); 2075 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
2025 struct ceph_cap *cap = 2076 struct ceph_cap *cap = NULL;
2026 ceph_get_cap_for_mds(ci, req->r_mds);; 2077
2078 if (req->r_session)
2079 cap = ceph_get_cap_for_mds(ci,
2080 req->r_session->s_mds);
2027 2081
2028 dout("already using auth"); 2082 dout("already using auth");
2029 if ((!cap || cap != ci->i_auth_cap) || 2083 if ((!cap || cap != ci->i_auth_cap) ||
@@ -2067,12 +2121,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2067 2121
2068 dout("handle_reply tid %lld result %d\n", tid, result); 2122 dout("handle_reply tid %lld result %d\n", tid, result);
2069 rinfo = &req->r_reply_info; 2123 rinfo = &req->r_reply_info;
2070 err = parse_reply_info(msg, rinfo); 2124 err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
2071 mutex_unlock(&mdsc->mutex); 2125 mutex_unlock(&mdsc->mutex);
2072 2126
2073 mutex_lock(&session->s_mutex); 2127 mutex_lock(&session->s_mutex);
2074 if (err < 0) { 2128 if (err < 0) {
2075 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); 2129 pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
2076 ceph_msg_dump(msg); 2130 ceph_msg_dump(msg);
2077 goto out_err; 2131 goto out_err;
2078 } 2132 }
@@ -2092,7 +2146,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2092 mutex_lock(&req->r_fill_mutex); 2146 mutex_lock(&req->r_fill_mutex);
2093 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); 2147 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2094 if (err == 0) { 2148 if (err == 0) {
2095 if (result == 0 && rinfo->dir_nr) 2149 if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
2150 rinfo->dir_nr)
2096 ceph_readdir_prepopulate(req, req->r_session); 2151 ceph_readdir_prepopulate(req, req->r_session);
2097 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2152 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
2098 } 2153 }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d66d63c7235..4e3a9cc0bba 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -35,6 +35,7 @@ struct ceph_cap;
35 */ 35 */
36struct ceph_mds_reply_info_in { 36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in; 37 struct ceph_mds_reply_inode *in;
38 struct ceph_dir_layout dir_layout;
38 u32 symlink_len; 39 u32 symlink_len;
39 char *symlink; 40 char *symlink;
40 u32 xattr_len; 41 u32 xattr_len;
@@ -42,26 +43,37 @@ struct ceph_mds_reply_info_in {
42}; 43};
43 44
44/* 45/*
45 * parsed info about an mds reply, including information about the 46 * parsed info about an mds reply, including information about
46 * target inode and/or its parent directory and dentry, and directory 47 * either: 1) the target inode and/or its parent directory and dentry,
47 * contents (for readdir results). 48 * and directory contents (for readdir results), or
49 * 2) the file range lock info (for fcntl F_GETLK results).
48 */ 50 */
49struct ceph_mds_reply_info_parsed { 51struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head; 52 struct ceph_mds_reply_head *head;
51 53
54 /* trace */
52 struct ceph_mds_reply_info_in diri, targeti; 55 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag; 56 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname; 57 char *dname;
55 u32 dname_len; 58 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease; 59 struct ceph_mds_reply_lease *dlease;
57 60
58 struct ceph_mds_reply_dirfrag *dir_dir; 61 /* extra */
59 int dir_nr; 62 union {
60 char **dir_dname; 63 /* for fcntl F_GETLK results */
61 u32 *dir_dname_len; 64 struct ceph_filelock *filelock_reply;
62 struct ceph_mds_reply_lease **dir_dlease; 65
63 struct ceph_mds_reply_info_in *dir_in; 66 /* for readdir results */
64 u8 dir_complete, dir_end; 67 struct {
68 struct ceph_mds_reply_dirfrag *dir_dir;
69 int dir_nr;
70 char **dir_dname;
71 u32 *dir_dname_len;
72 struct ceph_mds_reply_lease **dir_dlease;
73 struct ceph_mds_reply_info_in *dir_in;
74 u8 dir_complete, dir_end;
75 };
76 };
65 77
66 /* encoded blob describing snapshot contexts for certain 78 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */ 79 operations (e.g., open) */
@@ -154,7 +166,6 @@ struct ceph_mds_request {
154 struct ceph_mds_client *r_mdsc; 166 struct ceph_mds_client *r_mdsc;
155 167
156 int r_op; /* mds op code */ 168 int r_op; /* mds op code */
157 int r_mds;
158 169
159 /* operation on what? */ 170 /* operation on what? */
160 struct inode *r_inode; /* arg1 */ 171 struct inode *r_inode; /* arg1 */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
170 181
171 union ceph_mds_request_args r_args; 182 union ceph_mds_request_args r_args;
172 int r_fmode; /* file mode, if expecting cap */ 183 int r_fmode; /* file mode, if expecting cap */
184 uid_t r_uid;
185 gid_t r_gid;
173 186
174 /* for choosing which mds to send this request to */ 187 /* for choosing which mds to send this request to */
175 int r_direct_mode; 188 int r_direct_mode;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 08b460ae053..9c5085465a6 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -290,6 +290,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
290 290
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
294 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
293 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; 295 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
294 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 296 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
295 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 297 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
@@ -428,7 +430,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
428 goto fail; 430 goto fail;
429 } 431 }
430 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 432 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK; 433 fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
434 CEPH_FEATURE_DIRLAYOUTHASH;
432 fsc->client->monc.want_mdsmap = 1; 435 fsc->client->monc.want_mdsmap = 1;
433 436
434 fsc->mount_options = fsopt; 437 fsc->mount_options = fsopt;
@@ -443,13 +446,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
443 goto fail_client; 446 goto fail_client;
444 447
445 err = -ENOMEM; 448 err = -ENOMEM;
446 fsc->wb_wq = create_workqueue("ceph-writeback"); 449 /*
450 * The number of concurrent works can be high but they don't need
451 * to be processed in parallel, limit concurrency.
452 */
453 fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
447 if (fsc->wb_wq == NULL) 454 if (fsc->wb_wq == NULL)
448 goto fail_bdi; 455 goto fail_bdi;
449 fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); 456 fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
450 if (fsc->pg_inv_wq == NULL) 457 if (fsc->pg_inv_wq == NULL)
451 goto fail_wb_wq; 458 goto fail_wb_wq;
452 fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); 459 fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
453 if (fsc->trunc_wq == NULL) 460 if (fsc->trunc_wq == NULL)
454 goto fail_pg_inv_wq; 461 goto fail_pg_inv_wq;
455 462
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1886294e12f..20b907d76ae 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -239,6 +239,7 @@ struct ceph_inode_info {
239 unsigned i_ceph_flags; 239 unsigned i_ceph_flags;
240 unsigned long i_release_count; 240 unsigned long i_release_count;
241 241
242 struct ceph_dir_layout i_dir_layout;
242 struct ceph_file_layout i_layout; 243 struct ceph_file_layout i_layout;
243 char *i_symlink; 244 char *i_symlink;
244 245
@@ -293,9 +294,7 @@ struct ceph_inode_info {
293 int i_rd_ref, i_rdcache_ref, i_wr_ref; 294 int i_rd_ref, i_rdcache_ref, i_wr_ref;
294 int i_wrbuffer_ref, i_wrbuffer_ref_head; 295 int i_wrbuffer_ref, i_wrbuffer_ref_head;
295 u32 i_shared_gen; /* increment each time we get FILE_SHARED */ 296 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
296 u32 i_rdcache_gen; /* we increment this each time we get 297 u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
297 FILE_CACHE. If it's non-zero, we
298 _may_ have cached pages. */
299 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ 298 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
300 299
301 struct list_head i_unsafe_writes; /* uncommitted sync writes */ 300 struct list_head i_unsafe_writes; /* uncommitted sync writes */
@@ -667,7 +666,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
667extern void ceph_queue_writeback(struct inode *inode); 666extern void ceph_queue_writeback(struct inode *inode);
668 667
669extern int ceph_do_getattr(struct inode *inode, int mask); 668extern int ceph_do_getattr(struct inode *inode, int mask);
670extern int ceph_permission(struct inode *inode, int mask); 669extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
671extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); 670extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
672extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, 671extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
673 struct kstat *stat); 672 struct kstat *stat);
@@ -770,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
770extern void ceph_dentry_lru_touch(struct dentry *dn); 769extern void ceph_dentry_lru_touch(struct dentry *dn);
771extern void ceph_dentry_lru_del(struct dentry *dn); 770extern void ceph_dentry_lru_del(struct dentry *dn);
772extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 771extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
772extern unsigned ceph_dentry_hash(struct dentry *dn);
773 773
774/* 774/*
775 * our d_ops vary depending on whether the inode is live, 775 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 6e12a6ba5f7..8c9eba6ef9d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
219 struct rb_node **p; 219 struct rb_node **p;
220 struct rb_node *parent = NULL; 220 struct rb_node *parent = NULL;
221 struct ceph_inode_xattr *xattr = NULL; 221 struct ceph_inode_xattr *xattr = NULL;
222 int name_len = strlen(name);
222 int c; 223 int c;
223 224
224 p = &ci->i_xattrs.index.rb_node; 225 p = &ci->i_xattrs.index.rb_node;
@@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
226 parent = *p; 227 parent = *p;
227 xattr = rb_entry(parent, struct ceph_inode_xattr, node); 228 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
228 c = strncmp(name, xattr->name, xattr->name_len); 229 c = strncmp(name, xattr->name, xattr->name_len);
230 if (c == 0 && name_len > xattr->name_len)
231 c = 1;
229 if (c < 0) 232 if (c < 0)
230 p = &(*p)->rb_left; 233 p = &(*p)->rb_left;
231 else if (c > 0) 234 else if (c > 0)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index e5b9df993b9..dca9e5e0f73 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,7 +59,7 @@ static struct char_device_struct {
59} *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; 59} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
60 60
61/* index in the above */ 61/* index in the above */
62static inline int major_to_index(int major) 62static inline int major_to_index(unsigned major)
63{ 63{
64 return major % CHRDEV_MAJOR_HASH_SIZE; 64 return major % CHRDEV_MAJOR_HASH_SIZE;
65} 65}
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
417 return ret; 417 return ret;
418} 418}
419 419
420int cdev_index(struct inode *inode)
421{
422 int idx;
423 struct kobject *kobj;
424
425 kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
426 if (!kobj)
427 return -1;
428 kobject_put(kobj);
429 return idx;
430}
431
432void cd_forget(struct inode *inode) 420void cd_forget(struct inode *inode)
433{ 421{
434 spin_lock(&cdev_lock); 422 spin_lock(&cdev_lock);
@@ -582,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
582EXPORT_SYMBOL(cdev_alloc); 570EXPORT_SYMBOL(cdev_alloc);
583EXPORT_SYMBOL(cdev_del); 571EXPORT_SYMBOL(cdev_del);
584EXPORT_SYMBOL(cdev_add); 572EXPORT_SYMBOL(cdev_add);
585EXPORT_SYMBOL(cdev_index);
586EXPORT_SYMBOL(__register_chrdev); 573EXPORT_SYMBOL(__register_chrdev);
587EXPORT_SYMBOL(__unregister_chrdev); 574EXPORT_SYMBOL(__unregister_chrdev);
588EXPORT_SYMBOL(directly_mappable_cdev_bdi); 575EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 0ed213970ce..ee45648b0d1 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -4,6 +4,7 @@ config CIFS
4 select NLS 4 select NLS
5 select CRYPTO 5 select CRYPTO
6 select CRYPTO_MD5 6 select CRYPTO_MD5
7 select CRYPTO_HMAC
7 select CRYPTO_ARC4 8 select CRYPTO_ARC4
8 help 9 help
9 This is the client VFS module for the Common Internet File System 10 This is the client VFS module for the Common Internet File System
@@ -143,6 +144,13 @@ config CIFS_FSCACHE
143 to be cached locally on disk through the general filesystem cache 144 to be cached locally on disk through the general filesystem cache
144 manager. If unsure, say N. 145 manager. If unsure, say N.
145 146
147config CIFS_ACL
148 bool "Provide CIFS ACL support (EXPERIMENTAL)"
149 depends on EXPERIMENTAL && CIFS_XATTR
150 help
151 Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
152 is handed over to the application/caller.
153
146config CIFS_EXPERIMENTAL 154config CIFS_EXPERIMENTAL
147 bool "CIFS Experimental Features (EXPERIMENTAL)" 155 bool "CIFS Experimental Features (EXPERIMENTAL)"
148 depends on CIFS && EXPERIMENTAL 156 depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index adefa60a9bd..d87558448e3 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,8 +5,10 @@ obj-$(CONFIG_CIFS) += cifs.o
5 5
6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ 6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ 7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ 8 cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
9 readdir.o ioctl.o sess.o export.o cifsacl.o 9 readdir.o ioctl.o sess.o export.o
10
11cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
10 12
11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o 13cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
12 14
diff --git a/fs/cifs/README b/fs/cifs/README
index ee68d103654..fe168359082 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -337,6 +337,15 @@ A partial list of the supported mount options follows:
337 wsize default write size (default 57344) 337 wsize default write size (default 57344)
338 maximum wsize currently allowed by CIFS is 57344 (fourteen 338 maximum wsize currently allowed by CIFS is 57344 (fourteen
339 4096 byte pages) 339 4096 byte pages)
340 actimeo=n attribute cache timeout in seconds (default 1 second).
341 After this timeout, the cifs client requests fresh attribute
342 information from the server. This option allows to tune the
343 attribute cache timeout to suit the workload needs. Shorter
344 timeouts mean better the cache coherency, but increased number
345 of calls to the server. Longer timeouts mean reduced number
346 of calls to the server at the expense of less stricter cache
347 coherency checks (i.e. incorrect attribute cache for a short
348 period of time).
340 rw mount the network share read-write (note that the 349 rw mount the network share read-write (note that the
341 server may still consider the share read-only) 350 server may still consider the share read-only)
342 ro mount network share read-only 351 ro mount network share read-only
@@ -443,6 +452,11 @@ A partial list of the supported mount options follows:
443 if oplock (caching token) is granted and held. Note that 452 if oplock (caching token) is granted and held. Note that
444 direct allows write operations larger than page size 453 direct allows write operations larger than page size
445 to be sent to the server. 454 to be sent to the server.
455 strictcache Use for switching on strict cache mode. In this mode the
456 client read from the cache all the time it has Oplock Level II,
457 otherwise - read from the server. All written data are stored
458 in the cache, but if the client doesn't have Exclusive Oplock,
459 it writes the data to the server.
446 acl Allow setfacl and getfacl to manage posix ACLs if server 460 acl Allow setfacl and getfacl to manage posix ACLs if server
447 supports them. (default) 461 supports them. (default)
448 noacl Do not allow setfacl and getfacl calls on this mount 462 noacl Do not allow setfacl and getfacl calls on this mount
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e5..355abcdcda9 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
81 81
82v) mount check for unmatched uids 82v) mount check for unmatched uids
83 83
84w) Add support for new vfs entry points for setlease and fallocate 84w) Add support for new vfs entry point for fallocate
85 85
86x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 86x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
87processes can proceed better in parallel (on the server) 87processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 224d7bbd1fc..e654dfd092c 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
64 void *buffer, uint16_t maxbuf) 64 void *buffer, uint16_t maxbuf)
65{ 65{
66 const struct TCP_Server_Info *server = cookie_netfs_data; 66 const struct TCP_Server_Info *server = cookie_netfs_data;
67 const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr; 67 const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
68 const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
69 const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
68 struct cifs_server_key *key = buffer; 70 struct cifs_server_key *key = buffer;
69 uint16_t key_len = sizeof(struct cifs_server_key); 71 uint16_t key_len = sizeof(struct cifs_server_key);
70 72
@@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
76 */ 78 */
77 switch (sa->sa_family) { 79 switch (sa->sa_family) {
78 case AF_INET: 80 case AF_INET:
79 key->family = server->addr.sockAddr.sin_family; 81 key->family = sa->sa_family;
80 key->port = server->addr.sockAddr.sin_port; 82 key->port = addr->sin_port;
81 key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr; 83 key->addr[0].ipv4_addr = addr->sin_addr;
82 key_len += sizeof(key->addr[0].ipv4_addr); 84 key_len += sizeof(key->addr[0].ipv4_addr);
83 break; 85 break;
84 86
85 case AF_INET6: 87 case AF_INET6:
86 key->family = server->addr.sockAddr6.sin6_family; 88 key->family = sa->sa_family;
87 key->port = server->addr.sockAddr6.sin6_port; 89 key->port = addr6->sin6_port;
88 key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr; 90 key->addr[0].ipv6_addr = addr6->sin6_addr;
89 key_len += sizeof(key->addr[0].ipv6_addr); 91 key_len += sizeof(key->addr[0].ipv6_addr);
90 break; 92 break;
91 93
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 103ab8b605b..65829d32128 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
79 spin_lock(&GlobalMid_Lock); 79 spin_lock(&GlobalMid_Lock);
80 list_for_each(tmp, &server->pending_mid_q) { 80 list_for_each(tmp, &server->pending_mid_q) {
81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
82 cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", 82 cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
83 mid_entry->midState, 83 mid_entry->midState,
84 (int)mid_entry->command, 84 (int)mid_entry->command,
85 mid_entry->pid, 85 mid_entry->pid,
86 mid_entry->tsk, 86 mid_entry->callback_data,
87 mid_entry->mid); 87 mid_entry->mid);
88#ifdef CONFIG_CIFS_STATS2 88#ifdef CONFIG_CIFS_STATS2
89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld", 89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -119,29 +119,27 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
119 "Display Internal CIFS Data Structures for Debugging\n" 119 "Display Internal CIFS Data Structures for Debugging\n"
120 "---------------------------------------------------\n"); 120 "---------------------------------------------------\n");
121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); 121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
122 seq_printf(m, "Features: "); 122 seq_printf(m, "Features:");
123#ifdef CONFIG_CIFS_DFS_UPCALL 123#ifdef CONFIG_CIFS_DFS_UPCALL
124 seq_printf(m, "dfs"); 124 seq_printf(m, " dfs");
125 seq_putc(m, ' ');
126#endif 125#endif
127#ifdef CONFIG_CIFS_FSCACHE 126#ifdef CONFIG_CIFS_FSCACHE
128 seq_printf(m, "fscache"); 127 seq_printf(m, " fscache");
129 seq_putc(m, ' ');
130#endif 128#endif
131#ifdef CONFIG_CIFS_WEAK_PW_HASH 129#ifdef CONFIG_CIFS_WEAK_PW_HASH
132 seq_printf(m, "lanman"); 130 seq_printf(m, " lanman");
133 seq_putc(m, ' ');
134#endif 131#endif
135#ifdef CONFIG_CIFS_POSIX 132#ifdef CONFIG_CIFS_POSIX
136 seq_printf(m, "posix"); 133 seq_printf(m, " posix");
137 seq_putc(m, ' ');
138#endif 134#endif
139#ifdef CONFIG_CIFS_UPCALL 135#ifdef CONFIG_CIFS_UPCALL
140 seq_printf(m, "spnego"); 136 seq_printf(m, " spnego");
141 seq_putc(m, ' ');
142#endif 137#endif
143#ifdef CONFIG_CIFS_XATTR 138#ifdef CONFIG_CIFS_XATTR
144 seq_printf(m, "xattr"); 139 seq_printf(m, " xattr");
140#endif
141#ifdef CONFIG_CIFS_ACL
142 seq_printf(m, " acl");
145#endif 143#endif
146 seq_putc(m, '\n'); 144 seq_putc(m, '\n');
147 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); 145 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
@@ -220,11 +218,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
220 mid_entry = list_entry(tmp3, struct mid_q_entry, 218 mid_entry = list_entry(tmp3, struct mid_q_entry,
221 qhead); 219 qhead);
222 seq_printf(m, "\tState: %d com: %d pid:" 220 seq_printf(m, "\tState: %d com: %d pid:"
223 " %d tsk: %p mid %d\n", 221 " %d cbdata: %p mid %d\n",
224 mid_entry->midState, 222 mid_entry->midState,
225 (int)mid_entry->command, 223 (int)mid_entry->command,
226 mid_entry->pid, 224 mid_entry->pid,
227 mid_entry->tsk, 225 mid_entry->callback_data,
228 mid_entry->mid); 226 mid_entry->mid);
229 } 227 }
230 spin_unlock(&GlobalMid_Lock); 228 spin_unlock(&GlobalMid_Lock);
@@ -333,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
333 atomic_read(&totSmBufAllocCount)); 331 atomic_read(&totSmBufAllocCount));
334#endif /* CONFIG_CIFS_STATS2 */ 332#endif /* CONFIG_CIFS_STATS2 */
335 333
336 seq_printf(m, "Operations (MIDs): %d\n", midCount.counter); 334 seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
337 seq_printf(m, 335 seq_printf(m,
338 "\n%d session %d share reconnects\n", 336 "\n%d session %d share reconnects\n",
339 tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); 337 tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27f..f1c68629f27 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -255,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
255 255
256} 256}
257 257
258static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
259 struct list_head *mntlist)
260{
261 /* stolen from afs code */
262 int err;
263
264 mntget(newmnt);
265 err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
266 switch (err) {
267 case 0:
268 path_put(&nd->path);
269 nd->path.mnt = newmnt;
270 nd->path.dentry = dget(newmnt->mnt_root);
271 schedule_delayed_work(&cifs_dfs_automount_task,
272 cifs_dfs_mountpoint_expiry_timeout);
273 break;
274 case -EBUSY:
275 /* someone else made a mount here whilst we were busy */
276 while (d_mountpoint(nd->path.dentry) &&
277 follow_down(&nd->path))
278 ;
279 err = 0;
280 default:
281 mntput(newmnt);
282 break;
283 }
284 return err;
285}
286
287static void dump_referral(const struct dfs_info3_param *ref) 258static void dump_referral(const struct dfs_info3_param *ref)
288{ 259{
289 cFYI(1, "DFS: ref path: %s", ref->path_name); 260 cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -293,45 +264,42 @@ static void dump_referral(const struct dfs_info3_param *ref)
293 ref->path_consumed); 264 ref->path_consumed);
294} 265}
295 266
296 267/*
297static void* 268 * Create a vfsmount that we can automount
298cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) 269 */
270static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
299{ 271{
300 struct dfs_info3_param *referrals = NULL; 272 struct dfs_info3_param *referrals = NULL;
301 unsigned int num_referrals = 0; 273 unsigned int num_referrals = 0;
302 struct cifs_sb_info *cifs_sb; 274 struct cifs_sb_info *cifs_sb;
303 struct cifsSesInfo *ses; 275 struct cifsSesInfo *ses;
304 char *full_path = NULL; 276 char *full_path;
305 int xid, i; 277 int xid, i;
306 int rc = 0; 278 int rc;
307 struct vfsmount *mnt = ERR_PTR(-ENOENT); 279 struct vfsmount *mnt;
308 struct tcon_link *tlink; 280 struct tcon_link *tlink;
309 281
310 cFYI(1, "in %s", __func__); 282 cFYI(1, "in %s", __func__);
311 BUG_ON(IS_ROOT(dentry)); 283 BUG_ON(IS_ROOT(mntpt));
312 284
313 xid = GetXid(); 285 xid = GetXid();
314 286
315 dput(nd->path.dentry);
316 nd->path.dentry = dget(dentry);
317
318 /* 287 /*
319 * The MSDFS spec states that paths in DFS referral requests and 288 * The MSDFS spec states that paths in DFS referral requests and
320 * responses must be prefixed by a single '\' character instead of 289 * responses must be prefixed by a single '\' character instead of
321 * the double backslashes usually used in the UNC. This function 290 * the double backslashes usually used in the UNC. This function
322 * gives us the latter, so we must adjust the result. 291 * gives us the latter, so we must adjust the result.
323 */ 292 */
324 full_path = build_path_from_dentry(dentry); 293 mnt = ERR_PTR(-ENOMEM);
325 if (full_path == NULL) { 294 full_path = build_path_from_dentry(mntpt);
326 rc = -ENOMEM; 295 if (full_path == NULL)
327 goto out_err; 296 goto free_xid;
328 }
329 297
330 cifs_sb = CIFS_SB(dentry->d_inode->i_sb); 298 cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
331 tlink = cifs_sb_tlink(cifs_sb); 299 tlink = cifs_sb_tlink(cifs_sb);
332 if (IS_ERR(tlink)) { 300 if (IS_ERR(tlink)) {
333 rc = PTR_ERR(tlink); 301 mnt = ERR_CAST(tlink);
334 goto out_err; 302 goto free_full_path;
335 } 303 }
336 ses = tlink_tcon(tlink)->ses; 304 ses = tlink_tcon(tlink)->ses;
337 305
@@ -341,46 +309,63 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
341 309
342 cifs_put_tlink(tlink); 310 cifs_put_tlink(tlink);
343 311
312 mnt = ERR_PTR(-ENOENT);
344 for (i = 0; i < num_referrals; i++) { 313 for (i = 0; i < num_referrals; i++) {
345 int len; 314 int len;
346 dump_referral(referrals+i); 315 dump_referral(referrals + i);
347 /* connect to a node */ 316 /* connect to a node */
348 len = strlen(referrals[i].node_name); 317 len = strlen(referrals[i].node_name);
349 if (len < 2) { 318 if (len < 2) {
350 cERROR(1, "%s: Net Address path too short: %s", 319 cERROR(1, "%s: Net Address path too short: %s",
351 __func__, referrals[i].node_name); 320 __func__, referrals[i].node_name);
352 rc = -EINVAL; 321 mnt = ERR_PTR(-EINVAL);
353 goto out_err; 322 break;
354 } 323 }
355 mnt = cifs_dfs_do_refmount(cifs_sb, 324 mnt = cifs_dfs_do_refmount(cifs_sb,
356 full_path, referrals + i); 325 full_path, referrals + i);
357 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, 326 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
358 referrals[i].node_name, mnt); 327 referrals[i].node_name, mnt);
359
360 /* complete mount procedure if we accured submount */
361 if (!IS_ERR(mnt)) 328 if (!IS_ERR(mnt))
362 break; 329 goto success;
363 } 330 }
364 331
365 /* we need it cause for() above could exit without valid submount */ 332 /* no valid submounts were found; return error from get_dfs_path() by
366 rc = PTR_ERR(mnt); 333 * preference */
367 if (IS_ERR(mnt)) 334 if (rc != 0)
368 goto out_err; 335 mnt = ERR_PTR(rc);
369
370 rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
371 336
372out: 337success:
373 FreeXid(xid);
374 free_dfs_info_array(referrals, num_referrals); 338 free_dfs_info_array(referrals, num_referrals);
339free_full_path:
375 kfree(full_path); 340 kfree(full_path);
341free_xid:
342 FreeXid(xid);
376 cFYI(1, "leaving %s" , __func__); 343 cFYI(1, "leaving %s" , __func__);
377 return ERR_PTR(rc); 344 return mnt;
378out_err: 345}
379 path_put(&nd->path); 346
380 goto out; 347/*
348 * Attempt to automount the referral
349 */
350struct vfsmount *cifs_dfs_d_automount(struct path *path)
351{
352 struct vfsmount *newmnt;
353
354 cFYI(1, "in %s", __func__);
355
356 newmnt = cifs_dfs_do_automount(path->dentry);
357 if (IS_ERR(newmnt)) {
358 cFYI(1, "leaving %s [automount failed]" , __func__);
359 return newmnt;
360 }
361
362 mntget(newmnt); /* prevent immediate expiration */
363 mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
364 schedule_delayed_work(&cifs_dfs_automount_task,
365 cifs_dfs_mountpoint_expiry_timeout);
366 cFYI(1, "leaving %s [ok]" , __func__);
367 return newmnt;
381} 368}
382 369
383const struct inode_operations cifs_dfs_referral_inode_operations = { 370const struct inode_operations cifs_dfs_referral_inode_operations = {
384 .follow_link = cifs_dfs_follow_mountpoint,
385}; 371};
386
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 525ba59a410..ac51cd2d33a 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,7 +15,7 @@
15 * the GNU Lesser General Public License for more details. 15 * the GNU Lesser General Public License for more details.
16 * 16 *
17 */ 17 */
18#include <linux/radix-tree.h> 18#include <linux/rbtree.h>
19 19
20#ifndef _CIFS_FS_SB_H 20#ifndef _CIFS_FS_SB_H
21#define _CIFS_FS_SB_H 21#define _CIFS_FS_SB_H
@@ -40,14 +40,16 @@
40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ 40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */
41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */ 41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */
42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ 42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
43#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */
43 44
44struct cifs_sb_info { 45struct cifs_sb_info {
45 struct radix_tree_root tlink_tree; 46 struct rb_root tlink_tree;
46#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */
47 spinlock_t tlink_tree_lock; 47 spinlock_t tlink_tree_lock;
48 struct tcon_link *master_tlink;
48 struct nls_table *local_nls; 49 struct nls_table *local_nls;
49 unsigned int rsize; 50 unsigned int rsize;
50 unsigned int wsize; 51 unsigned int wsize;
52 unsigned long actimeo; /* attribute cache timeout (jiffies) */
51 atomic_t active; 53 atomic_t active;
52 uid_t mnt_uid; 54 uid_t mnt_uid;
53 gid_t mnt_gid; 55 gid_t mnt_gid;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 87044906cd1..4dfba828316 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,6 +98,8 @@ struct key *
98cifs_get_spnego_key(struct cifsSesInfo *sesInfo) 98cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
99{ 99{
100 struct TCP_Server_Info *server = sesInfo->server; 100 struct TCP_Server_Info *server = sesInfo->server;
101 struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
102 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
101 char *description, *dp; 103 char *description, *dp;
102 size_t desc_len; 104 size_t desc_len;
103 struct key *spnego_key; 105 struct key *spnego_key;
@@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
127 dp = description + strlen(description); 129 dp = description + strlen(description);
128 130
129 /* add the server address */ 131 /* add the server address */
130 if (server->addr.sockAddr.sin_family == AF_INET) 132 if (server->dstaddr.ss_family == AF_INET)
131 sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); 133 sprintf(dp, "ip4=%pI4", &sa->sin_addr);
132 else if (server->addr.sockAddr.sin_family == AF_INET6) 134 else if (server->dstaddr.ss_family == AF_INET6)
133 sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr); 135 sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
134 else 136 else
135 goto out; 137 goto out;
136 138
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 430f510a172..fc0fd4fde30 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
44 int charlen, outlen = 0; 44 int charlen, outlen = 0;
45 int maxwords = maxbytes / 2; 45 int maxwords = maxbytes / 2;
46 char tmp[NLS_MAX_CHARSET_SIZE]; 46 char tmp[NLS_MAX_CHARSET_SIZE];
47 __u16 ftmp;
47 48
48 for (i = 0; i < maxwords && from[i]; i++) { 49 for (i = 0; i < maxwords; i++) {
49 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, 50 ftmp = get_unaligned_le16(&from[i]);
50 NLS_MAX_CHARSET_SIZE); 51 if (ftmp == 0)
52 break;
53
54 charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
51 if (charlen > 0) 55 if (charlen > 0)
52 outlen += charlen; 56 outlen += charlen;
53 else 57 else
@@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
58} 62}
59 63
60/* 64/*
61 * cifs_mapchar - convert a little-endian char to proper char in codepage 65 * cifs_mapchar - convert a host-endian char to proper char in codepage
62 * @target - where converted character should be copied 66 * @target - where converted character should be copied
63 * @src_char - 2 byte little-endian source character 67 * @src_char - 2 byte host-endian source character
64 * @cp - codepage to which character should be converted 68 * @cp - codepage to which character should be converted
65 * @mapchar - should character be mapped according to mapchars mount option? 69 * @mapchar - should character be mapped according to mapchars mount option?
66 * 70 *
@@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
69 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). 73 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
70 */ 74 */
71static int 75static int
72cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp, 76cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
73 bool mapchar) 77 bool mapchar)
74{ 78{
75 int len = 1; 79 int len = 1;
@@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
82 * build_path_from_dentry are modified, as they use slash as 86 * build_path_from_dentry are modified, as they use slash as
83 * separator. 87 * separator.
84 */ 88 */
85 switch (le16_to_cpu(src_char)) { 89 switch (src_char) {
86 case UNI_COLON: 90 case UNI_COLON:
87 *target = ':'; 91 *target = ':';
88 break; 92 break;
@@ -109,8 +113,7 @@ out:
109 return len; 113 return len;
110 114
111cp_convert: 115cp_convert:
112 len = cp->uni2char(le16_to_cpu(src_char), target, 116 len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
113 NLS_MAX_CHARSET_SIZE);
114 if (len <= 0) { 117 if (len <= 0) {
115 *target = '?'; 118 *target = '?';
116 len = 1; 119 len = 1;
@@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
149 int nullsize = nls_nullsize(codepage); 152 int nullsize = nls_nullsize(codepage);
150 int fromwords = fromlen / 2; 153 int fromwords = fromlen / 2;
151 char tmp[NLS_MAX_CHARSET_SIZE]; 154 char tmp[NLS_MAX_CHARSET_SIZE];
155 __u16 ftmp;
152 156
153 /* 157 /*
154 * because the chars can be of varying widths, we need to take care 158 * because the chars can be of varying widths, we need to take care
@@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
158 */ 162 */
159 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); 163 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
160 164
161 for (i = 0; i < fromwords && from[i]; i++) { 165 for (i = 0; i < fromwords; i++) {
166 ftmp = get_unaligned_le16(&from[i]);
167 if (ftmp == 0)
168 break;
169
162 /* 170 /*
163 * check to see if converting this character might make the 171 * check to see if converting this character might make the
164 * conversion bleed into the null terminator 172 * conversion bleed into the null terminator
165 */ 173 */
166 if (outlen >= safelen) { 174 if (outlen >= safelen) {
167 charlen = cifs_mapchar(tmp, from[i], codepage, mapchar); 175 charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
168 if ((outlen + charlen) > (tolen - nullsize)) 176 if ((outlen + charlen) > (tolen - nullsize))
169 break; 177 break;
170 } 178 }
171 179
172 /* put converted char into 'to' buffer */ 180 /* put converted char into 'to' buffer */
173 charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar); 181 charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
174 outlen += charlen; 182 outlen += charlen;
175 } 183 }
176 184
@@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
193{ 201{
194 int charlen; 202 int charlen;
195 int i; 203 int i;
196 wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */ 204 wchar_t wchar_to; /* needed to quiet sparse */
197 205
198 for (i = 0; len && *from; i++, from += charlen, len -= charlen) { 206 for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
199 207 charlen = codepage->char2uni(from, len, &wchar_to);
200 /* works for 2.4.0 kernel or later */
201 charlen = codepage->char2uni(from, len, &wchar_to[i]);
202 if (charlen < 1) { 208 if (charlen < 1) {
203 cERROR(1, "strtoUCS: char2uni of %d returned %d", 209 cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
204 (int)*from, charlen); 210 *from, charlen);
205 /* A question mark */ 211 /* A question mark */
206 to[i] = cpu_to_le16(0x003f); 212 wchar_to = 0x003f;
207 charlen = 1; 213 charlen = 1;
208 } else 214 }
209 to[i] = cpu_to_le16(wchar_to[i]); 215 put_unaligned_le16(wchar_to, &to[i]);
210
211 } 216 }
212 217
213 to[i] = 0; 218 put_unaligned_le16(0, &to[i]);
214 return i; 219 return i;
215} 220}
216 221
@@ -252,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
252 return dst; 257 return dst;
253} 258}
254 259
260/*
261 * Convert 16 bit Unicode pathname to wire format from string in current code
262 * page. Conversion may involve remapping up the six characters that are
263 * only legal in POSIX-like OS (if they are present in the string). Path
264 * names are little endian 16 bit Unicode on the wire
265 */
266int
267cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
268 const struct nls_table *cp, int mapChars)
269{
270 int i, j, charlen;
271 int len_remaining = maxlen;
272 char src_char;
273 __u16 temp;
274
275 if (!mapChars)
276 return cifs_strtoUCS(target, source, PATH_MAX, cp);
277
278 for (i = 0, j = 0; i < maxlen; j++) {
279 src_char = source[i];
280 switch (src_char) {
281 case 0:
282 put_unaligned_le16(0, &target[j]);
283 goto ctoUCS_out;
284 case ':':
285 temp = UNI_COLON;
286 break;
287 case '*':
288 temp = UNI_ASTERIK;
289 break;
290 case '?':
291 temp = UNI_QUESTION;
292 break;
293 case '<':
294 temp = UNI_LESSTHAN;
295 break;
296 case '>':
297 temp = UNI_GRTRTHAN;
298 break;
299 case '|':
300 temp = UNI_PIPE;
301 break;
302 /*
303 * FIXME: We can not handle remapping backslash (UNI_SLASH)
304 * until all the calls to build_path_from_dentry are modified,
305 * as they use backslash as separator.
306 */
307 default:
308 charlen = cp->char2uni(source+i, len_remaining,
309 &temp);
310 /*
311 * if no match, use question mark, which at least in
312 * some cases serves as wild card
313 */
314 if (charlen < 1) {
315 temp = 0x003f;
316 charlen = 1;
317 }
318 len_remaining -= charlen;
319 /*
320 * character may take more than one byte in the source
321 * string, but will take exactly two bytes in the
322 * target string
323 */
324 i += charlen;
325 continue;
326 }
327 put_unaligned_le16(temp, &target[j]);
328 i++; /* move to next char in source string */
329 len_remaining--;
330 }
331
332ctoUCS_out:
333 return i;
334}
335
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c9b4792ae82..1e7636b145a 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -30,8 +30,6 @@
30#include "cifs_debug.h" 30#include "cifs_debug.h"
31 31
32 32
33#ifdef CONFIG_CIFS_EXPERIMENTAL
34
35static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { 33static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
36 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, 34 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
37 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, 35 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
@@ -43,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
43; 41;
44 42
45 43
46/* security id for everyone */ 44/* security id for everyone/world system group */
47static const struct cifs_sid sid_everyone = { 45static const struct cifs_sid sid_everyone = {
48 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; 46 1, 1, {0, 0, 0, 0, 0, 1}, {0} };
47/* security id for Authenticated Users system group */
48static const struct cifs_sid sid_authusers = {
49 1, 1, {0, 0, 0, 0, 0, 5}, {11} };
49/* group users */ 50/* group users */
50static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; 51static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
51 52
@@ -367,7 +368,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
367 if (num_aces > 0) { 368 if (num_aces > 0) {
368 umode_t user_mask = S_IRWXU; 369 umode_t user_mask = S_IRWXU;
369 umode_t group_mask = S_IRWXG; 370 umode_t group_mask = S_IRWXG;
370 umode_t other_mask = S_IRWXO; 371 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
371 372
372 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), 373 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
373 GFP_KERNEL); 374 GFP_KERNEL);
@@ -392,6 +393,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
392 ppace[i]->type, 393 ppace[i]->type,
393 &fattr->cf_mode, 394 &fattr->cf_mode,
394 &other_mask); 395 &other_mask);
396 if (compare_sids(&(ppace[i]->sid), &sid_authusers))
397 access_flags_to_mode(ppace[i]->access_req,
398 ppace[i]->type,
399 &fattr->cf_mode,
400 &other_mask);
401
395 402
396/* memcpy((void *)(&(cifscred->aces[i])), 403/* memcpy((void *)(&(cifscred->aces[i])),
397 (void *)ppace[i], 404 (void *)ppace[i],
@@ -560,7 +567,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
560 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 567 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
561 568
562 if (IS_ERR(tlink)) 569 if (IS_ERR(tlink))
563 return NULL; 570 return ERR_CAST(tlink);
564 571
565 xid = GetXid(); 572 xid = GetXid();
566 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen); 573 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
@@ -568,7 +575,9 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
568 575
569 cifs_put_tlink(tlink); 576 cifs_put_tlink(tlink);
570 577
571 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); 578 cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
579 if (rc)
580 return ERR_PTR(rc);
572 return pntsd; 581 return pntsd;
573} 582}
574 583
@@ -583,7 +592,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
583 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 592 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
584 593
585 if (IS_ERR(tlink)) 594 if (IS_ERR(tlink))
586 return NULL; 595 return ERR_CAST(tlink);
587 596
588 tcon = tlink_tcon(tlink); 597 tcon = tlink_tcon(tlink);
589 xid = GetXid(); 598 xid = GetXid();
@@ -591,23 +600,22 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
591 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, 600 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
592 &fid, &oplock, NULL, cifs_sb->local_nls, 601 &fid, &oplock, NULL, cifs_sb->local_nls,
593 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 602 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
594 if (rc) { 603 if (!rc) {
595 cERROR(1, "Unable to open file to get ACL"); 604 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
596 goto out; 605 CIFSSMBClose(xid, tcon, fid);
597 } 606 }
598 607
599 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
600 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
601
602 CIFSSMBClose(xid, tcon, fid);
603 out:
604 cifs_put_tlink(tlink); 608 cifs_put_tlink(tlink);
605 FreeXid(xid); 609 FreeXid(xid);
610
611 cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
612 if (rc)
613 return ERR_PTR(rc);
606 return pntsd; 614 return pntsd;
607} 615}
608 616
609/* Retrieve an ACL from the server */ 617/* Retrieve an ACL from the server */
610static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, 618struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
611 struct inode *inode, const char *path, 619 struct inode *inode, const char *path,
612 u32 *pacllen) 620 u32 *pacllen)
613{ 621{
@@ -695,7 +703,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
695} 703}
696 704
697/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 705/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
698void 706int
699cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, 707cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
700 struct inode *inode, const char *path, const __u16 *pfid) 708 struct inode *inode, const char *path, const __u16 *pfid)
701{ 709{
@@ -711,17 +719,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
711 pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); 719 pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
712 720
713 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ 721 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
714 if (pntsd) 722 if (IS_ERR(pntsd)) {
723 rc = PTR_ERR(pntsd);
724 cERROR(1, "%s: error %d getting sec desc", __func__, rc);
725 } else {
715 rc = parse_sec_desc(pntsd, acllen, fattr); 726 rc = parse_sec_desc(pntsd, acllen, fattr);
716 if (rc) 727 kfree(pntsd);
717 cFYI(1, "parse sec desc failed rc = %d", rc); 728 if (rc)
729 cERROR(1, "parse sec desc failed rc = %d", rc);
730 }
718 731
719 kfree(pntsd); 732 return rc;
720 return;
721} 733}
722 734
723/* Convert mode bits to an ACL so we can update the ACL on the server */ 735/* Convert mode bits to an ACL so we can update the ACL on the server */
724int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) 736int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
725{ 737{
726 int rc = 0; 738 int rc = 0;
727 __u32 secdesclen = 0; 739 __u32 secdesclen = 0;
@@ -736,7 +748,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
736 /* Add three ACEs for owner, group, everyone getting rid of 748 /* Add three ACEs for owner, group, everyone getting rid of
737 other ACEs as chmod disables ACEs and set the security descriptor */ 749 other ACEs as chmod disables ACEs and set the security descriptor */
738 750
739 if (pntsd) { 751 if (IS_ERR(pntsd)) {
752 rc = PTR_ERR(pntsd);
753 cERROR(1, "%s: error %d getting sec desc", __func__, rc);
754 } else {
740 /* allocate memory for the smb header, 755 /* allocate memory for the smb header,
741 set security descriptor request security descriptor 756 set security descriptor request security descriptor
742 parameters, and secuirty descriptor itself */ 757 parameters, and secuirty descriptor itself */
@@ -766,4 +781,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
766 781
767 return rc; 782 return rc;
768} 783}
769#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 6c8096cf515..c4ae7d03656 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -74,11 +74,7 @@ struct cifs_wksid {
74 char sidname[SIDNAMELENGTH]; 74 char sidname[SIDNAMELENGTH];
75} __attribute__((packed)); 75} __attribute__((packed));
76 76
77#ifdef CONFIG_CIFS_EXPERIMENTAL
78
79extern int match_sid(struct cifs_sid *); 77extern int match_sid(struct cifs_sid *);
80extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); 78extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
81 79
82#endif /* CONFIG_CIFS_EXPERIMENTAL */
83
84#endif /* _CIFSACL_H */ 80#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index f856732161a..0db5f1de022 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,7 +24,6 @@
24#include "cifspdu.h" 24#include "cifspdu.h"
25#include "cifsglob.h" 25#include "cifsglob.h"
26#include "cifs_debug.h" 26#include "cifs_debug.h"
27#include "md5.h"
28#include "cifs_unicode.h" 27#include "cifs_unicode.h"
29#include "cifsproto.h" 28#include "cifsproto.h"
30#include "ntlmssp.h" 29#include "ntlmssp.h"
@@ -37,11 +36,6 @@
37/* Note that the smb header signature field on input contains the 36/* Note that the smb header signature field on input contains the
38 sequence number before this function is called */ 37 sequence number before this function is called */
39 38
40extern void mdfour(unsigned char *out, unsigned char *in, int n);
41extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
42extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
43 unsigned char *p24);
44
45static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 39static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
46 struct TCP_Server_Info *server, char *signature) 40 struct TCP_Server_Info *server, char *signature)
47{ 41{
@@ -72,6 +66,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
72 return 0; 66 return 0;
73} 67}
74 68
69/* must be called with server->srv_mutex held */
75int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, 70int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
76 __u32 *pexpected_response_sequence_number) 71 __u32 *pexpected_response_sequence_number)
77{ 72{
@@ -84,14 +79,12 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
84 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 79 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
85 return rc; 80 return rc;
86 81
87 spin_lock(&GlobalMid_Lock);
88 cifs_pdu->Signature.Sequence.SequenceNumber = 82 cifs_pdu->Signature.Sequence.SequenceNumber =
89 cpu_to_le32(server->sequence_number); 83 cpu_to_le32(server->sequence_number);
90 cifs_pdu->Signature.Sequence.Reserved = 0; 84 cifs_pdu->Signature.Sequence.Reserved = 0;
91 85
92 *pexpected_response_sequence_number = server->sequence_number++; 86 *pexpected_response_sequence_number = server->sequence_number++;
93 server->sequence_number++; 87 server->sequence_number++;
94 spin_unlock(&GlobalMid_Lock);
95 88
96 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); 89 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
97 if (rc) 90 if (rc)
@@ -149,6 +142,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
149 return rc; 142 return rc;
150} 143}
151 144
145/* must be called with server->srv_mutex held */
152int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, 146int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
153 __u32 *pexpected_response_sequence_number) 147 __u32 *pexpected_response_sequence_number)
154{ 148{
@@ -162,14 +156,12 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
162 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 156 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
163 return rc; 157 return rc;
164 158
165 spin_lock(&GlobalMid_Lock);
166 cifs_pdu->Signature.Sequence.SequenceNumber = 159 cifs_pdu->Signature.Sequence.SequenceNumber =
167 cpu_to_le32(server->sequence_number); 160 cpu_to_le32(server->sequence_number);
168 cifs_pdu->Signature.Sequence.Reserved = 0; 161 cifs_pdu->Signature.Sequence.Reserved = 0;
169 162
170 *pexpected_response_sequence_number = server->sequence_number++; 163 *pexpected_response_sequence_number = server->sequence_number++;
171 server->sequence_number++; 164 server->sequence_number++;
172 spin_unlock(&GlobalMid_Lock);
173 165
174 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); 166 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
175 if (rc) 167 if (rc)
@@ -236,6 +228,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
236/* first calculate 24 bytes ntlm response and then 16 byte session key */ 228/* first calculate 24 bytes ntlm response and then 16 byte session key */
237int setup_ntlm_response(struct cifsSesInfo *ses) 229int setup_ntlm_response(struct cifsSesInfo *ses)
238{ 230{
231 int rc = 0;
239 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; 232 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
240 char temp_key[CIFS_SESS_KEY_SIZE]; 233 char temp_key[CIFS_SESS_KEY_SIZE];
241 234
@@ -249,13 +242,26 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
249 } 242 }
250 ses->auth_key.len = temp_len; 243 ses->auth_key.len = temp_len;
251 244
252 SMBNTencrypt(ses->password, ses->server->cryptkey, 245 rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
253 ses->auth_key.response + CIFS_SESS_KEY_SIZE); 246 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
247 if (rc) {
248 cFYI(1, "%s Can't generate NTLM response, error: %d",
249 __func__, rc);
250 return rc;
251 }
252
253 rc = E_md4hash(ses->password, temp_key);
254 if (rc) {
255 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
256 return rc;
257 }
254 258
255 E_md4hash(ses->password, temp_key); 259 rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
256 mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); 260 if (rc)
261 cFYI(1, "%s Can't generate NTLM session key, error: %d",
262 __func__, rc);
257 263
258 return 0; 264 return rc;
259} 265}
260 266
261#ifdef CONFIG_CIFS_WEAK_PW_HASH 267#ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -702,14 +708,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
702 unsigned int size; 708 unsigned int size;
703 709
704 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); 710 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
705 if (!server->secmech.hmacmd5 || 711 if (IS_ERR(server->secmech.hmacmd5)) {
706 IS_ERR(server->secmech.hmacmd5)) {
707 cERROR(1, "could not allocate crypto hmacmd5\n"); 712 cERROR(1, "could not allocate crypto hmacmd5\n");
708 return PTR_ERR(server->secmech.hmacmd5); 713 return PTR_ERR(server->secmech.hmacmd5);
709 } 714 }
710 715
711 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); 716 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
712 if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) { 717 if (IS_ERR(server->secmech.md5)) {
713 cERROR(1, "could not allocate crypto md5\n"); 718 cERROR(1, "could not allocate crypto md5\n");
714 rc = PTR_ERR(server->secmech.md5); 719 rc = PTR_ERR(server->secmech.md5);
715 goto crypto_allocate_md5_fail; 720 goto crypto_allocate_md5_fail;
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
deleted file mode 100644
index 15d2ec00647..00000000000
--- a/fs/cifs/cifsencrypt.h
+++ /dev/null
@@ -1,33 +0,0 @@
1/*
2 * fs/cifs/cifsencrypt.h
3 *
4 * Copyright (c) International Business Machines Corp., 2005
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * Externs for misc. small encryption routines
8 * so we do not have to put them in cifsproto.h
9 *
10 * This library is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser General Public License as published
12 * by the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
18 * the GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this library; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25/* md4.c */
26extern void mdfour(unsigned char *out, unsigned char *in, int n);
27/* smbdes.c */
28extern void E_P16(unsigned char *p14, unsigned char *p16);
29extern void E_P24(unsigned char *p21, const unsigned char *c8,
30 unsigned char *p24);
31
32
33
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 75c4eaa7958..f2970136d17 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -77,7 +77,11 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
77module_param(cifs_max_pending, int, 0); 77module_param(cifs_max_pending, int, 0);
78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
79 "Default: 50 Range: 2 to 256"); 79 "Default: 50 Range: 2 to 256");
80 80unsigned short echo_retries = 5;
81module_param(echo_retries, ushort, 0644);
82MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
83 "reconnecting server. Default: 5. 0 means "
84 "never reconnect.");
81extern mempool_t *cifs_sm_req_poolp; 85extern mempool_t *cifs_sm_req_poolp;
82extern mempool_t *cifs_req_poolp; 86extern mempool_t *cifs_req_poolp;
83extern mempool_t *cifs_mid_poolp; 87extern mempool_t *cifs_mid_poolp;
@@ -116,7 +120,7 @@ cifs_read_super(struct super_block *sb, void *data,
116 return -ENOMEM; 120 return -ENOMEM;
117 121
118 spin_lock_init(&cifs_sb->tlink_tree_lock); 122 spin_lock_init(&cifs_sb->tlink_tree_lock);
119 INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL); 123 cifs_sb->tlink_tree = RB_ROOT;
120 124
121 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); 125 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
122 if (rc) { 126 if (rc) {
@@ -174,6 +178,12 @@ cifs_read_super(struct super_block *sb, void *data,
174 goto out_no_root; 178 goto out_no_root;
175 } 179 }
176 180
181 /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
182 if (cifs_sb_master_tcon(cifs_sb)->nocase)
183 sb->s_d_op = &cifs_ci_dentry_ops;
184 else
185 sb->s_d_op = &cifs_dentry_ops;
186
177#ifdef CONFIG_CIFS_EXPERIMENTAL 187#ifdef CONFIG_CIFS_EXPERIMENTAL
178 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 188 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
179 cFYI(1, "export ops supported"); 189 cFYI(1, "export ops supported");
@@ -283,10 +293,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
283 return 0; 293 return 0;
284} 294}
285 295
286static int cifs_permission(struct inode *inode, int mask) 296static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
287{ 297{
288 struct cifs_sb_info *cifs_sb; 298 struct cifs_sb_info *cifs_sb;
289 299
300 if (flags & IPERM_FLAG_RCU)
301 return -ECHILD;
302
290 cifs_sb = CIFS_SB(inode->i_sb); 303 cifs_sb = CIFS_SB(inode->i_sb);
291 304
292 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { 305 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -298,7 +311,7 @@ static int cifs_permission(struct inode *inode, int mask)
298 on the client (above and beyond ACL on servers) for 311 on the client (above and beyond ACL on servers) for
299 servers which do not support setting and viewing mode bits, 312 servers which do not support setting and viewing mode bits,
300 so allowing client to check permissions is useful */ 313 so allowing client to check permissions is useful */
301 return generic_permission(inode, mask, NULL); 314 return generic_permission(inode, mask, flags, NULL);
302} 315}
303 316
304static struct kmem_cache *cifs_inode_cachep; 317static struct kmem_cache *cifs_inode_cachep;
@@ -321,12 +334,13 @@ cifs_alloc_inode(struct super_block *sb)
321 /* Until the file is open and we have gotten oplock 334 /* Until the file is open and we have gotten oplock
322 info back from the server, can not assume caching of 335 info back from the server, can not assume caching of
323 file data or metadata */ 336 file data or metadata */
324 cifs_inode->clientCanCacheRead = false; 337 cifs_set_oplock_level(cifs_inode, 0);
325 cifs_inode->clientCanCacheAll = false;
326 cifs_inode->delete_pending = false; 338 cifs_inode->delete_pending = false;
327 cifs_inode->invalid_mapping = false; 339 cifs_inode->invalid_mapping = false;
328 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 340 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
329 cifs_inode->server_eof = 0; 341 cifs_inode->server_eof = 0;
342 cifs_inode->uniqueid = 0;
343 cifs_inode->createtime = 0;
330 344
331 /* Can not set i_flags here - they get immediately overwritten 345 /* Can not set i_flags here - they get immediately overwritten
332 to zero by the VFS */ 346 to zero by the VFS */
@@ -335,10 +349,17 @@ cifs_alloc_inode(struct super_block *sb)
335 return &cifs_inode->vfs_inode; 349 return &cifs_inode->vfs_inode;
336} 350}
337 351
352static void cifs_i_callback(struct rcu_head *head)
353{
354 struct inode *inode = container_of(head, struct inode, i_rcu);
355 INIT_LIST_HEAD(&inode->i_dentry);
356 kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
357}
358
338static void 359static void
339cifs_destroy_inode(struct inode *inode) 360cifs_destroy_inode(struct inode *inode)
340{ 361{
341 kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); 362 call_rcu(&inode->i_rcu, cifs_i_callback);
342} 363}
343 364
344static void 365static void
@@ -352,18 +373,19 @@ cifs_evict_inode(struct inode *inode)
352static void 373static void
353cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) 374cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
354{ 375{
376 struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
377 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
378
355 seq_printf(s, ",addr="); 379 seq_printf(s, ",addr=");
356 380
357 switch (server->addr.sockAddr.sin_family) { 381 switch (server->dstaddr.ss_family) {
358 case AF_INET: 382 case AF_INET:
359 seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr); 383 seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
360 break; 384 break;
361 case AF_INET6: 385 case AF_INET6:
362 seq_printf(s, "%pI6", 386 seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
363 &server->addr.sockAddr6.sin6_addr.s6_addr); 387 if (sa6->sin6_scope_id)
364 if (server->addr.sockAddr6.sin6_scope_id) 388 seq_printf(s, "%%%u", sa6->sin6_scope_id);
365 seq_printf(s, "%%%u",
366 server->addr.sockAddr6.sin6_scope_id);
367 break; 389 break;
368 default: 390 default:
369 seq_printf(s, "(unknown)"); 391 seq_printf(s, "(unknown)");
@@ -459,9 +481,13 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
459 seq_printf(s, ",acl"); 481 seq_printf(s, ",acl");
460 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) 482 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
461 seq_printf(s, ",mfsymlinks"); 483 seq_printf(s, ",mfsymlinks");
484 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
485 seq_printf(s, ",fsc");
462 486
463 seq_printf(s, ",rsize=%d", cifs_sb->rsize); 487 seq_printf(s, ",rsize=%d", cifs_sb->rsize);
464 seq_printf(s, ",wsize=%d", cifs_sb->wsize); 488 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
489 /* convert actimeo and display it in seconds */
490 seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
465 491
466 return 0; 492 return 0;
467} 493}
@@ -574,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
574{ 600{
575 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 601 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
576 ssize_t written; 602 ssize_t written;
603 int rc;
577 604
578 written = generic_file_aio_write(iocb, iov, nr_segs, pos); 605 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
579 if (!CIFS_I(inode)->clientCanCacheAll) 606
580 filemap_fdatawrite(inode->i_mapping); 607 if (CIFS_I(inode)->clientCanCacheAll)
608 return written;
609
610 rc = filemap_fdatawrite(inode->i_mapping);
611 if (rc)
612 cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
613
581 return written; 614 return written;
582} 615}
583 616
@@ -707,6 +740,25 @@ const struct file_operations cifs_file_ops = {
707 .setlease = cifs_setlease, 740 .setlease = cifs_setlease,
708}; 741};
709 742
743const struct file_operations cifs_file_strict_ops = {
744 .read = do_sync_read,
745 .write = do_sync_write,
746 .aio_read = cifs_strict_readv,
747 .aio_write = cifs_strict_writev,
748 .open = cifs_open,
749 .release = cifs_close,
750 .lock = cifs_lock,
751 .fsync = cifs_strict_fsync,
752 .flush = cifs_flush,
753 .mmap = cifs_file_strict_mmap,
754 .splice_read = generic_file_splice_read,
755 .llseek = cifs_llseek,
756#ifdef CONFIG_CIFS_POSIX
757 .unlocked_ioctl = cifs_ioctl,
758#endif /* CONFIG_CIFS_POSIX */
759 .setlease = cifs_setlease,
760};
761
710const struct file_operations cifs_file_direct_ops = { 762const struct file_operations cifs_file_direct_ops = {
711 /* no aio, no readv - 763 /* no aio, no readv -
712 BB reevaluate whether they can be done with directio, no cache */ 764 BB reevaluate whether they can be done with directio, no cache */
@@ -725,6 +777,7 @@ const struct file_operations cifs_file_direct_ops = {
725 .llseek = cifs_llseek, 777 .llseek = cifs_llseek,
726 .setlease = cifs_setlease, 778 .setlease = cifs_setlease,
727}; 779};
780
728const struct file_operations cifs_file_nobrl_ops = { 781const struct file_operations cifs_file_nobrl_ops = {
729 .read = do_sync_read, 782 .read = do_sync_read,
730 .write = do_sync_write, 783 .write = do_sync_write,
@@ -743,6 +796,24 @@ const struct file_operations cifs_file_nobrl_ops = {
743 .setlease = cifs_setlease, 796 .setlease = cifs_setlease,
744}; 797};
745 798
799const struct file_operations cifs_file_strict_nobrl_ops = {
800 .read = do_sync_read,
801 .write = do_sync_write,
802 .aio_read = cifs_strict_readv,
803 .aio_write = cifs_strict_writev,
804 .open = cifs_open,
805 .release = cifs_close,
806 .fsync = cifs_strict_fsync,
807 .flush = cifs_flush,
808 .mmap = cifs_file_strict_mmap,
809 .splice_read = generic_file_splice_read,
810 .llseek = cifs_llseek,
811#ifdef CONFIG_CIFS_POSIX
812 .unlocked_ioctl = cifs_ioctl,
813#endif /* CONFIG_CIFS_POSIX */
814 .setlease = cifs_setlease,
815};
816
746const struct file_operations cifs_file_direct_nobrl_ops = { 817const struct file_operations cifs_file_direct_nobrl_ops = {
747 /* no mmap, no aio, no readv - 818 /* no mmap, no aio, no readv -
748 BB reevaluate whether they can be done with directio, no cache */ 819 BB reevaluate whether they can be done with directio, no cache */
@@ -934,7 +1005,6 @@ init_cifs(void)
934 GlobalCurrentXid = 0; 1005 GlobalCurrentXid = 0;
935 GlobalTotalActiveXid = 0; 1006 GlobalTotalActiveXid = 0;
936 GlobalMaxActiveXid = 0; 1007 GlobalMaxActiveXid = 0;
937 memset(Local_System_Name, 0, 15);
938 spin_lock_init(&cifs_tcp_ses_lock); 1008 spin_lock_init(&cifs_tcp_ses_lock);
939 spin_lock_init(&cifs_file_list_lock); 1009 spin_lock_init(&cifs_file_list_lock);
940 spin_lock_init(&GlobalMid_Lock); 1010 spin_lock_init(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 897b2b2b28b..14789a97304 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
61 struct dentry *); 61 struct dentry *);
62extern int cifs_revalidate_file(struct file *filp); 62extern int cifs_revalidate_file(struct file *filp);
63extern int cifs_revalidate_dentry(struct dentry *); 63extern int cifs_revalidate_dentry(struct dentry *);
64extern void cifs_invalidate_mapping(struct inode *inode);
64extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
65extern int cifs_setattr(struct dentry *, struct iattr *); 66extern int cifs_setattr(struct dentry *, struct iattr *);
66 67
@@ -72,19 +73,27 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
72/* Functions related to files and directories */ 73/* Functions related to files and directories */
73extern const struct file_operations cifs_file_ops; 74extern const struct file_operations cifs_file_ops;
74extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ 75extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
75extern const struct file_operations cifs_file_nobrl_ops; 76extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
76extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */ 77extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
78extern const struct file_operations cifs_file_direct_nobrl_ops;
79extern const struct file_operations cifs_file_strict_nobrl_ops;
77extern int cifs_open(struct inode *inode, struct file *file); 80extern int cifs_open(struct inode *inode, struct file *file);
78extern int cifs_close(struct inode *inode, struct file *file); 81extern int cifs_close(struct inode *inode, struct file *file);
79extern int cifs_closedir(struct inode *inode, struct file *file); 82extern int cifs_closedir(struct inode *inode, struct file *file);
80extern ssize_t cifs_user_read(struct file *file, char __user *read_data, 83extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
81 size_t read_size, loff_t *poffset); 84 size_t read_size, loff_t *poffset);
85extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
86 unsigned long nr_segs, loff_t pos);
82extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 87extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
83 size_t write_size, loff_t *poffset); 88 size_t write_size, loff_t *poffset);
89extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
90 unsigned long nr_segs, loff_t pos);
84extern int cifs_lock(struct file *, int, struct file_lock *); 91extern int cifs_lock(struct file *, int, struct file_lock *);
85extern int cifs_fsync(struct file *, int); 92extern int cifs_fsync(struct file *, int);
93extern int cifs_strict_fsync(struct file *, int);
86extern int cifs_flush(struct file *, fl_owner_t id); 94extern int cifs_flush(struct file *, fl_owner_t id);
87extern int cifs_file_mmap(struct file * , struct vm_area_struct *); 95extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
96extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
88extern const struct file_operations cifs_dir_ops; 97extern const struct file_operations cifs_dir_ops;
89extern int cifs_dir_open(struct inode *inode, struct file *file); 98extern int cifs_dir_open(struct inode *inode, struct file *file);
90extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); 99extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
@@ -93,6 +102,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
93extern const struct dentry_operations cifs_dentry_ops; 102extern const struct dentry_operations cifs_dentry_ops;
94extern const struct dentry_operations cifs_ci_dentry_ops; 103extern const struct dentry_operations cifs_ci_dentry_ops;
95 104
105#ifdef CONFIG_CIFS_DFS_UPCALL
106extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
107#else
108#define cifs_dfs_d_automount NULL
109#endif
110
96/* Functions related to symlinks */ 111/* Functions related to symlinks */
97extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); 112extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
98extern void cifs_put_link(struct dentry *direntry, 113extern void cifs_put_link(struct dentry *direntry,
@@ -112,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
112extern const struct export_operations cifs_export_ops; 127extern const struct export_operations cifs_export_ops;
113#endif /* EXPERIMENTAL */ 128#endif /* EXPERIMENTAL */
114 129
115#define CIFS_VERSION "1.68" 130#define CIFS_VERSION "1.69"
116#endif /* _CIFSFS_H */ 131#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f259e4d7612..edd5b29b53c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -45,6 +45,16 @@
45#define CIFS_MIN_RCV_POOL 4 45#define CIFS_MIN_RCV_POOL 4
46 46
47/* 47/*
48 * default attribute cache timeout (jiffies)
49 */
50#define CIFS_DEF_ACTIMEO (1 * HZ)
51
52/*
53 * max attribute cache timeout (jiffies) - 2^30
54 */
55#define CIFS_MAX_ACTIMEO (1 << 30)
56
57/*
48 * MAX_REQ is the maximum number of requests that WE will send 58 * MAX_REQ is the maximum number of requests that WE will send
49 * on one socket concurrently. It also matches the most common 59 * on one socket concurrently. It also matches the most common
50 * value of max multiplex returned by servers. We may 60 * value of max multiplex returned by servers. We may
@@ -151,35 +161,27 @@ struct TCP_Server_Info {
151 int srv_count; /* reference counter */ 161 int srv_count; /* reference counter */
152 /* 15 character server name + 0x20 16th byte indicating type = srv */ 162 /* 15 character server name + 0x20 16th byte indicating type = srv */
153 char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 163 char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
164 enum statusEnum tcpStatus; /* what we think the status is */
154 char *hostname; /* hostname portion of UNC string */ 165 char *hostname; /* hostname portion of UNC string */
155 struct socket *ssocket; 166 struct socket *ssocket;
156 union { 167 struct sockaddr_storage dstaddr;
157 struct sockaddr_in sockAddr;
158 struct sockaddr_in6 sockAddr6;
159 } addr;
160 struct sockaddr_storage srcaddr; /* locally bind to this IP */ 168 struct sockaddr_storage srcaddr; /* locally bind to this IP */
169#ifdef CONFIG_NET_NS
170 struct net *net;
171#endif
161 wait_queue_head_t response_q; 172 wait_queue_head_t response_q;
162 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ 173 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
163 struct list_head pending_mid_q; 174 struct list_head pending_mid_q;
164 void *Server_NlsInfo; /* BB - placeholder for future NLS info */
165 unsigned short server_codepage; /* codepage for the server */
166 enum protocolEnum protocolType;
167 char versionMajor;
168 char versionMinor;
169 bool svlocal:1; /* local server or remote */
170 bool noblocksnd; /* use blocking sendmsg */ 175 bool noblocksnd; /* use blocking sendmsg */
171 bool noautotune; /* do not autotune send buf sizes */ 176 bool noautotune; /* do not autotune send buf sizes */
172 bool tcp_nodelay; 177 bool tcp_nodelay;
173 atomic_t inFlight; /* number of requests on the wire to server */ 178 atomic_t inFlight; /* number of requests on the wire to server */
174#ifdef CONFIG_CIFS_STATS2
175 atomic_t inSend; /* requests trying to send */
176 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
177#endif
178 enum statusEnum tcpStatus; /* what we think the status is */
179 struct mutex srv_mutex; 179 struct mutex srv_mutex;
180 struct task_struct *tsk; 180 struct task_struct *tsk;
181 char server_GUID[16]; 181 char server_GUID[16];
182 char secMode; 182 char secMode;
183 bool session_estab; /* mark when very first sess is established */
184 u16 dialect; /* dialect index that server chose */
183 enum securityEnum secType; 185 enum securityEnum secType;
184 unsigned int maxReq; /* Clients should submit no more */ 186 unsigned int maxReq; /* Clients should submit no more */
185 /* than maxReq distinct unanswered SMBs to the server when using */ 187 /* than maxReq distinct unanswered SMBs to the server when using */
@@ -192,31 +194,62 @@ struct TCP_Server_Info {
192 unsigned int max_vcs; /* maximum number of smb sessions, at least 194 unsigned int max_vcs; /* maximum number of smb sessions, at least
193 those that can be specified uniquely with 195 those that can be specified uniquely with
194 vcnumbers */ 196 vcnumbers */
195 char sessid[4]; /* unique token id for this session */
196 /* (returned on Negotiate */
197 int capabilities; /* allow selective disabling of caps by smb sess */ 197 int capabilities; /* allow selective disabling of caps by smb sess */
198 int timeAdj; /* Adjust for difference in server time zone in sec */ 198 int timeAdj; /* Adjust for difference in server time zone in sec */
199 __u16 CurrentMid; /* multiplex id - rotating counter */ 199 __u16 CurrentMid; /* multiplex id - rotating counter */
200 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ 200 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
201 /* 16th byte of RFC1001 workstation name is always null */ 201 /* 16th byte of RFC1001 workstation name is always null */
202 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 202 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
203 __u32 sequence_number; /* needed for CIFS PDU signature */ 203 __u32 sequence_number; /* for signing, protected by srv_mutex */
204 struct session_key session_key; 204 struct session_key session_key;
205 unsigned long lstrp; /* when we got last response from this server */ 205 unsigned long lstrp; /* when we got last response from this server */
206 u16 dialect; /* dialect index that server chose */
207 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ 206 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
208 /* extended security flavors that server supports */ 207 /* extended security flavors that server supports */
208 bool sec_ntlmssp; /* supports NTLMSSP */
209 bool sec_kerberosu2u; /* supports U2U Kerberos */
209 bool sec_kerberos; /* supports plain Kerberos */ 210 bool sec_kerberos; /* supports plain Kerberos */
210 bool sec_mskerberos; /* supports legacy MS Kerberos */ 211 bool sec_mskerberos; /* supports legacy MS Kerberos */
211 bool sec_kerberosu2u; /* supports U2U Kerberos */ 212 struct delayed_work echo; /* echo ping workqueue job */
212 bool sec_ntlmssp; /* supports NTLMSSP */
213 bool session_estab; /* mark when very first sess is established */
214#ifdef CONFIG_CIFS_FSCACHE 213#ifdef CONFIG_CIFS_FSCACHE
215 struct fscache_cookie *fscache; /* client index cache cookie */ 214 struct fscache_cookie *fscache; /* client index cache cookie */
216#endif 215#endif
216#ifdef CONFIG_CIFS_STATS2
217 atomic_t inSend; /* requests trying to send */
218 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
219#endif
217}; 220};
218 221
219/* 222/*
223 * Macros to allow the TCP_Server_Info->net field and related code to drop out
224 * when CONFIG_NET_NS isn't set.
225 */
226
227#ifdef CONFIG_NET_NS
228
229static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
230{
231 return srv->net;
232}
233
234static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
235{
236 srv->net = net;
237}
238
239#else
240
241static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
242{
243 return &init_net;
244}
245
246static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
247{
248}
249
250#endif
251
252/*
220 * Session structure. One of these for each uid session with a particular host 253 * Session structure. One of these for each uid session with a particular host
221 */ 254 */
222struct cifsSesInfo { 255struct cifsSesInfo {
@@ -336,7 +369,8 @@ struct cifsTconInfo {
336 * "get" on the container. 369 * "get" on the container.
337 */ 370 */
338struct tcon_link { 371struct tcon_link {
339 unsigned long tl_index; 372 struct rb_node tl_rbnode;
373 uid_t tl_uid;
340 unsigned long tl_flags; 374 unsigned long tl_flags;
341#define TCON_LINK_MASTER 0 375#define TCON_LINK_MASTER 0
342#define TCON_LINK_PENDING 1 376#define TCON_LINK_PENDING 1
@@ -438,13 +472,14 @@ struct cifsInodeInfo {
438 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 472 /* BB add in lists for dirty pages i.e. write caching info for oplock */
439 struct list_head openFileList; 473 struct list_head openFileList;
440 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 474 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
441 unsigned long time; /* jiffies of last update/check of inode */ 475 bool clientCanCacheRead; /* read oplock */
442 bool clientCanCacheRead:1; /* read oplock */ 476 bool clientCanCacheAll; /* read and writebehind oplock */
443 bool clientCanCacheAll:1; /* read and writebehind oplock */ 477 bool delete_pending; /* DELETE_ON_CLOSE is set */
444 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 478 bool invalid_mapping; /* pagecache is invalid */
445 bool invalid_mapping:1; /* pagecache is invalid */ 479 unsigned long time; /* jiffies of last update of inode */
446 u64 server_eof; /* current file size on server */ 480 u64 server_eof; /* current file size on server */
447 u64 uniqueid; /* server inode number */ 481 u64 uniqueid; /* server inode number */
482 u64 createtime; /* creation time on server */
448#ifdef CONFIG_CIFS_FSCACHE 483#ifdef CONFIG_CIFS_FSCACHE
449 struct fscache_cookie *fscache; 484 struct fscache_cookie *fscache;
450#endif 485#endif
@@ -499,6 +534,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
499 534
500#endif 535#endif
501 536
537struct mid_q_entry;
538
539/*
540 * This is the prototype for the mid callback function. When creating one,
541 * take special care to avoid deadlocks. Things to bear in mind:
542 *
543 * - it will be called by cifsd
544 * - the GlobalMid_Lock will be held
545 * - the mid will be removed from the pending_mid_q list
546 */
547typedef void (mid_callback_t)(struct mid_q_entry *mid);
548
502/* one of these for every pending CIFS request to the server */ 549/* one of these for every pending CIFS request to the server */
503struct mid_q_entry { 550struct mid_q_entry {
504 struct list_head qhead; /* mids waiting on reply from this server */ 551 struct list_head qhead; /* mids waiting on reply from this server */
@@ -510,7 +557,8 @@ struct mid_q_entry {
510 unsigned long when_sent; /* time when smb send finished */ 557 unsigned long when_sent; /* time when smb send finished */
511 unsigned long when_received; /* when demux complete (taken off wire) */ 558 unsigned long when_received; /* when demux complete (taken off wire) */
512#endif 559#endif
513 struct task_struct *tsk; /* task waiting for response */ 560 mid_callback_t *callback; /* call completion callback */
561 void *callback_data; /* general purpose pointer for callback */
514 struct smb_hdr *resp_buf; /* response buffer */ 562 struct smb_hdr *resp_buf; /* response buffer */
515 int midState; /* wish this were enum but can not pass to wait_event */ 563 int midState; /* wish this were enum but can not pass to wait_event */
516 __u8 command; /* smb command code */ 564 __u8 command; /* smb command code */
@@ -565,6 +613,7 @@ struct cifs_fattr {
565 u64 cf_uniqueid; 613 u64 cf_uniqueid;
566 u64 cf_eof; 614 u64 cf_eof;
567 u64 cf_bytes; 615 u64 cf_bytes;
616 u64 cf_createtime;
568 uid_t cf_uid; 617 uid_t cf_uid;
569 gid_t cf_gid; 618 gid_t cf_gid;
570 umode_t cf_mode; 619 umode_t cf_mode;
@@ -612,12 +661,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
612#define CIFS_IOVEC 4 /* array of response buffers */ 661#define CIFS_IOVEC 4 /* array of response buffers */
613 662
614/* Type of Request to SendReceive2 */ 663/* Type of Request to SendReceive2 */
615#define CIFS_STD_OP 0 /* normal request timeout */ 664#define CIFS_BLOCKING_OP 1 /* operation can block */
616#define CIFS_LONG_OP 1 /* long op (up to 45 sec, oplock time) */ 665#define CIFS_ASYNC_OP 2 /* do not wait for response */
617#define CIFS_VLONG_OP 2 /* sloow op - can take up to 180 seconds */ 666#define CIFS_TIMEOUT_MASK 0x003 /* only one of above set in req */
618#define CIFS_BLOCKING_OP 4 /* operation can block */
619#define CIFS_ASYNC_OP 8 /* do not wait for response */
620#define CIFS_TIMEOUT_MASK 0x00F /* only one of 5 above set in req */
621#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */ 667#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */
622#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */ 668#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */
623#define CIFS_NO_RESP 0x040 /* no response buffer required */ 669#define CIFS_NO_RESP 0x040 /* no response buffer required */
@@ -745,8 +791,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
745GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ 791GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
746GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ 792GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */
747 /* on midQ entries */ 793 /* on midQ entries */
748GLOBAL_EXTERN char Local_System_Name[15];
749
750/* 794/*
751 * Global counters, updated atomically 795 * Global counters, updated atomically
752 */ 796 */
@@ -782,6 +826,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
782GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 826GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
783GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 827GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
784 828
829/* reconnect after this many failed echo attempts */
830GLOBAL_EXTERN unsigned short echo_retries;
831
785void cifs_oplock_break(struct work_struct *work); 832void cifs_oplock_break(struct work_struct *work);
786void cifs_oplock_break_get(struct cifsFileInfo *cfile); 833void cifs_oplock_break_get(struct cifsFileInfo *cfile);
787void cifs_oplock_break_put(struct cifsFileInfo *cfile); 834void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de36b09763a..b5c8cc5d7a7 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -23,6 +23,7 @@
23#define _CIFSPDU_H 23#define _CIFSPDU_H
24 24
25#include <net/sock.h> 25#include <net/sock.h>
26#include <asm/unaligned.h>
26#include "smbfsctl.h" 27#include "smbfsctl.h"
27 28
28#ifdef CONFIG_CIFS_WEAK_PW_HASH 29#ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -50,6 +51,7 @@
50#define SMB_COM_SETATTR 0x09 /* trivial response */ 51#define SMB_COM_SETATTR 0x09 /* trivial response */
51#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */ 52#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */
52#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/ 53#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/
54#define SMB_COM_ECHO 0x2B /* echo request */
53#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */ 55#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */
54#define SMB_COM_READ_ANDX 0x2E 56#define SMB_COM_READ_ANDX 0x2E
55#define SMB_COM_WRITE_ANDX 0x2F 57#define SMB_COM_WRITE_ANDX 0x2F
@@ -425,11 +427,49 @@ struct smb_hdr {
425 __u16 Mid; 427 __u16 Mid;
426 __u8 WordCount; 428 __u8 WordCount;
427} __attribute__((packed)); 429} __attribute__((packed));
428/* given a pointer to an smb_hdr retrieve the value of byte count */ 430
429#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) 431/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
430#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) 432#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
433 (2 * (smb_var)->WordCount))
434
431/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ 435/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
432#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2) 436#define pByteArea(smb_var) (BCC(smb_var) + 2)
437
438/* get the converted ByteCount for a SMB packet and return it */
439static inline __u16
440get_bcc(struct smb_hdr *hdr)
441{
442 __u16 *bc_ptr = (__u16 *)BCC(hdr);
443
444 return get_unaligned(bc_ptr);
445}
446
447/* get the unconverted ByteCount for a SMB packet and return it */
448static inline __u16
449get_bcc_le(struct smb_hdr *hdr)
450{
451 __le16 *bc_ptr = (__le16 *)BCC(hdr);
452
453 return get_unaligned_le16(bc_ptr);
454}
455
456/* set the ByteCount for a SMB packet in host-byte order */
457static inline void
458put_bcc(__u16 count, struct smb_hdr *hdr)
459{
460 __u16 *bc_ptr = (__u16 *)BCC(hdr);
461
462 put_unaligned(count, bc_ptr);
463}
464
465/* set the ByteCount for a SMB packet in little-endian */
466static inline void
467put_bcc_le(__u16 count, struct smb_hdr *hdr)
468{
469 __le16 *bc_ptr = (__le16 *)BCC(hdr);
470
471 put_unaligned_le16(count, bc_ptr);
472}
433 473
434/* 474/*
435 * Computer Name Length (since Netbios name was length 16 with last byte 0x20) 475 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -760,6 +800,20 @@ typedef struct smb_com_tconx_rsp_ext {
760 * 800 *
761 */ 801 */
762 802
803typedef struct smb_com_echo_req {
804 struct smb_hdr hdr;
805 __le16 EchoCount;
806 __le16 ByteCount;
807 char Data[1];
808} __attribute__((packed)) ECHO_REQ;
809
810typedef struct smb_com_echo_rsp {
811 struct smb_hdr hdr;
812 __le16 SequenceNumber;
813 __le16 ByteCount;
814 char Data[1];
815} __attribute__((packed)) ECHO_RSP;
816
763typedef struct smb_com_logoff_andx_req { 817typedef struct smb_com_logoff_andx_req {
764 struct smb_hdr hdr; /* wct = 2 */ 818 struct smb_hdr hdr; /* wct = 2 */
765 __u8 AndXCommand; 819 __u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index edb6d90efdf..8096f27ad9a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -54,12 +54,19 @@ do { \
54 __func__, curr_xid, (int)rc); \ 54 __func__, curr_xid, (int)rc); \
55} while (0) 55} while (0)
56extern char *build_path_from_dentry(struct dentry *); 56extern char *build_path_from_dentry(struct dentry *);
57extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); 57extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
58 struct cifsTconInfo *tcon);
58extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 59extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
59extern char *cifs_compose_mount_options(const char *sb_mountdata, 60extern char *cifs_compose_mount_options(const char *sb_mountdata,
60 const char *fullpath, const struct dfs_info3_param *ref, 61 const char *fullpath, const struct dfs_info3_param *ref,
61 char **devname); 62 char **devname);
62/* extern void renew_parental_timestamps(struct dentry *direntry);*/ 63/* extern void renew_parental_timestamps(struct dentry *direntry);*/
64extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
65 struct TCP_Server_Info *server);
66extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
67extern int cifs_call_async(struct TCP_Server_Info *server,
68 struct smb_hdr *in_buf, mid_callback_t *callback,
69 void *cbdata);
63extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, 70extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
64 struct smb_hdr * /* input */ , 71 struct smb_hdr * /* input */ ,
65 struct smb_hdr * /* out */ , 72 struct smb_hdr * /* out */ ,
@@ -78,10 +85,10 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
78extern bool is_valid_oplock_break(struct smb_hdr *smb, 85extern bool is_valid_oplock_break(struct smb_hdr *smb,
79 struct TCP_Server_Info *); 86 struct TCP_Server_Info *);
80extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 87extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
88extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
89 unsigned int bytes_written);
81extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); 90extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
82#ifdef CONFIG_CIFS_EXPERIMENTAL
83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); 91extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
84#endif
85extern unsigned int smbCalcSize(struct smb_hdr *ptr); 92extern unsigned int smbCalcSize(struct smb_hdr *ptr);
86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 93extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
87extern int decode_negTokenInit(unsigned char *security_blob, int length, 94extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -104,6 +111,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
104extern u64 cifs_UnixTimeToNT(struct timespec); 111extern u64 cifs_UnixTimeToNT(struct timespec);
105extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 112extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
106 int offset); 113 int offset);
114extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
107 115
108extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, 116extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
109 struct file *file, struct tcon_link *tlink, 117 struct file *file, struct tcon_link *tlink,
@@ -129,10 +137,12 @@ extern int cifs_get_file_info_unix(struct file *filp);
129extern int cifs_get_inode_info_unix(struct inode **pinode, 137extern int cifs_get_inode_info_unix(struct inode **pinode,
130 const unsigned char *search_path, 138 const unsigned char *search_path,
131 struct super_block *sb, int xid); 139 struct super_block *sb, int xid);
132extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, 140extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
133 struct cifs_fattr *fattr, struct inode *inode, 141 struct cifs_fattr *fattr, struct inode *inode,
134 const char *path, const __u16 *pfid); 142 const char *path, const __u16 *pfid);
135extern int mode_to_acl(struct inode *inode, const char *path, __u64); 143extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
144extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
145 const char *, u32 *);
136 146
137extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, 147extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
138 const char *); 148 const char *);
@@ -345,12 +355,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
345 const __u16 netfid, const __u64 len, 355 const __u16 netfid, const __u64 len,
346 const __u64 offset, const __u32 numUnlock, 356 const __u64 offset, const __u32 numUnlock,
347 const __u32 numLock, const __u8 lockType, 357 const __u32 numLock, const __u8 lockType,
348 const bool waitFlag); 358 const bool waitFlag, const __u8 oplock_level);
349extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, 359extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
350 const __u16 smb_file_id, const int get_flag, 360 const __u16 smb_file_id, const int get_flag,
351 const __u64 len, struct file_lock *, 361 const __u64 len, struct file_lock *,
352 const __u16 lock_type, const bool waitFlag); 362 const __u16 lock_type, const bool waitFlag);
353extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon); 363extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
364extern int CIFSSMBEcho(struct TCP_Server_Info *server);
354extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses); 365extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
355 366
356extern struct cifsSesInfo *sesInfoAlloc(void); 367extern struct cifsSesInfo *sesInfoAlloc(void);
@@ -364,7 +375,7 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
364extern int cifs_verify_signature(struct smb_hdr *, 375extern int cifs_verify_signature(struct smb_hdr *,
365 struct TCP_Server_Info *server, 376 struct TCP_Server_Info *server,
366 __u32 expected_sequence_number); 377 __u32 expected_sequence_number);
367extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); 378extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
368extern int setup_ntlm_response(struct cifsSesInfo *); 379extern int setup_ntlm_response(struct cifsSesInfo *);
369extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *); 380extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
370extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); 381extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
@@ -414,4 +425,11 @@ extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
414extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, 425extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
415 const unsigned char *path, 426 const unsigned char *path,
416 struct cifs_sb_info *cifs_sb, int xid); 427 struct cifs_sb_info *cifs_sb, int xid);
428extern int mdfour(unsigned char *, unsigned char *, int);
429extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
430extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
431 unsigned char *p24);
432extern void E_P16(unsigned char *p14, unsigned char *p16);
433extern void E_P24(unsigned char *p21, const unsigned char *c8,
434 unsigned char *p24);
417#endif /* _CIFSPROTO_H */ 435#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f2632b6df5..3106f5e5c63 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -331,37 +331,35 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
331 331
332static int validate_t2(struct smb_t2_rsp *pSMB) 332static int validate_t2(struct smb_t2_rsp *pSMB)
333{ 333{
334 int rc = -EINVAL; 334 unsigned int total_size;
335 int total_size; 335
336 char *pBCC; 336 /* check for plausible wct */
337 if (pSMB->hdr.WordCount < 10)
338 goto vt2_err;
337 339
338 /* check for plausible wct, bcc and t2 data and parm sizes */
339 /* check for parm and data offset going beyond end of smb */ 340 /* check for parm and data offset going beyond end of smb */
340 if (pSMB->hdr.WordCount >= 10) { 341 if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
341 if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) && 342 get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
342 (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) { 343 goto vt2_err;
343 /* check that bcc is at least as big as parms + data */ 344
344 /* check that bcc is less than negotiated smb buffer */ 345 /* check that bcc is at least as big as parms + data */
345 total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount); 346 /* check that bcc is less than negotiated smb buffer */
346 if (total_size < 512) { 347 total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
347 total_size += 348 if (total_size >= 512)
348 le16_to_cpu(pSMB->t2_rsp.DataCount); 349 goto vt2_err;
349 /* BCC le converted in SendReceive */ 350
350 pBCC = (pSMB->hdr.WordCount * 2) + 351 total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
351 sizeof(struct smb_hdr) + 352 if (total_size > get_bcc(&pSMB->hdr) ||
352 (char *)pSMB; 353 total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
353 if ((total_size <= (*(u16 *)pBCC)) && 354 goto vt2_err;
354 (total_size < 355
355 CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) { 356 return 0;
356 return 0; 357vt2_err:
357 }
358 }
359 }
360 }
361 cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB, 358 cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
362 sizeof(struct smb_t2_rsp) + 16); 359 sizeof(struct smb_t2_rsp) + 16);
363 return rc; 360 return -EINVAL;
364} 361}
362
365int 363int
366CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) 364CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
367{ 365{
@@ -401,15 +399,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
401 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { 399 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
402 cFYI(1, "Kerberos only mechanism, enable extended security"); 400 cFYI(1, "Kerberos only mechanism, enable extended security");
403 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 401 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
404 } 402 } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
405#ifdef CONFIG_CIFS_EXPERIMENTAL
406 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
407 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 403 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
408 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { 404 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
409 cFYI(1, "NTLMSSP only mechanism, enable extended security"); 405 cFYI(1, "NTLMSSP only mechanism, enable extended security");
410 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 406 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
411 } 407 }
412#endif
413 408
414 count = 0; 409 count = 0;
415 for (i = 0; i < CIFS_NUM_PROT; i++) { 410 for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -455,7 +450,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
455 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), 450 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
456 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 451 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
457 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); 452 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
458 GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
459 /* even though we do not use raw we might as well set this 453 /* even though we do not use raw we might as well set this
460 accurately, in case we ever find a need for it */ 454 accurately, in case we ever find a need for it */
461 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { 455 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -569,7 +563,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
569 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 563 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
570 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 564 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
571 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); 565 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
572 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
573 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 566 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
574 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 567 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
575 server->timeAdj *= 60; 568 server->timeAdj *= 60;
@@ -709,6 +702,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
709 return rc; 702 return rc;
710} 703}
711 704
705/*
706 * This is a no-op for now. We're not really interested in the reply, but
707 * rather in the fact that the server sent one and that server->lstrp
708 * gets updated.
709 *
710 * FIXME: maybe we should consider checking that the reply matches request?
711 */
712static void
713cifs_echo_callback(struct mid_q_entry *mid)
714{
715 struct TCP_Server_Info *server = mid->callback_data;
716
717 DeleteMidQEntry(mid);
718 atomic_dec(&server->inFlight);
719 wake_up(&server->request_q);
720}
721
722int
723CIFSSMBEcho(struct TCP_Server_Info *server)
724{
725 ECHO_REQ *smb;
726 int rc = 0;
727
728 cFYI(1, "In echo request");
729
730 rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
731 if (rc)
732 return rc;
733
734 /* set up echo request */
735 smb->hdr.Tid = cpu_to_le16(0xffff);
736 smb->hdr.WordCount = 1;
737 put_unaligned_le16(1, &smb->EchoCount);
738 put_bcc_le(1, &smb->hdr);
739 smb->Data[0] = 'a';
740 smb->hdr.smb_buf_length += 3;
741
742 rc = cifs_call_async(server, (struct smb_hdr *)smb,
743 cifs_echo_callback, server);
744 if (rc)
745 cFYI(1, "Echo request failed: %d", rc);
746
747 cifs_small_buf_release(smb);
748
749 return rc;
750}
751
712int 752int
713CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) 753CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
714{ 754{
@@ -1196,7 +1236,7 @@ OldOpenRetry:
1196 pSMB->ByteCount = cpu_to_le16(count); 1236 pSMB->ByteCount = cpu_to_le16(count);
1197 /* long_op set to 1 to allow for oplock break timeouts */ 1237 /* long_op set to 1 to allow for oplock break timeouts */
1198 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1238 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1199 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1239 (struct smb_hdr *)pSMBr, &bytes_returned, 0);
1200 cifs_stats_inc(&tcon->num_opens); 1240 cifs_stats_inc(&tcon->num_opens);
1201 if (rc) { 1241 if (rc) {
1202 cFYI(1, "Error in Open = %d", rc); 1242 cFYI(1, "Error in Open = %d", rc);
@@ -1309,7 +1349,7 @@ openRetry:
1309 pSMB->ByteCount = cpu_to_le16(count); 1349 pSMB->ByteCount = cpu_to_le16(count);
1310 /* long_op set to 1 to allow for oplock break timeouts */ 1350 /* long_op set to 1 to allow for oplock break timeouts */
1311 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1351 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1312 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1352 (struct smb_hdr *)pSMBr, &bytes_returned, 0);
1313 cifs_stats_inc(&tcon->num_opens); 1353 cifs_stats_inc(&tcon->num_opens);
1314 if (rc) { 1354 if (rc) {
1315 cFYI(1, "Error in Open = %d", rc); 1355 cFYI(1, "Error in Open = %d", rc);
@@ -1391,7 +1431,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1391 iov[0].iov_base = (char *)pSMB; 1431 iov[0].iov_base = (char *)pSMB;
1392 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; 1432 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
1393 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, 1433 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
1394 &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR); 1434 &resp_buf_type, CIFS_LOG_ERROR);
1395 cifs_stats_inc(&tcon->num_reads); 1435 cifs_stats_inc(&tcon->num_reads);
1396 pSMBr = (READ_RSP *)iov[0].iov_base; 1436 pSMBr = (READ_RSP *)iov[0].iov_base;
1397 if (rc) { 1437 if (rc) {
@@ -1666,7 +1706,8 @@ int
1666CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, 1706CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1667 const __u16 smb_file_id, const __u64 len, 1707 const __u16 smb_file_id, const __u64 len,
1668 const __u64 offset, const __u32 numUnlock, 1708 const __u64 offset, const __u32 numUnlock,
1669 const __u32 numLock, const __u8 lockType, const bool waitFlag) 1709 const __u32 numLock, const __u8 lockType,
1710 const bool waitFlag, const __u8 oplock_level)
1670{ 1711{
1671 int rc = 0; 1712 int rc = 0;
1672 LOCK_REQ *pSMB = NULL; 1713 LOCK_REQ *pSMB = NULL;
@@ -1694,6 +1735,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1694 pSMB->NumberOfLocks = cpu_to_le16(numLock); 1735 pSMB->NumberOfLocks = cpu_to_le16(numLock);
1695 pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock); 1736 pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
1696 pSMB->LockType = lockType; 1737 pSMB->LockType = lockType;
1738 pSMB->OplockLevel = oplock_level;
1697 pSMB->AndXCommand = 0xFF; /* none */ 1739 pSMB->AndXCommand = 0xFF; /* none */
1698 pSMB->Fid = smb_file_id; /* netfid stays le */ 1740 pSMB->Fid = smb_file_id; /* netfid stays le */
1699 1741
@@ -2478,95 +2520,6 @@ querySymLinkRetry:
2478} 2520}
2479 2521
2480#ifdef CONFIG_CIFS_EXPERIMENTAL 2522#ifdef CONFIG_CIFS_EXPERIMENTAL
2481/* Initialize NT TRANSACT SMB into small smb request buffer.
2482 This assumes that all NT TRANSACTS that we init here have
2483 total parm and data under about 400 bytes (to fit in small cifs
2484 buffer size), which is the case so far, it easily fits. NB:
2485 Setup words themselves and ByteCount
2486 MaxSetupCount (size of returned setup area) and
2487 MaxParameterCount (returned parms size) must be set by caller */
2488static int
2489smb_init_nttransact(const __u16 sub_command, const int setup_count,
2490 const int parm_len, struct cifsTconInfo *tcon,
2491 void **ret_buf)
2492{
2493 int rc;
2494 __u32 temp_offset;
2495 struct smb_com_ntransact_req *pSMB;
2496
2497 rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
2498 (void **)&pSMB);
2499 if (rc)
2500 return rc;
2501 *ret_buf = (void *)pSMB;
2502 pSMB->Reserved = 0;
2503 pSMB->TotalParameterCount = cpu_to_le32(parm_len);
2504 pSMB->TotalDataCount = 0;
2505 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
2506 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
2507 pSMB->ParameterCount = pSMB->TotalParameterCount;
2508 pSMB->DataCount = pSMB->TotalDataCount;
2509 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
2510 (setup_count * 2) - 4 /* for rfc1001 length itself */;
2511 pSMB->ParameterOffset = cpu_to_le32(temp_offset);
2512 pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
2513 pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
2514 pSMB->SubCommand = cpu_to_le16(sub_command);
2515 return 0;
2516}
2517
2518static int
2519validate_ntransact(char *buf, char **ppparm, char **ppdata,
2520 __u32 *pparmlen, __u32 *pdatalen)
2521{
2522 char *end_of_smb;
2523 __u32 data_count, data_offset, parm_count, parm_offset;
2524 struct smb_com_ntransact_rsp *pSMBr;
2525
2526 *pdatalen = 0;
2527 *pparmlen = 0;
2528
2529 if (buf == NULL)
2530 return -EINVAL;
2531
2532 pSMBr = (struct smb_com_ntransact_rsp *)buf;
2533
2534 /* ByteCount was converted from little endian in SendReceive */
2535 end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
2536 (char *)&pSMBr->ByteCount;
2537
2538 data_offset = le32_to_cpu(pSMBr->DataOffset);
2539 data_count = le32_to_cpu(pSMBr->DataCount);
2540 parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
2541 parm_count = le32_to_cpu(pSMBr->ParameterCount);
2542
2543 *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
2544 *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
2545
2546 /* should we also check that parm and data areas do not overlap? */
2547 if (*ppparm > end_of_smb) {
2548 cFYI(1, "parms start after end of smb");
2549 return -EINVAL;
2550 } else if (parm_count + *ppparm > end_of_smb) {
2551 cFYI(1, "parm end after end of smb");
2552 return -EINVAL;
2553 } else if (*ppdata > end_of_smb) {
2554 cFYI(1, "data starts after end of smb");
2555 return -EINVAL;
2556 } else if (data_count + *ppdata > end_of_smb) {
2557 cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
2558 *ppdata, data_count, (data_count + *ppdata),
2559 end_of_smb, pSMBr);
2560 return -EINVAL;
2561 } else if (parm_count + data_count > pSMBr->ByteCount) {
2562 cFYI(1, "parm count and data count larger than SMB");
2563 return -EINVAL;
2564 }
2565 *pdatalen = data_count;
2566 *pparmlen = parm_count;
2567 return 0;
2568}
2569
2570int 2523int
2571CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, 2524CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2572 const unsigned char *searchName, 2525 const unsigned char *searchName,
@@ -3056,7 +3009,97 @@ GetExtAttrOut:
3056 3009
3057#endif /* CONFIG_POSIX */ 3010#endif /* CONFIG_POSIX */
3058 3011
3059#ifdef CONFIG_CIFS_EXPERIMENTAL 3012#ifdef CONFIG_CIFS_ACL
3013/*
3014 * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that
3015 * all NT TRANSACTS that we init here have total parm and data under about 400
3016 * bytes (to fit in small cifs buffer size), which is the case so far, it
3017 * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
3018 * returned setup area) and MaxParameterCount (returned parms size) must be set
3019 * by caller
3020 */
3021static int
3022smb_init_nttransact(const __u16 sub_command, const int setup_count,
3023 const int parm_len, struct cifsTconInfo *tcon,
3024 void **ret_buf)
3025{
3026 int rc;
3027 __u32 temp_offset;
3028 struct smb_com_ntransact_req *pSMB;
3029
3030 rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
3031 (void **)&pSMB);
3032 if (rc)
3033 return rc;
3034 *ret_buf = (void *)pSMB;
3035 pSMB->Reserved = 0;
3036 pSMB->TotalParameterCount = cpu_to_le32(parm_len);
3037 pSMB->TotalDataCount = 0;
3038 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
3039 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
3040 pSMB->ParameterCount = pSMB->TotalParameterCount;
3041 pSMB->DataCount = pSMB->TotalDataCount;
3042 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
3043 (setup_count * 2) - 4 /* for rfc1001 length itself */;
3044 pSMB->ParameterOffset = cpu_to_le32(temp_offset);
3045 pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
3046 pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
3047 pSMB->SubCommand = cpu_to_le16(sub_command);
3048 return 0;
3049}
3050
3051static int
3052validate_ntransact(char *buf, char **ppparm, char **ppdata,
3053 __u32 *pparmlen, __u32 *pdatalen)
3054{
3055 char *end_of_smb;
3056 __u32 data_count, data_offset, parm_count, parm_offset;
3057 struct smb_com_ntransact_rsp *pSMBr;
3058
3059 *pdatalen = 0;
3060 *pparmlen = 0;
3061
3062 if (buf == NULL)
3063 return -EINVAL;
3064
3065 pSMBr = (struct smb_com_ntransact_rsp *)buf;
3066
3067 /* ByteCount was converted from little endian in SendReceive */
3068 end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
3069 (char *)&pSMBr->ByteCount;
3070
3071 data_offset = le32_to_cpu(pSMBr->DataOffset);
3072 data_count = le32_to_cpu(pSMBr->DataCount);
3073 parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
3074 parm_count = le32_to_cpu(pSMBr->ParameterCount);
3075
3076 *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
3077 *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
3078
3079 /* should we also check that parm and data areas do not overlap? */
3080 if (*ppparm > end_of_smb) {
3081 cFYI(1, "parms start after end of smb");
3082 return -EINVAL;
3083 } else if (parm_count + *ppparm > end_of_smb) {
3084 cFYI(1, "parm end after end of smb");
3085 return -EINVAL;
3086 } else if (*ppdata > end_of_smb) {
3087 cFYI(1, "data starts after end of smb");
3088 return -EINVAL;
3089 } else if (data_count + *ppdata > end_of_smb) {
3090 cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
3091 *ppdata, data_count, (data_count + *ppdata),
3092 end_of_smb, pSMBr);
3093 return -EINVAL;
3094 } else if (parm_count + data_count > pSMBr->ByteCount) {
3095 cFYI(1, "parm count and data count larger than SMB");
3096 return -EINVAL;
3097 }
3098 *pdatalen = data_count;
3099 *pparmlen = parm_count;
3100 return 0;
3101}
3102
3060/* Get Security Descriptor (by handle) from remote server for a file or dir */ 3103/* Get Security Descriptor (by handle) from remote server for a file or dir */
3061int 3104int
3062CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid, 3105CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
@@ -3089,7 +3132,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3089 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; 3132 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
3090 3133
3091 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, 3134 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
3092 CIFS_STD_OP); 3135 0);
3093 cifs_stats_inc(&tcon->num_acl_get); 3136 cifs_stats_inc(&tcon->num_acl_get);
3094 if (rc) { 3137 if (rc) {
3095 cFYI(1, "Send error in QuerySecDesc = %d", rc); 3138 cFYI(1, "Send error in QuerySecDesc = %d", rc);
@@ -3214,7 +3257,7 @@ setCifsAclRetry:
3214 return (rc); 3257 return (rc);
3215} 3258}
3216 3259
3217#endif /* CONFIG_CIFS_EXPERIMENTAL */ 3260#endif /* CONFIG_CIFS_ACL */
3218 3261
3219/* Legacy Query Path Information call for lookup to old servers such 3262/* Legacy Query Path Information call for lookup to old servers such
3220 as Win9x/WinME */ 3263 as Win9x/WinME */
@@ -5564,7 +5607,7 @@ QAllEAsRetry:
5564 } 5607 }
5565 5608
5566 /* make sure list_len doesn't go past end of SMB */ 5609 /* make sure list_len doesn't go past end of SMB */
5567 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr); 5610 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
5568 if ((char *)ea_response_data + list_len > end_of_smb) { 5611 if ((char *)ea_response_data + list_len > end_of_smb) {
5569 cFYI(1, "EA list appears to go beyond SMB"); 5612 cFYI(1, "EA list appears to go beyond SMB");
5570 rc = -EIO; 5613 rc = -EIO;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9eb327defa1..47d8ff62368 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -52,8 +52,8 @@
52#define CIFS_PORT 445 52#define CIFS_PORT 445
53#define RFC1001_PORT 139 53#define RFC1001_PORT 139
54 54
55extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 55/* SMB echo "timeout" -- FIXME: tunable? */
56 unsigned char *p24); 56#define SMB_ECHO_INTERVAL (60 * HZ)
57 57
58extern mempool_t *cifs_req_poolp; 58extern mempool_t *cifs_req_poolp;
59 59
@@ -64,8 +64,8 @@ struct smb_vol {
64 char *UNC; 64 char *UNC;
65 char *UNCip; 65 char *UNCip;
66 char *iocharset; /* local code page for mapping to and from Unicode */ 66 char *iocharset; /* local code page for mapping to and from Unicode */
67 char source_rfc1001_name[16]; /* netbios name of client */ 67 char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
68 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ 68 char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
69 uid_t cred_uid; 69 uid_t cred_uid;
70 uid_t linux_uid; 70 uid_t linux_uid;
71 gid_t linux_gid; 71 gid_t linux_gid;
@@ -84,6 +84,7 @@ struct smb_vol {
84 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ 84 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/
85 bool server_ino:1; /* use inode numbers from server ie UniqueId */ 85 bool server_ino:1; /* use inode numbers from server ie UniqueId */
86 bool direct_io:1; 86 bool direct_io:1;
87 bool strict_io:1; /* strict cache behavior */
87 bool remap:1; /* set to remap seven reserved chars in filenames */ 88 bool remap:1; /* set to remap seven reserved chars in filenames */
88 bool posix_paths:1; /* unset to not ask for posix pathnames. */ 89 bool posix_paths:1; /* unset to not ask for posix pathnames. */
89 bool no_linux_ext:1; 90 bool no_linux_ext:1;
@@ -105,6 +106,7 @@ struct smb_vol {
105 unsigned int wsize; 106 unsigned int wsize;
106 bool sockopt_tcp_nodelay:1; 107 bool sockopt_tcp_nodelay:1;
107 unsigned short int port; 108 unsigned short int port;
109 unsigned long actimeo; /* attribute cache timeout (jiffies) */
108 char *prepath; 110 char *prepath;
109 struct sockaddr_storage srcaddr; /* allow binding to a local IP */ 111 struct sockaddr_storage srcaddr; /* allow binding to a local IP */
110 struct nls_table *local_nls; 112 struct nls_table *local_nls;
@@ -114,8 +116,9 @@ struct smb_vol {
114#define TLINK_ERROR_EXPIRE (1 * HZ) 116#define TLINK_ERROR_EXPIRE (1 * HZ)
115#define TLINK_IDLE_EXPIRE (600 * HZ) 117#define TLINK_IDLE_EXPIRE (600 * HZ)
116 118
117static int ipv4_connect(struct TCP_Server_Info *server); 119static int ip_connect(struct TCP_Server_Info *server);
118static int ipv6_connect(struct TCP_Server_Info *server); 120static int generic_ip_connect(struct TCP_Server_Info *server);
121static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
119static void cifs_prune_tlinks(struct work_struct *work); 122static void cifs_prune_tlinks(struct work_struct *work);
120 123
121/* 124/*
@@ -150,6 +153,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
150 153
151 /* before reconnecting the tcp session, mark the smb session (uid) 154 /* before reconnecting the tcp session, mark the smb session (uid)
152 and the tid bad so they are not used until reconnected */ 155 and the tid bad so they are not used until reconnected */
156 cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
153 spin_lock(&cifs_tcp_ses_lock); 157 spin_lock(&cifs_tcp_ses_lock);
154 list_for_each(tmp, &server->smb_ses_list) { 158 list_for_each(tmp, &server->smb_ses_list) {
155 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 159 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
@@ -161,7 +165,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
161 } 165 }
162 } 166 }
163 spin_unlock(&cifs_tcp_ses_lock); 167 spin_unlock(&cifs_tcp_ses_lock);
168
164 /* do not want to be sending data on a socket we are freeing */ 169 /* do not want to be sending data on a socket we are freeing */
170 cFYI(1, "%s: tearing down socket", __func__);
165 mutex_lock(&server->srv_mutex); 171 mutex_lock(&server->srv_mutex);
166 if (server->ssocket) { 172 if (server->ssocket) {
167 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state, 173 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -178,30 +184,27 @@ cifs_reconnect(struct TCP_Server_Info *server)
178 kfree(server->session_key.response); 184 kfree(server->session_key.response);
179 server->session_key.response = NULL; 185 server->session_key.response = NULL;
180 server->session_key.len = 0; 186 server->session_key.len = 0;
187 server->lstrp = jiffies;
188 mutex_unlock(&server->srv_mutex);
181 189
190 /* mark submitted MIDs for retry and issue callback */
191 cFYI(1, "%s: issuing mid callbacks", __func__);
182 spin_lock(&GlobalMid_Lock); 192 spin_lock(&GlobalMid_Lock);
183 list_for_each(tmp, &server->pending_mid_q) { 193 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
184 mid_entry = list_entry(tmp, struct 194 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
185 mid_q_entry, 195 if (mid_entry->midState == MID_REQUEST_SUBMITTED)
186 qhead);
187 if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
188 /* Mark other intransit requests as needing
189 retry so we do not immediately mark the
190 session bad again (ie after we reconnect
191 below) as they timeout too */
192 mid_entry->midState = MID_RETRY_NEEDED; 196 mid_entry->midState = MID_RETRY_NEEDED;
193 } 197 list_del_init(&mid_entry->qhead);
198 mid_entry->callback(mid_entry);
194 } 199 }
195 spin_unlock(&GlobalMid_Lock); 200 spin_unlock(&GlobalMid_Lock);
196 mutex_unlock(&server->srv_mutex);
197 201
198 while ((server->tcpStatus != CifsExiting) && 202 while ((server->tcpStatus != CifsExiting) &&
199 (server->tcpStatus != CifsGood)) { 203 (server->tcpStatus != CifsGood)) {
200 try_to_freeze(); 204 try_to_freeze();
201 if (server->addr.sockAddr6.sin6_family == AF_INET6) 205
202 rc = ipv6_connect(server); 206 /* we should try only the port we connected to before */
203 else 207 rc = generic_ip_connect(server);
204 rc = ipv4_connect(server);
205 if (rc) { 208 if (rc) {
206 cFYI(1, "reconnect error %d", rc); 209 cFYI(1, "reconnect error %d", rc);
207 msleep(3000); 210 msleep(3000);
@@ -211,10 +214,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
211 if (server->tcpStatus != CifsExiting) 214 if (server->tcpStatus != CifsExiting)
212 server->tcpStatus = CifsGood; 215 server->tcpStatus = CifsGood;
213 spin_unlock(&GlobalMid_Lock); 216 spin_unlock(&GlobalMid_Lock);
214 /* atomic_set(&server->inFlight,0);*/
215 wake_up(&server->response_q);
216 } 217 }
217 } 218 }
219
218 return rc; 220 return rc;
219} 221}
220 222
@@ -228,9 +230,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
228static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) 230static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
229{ 231{
230 struct smb_t2_rsp *pSMBt; 232 struct smb_t2_rsp *pSMBt;
231 int total_data_size;
232 int data_in_this_rsp;
233 int remaining; 233 int remaining;
234 __u16 total_data_size, data_in_this_rsp;
234 235
235 if (pSMB->Command != SMB_COM_TRANSACTION2) 236 if (pSMB->Command != SMB_COM_TRANSACTION2)
236 return 0; 237 return 0;
@@ -244,8 +245,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
244 245
245 pSMBt = (struct smb_t2_rsp *)pSMB; 246 pSMBt = (struct smb_t2_rsp *)pSMB;
246 247
247 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 248 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
248 data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount); 249 data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
249 250
250 remaining = total_data_size - data_in_this_rsp; 251 remaining = total_data_size - data_in_this_rsp;
251 252
@@ -271,21 +272,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
271{ 272{
272 struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond; 273 struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
273 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB; 274 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB;
274 int total_data_size;
275 int total_in_buf;
276 int remaining;
277 int total_in_buf2;
278 char *data_area_of_target; 275 char *data_area_of_target;
279 char *data_area_of_buf2; 276 char *data_area_of_buf2;
280 __u16 byte_count; 277 int remaining;
278 __u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
281 279
282 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 280 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
283 281
284 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) { 282 if (total_data_size !=
283 get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
285 cFYI(1, "total data size of primary and secondary t2 differ"); 284 cFYI(1, "total data size of primary and secondary t2 differ");
286 }
287 285
288 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount); 286 total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
289 287
290 remaining = total_data_size - total_in_buf; 288 remaining = total_data_size - total_in_buf;
291 289
@@ -295,28 +293,28 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
295 if (remaining == 0) /* nothing to do, ignore */ 293 if (remaining == 0) /* nothing to do, ignore */
296 return 0; 294 return 0;
297 295
298 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount); 296 total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
299 if (remaining < total_in_buf2) { 297 if (remaining < total_in_buf2) {
300 cFYI(1, "transact2 2nd response contains too much data"); 298 cFYI(1, "transact2 2nd response contains too much data");
301 } 299 }
302 300
303 /* find end of first SMB data area */ 301 /* find end of first SMB data area */
304 data_area_of_target = (char *)&pSMBt->hdr.Protocol + 302 data_area_of_target = (char *)&pSMBt->hdr.Protocol +
305 le16_to_cpu(pSMBt->t2_rsp.DataOffset); 303 get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
306 /* validate target area */ 304 /* validate target area */
307 305
308 data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol + 306 data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
309 le16_to_cpu(pSMB2->t2_rsp.DataOffset); 307 get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
310 308
311 data_area_of_target += total_in_buf; 309 data_area_of_target += total_in_buf;
312 310
313 /* copy second buffer into end of first buffer */ 311 /* copy second buffer into end of first buffer */
314 memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); 312 memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
315 total_in_buf += total_in_buf2; 313 total_in_buf += total_in_buf2;
316 pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf); 314 put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
317 byte_count = le16_to_cpu(BCC_LE(pTargetSMB)); 315 byte_count = get_bcc_le(pTargetSMB);
318 byte_count += total_in_buf2; 316 byte_count += total_in_buf2;
319 BCC_LE(pTargetSMB) = cpu_to_le16(byte_count); 317 put_bcc_le(byte_count, pTargetSMB);
320 318
321 byte_count = pTargetSMB->smb_buf_length; 319 byte_count = pTargetSMB->smb_buf_length;
322 byte_count += total_in_buf2; 320 byte_count += total_in_buf2;
@@ -330,7 +328,26 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
330 return 0; /* we are done */ 328 return 0; /* we are done */
331 } else /* more responses to go */ 329 } else /* more responses to go */
332 return 1; 330 return 1;
331}
332
333static void
334cifs_echo_request(struct work_struct *work)
335{
336 int rc;
337 struct TCP_Server_Info *server = container_of(work,
338 struct TCP_Server_Info, echo.work);
333 339
340 /* no need to ping if we got a response recently */
341 if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
342 goto requeue_echo;
343
344 rc = CIFSSMBEcho(server);
345 if (rc)
346 cFYI(1, "Unable to send echo request to server: %s",
347 server->hostname);
348
349requeue_echo:
350 queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
334} 351}
335 352
336static int 353static int
@@ -344,8 +361,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
344 struct msghdr smb_msg; 361 struct msghdr smb_msg;
345 struct kvec iov; 362 struct kvec iov;
346 struct socket *csocket = server->ssocket; 363 struct socket *csocket = server->ssocket;
347 struct list_head *tmp; 364 struct list_head *tmp, *tmp2;
348 struct cifsSesInfo *ses;
349 struct task_struct *task_to_wake = NULL; 365 struct task_struct *task_to_wake = NULL;
350 struct mid_q_entry *mid_entry; 366 struct mid_q_entry *mid_entry;
351 char temp; 367 char temp;
@@ -398,7 +414,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
398 smb_msg.msg_control = NULL; 414 smb_msg.msg_control = NULL;
399 smb_msg.msg_controllen = 0; 415 smb_msg.msg_controllen = 0;
400 pdu_length = 4; /* enough to get RFC1001 header */ 416 pdu_length = 4; /* enough to get RFC1001 header */
417
401incomplete_rcv: 418incomplete_rcv:
419 if (echo_retries > 0 &&
420 time_after(jiffies, server->lstrp +
421 (echo_retries * SMB_ECHO_INTERVAL))) {
422 cERROR(1, "Server %s has not responded in %d seconds. "
423 "Reconnecting...", server->hostname,
424 (echo_retries * SMB_ECHO_INTERVAL / HZ));
425 cifs_reconnect(server);
426 csocket = server->ssocket;
427 wake_up(&server->response_q);
428 continue;
429 }
430
402 length = 431 length =
403 kernel_recvmsg(csocket, &smb_msg, 432 kernel_recvmsg(csocket, &smb_msg,
404 &iov, 1, pdu_length, 0 /* BB other flags? */); 433 &iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -475,7 +504,7 @@ incomplete_rcv:
475 * initialize frame) 504 * initialize frame)
476 */ 505 */
477 cifs_set_port((struct sockaddr *) 506 cifs_set_port((struct sockaddr *)
478 &server->addr.sockAddr, CIFS_PORT); 507 &server->dstaddr, CIFS_PORT);
479 cifs_reconnect(server); 508 cifs_reconnect(server);
480 csocket = server->ssocket; 509 csocket = server->ssocket;
481 wake_up(&server->response_q); 510 wake_up(&server->response_q);
@@ -558,10 +587,11 @@ incomplete_rcv:
558 continue; 587 continue;
559 } 588 }
560 589
590 mid_entry = NULL;
591 server->lstrp = jiffies;
561 592
562 task_to_wake = NULL;
563 spin_lock(&GlobalMid_Lock); 593 spin_lock(&GlobalMid_Lock);
564 list_for_each(tmp, &server->pending_mid_q) { 594 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
565 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 595 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
566 596
567 if ((mid_entry->mid == smb_buffer->Mid) && 597 if ((mid_entry->mid == smb_buffer->Mid) &&
@@ -602,20 +632,19 @@ incomplete_rcv:
602 mid_entry->resp_buf = smb_buffer; 632 mid_entry->resp_buf = smb_buffer;
603 mid_entry->largeBuf = isLargeBuf; 633 mid_entry->largeBuf = isLargeBuf;
604multi_t2_fnd: 634multi_t2_fnd:
605 task_to_wake = mid_entry->tsk;
606 mid_entry->midState = MID_RESPONSE_RECEIVED; 635 mid_entry->midState = MID_RESPONSE_RECEIVED;
636 list_del_init(&mid_entry->qhead);
637 mid_entry->callback(mid_entry);
607#ifdef CONFIG_CIFS_STATS2 638#ifdef CONFIG_CIFS_STATS2
608 mid_entry->when_received = jiffies; 639 mid_entry->when_received = jiffies;
609#endif 640#endif
610 /* so we do not time out requests to server
611 which is still responding (since server could
612 be busy but not dead) */
613 server->lstrp = jiffies;
614 break; 641 break;
615 } 642 }
643 mid_entry = NULL;
616 } 644 }
617 spin_unlock(&GlobalMid_Lock); 645 spin_unlock(&GlobalMid_Lock);
618 if (task_to_wake) { 646
647 if (mid_entry != NULL) {
619 /* Was previous buf put in mpx struct for multi-rsp? */ 648 /* Was previous buf put in mpx struct for multi-rsp? */
620 if (!isMultiRsp) { 649 if (!isMultiRsp) {
621 /* smb buffer will be freed by user thread */ 650 /* smb buffer will be freed by user thread */
@@ -624,11 +653,10 @@ multi_t2_fnd:
624 else 653 else
625 smallbuf = NULL; 654 smallbuf = NULL;
626 } 655 }
627 wake_up_process(task_to_wake);
628 } else if (!is_valid_oplock_break(smb_buffer, server) && 656 } else if (!is_valid_oplock_break(smb_buffer, server) &&
629 !isMultiRsp) { 657 !isMultiRsp) {
630 cERROR(1, "No task to wake, unknown frame received! " 658 cERROR(1, "No task to wake, unknown frame received! "
631 "NumMids %d", midCount.counter); 659 "NumMids %d", atomic_read(&midCount));
632 cifs_dump_mem("Received Data is: ", (char *)smb_buffer, 660 cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
633 sizeof(struct smb_hdr)); 661 sizeof(struct smb_hdr));
634#ifdef CONFIG_CIFS_DEBUG2 662#ifdef CONFIG_CIFS_DEBUG2
@@ -676,44 +704,16 @@ multi_t2_fnd:
676 if (smallbuf) /* no sense logging a debug message if NULL */ 704 if (smallbuf) /* no sense logging a debug message if NULL */
677 cifs_small_buf_release(smallbuf); 705 cifs_small_buf_release(smallbuf);
678 706
679 /* 707 if (!list_empty(&server->pending_mid_q)) {
680 * BB: we shouldn't have to do any of this. It shouldn't be
681 * possible to exit from the thread with active SMB sessions
682 */
683 spin_lock(&cifs_tcp_ses_lock);
684 if (list_empty(&server->pending_mid_q)) {
685 /* loop through server session structures attached to this and
686 mark them dead */
687 list_for_each(tmp, &server->smb_ses_list) {
688 ses = list_entry(tmp, struct cifsSesInfo,
689 smb_ses_list);
690 ses->status = CifsExiting;
691 ses->server = NULL;
692 }
693 spin_unlock(&cifs_tcp_ses_lock);
694 } else {
695 /* although we can not zero the server struct pointer yet,
696 since there are active requests which may depnd on them,
697 mark the corresponding SMB sessions as exiting too */
698 list_for_each(tmp, &server->smb_ses_list) {
699 ses = list_entry(tmp, struct cifsSesInfo,
700 smb_ses_list);
701 ses->status = CifsExiting;
702 }
703
704 spin_lock(&GlobalMid_Lock); 708 spin_lock(&GlobalMid_Lock);
705 list_for_each(tmp, &server->pending_mid_q) { 709 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
706 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 710 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
707 if (mid_entry->midState == MID_REQUEST_SUBMITTED) { 711 cFYI(1, "Clearing Mid 0x%x - issuing callback",
708 cFYI(1, "Clearing Mid 0x%x - waking up ",
709 mid_entry->mid); 712 mid_entry->mid);
710 task_to_wake = mid_entry->tsk; 713 list_del_init(&mid_entry->qhead);
711 if (task_to_wake) 714 mid_entry->callback(mid_entry);
712 wake_up_process(task_to_wake);
713 }
714 } 715 }
715 spin_unlock(&GlobalMid_Lock); 716 spin_unlock(&GlobalMid_Lock);
716 spin_unlock(&cifs_tcp_ses_lock);
717 /* 1/8th of sec is more than enough time for them to exit */ 717 /* 1/8th of sec is more than enough time for them to exit */
718 msleep(125); 718 msleep(125);
719 } 719 }
@@ -731,18 +731,6 @@ multi_t2_fnd:
731 coming home not much else we can do but free the memory */ 731 coming home not much else we can do but free the memory */
732 } 732 }
733 733
734 /* last chance to mark ses pointers invalid
735 if there are any pointing to this (e.g
736 if a crazy root user tried to kill cifsd
737 kernel thread explicitly this might happen) */
738 /* BB: This shouldn't be necessary, see above */
739 spin_lock(&cifs_tcp_ses_lock);
740 list_for_each(tmp, &server->smb_ses_list) {
741 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
742 ses->server = NULL;
743 }
744 spin_unlock(&cifs_tcp_ses_lock);
745
746 kfree(server->hostname); 734 kfree(server->hostname);
747 task_to_wake = xchg(&server->tsk, NULL); 735 task_to_wake = xchg(&server->tsk, NULL);
748 kfree(server); 736 kfree(server);
@@ -805,24 +793,21 @@ cifs_parse_mount_options(char *options, const char *devname,
805 short int override_gid = -1; 793 short int override_gid = -1;
806 bool uid_specified = false; 794 bool uid_specified = false;
807 bool gid_specified = false; 795 bool gid_specified = false;
796 char *nodename = utsname()->nodename;
808 797
809 separator[0] = ','; 798 separator[0] = ',';
810 separator[1] = 0; 799 separator[1] = 0;
811 800
812 if (Local_System_Name[0] != 0) 801 /*
813 memcpy(vol->source_rfc1001_name, Local_System_Name, 15); 802 * does not have to be perfect mapping since field is
814 else { 803 * informational, only used for servers that do not support
815 char *nodename = utsname()->nodename; 804 * port 445 and it can be overridden at mount time
816 int n = strnlen(nodename, 15); 805 */
817 memset(vol->source_rfc1001_name, 0x20, 15); 806 memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
818 for (i = 0; i < n; i++) { 807 for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
819 /* does not have to be perfect mapping since field is 808 vol->source_rfc1001_name[i] = toupper(nodename[i]);
820 informational, only used for servers that do not support 809
821 port 445 and it can be overridden at mount time */ 810 vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
822 vol->source_rfc1001_name[i] = toupper(nodename[i]);
823 }
824 }
825 vol->source_rfc1001_name[15] = 0;
826 /* null target name indicates to use *SMBSERVR default called name 811 /* null target name indicates to use *SMBSERVR default called name
827 if we end up sending RFC1001 session initialize */ 812 if we end up sending RFC1001 session initialize */
828 vol->target_rfc1001_name[0] = 0; 813 vol->target_rfc1001_name[0] = 0;
@@ -839,6 +824,8 @@ cifs_parse_mount_options(char *options, const char *devname,
839 /* default to using server inode numbers where available */ 824 /* default to using server inode numbers where available */
840 vol->server_ino = 1; 825 vol->server_ino = 1;
841 826
827 vol->actimeo = CIFS_DEF_ACTIMEO;
828
842 if (!options) 829 if (!options)
843 return 1; 830 return 1;
844 831
@@ -984,13 +971,11 @@ cifs_parse_mount_options(char *options, const char *devname,
984 return 1; 971 return 1;
985 } else if (strnicmp(value, "krb5", 4) == 0) { 972 } else if (strnicmp(value, "krb5", 4) == 0) {
986 vol->secFlg |= CIFSSEC_MAY_KRB5; 973 vol->secFlg |= CIFSSEC_MAY_KRB5;
987#ifdef CONFIG_CIFS_EXPERIMENTAL
988 } else if (strnicmp(value, "ntlmsspi", 8) == 0) { 974 } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
989 vol->secFlg |= CIFSSEC_MAY_NTLMSSP | 975 vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
990 CIFSSEC_MUST_SIGN; 976 CIFSSEC_MUST_SIGN;
991 } else if (strnicmp(value, "ntlmssp", 7) == 0) { 977 } else if (strnicmp(value, "ntlmssp", 7) == 0) {
992 vol->secFlg |= CIFSSEC_MAY_NTLMSSP; 978 vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
993#endif
994 } else if (strnicmp(value, "ntlmv2i", 7) == 0) { 979 } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
995 vol->secFlg |= CIFSSEC_MAY_NTLMV2 | 980 vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
996 CIFSSEC_MUST_SIGN; 981 CIFSSEC_MUST_SIGN;
@@ -1115,6 +1100,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1115 } else if (!strnicmp(data, "uid", 3) && value && *value) { 1100 } else if (!strnicmp(data, "uid", 3) && value && *value) {
1116 vol->linux_uid = simple_strtoul(value, &value, 0); 1101 vol->linux_uid = simple_strtoul(value, &value, 0);
1117 uid_specified = true; 1102 uid_specified = true;
1103 } else if (!strnicmp(data, "cruid", 5) && value && *value) {
1104 vol->cred_uid = simple_strtoul(value, &value, 0);
1118 } else if (!strnicmp(data, "forceuid", 8)) { 1105 } else if (!strnicmp(data, "forceuid", 8)) {
1119 override_uid = 1; 1106 override_uid = 1;
1120 } else if (!strnicmp(data, "noforceuid", 10)) { 1107 } else if (!strnicmp(data, "noforceuid", 10)) {
@@ -1167,22 +1154,22 @@ cifs_parse_mount_options(char *options, const char *devname,
1167 if (!value || !*value || (*value == ' ')) { 1154 if (!value || !*value || (*value == ' ')) {
1168 cFYI(1, "invalid (empty) netbiosname"); 1155 cFYI(1, "invalid (empty) netbiosname");
1169 } else { 1156 } else {
1170 memset(vol->source_rfc1001_name, 0x20, 15); 1157 memset(vol->source_rfc1001_name, 0x20,
1171 for (i = 0; i < 15; i++) { 1158 RFC1001_NAME_LEN);
1172 /* BB are there cases in which a comma can be 1159 /*
1173 valid in this workstation netbios name (and need 1160 * FIXME: are there cases in which a comma can
1174 special handling)? */ 1161 * be valid in workstation netbios name (and
1175 1162 * need special handling)?
1176 /* We do not uppercase netbiosname for user */ 1163 */
1164 for (i = 0; i < RFC1001_NAME_LEN; i++) {
1165 /* don't ucase netbiosname for user */
1177 if (value[i] == 0) 1166 if (value[i] == 0)
1178 break; 1167 break;
1179 else 1168 vol->source_rfc1001_name[i] = value[i];
1180 vol->source_rfc1001_name[i] =
1181 value[i];
1182 } 1169 }
1183 /* The string has 16th byte zero still from 1170 /* The string has 16th byte zero still from
1184 set at top of the function */ 1171 set at top of the function */
1185 if ((i == 15) && (value[i] != 0)) 1172 if (i == RFC1001_NAME_LEN && value[i] != 0)
1186 printk(KERN_WARNING "CIFS: netbiosname" 1173 printk(KERN_WARNING "CIFS: netbiosname"
1187 " longer than 15 truncated.\n"); 1174 " longer than 15 truncated.\n");
1188 } 1175 }
@@ -1192,7 +1179,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1192 cFYI(1, "empty server netbiosname specified"); 1179 cFYI(1, "empty server netbiosname specified");
1193 } else { 1180 } else {
1194 /* last byte, type, is 0x20 for servr type */ 1181 /* last byte, type, is 0x20 for servr type */
1195 memset(vol->target_rfc1001_name, 0x20, 16); 1182 memset(vol->target_rfc1001_name, 0x20,
1183 RFC1001_NAME_LEN_WITH_NULL);
1196 1184
1197 for (i = 0; i < 15; i++) { 1185 for (i = 0; i < 15; i++) {
1198 /* BB are there cases in which a comma can be 1186 /* BB are there cases in which a comma can be
@@ -1209,10 +1197,20 @@ cifs_parse_mount_options(char *options, const char *devname,
1209 } 1197 }
1210 /* The string has 16th byte zero still from 1198 /* The string has 16th byte zero still from
1211 set at top of the function */ 1199 set at top of the function */
1212 if ((i == 15) && (value[i] != 0)) 1200 if (i == RFC1001_NAME_LEN && value[i] != 0)
1213 printk(KERN_WARNING "CIFS: server net" 1201 printk(KERN_WARNING "CIFS: server net"
1214 "biosname longer than 15 truncated.\n"); 1202 "biosname longer than 15 truncated.\n");
1215 } 1203 }
1204 } else if (strnicmp(data, "actimeo", 7) == 0) {
1205 if (value && *value) {
1206 vol->actimeo = HZ * simple_strtoul(value,
1207 &value, 0);
1208 if (vol->actimeo > CIFS_MAX_ACTIMEO) {
1209 cERROR(1, "CIFS: attribute cache"
1210 "timeout too large");
1211 return 1;
1212 }
1213 }
1216 } else if (strnicmp(data, "credentials", 4) == 0) { 1214 } else if (strnicmp(data, "credentials", 4) == 0) {
1217 /* ignore */ 1215 /* ignore */
1218 } else if (strnicmp(data, "version", 3) == 0) { 1216 } else if (strnicmp(data, "version", 3) == 0) {
@@ -1330,10 +1328,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1330 vol->no_psx_acl = 0; 1328 vol->no_psx_acl = 0;
1331 } else if (strnicmp(data, "noacl", 5) == 0) { 1329 } else if (strnicmp(data, "noacl", 5) == 0) {
1332 vol->no_psx_acl = 1; 1330 vol->no_psx_acl = 1;
1333#ifdef CONFIG_CIFS_EXPERIMENTAL
1334 } else if (strnicmp(data, "locallease", 6) == 0) { 1331 } else if (strnicmp(data, "locallease", 6) == 0) {
1335 vol->local_lease = 1; 1332 vol->local_lease = 1;
1336#endif
1337 } else if (strnicmp(data, "sign", 4) == 0) { 1333 } else if (strnicmp(data, "sign", 4) == 0) {
1338 vol->secFlg |= CIFSSEC_MUST_SIGN; 1334 vol->secFlg |= CIFSSEC_MUST_SIGN;
1339 } else if (strnicmp(data, "seal", 4) == 0) { 1335 } else if (strnicmp(data, "seal", 4) == 0) {
@@ -1346,11 +1342,18 @@ cifs_parse_mount_options(char *options, const char *devname,
1346 vol->direct_io = 1; 1342 vol->direct_io = 1;
1347 } else if (strnicmp(data, "forcedirectio", 13) == 0) { 1343 } else if (strnicmp(data, "forcedirectio", 13) == 0) {
1348 vol->direct_io = 1; 1344 vol->direct_io = 1;
1345 } else if (strnicmp(data, "strictcache", 11) == 0) {
1346 vol->strict_io = 1;
1349 } else if (strnicmp(data, "noac", 4) == 0) { 1347 } else if (strnicmp(data, "noac", 4) == 0) {
1350 printk(KERN_WARNING "CIFS: Mount option noac not " 1348 printk(KERN_WARNING "CIFS: Mount option noac not "
1351 "supported. Instead set " 1349 "supported. Instead set "
1352 "/proc/fs/cifs/LookupCacheEnabled to 0\n"); 1350 "/proc/fs/cifs/LookupCacheEnabled to 0\n");
1353 } else if (strnicmp(data, "fsc", 3) == 0) { 1351 } else if (strnicmp(data, "fsc", 3) == 0) {
1352#ifndef CONFIG_CIFS_FSCACHE
1353 cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
1354 "kernel config option set");
1355 return 1;
1356#endif
1354 vol->fsc = true; 1357 vol->fsc = true;
1355 } else if (strnicmp(data, "mfsymlinks", 10) == 0) { 1358 } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
1356 vol->mfsymlinks = true; 1359 vol->mfsymlinks = true;
@@ -1438,35 +1441,71 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
1438 } 1441 }
1439} 1442}
1440 1443
1444/*
1445 * If no port is specified in addr structure, we try to match with 445 port
1446 * and if it fails - with 139 ports. It should be called only if address
1447 * families of server and addr are equal.
1448 */
1449static bool
1450match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
1451{
1452 unsigned short int port, *sport;
1453
1454 switch (addr->sa_family) {
1455 case AF_INET:
1456 sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
1457 port = ((struct sockaddr_in *) addr)->sin_port;
1458 break;
1459 case AF_INET6:
1460 sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
1461 port = ((struct sockaddr_in6 *) addr)->sin6_port;
1462 break;
1463 default:
1464 WARN_ON(1);
1465 return false;
1466 }
1467
1468 if (!port) {
1469 port = htons(CIFS_PORT);
1470 if (port == *sport)
1471 return true;
1472
1473 port = htons(RFC1001_PORT);
1474 }
1475
1476 return port == *sport;
1477}
1441 1478
1442static bool 1479static bool
1443match_address(struct TCP_Server_Info *server, struct sockaddr *addr, 1480match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
1444 struct sockaddr *srcaddr) 1481 struct sockaddr *srcaddr)
1445{ 1482{
1446 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
1447 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
1448
1449 switch (addr->sa_family) { 1483 switch (addr->sa_family) {
1450 case AF_INET: 1484 case AF_INET: {
1451 if (addr4->sin_addr.s_addr != 1485 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
1452 server->addr.sockAddr.sin_addr.s_addr) 1486 struct sockaddr_in *srv_addr4 =
1453 return false; 1487 (struct sockaddr_in *)&server->dstaddr;
1454 if (addr4->sin_port && 1488
1455 addr4->sin_port != server->addr.sockAddr.sin_port) 1489 if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
1456 return false; 1490 return false;
1457 break; 1491 break;
1458 case AF_INET6: 1492 }
1493 case AF_INET6: {
1494 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
1495 struct sockaddr_in6 *srv_addr6 =
1496 (struct sockaddr_in6 *)&server->dstaddr;
1497
1459 if (!ipv6_addr_equal(&addr6->sin6_addr, 1498 if (!ipv6_addr_equal(&addr6->sin6_addr,
1460 &server->addr.sockAddr6.sin6_addr)) 1499 &srv_addr6->sin6_addr))
1461 return false;
1462 if (addr6->sin6_scope_id !=
1463 server->addr.sockAddr6.sin6_scope_id)
1464 return false; 1500 return false;
1465 if (addr6->sin6_port && 1501 if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
1466 addr6->sin6_port != server->addr.sockAddr6.sin6_port)
1467 return false; 1502 return false;
1468 break; 1503 break;
1469 } 1504 }
1505 default:
1506 WARN_ON(1);
1507 return false; /* don't expect to be here */
1508 }
1470 1509
1471 if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr)) 1510 if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
1472 return false; 1511 return false;
@@ -1529,10 +1568,16 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
1529 1568
1530 spin_lock(&cifs_tcp_ses_lock); 1569 spin_lock(&cifs_tcp_ses_lock);
1531 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { 1570 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
1571 if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
1572 continue;
1573
1532 if (!match_address(server, addr, 1574 if (!match_address(server, addr,
1533 (struct sockaddr *)&vol->srcaddr)) 1575 (struct sockaddr *)&vol->srcaddr))
1534 continue; 1576 continue;
1535 1577
1578 if (!match_port(server, addr))
1579 continue;
1580
1536 if (!match_security(server, vol)) 1581 if (!match_security(server, vol))
1537 continue; 1582 continue;
1538 1583
@@ -1556,9 +1601,13 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
1556 return; 1601 return;
1557 } 1602 }
1558 1603
1604 put_net(cifs_net_ns(server));
1605
1559 list_del_init(&server->tcp_ses_list); 1606 list_del_init(&server->tcp_ses_list);
1560 spin_unlock(&cifs_tcp_ses_lock); 1607 spin_unlock(&cifs_tcp_ses_lock);
1561 1608
1609 cancel_delayed_work_sync(&server->echo);
1610
1562 spin_lock(&GlobalMid_Lock); 1611 spin_lock(&GlobalMid_Lock);
1563 server->tcpStatus = CifsExiting; 1612 server->tcpStatus = CifsExiting;
1564 spin_unlock(&GlobalMid_Lock); 1613 spin_unlock(&GlobalMid_Lock);
@@ -1628,6 +1677,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1628 goto out_err; 1677 goto out_err;
1629 } 1678 }
1630 1679
1680 cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
1631 tcp_ses->hostname = extract_hostname(volume_info->UNC); 1681 tcp_ses->hostname = extract_hostname(volume_info->UNC);
1632 if (IS_ERR(tcp_ses->hostname)) { 1682 if (IS_ERR(tcp_ses->hostname)) {
1633 rc = PTR_ERR(tcp_ses->hostname); 1683 rc = PTR_ERR(tcp_ses->hostname);
@@ -1648,8 +1698,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1648 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1698 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1649 tcp_ses->session_estab = false; 1699 tcp_ses->session_estab = false;
1650 tcp_ses->sequence_number = 0; 1700 tcp_ses->sequence_number = 0;
1701 tcp_ses->lstrp = jiffies;
1651 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 1702 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
1652 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 1703 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
1704 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
1653 1705
1654 /* 1706 /*
1655 * at this point we are the only ones with the pointer 1707 * at this point we are the only ones with the pointer
@@ -1665,14 +1717,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1665 cFYI(1, "attempting ipv6 connect"); 1717 cFYI(1, "attempting ipv6 connect");
1666 /* BB should we allow ipv6 on port 139? */ 1718 /* BB should we allow ipv6 on port 139? */
1667 /* other OS never observed in Wild doing 139 with v6 */ 1719 /* other OS never observed in Wild doing 139 with v6 */
1668 memcpy(&tcp_ses->addr.sockAddr6, sin_server6, 1720 memcpy(&tcp_ses->dstaddr, sin_server6,
1669 sizeof(struct sockaddr_in6)); 1721 sizeof(struct sockaddr_in6));
1670 rc = ipv6_connect(tcp_ses); 1722 } else
1671 } else { 1723 memcpy(&tcp_ses->dstaddr, sin_server,
1672 memcpy(&tcp_ses->addr.sockAddr, sin_server, 1724 sizeof(struct sockaddr_in));
1673 sizeof(struct sockaddr_in)); 1725
1674 rc = ipv4_connect(tcp_ses); 1726 rc = ip_connect(tcp_ses);
1675 }
1676 if (rc < 0) { 1727 if (rc < 0) {
1677 cERROR(1, "Error connecting to socket. Aborting operation"); 1728 cERROR(1, "Error connecting to socket. Aborting operation");
1678 goto out_err_crypto_release; 1729 goto out_err_crypto_release;
@@ -1699,11 +1750,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1699 1750
1700 cifs_fscache_get_client_cookie(tcp_ses); 1751 cifs_fscache_get_client_cookie(tcp_ses);
1701 1752
1753 /* queue echo request delayed work */
1754 queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
1755
1702 return tcp_ses; 1756 return tcp_ses;
1703 1757
1704out_err_crypto_release: 1758out_err_crypto_release:
1705 cifs_crypto_shash_release(tcp_ses); 1759 cifs_crypto_shash_release(tcp_ses);
1706 1760
1761 put_net(cifs_net_ns(tcp_ses));
1762
1707out_err: 1763out_err:
1708 if (tcp_ses) { 1764 if (tcp_ses) {
1709 if (!IS_ERR(tcp_ses->hostname)) 1765 if (!IS_ERR(tcp_ses->hostname))
@@ -1777,6 +1833,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1777{ 1833{
1778 int rc = -ENOMEM, xid; 1834 int rc = -ENOMEM, xid;
1779 struct cifsSesInfo *ses; 1835 struct cifsSesInfo *ses;
1836 struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
1837 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
1780 1838
1781 xid = GetXid(); 1839 xid = GetXid();
1782 1840
@@ -1820,12 +1878,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1820 1878
1821 /* new SMB session uses our server ref */ 1879 /* new SMB session uses our server ref */
1822 ses->server = server; 1880 ses->server = server;
1823 if (server->addr.sockAddr6.sin6_family == AF_INET6) 1881 if (server->dstaddr.ss_family == AF_INET6)
1824 sprintf(ses->serverName, "%pI6", 1882 sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
1825 &server->addr.sockAddr6.sin6_addr);
1826 else 1883 else
1827 sprintf(ses->serverName, "%pI4", 1884 sprintf(ses->serverName, "%pI4", &addr->sin_addr);
1828 &server->addr.sockAddr.sin_addr.s_addr);
1829 1885
1830 if (volume_info->username) 1886 if (volume_info->username)
1831 strncpy(ses->userName, volume_info->username, 1887 strncpy(ses->userName, volume_info->username,
@@ -2120,19 +2176,106 @@ bind_socket(struct TCP_Server_Info *server)
2120} 2176}
2121 2177
2122static int 2178static int
2123ipv4_connect(struct TCP_Server_Info *server) 2179ip_rfc1001_connect(struct TCP_Server_Info *server)
2124{ 2180{
2125 int rc = 0; 2181 int rc = 0;
2126 int val; 2182 /*
2127 bool connected = false; 2183 * some servers require RFC1001 sessinit before sending
2128 __be16 orig_port = 0; 2184 * negprot - BB check reconnection in case where second
2185 * sessinit is sent but no second negprot
2186 */
2187 struct rfc1002_session_packet *ses_init_buf;
2188 struct smb_hdr *smb_buf;
2189 ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
2190 GFP_KERNEL);
2191 if (ses_init_buf) {
2192 ses_init_buf->trailer.session_req.called_len = 32;
2193
2194 if (server->server_RFC1001_name &&
2195 server->server_RFC1001_name[0] != 0)
2196 rfc1002mangle(ses_init_buf->trailer.
2197 session_req.called_name,
2198 server->server_RFC1001_name,
2199 RFC1001_NAME_LEN_WITH_NULL);
2200 else
2201 rfc1002mangle(ses_init_buf->trailer.
2202 session_req.called_name,
2203 DEFAULT_CIFS_CALLED_NAME,
2204 RFC1001_NAME_LEN_WITH_NULL);
2205
2206 ses_init_buf->trailer.session_req.calling_len = 32;
2207
2208 /*
2209 * calling name ends in null (byte 16) from old smb
2210 * convention.
2211 */
2212 if (server->workstation_RFC1001_name &&
2213 server->workstation_RFC1001_name[0] != 0)
2214 rfc1002mangle(ses_init_buf->trailer.
2215 session_req.calling_name,
2216 server->workstation_RFC1001_name,
2217 RFC1001_NAME_LEN_WITH_NULL);
2218 else
2219 rfc1002mangle(ses_init_buf->trailer.
2220 session_req.calling_name,
2221 "LINUX_CIFS_CLNT",
2222 RFC1001_NAME_LEN_WITH_NULL);
2223
2224 ses_init_buf->trailer.session_req.scope1 = 0;
2225 ses_init_buf->trailer.session_req.scope2 = 0;
2226 smb_buf = (struct smb_hdr *)ses_init_buf;
2227
2228 /* sizeof RFC1002_SESSION_REQUEST with no scope */
2229 smb_buf->smb_buf_length = 0x81000044;
2230 rc = smb_send(server, smb_buf, 0x44);
2231 kfree(ses_init_buf);
2232 /*
2233 * RFC1001 layer in at least one server
2234 * requires very short break before negprot
2235 * presumably because not expecting negprot
2236 * to follow so fast. This is a simple
2237 * solution that works without
2238 * complicating the code and causes no
2239 * significant slowing down on mount
2240 * for everyone else
2241 */
2242 usleep_range(1000, 2000);
2243 }
2244 /*
2245 * else the negprot may still work without this
2246 * even though malloc failed
2247 */
2248
2249 return rc;
2250}
2251
2252static int
2253generic_ip_connect(struct TCP_Server_Info *server)
2254{
2255 int rc = 0;
2256 unsigned short int sport;
2257 int slen, sfamily;
2129 struct socket *socket = server->ssocket; 2258 struct socket *socket = server->ssocket;
2259 struct sockaddr *saddr;
2260
2261 saddr = (struct sockaddr *) &server->dstaddr;
2262
2263 if (server->dstaddr.ss_family == AF_INET6) {
2264 sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
2265 slen = sizeof(struct sockaddr_in6);
2266 sfamily = AF_INET6;
2267 } else {
2268 sport = ((struct sockaddr_in *) saddr)->sin_port;
2269 slen = sizeof(struct sockaddr_in);
2270 sfamily = AF_INET;
2271 }
2130 2272
2131 if (socket == NULL) { 2273 if (socket == NULL) {
2132 rc = sock_create_kern(PF_INET, SOCK_STREAM, 2274 rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
2133 IPPROTO_TCP, &socket); 2275 IPPROTO_TCP, &socket, 1);
2134 if (rc < 0) { 2276 if (rc < 0) {
2135 cERROR(1, "Error %d creating socket", rc); 2277 cERROR(1, "Error %d creating socket", rc);
2278 server->ssocket = NULL;
2136 return rc; 2279 return rc;
2137 } 2280 }
2138 2281
@@ -2140,63 +2283,28 @@ ipv4_connect(struct TCP_Server_Info *server)
2140 cFYI(1, "Socket created"); 2283 cFYI(1, "Socket created");
2141 server->ssocket = socket; 2284 server->ssocket = socket;
2142 socket->sk->sk_allocation = GFP_NOFS; 2285 socket->sk->sk_allocation = GFP_NOFS;
2143 cifs_reclassify_socket4(socket); 2286 if (sfamily == AF_INET6)
2287 cifs_reclassify_socket6(socket);
2288 else
2289 cifs_reclassify_socket4(socket);
2144 } 2290 }
2145 2291
2146 rc = bind_socket(server); 2292 rc = bind_socket(server);
2147 if (rc < 0) 2293 if (rc < 0)
2148 return rc; 2294 return rc;
2149 2295
2150 /* user overrode default port */ 2296 rc = socket->ops->connect(socket, saddr, slen, 0);
2151 if (server->addr.sockAddr.sin_port) { 2297 if (rc < 0) {
2152 rc = socket->ops->connect(socket, (struct sockaddr *) 2298 cFYI(1, "Error %d connecting to server", rc);
2153 &server->addr.sockAddr,
2154 sizeof(struct sockaddr_in), 0);
2155 if (rc >= 0)
2156 connected = true;
2157 }
2158
2159 if (!connected) {
2160 /* save original port so we can retry user specified port
2161 later if fall back ports fail this time */
2162 orig_port = server->addr.sockAddr.sin_port;
2163
2164 /* do not retry on the same port we just failed on */
2165 if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
2166 server->addr.sockAddr.sin_port = htons(CIFS_PORT);
2167 rc = socket->ops->connect(socket,
2168 (struct sockaddr *)
2169 &server->addr.sockAddr,
2170 sizeof(struct sockaddr_in), 0);
2171 if (rc >= 0)
2172 connected = true;
2173 }
2174 }
2175 if (!connected) {
2176 server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
2177 rc = socket->ops->connect(socket, (struct sockaddr *)
2178 &server->addr.sockAddr,
2179 sizeof(struct sockaddr_in), 0);
2180 if (rc >= 0)
2181 connected = true;
2182 }
2183
2184 /* give up here - unless we want to retry on different
2185 protocol families some day */
2186 if (!connected) {
2187 if (orig_port)
2188 server->addr.sockAddr.sin_port = orig_port;
2189 cFYI(1, "Error %d connecting to server via ipv4", rc);
2190 sock_release(socket); 2299 sock_release(socket);
2191 server->ssocket = NULL; 2300 server->ssocket = NULL;
2192 return rc; 2301 return rc;
2193 } 2302 }
2194 2303
2195
2196 /* 2304 /*
2197 * Eventually check for other socket options to change from 2305 * Eventually check for other socket options to change from
2198 * the default. sock_setsockopt not used because it expects 2306 * the default. sock_setsockopt not used because it expects
2199 * user space buffer 2307 * user space buffer
2200 */ 2308 */
2201 socket->sk->sk_rcvtimeo = 7 * HZ; 2309 socket->sk->sk_rcvtimeo = 7 * HZ;
2202 socket->sk->sk_sndtimeo = 5 * HZ; 2310 socket->sk->sk_sndtimeo = 5 * HZ;
@@ -2210,7 +2318,7 @@ ipv4_connect(struct TCP_Server_Info *server)
2210 } 2318 }
2211 2319
2212 if (server->tcp_nodelay) { 2320 if (server->tcp_nodelay) {
2213 val = 1; 2321 int val = 1;
2214 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, 2322 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2215 (char *)&val, sizeof(val)); 2323 (char *)&val, sizeof(val));
2216 if (rc) 2324 if (rc)
@@ -2221,161 +2329,39 @@ ipv4_connect(struct TCP_Server_Info *server)
2221 socket->sk->sk_sndbuf, 2329 socket->sk->sk_sndbuf,
2222 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); 2330 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
2223 2331
2224 /* send RFC1001 sessinit */ 2332 if (sport == htons(RFC1001_PORT))
2225 if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) { 2333 rc = ip_rfc1001_connect(server);
2226 /* some servers require RFC1001 sessinit before sending
2227 negprot - BB check reconnection in case where second
2228 sessinit is sent but no second negprot */
2229 struct rfc1002_session_packet *ses_init_buf;
2230 struct smb_hdr *smb_buf;
2231 ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
2232 GFP_KERNEL);
2233 if (ses_init_buf) {
2234 ses_init_buf->trailer.session_req.called_len = 32;
2235 if (server->server_RFC1001_name &&
2236 server->server_RFC1001_name[0] != 0)
2237 rfc1002mangle(ses_init_buf->trailer.
2238 session_req.called_name,
2239 server->server_RFC1001_name,
2240 RFC1001_NAME_LEN_WITH_NULL);
2241 else
2242 rfc1002mangle(ses_init_buf->trailer.
2243 session_req.called_name,
2244 DEFAULT_CIFS_CALLED_NAME,
2245 RFC1001_NAME_LEN_WITH_NULL);
2246
2247 ses_init_buf->trailer.session_req.calling_len = 32;
2248
2249 /* calling name ends in null (byte 16) from old smb
2250 convention. */
2251 if (server->workstation_RFC1001_name &&
2252 server->workstation_RFC1001_name[0] != 0)
2253 rfc1002mangle(ses_init_buf->trailer.
2254 session_req.calling_name,
2255 server->workstation_RFC1001_name,
2256 RFC1001_NAME_LEN_WITH_NULL);
2257 else
2258 rfc1002mangle(ses_init_buf->trailer.
2259 session_req.calling_name,
2260 "LINUX_CIFS_CLNT",
2261 RFC1001_NAME_LEN_WITH_NULL);
2262
2263 ses_init_buf->trailer.session_req.scope1 = 0;
2264 ses_init_buf->trailer.session_req.scope2 = 0;
2265 smb_buf = (struct smb_hdr *)ses_init_buf;
2266 /* sizeof RFC1002_SESSION_REQUEST with no scope */
2267 smb_buf->smb_buf_length = 0x81000044;
2268 rc = smb_send(server, smb_buf, 0x44);
2269 kfree(ses_init_buf);
2270 msleep(1); /* RFC1001 layer in at least one server
2271 requires very short break before negprot
2272 presumably because not expecting negprot
2273 to follow so fast. This is a simple
2274 solution that works without
2275 complicating the code and causes no
2276 significant slowing down on mount
2277 for everyone else */
2278 }
2279 /* else the negprot may still work without this
2280 even though malloc failed */
2281
2282 }
2283 2334
2284 return rc; 2335 return rc;
2285} 2336}
2286 2337
2287static int 2338static int
2288ipv6_connect(struct TCP_Server_Info *server) 2339ip_connect(struct TCP_Server_Info *server)
2289{ 2340{
2290 int rc = 0; 2341 unsigned short int *sport;
2291 int val; 2342 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
2292 bool connected = false; 2343 struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
2293 __be16 orig_port = 0;
2294 struct socket *socket = server->ssocket;
2295 2344
2296 if (socket == NULL) { 2345 if (server->dstaddr.ss_family == AF_INET6)
2297 rc = sock_create_kern(PF_INET6, SOCK_STREAM, 2346 sport = &addr6->sin6_port;
2298 IPPROTO_TCP, &socket); 2347 else
2299 if (rc < 0) { 2348 sport = &addr->sin_port;
2300 cERROR(1, "Error %d creating ipv6 socket", rc);
2301 socket = NULL;
2302 return rc;
2303 }
2304 2349
2305 /* BB other socket options to set KEEPALIVE, NODELAY? */ 2350 if (*sport == 0) {
2306 cFYI(1, "ipv6 Socket created"); 2351 int rc;
2307 server->ssocket = socket;
2308 socket->sk->sk_allocation = GFP_NOFS;
2309 cifs_reclassify_socket6(socket);
2310 }
2311 2352
2312 rc = bind_socket(server); 2353 /* try with 445 port at first */
2313 if (rc < 0) 2354 *sport = htons(CIFS_PORT);
2314 return rc;
2315 2355
2316 /* user overrode default port */ 2356 rc = generic_ip_connect(server);
2317 if (server->addr.sockAddr6.sin6_port) {
2318 rc = socket->ops->connect(socket,
2319 (struct sockaddr *) &server->addr.sockAddr6,
2320 sizeof(struct sockaddr_in6), 0);
2321 if (rc >= 0) 2357 if (rc >= 0)
2322 connected = true; 2358 return rc;
2323 }
2324
2325 if (!connected) {
2326 /* save original port so we can retry user specified port
2327 later if fall back ports fail this time */
2328
2329 orig_port = server->addr.sockAddr6.sin6_port;
2330 /* do not retry on the same port we just failed on */
2331 if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
2332 server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
2333 rc = socket->ops->connect(socket, (struct sockaddr *)
2334 &server->addr.sockAddr6,
2335 sizeof(struct sockaddr_in6), 0);
2336 if (rc >= 0)
2337 connected = true;
2338 }
2339 }
2340 if (!connected) {
2341 server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
2342 rc = socket->ops->connect(socket, (struct sockaddr *)
2343 &server->addr.sockAddr6,
2344 sizeof(struct sockaddr_in6), 0);
2345 if (rc >= 0)
2346 connected = true;
2347 }
2348
2349 /* give up here - unless we want to retry on different
2350 protocol families some day */
2351 if (!connected) {
2352 if (orig_port)
2353 server->addr.sockAddr6.sin6_port = orig_port;
2354 cFYI(1, "Error %d connecting to server via ipv6", rc);
2355 sock_release(socket);
2356 server->ssocket = NULL;
2357 return rc;
2358 }
2359
2360 /*
2361 * Eventually check for other socket options to change from
2362 * the default. sock_setsockopt not used because it expects
2363 * user space buffer
2364 */
2365 socket->sk->sk_rcvtimeo = 7 * HZ;
2366 socket->sk->sk_sndtimeo = 5 * HZ;
2367 2359
2368 if (server->tcp_nodelay) { 2360 /* if it failed, try with 139 port */
2369 val = 1; 2361 *sport = htons(RFC1001_PORT);
2370 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2371 (char *)&val, sizeof(val));
2372 if (rc)
2373 cFYI(1, "set TCP_NODELAY socket option error %d", rc);
2374 } 2362 }
2375 2363
2376 server->ssocket = socket; 2364 return generic_ip_connect(server);
2377
2378 return rc;
2379} 2365}
2380 2366
2381void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon, 2367void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
@@ -2565,6 +2551,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2565 cFYI(1, "file mode: 0x%x dir mode: 0x%x", 2551 cFYI(1, "file mode: 0x%x dir mode: 0x%x",
2566 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); 2552 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
2567 2553
2554 cifs_sb->actimeo = pvolume_info->actimeo;
2555
2568 if (pvolume_info->noperm) 2556 if (pvolume_info->noperm)
2569 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; 2557 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
2570 if (pvolume_info->setuids) 2558 if (pvolume_info->setuids)
@@ -2596,6 +2584,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2596 if (pvolume_info->multiuser) 2584 if (pvolume_info->multiuser)
2597 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | 2585 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
2598 CIFS_MOUNT_NO_PERM); 2586 CIFS_MOUNT_NO_PERM);
2587 if (pvolume_info->strict_io)
2588 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
2599 if (pvolume_info->direct_io) { 2589 if (pvolume_info->direct_io) {
2600 cFYI(1, "mounting share using direct i/o"); 2590 cFYI(1, "mounting share using direct i/o");
2601 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2591 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
@@ -2815,13 +2805,13 @@ remote_path_check:
2815 /* check if a whole path (including prepath) is not remote */ 2805 /* check if a whole path (including prepath) is not remote */
2816 if (!rc && cifs_sb->prepathlen && tcon) { 2806 if (!rc && cifs_sb->prepathlen && tcon) {
2817 /* build_path_to_root works only when we have a valid tcon */ 2807 /* build_path_to_root works only when we have a valid tcon */
2818 full_path = cifs_build_path_to_root(cifs_sb); 2808 full_path = cifs_build_path_to_root(cifs_sb, tcon);
2819 if (full_path == NULL) { 2809 if (full_path == NULL) {
2820 rc = -ENOMEM; 2810 rc = -ENOMEM;
2821 goto mount_fail_check; 2811 goto mount_fail_check;
2822 } 2812 }
2823 rc = is_path_accessible(xid, tcon, cifs_sb, full_path); 2813 rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
2824 if (rc != -EREMOTE) { 2814 if (rc != 0 && rc != -EREMOTE) {
2825 kfree(full_path); 2815 kfree(full_path);
2826 goto mount_fail_check; 2816 goto mount_fail_check;
2827 } 2817 }
@@ -2900,24 +2890,16 @@ remote_path_check:
2900 goto mount_fail_check; 2890 goto mount_fail_check;
2901 } 2891 }
2902 2892
2903 tlink->tl_index = pSesInfo->linux_uid; 2893 tlink->tl_uid = pSesInfo->linux_uid;
2904 tlink->tl_tcon = tcon; 2894 tlink->tl_tcon = tcon;
2905 tlink->tl_time = jiffies; 2895 tlink->tl_time = jiffies;
2906 set_bit(TCON_LINK_MASTER, &tlink->tl_flags); 2896 set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
2907 set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); 2897 set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
2908 2898
2909 rc = radix_tree_preload(GFP_KERNEL); 2899 cifs_sb->master_tlink = tlink;
2910 if (rc == -ENOMEM) {
2911 kfree(tlink);
2912 goto mount_fail_check;
2913 }
2914
2915 spin_lock(&cifs_sb->tlink_tree_lock); 2900 spin_lock(&cifs_sb->tlink_tree_lock);
2916 radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink); 2901 tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
2917 radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
2918 CIFS_TLINK_MASTER_TAG);
2919 spin_unlock(&cifs_sb->tlink_tree_lock); 2902 spin_unlock(&cifs_sb->tlink_tree_lock);
2920 radix_tree_preload_end();
2921 2903
2922 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, 2904 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
2923 TLINK_IDLE_EXPIRE); 2905 TLINK_IDLE_EXPIRE);
@@ -2960,8 +2942,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2960 TCONX_RSP *pSMBr; 2942 TCONX_RSP *pSMBr;
2961 unsigned char *bcc_ptr; 2943 unsigned char *bcc_ptr;
2962 int rc = 0; 2944 int rc = 0;
2963 int length, bytes_left; 2945 int length;
2964 __u16 count; 2946 __u16 bytes_left, count;
2965 2947
2966 if (ses == NULL) 2948 if (ses == NULL)
2967 return -EIO; 2949 return -EIO;
@@ -2989,7 +2971,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2989 bcc_ptr++; /* skip password */ 2971 bcc_ptr++; /* skip password */
2990 /* already aligned so no need to do it below */ 2972 /* already aligned so no need to do it below */
2991 } else { 2973 } else {
2992 pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 2974 pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
2993 /* BB FIXME add code to fail this if NTLMv2 or Kerberos 2975 /* BB FIXME add code to fail this if NTLMv2 or Kerberos
2994 specified as required (when that support is added to 2976 specified as required (when that support is added to
2995 the vfs in the future) as only NTLM or the much 2977 the vfs in the future) as only NTLM or the much
@@ -3005,9 +2987,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3005 bcc_ptr); 2987 bcc_ptr);
3006 else 2988 else
3007#endif /* CIFS_WEAK_PW_HASH */ 2989#endif /* CIFS_WEAK_PW_HASH */
3008 SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr); 2990 rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
2991 bcc_ptr);
3009 2992
3010 bcc_ptr += CIFS_SESS_KEY_SIZE; 2993 bcc_ptr += CIFS_AUTH_RESP_SIZE;
3011 if (ses->capabilities & CAP_UNICODE) { 2994 if (ses->capabilities & CAP_UNICODE) {
3012 /* must align unicode strings */ 2995 /* must align unicode strings */
3013 *bcc_ptr = 0; /* null byte password */ 2996 *bcc_ptr = 0; /* null byte password */
@@ -3045,7 +3028,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3045 pSMB->ByteCount = cpu_to_le16(count); 3028 pSMB->ByteCount = cpu_to_le16(count);
3046 3029
3047 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, 3030 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
3048 CIFS_STD_OP); 3031 0);
3049 3032
3050 /* above now done in SendReceive */ 3033 /* above now done in SendReceive */
3051 if ((rc == 0) && (tcon != NULL)) { 3034 if ((rc == 0) && (tcon != NULL)) {
@@ -3055,7 +3038,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3055 tcon->need_reconnect = false; 3038 tcon->need_reconnect = false;
3056 tcon->tid = smb_buffer_response->Tid; 3039 tcon->tid = smb_buffer_response->Tid;
3057 bcc_ptr = pByteArea(smb_buffer_response); 3040 bcc_ptr = pByteArea(smb_buffer_response);
3058 bytes_left = BCC(smb_buffer_response); 3041 bytes_left = get_bcc(smb_buffer_response);
3059 length = strnlen(bcc_ptr, bytes_left - 2); 3042 length = strnlen(bcc_ptr, bytes_left - 2);
3060 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) 3043 if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
3061 is_unicode = true; 3044 is_unicode = true;
@@ -3107,32 +3090,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3107int 3090int
3108cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) 3091cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
3109{ 3092{
3110 int i, ret; 3093 struct rb_root *root = &cifs_sb->tlink_tree;
3094 struct rb_node *node;
3095 struct tcon_link *tlink;
3111 char *tmp; 3096 char *tmp;
3112 struct tcon_link *tlink[8];
3113 unsigned long index = 0;
3114 3097
3115 cancel_delayed_work_sync(&cifs_sb->prune_tlinks); 3098 cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
3116 3099
3117 do { 3100 spin_lock(&cifs_sb->tlink_tree_lock);
3118 spin_lock(&cifs_sb->tlink_tree_lock); 3101 while ((node = rb_first(root))) {
3119 ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, 3102 tlink = rb_entry(node, struct tcon_link, tl_rbnode);
3120 (void **)tlink, index, 3103 cifs_get_tlink(tlink);
3121 ARRAY_SIZE(tlink)); 3104 clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
3122 /* increment index for next pass */ 3105 rb_erase(node, root);
3123 if (ret > 0)
3124 index = tlink[ret - 1]->tl_index + 1;
3125 for (i = 0; i < ret; i++) {
3126 cifs_get_tlink(tlink[i]);
3127 clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
3128 radix_tree_delete(&cifs_sb->tlink_tree,
3129 tlink[i]->tl_index);
3130 }
3131 spin_unlock(&cifs_sb->tlink_tree_lock);
3132 3106
3133 for (i = 0; i < ret; i++) 3107 spin_unlock(&cifs_sb->tlink_tree_lock);
3134 cifs_put_tlink(tlink[i]); 3108 cifs_put_tlink(tlink);
3135 } while (ret != 0); 3109 spin_lock(&cifs_sb->tlink_tree_lock);
3110 }
3111 spin_unlock(&cifs_sb->tlink_tree_lock);
3136 3112
3137 tmp = cifs_sb->prepath; 3113 tmp = cifs_sb->prepath;
3138 cifs_sb->prepathlen = 0; 3114 cifs_sb->prepathlen = 0;
@@ -3271,22 +3247,10 @@ out:
3271 return tcon; 3247 return tcon;
3272} 3248}
3273 3249
3274static struct tcon_link * 3250static inline struct tcon_link *
3275cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) 3251cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
3276{ 3252{
3277 struct tcon_link *tlink; 3253 return cifs_sb->master_tlink;
3278 unsigned int ret;
3279
3280 spin_lock(&cifs_sb->tlink_tree_lock);
3281 ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
3282 0, 1, CIFS_TLINK_MASTER_TAG);
3283 spin_unlock(&cifs_sb->tlink_tree_lock);
3284
3285 /* the master tcon should always be present */
3286 if (ret == 0)
3287 BUG();
3288
3289 return tlink;
3290} 3254}
3291 3255
3292struct cifsTconInfo * 3256struct cifsTconInfo *
@@ -3302,6 +3266,47 @@ cifs_sb_tcon_pending_wait(void *unused)
3302 return signal_pending(current) ? -ERESTARTSYS : 0; 3266 return signal_pending(current) ? -ERESTARTSYS : 0;
3303} 3267}
3304 3268
3269/* find and return a tlink with given uid */
3270static struct tcon_link *
3271tlink_rb_search(struct rb_root *root, uid_t uid)
3272{
3273 struct rb_node *node = root->rb_node;
3274 struct tcon_link *tlink;
3275
3276 while (node) {
3277 tlink = rb_entry(node, struct tcon_link, tl_rbnode);
3278
3279 if (tlink->tl_uid > uid)
3280 node = node->rb_left;
3281 else if (tlink->tl_uid < uid)
3282 node = node->rb_right;
3283 else
3284 return tlink;
3285 }
3286 return NULL;
3287}
3288
3289/* insert a tcon_link into the tree */
3290static void
3291tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
3292{
3293 struct rb_node **new = &(root->rb_node), *parent = NULL;
3294 struct tcon_link *tlink;
3295
3296 while (*new) {
3297 tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
3298 parent = *new;
3299
3300 if (tlink->tl_uid > new_tlink->tl_uid)
3301 new = &((*new)->rb_left);
3302 else
3303 new = &((*new)->rb_right);
3304 }
3305
3306 rb_link_node(&new_tlink->tl_rbnode, parent, new);
3307 rb_insert_color(&new_tlink->tl_rbnode, root);
3308}
3309
3305/* 3310/*
3306 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the 3311 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
3307 * current task. 3312 * current task.
@@ -3309,7 +3314,7 @@ cifs_sb_tcon_pending_wait(void *unused)
3309 * If the superblock doesn't refer to a multiuser mount, then just return 3314 * If the superblock doesn't refer to a multiuser mount, then just return
3310 * the master tcon for the mount. 3315 * the master tcon for the mount.
3311 * 3316 *
3312 * First, search the radix tree for an existing tcon for this fsuid. If one 3317 * First, search the rbtree for an existing tcon for this fsuid. If one
3313 * exists, then check to see if it's pending construction. If it is then wait 3318 * exists, then check to see if it's pending construction. If it is then wait
3314 * for construction to complete. Once it's no longer pending, check to see if 3319 * for construction to complete. Once it's no longer pending, check to see if
3315 * it failed and either return an error or retry construction, depending on 3320 * it failed and either return an error or retry construction, depending on
@@ -3322,14 +3327,14 @@ struct tcon_link *
3322cifs_sb_tlink(struct cifs_sb_info *cifs_sb) 3327cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
3323{ 3328{
3324 int ret; 3329 int ret;
3325 unsigned long fsuid = (unsigned long) current_fsuid(); 3330 uid_t fsuid = current_fsuid();
3326 struct tcon_link *tlink, *newtlink; 3331 struct tcon_link *tlink, *newtlink;
3327 3332
3328 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) 3333 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
3329 return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); 3334 return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
3330 3335
3331 spin_lock(&cifs_sb->tlink_tree_lock); 3336 spin_lock(&cifs_sb->tlink_tree_lock);
3332 tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); 3337 tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
3333 if (tlink) 3338 if (tlink)
3334 cifs_get_tlink(tlink); 3339 cifs_get_tlink(tlink);
3335 spin_unlock(&cifs_sb->tlink_tree_lock); 3340 spin_unlock(&cifs_sb->tlink_tree_lock);
@@ -3338,36 +3343,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
3338 newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL); 3343 newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
3339 if (newtlink == NULL) 3344 if (newtlink == NULL)
3340 return ERR_PTR(-ENOMEM); 3345 return ERR_PTR(-ENOMEM);
3341 newtlink->tl_index = fsuid; 3346 newtlink->tl_uid = fsuid;
3342 newtlink->tl_tcon = ERR_PTR(-EACCES); 3347 newtlink->tl_tcon = ERR_PTR(-EACCES);
3343 set_bit(TCON_LINK_PENDING, &newtlink->tl_flags); 3348 set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
3344 set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags); 3349 set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
3345 cifs_get_tlink(newtlink); 3350 cifs_get_tlink(newtlink);
3346 3351
3347 ret = radix_tree_preload(GFP_KERNEL);
3348 if (ret != 0) {
3349 kfree(newtlink);
3350 return ERR_PTR(ret);
3351 }
3352
3353 spin_lock(&cifs_sb->tlink_tree_lock); 3352 spin_lock(&cifs_sb->tlink_tree_lock);
3354 /* was one inserted after previous search? */ 3353 /* was one inserted after previous search? */
3355 tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); 3354 tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
3356 if (tlink) { 3355 if (tlink) {
3357 cifs_get_tlink(tlink); 3356 cifs_get_tlink(tlink);
3358 spin_unlock(&cifs_sb->tlink_tree_lock); 3357 spin_unlock(&cifs_sb->tlink_tree_lock);
3359 radix_tree_preload_end();
3360 kfree(newtlink); 3358 kfree(newtlink);
3361 goto wait_for_construction; 3359 goto wait_for_construction;
3362 } 3360 }
3363 ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
3364 spin_unlock(&cifs_sb->tlink_tree_lock);
3365 radix_tree_preload_end();
3366 if (ret) {
3367 kfree(newtlink);
3368 return ERR_PTR(ret);
3369 }
3370 tlink = newtlink; 3361 tlink = newtlink;
3362 tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
3363 spin_unlock(&cifs_sb->tlink_tree_lock);
3371 } else { 3364 } else {
3372wait_for_construction: 3365wait_for_construction:
3373 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, 3366 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
@@ -3413,39 +3406,39 @@ cifs_prune_tlinks(struct work_struct *work)
3413{ 3406{
3414 struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, 3407 struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
3415 prune_tlinks.work); 3408 prune_tlinks.work);
3416 struct tcon_link *tlink[8]; 3409 struct rb_root *root = &cifs_sb->tlink_tree;
3417 unsigned long now = jiffies; 3410 struct rb_node *node = rb_first(root);
3418 unsigned long index = 0; 3411 struct rb_node *tmp;
3419 int i, ret; 3412 struct tcon_link *tlink;
3420 3413
3421 do { 3414 /*
3422 spin_lock(&cifs_sb->tlink_tree_lock); 3415 * Because we drop the spinlock in the loop in order to put the tlink
3423 ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, 3416 * it's not guarded against removal of links from the tree. The only
3424 (void **)tlink, index, 3417 * places that remove entries from the tree are this function and
3425 ARRAY_SIZE(tlink)); 3418 * umounts. Because this function is non-reentrant and is canceled
3426 /* increment index for next pass */ 3419 * before umount can proceed, this is safe.
3427 if (ret > 0) 3420 */
3428 index = tlink[ret - 1]->tl_index + 1; 3421 spin_lock(&cifs_sb->tlink_tree_lock);
3429 for (i = 0; i < ret; i++) { 3422 node = rb_first(root);
3430 if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) || 3423 while (node != NULL) {
3431 atomic_read(&tlink[i]->tl_count) != 0 || 3424 tmp = node;
3432 time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE, 3425 node = rb_next(tmp);
3433 now)) { 3426 tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
3434 tlink[i] = NULL; 3427
3435 continue; 3428 if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
3436 } 3429 atomic_read(&tlink->tl_count) != 0 ||
3437 cifs_get_tlink(tlink[i]); 3430 time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
3438 clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); 3431 continue;
3439 radix_tree_delete(&cifs_sb->tlink_tree,
3440 tlink[i]->tl_index);
3441 }
3442 spin_unlock(&cifs_sb->tlink_tree_lock);
3443 3432
3444 for (i = 0; i < ret; i++) { 3433 cifs_get_tlink(tlink);
3445 if (tlink[i] != NULL) 3434 clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
3446 cifs_put_tlink(tlink[i]); 3435 rb_erase(tmp, root);
3447 } 3436
3448 } while (ret != 0); 3437 spin_unlock(&cifs_sb->tlink_tree_lock);
3438 cifs_put_tlink(tlink);
3439 spin_lock(&cifs_sb->tlink_tree_lock);
3440 }
3441 spin_unlock(&cifs_sb->tlink_tree_lock);
3449 3442
3450 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, 3443 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
3451 TLINK_IDLE_EXPIRE); 3444 TLINK_IDLE_EXPIRE);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3840eddbfb7..dd5f22918c3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -130,17 +130,6 @@ cifs_bp_rename_retry:
130 return full_path; 130 return full_path;
131} 131}
132 132
133static void setup_cifs_dentry(struct cifsTconInfo *tcon,
134 struct dentry *direntry,
135 struct inode *newinode)
136{
137 if (tcon->nocase)
138 direntry->d_op = &cifs_ci_dentry_ops;
139 else
140 direntry->d_op = &cifs_dentry_ops;
141 d_instantiate(direntry, newinode);
142}
143
144/* Inode operations in similar order to how they appear in Linux file fs.h */ 133/* Inode operations in similar order to how they appear in Linux file fs.h */
145 134
146int 135int
@@ -293,10 +282,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
293 args.uid = NO_CHANGE_64; 282 args.uid = NO_CHANGE_64;
294 args.gid = NO_CHANGE_64; 283 args.gid = NO_CHANGE_64;
295 } 284 }
296 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, 285 CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
297 cifs_sb->local_nls, 286 current->tgid);
298 cifs_sb->mnt_cifs_flags &
299 CIFS_MOUNT_MAP_SPECIAL_CHR);
300 } else { 287 } else {
301 /* BB implement mode setting via Windows security 288 /* BB implement mode setting via Windows security
302 descriptors e.g. */ 289 descriptors e.g. */
@@ -329,7 +316,7 @@ cifs_create_get_file_info:
329 316
330cifs_create_set_dentry: 317cifs_create_set_dentry:
331 if (rc == 0) 318 if (rc == 0)
332 setup_cifs_dentry(tcon, direntry, newinode); 319 d_instantiate(direntry, newinode);
333 else 320 else
334 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); 321 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
335 322
@@ -420,10 +407,6 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
420 407
421 rc = cifs_get_inode_info_unix(&newinode, full_path, 408 rc = cifs_get_inode_info_unix(&newinode, full_path,
422 inode->i_sb, xid); 409 inode->i_sb, xid);
423 if (pTcon->nocase)
424 direntry->d_op = &cifs_ci_dentry_ops;
425 else
426 direntry->d_op = &cifs_dentry_ops;
427 410
428 if (rc == 0) 411 if (rc == 0)
429 d_instantiate(direntry, newinode); 412 d_instantiate(direntry, newinode);
@@ -603,10 +586,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
603 parent_dir_inode->i_sb, xid, NULL); 586 parent_dir_inode->i_sb, xid, NULL);
604 587
605 if ((rc == 0) && (newInode != NULL)) { 588 if ((rc == 0) && (newInode != NULL)) {
606 if (pTcon->nocase)
607 direntry->d_op = &cifs_ci_dentry_ops;
608 else
609 direntry->d_op = &cifs_dentry_ops;
610 d_add(direntry, newInode); 589 d_add(direntry, newInode);
611 if (posix_open) { 590 if (posix_open) {
612 filp = lookup_instantiate_filp(nd, direntry, 591 filp = lookup_instantiate_filp(nd, direntry,
@@ -633,10 +612,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
633 } else if (rc == -ENOENT) { 612 } else if (rc == -ENOENT) {
634 rc = 0; 613 rc = 0;
635 direntry->d_time = jiffies; 614 direntry->d_time = jiffies;
636 if (pTcon->nocase)
637 direntry->d_op = &cifs_ci_dentry_ops;
638 else
639 direntry->d_op = &cifs_dentry_ops;
640 d_add(direntry, NULL); 615 d_add(direntry, NULL);
641 /* if it was once a directory (but how can we tell?) we could do 616 /* if it was once a directory (but how can we tell?) we could do
642 shrink_dcache_parent(direntry); */ 617 shrink_dcache_parent(direntry); */
@@ -656,22 +631,37 @@ lookup_out:
656static int 631static int
657cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) 632cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
658{ 633{
659 int isValid = 1; 634 if (nd->flags & LOOKUP_RCU)
635 return -ECHILD;
660 636
661 if (direntry->d_inode) { 637 if (direntry->d_inode) {
662 if (cifs_revalidate_dentry(direntry)) 638 if (cifs_revalidate_dentry(direntry))
663 return 0; 639 return 0;
664 } else { 640 else
665 cFYI(1, "neg dentry 0x%p name = %s", 641 return 1;
666 direntry, direntry->d_name.name);
667 if (time_after(jiffies, direntry->d_time + HZ) ||
668 !lookupCacheEnabled) {
669 d_drop(direntry);
670 isValid = 0;
671 }
672 } 642 }
673 643
674 return isValid; 644 /*
645 * This may be nfsd (or something), anyway, we can't see the
646 * intent of this. So, since this can be for creation, drop it.
647 */
648 if (!nd)
649 return 0;
650
651 /*
652 * Drop the negative dentry, in order to make sure to use the
653 * case sensitive name which is specified by user if this is
654 * for creation.
655 */
656 if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
657 if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
658 return 0;
659 }
660
661 if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
662 return 0;
663
664 return 1;
675} 665}
676 666
677/* static int cifs_d_delete(struct dentry *direntry) 667/* static int cifs_d_delete(struct dentry *direntry)
@@ -685,12 +675,14 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
685 675
686const struct dentry_operations cifs_dentry_ops = { 676const struct dentry_operations cifs_dentry_ops = {
687 .d_revalidate = cifs_d_revalidate, 677 .d_revalidate = cifs_d_revalidate,
678 .d_automount = cifs_dfs_d_automount,
688/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ 679/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
689}; 680};
690 681
691static int cifs_ci_hash(struct dentry *dentry, struct qstr *q) 682static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
683 struct qstr *q)
692{ 684{
693 struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls; 685 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
694 unsigned long hash; 686 unsigned long hash;
695 int i; 687 int i;
696 688
@@ -703,21 +695,16 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
703 return 0; 695 return 0;
704} 696}
705 697
706static int cifs_ci_compare(struct dentry *dentry, struct qstr *a, 698static int cifs_ci_compare(const struct dentry *parent,
707 struct qstr *b) 699 const struct inode *pinode,
700 const struct dentry *dentry, const struct inode *inode,
701 unsigned int len, const char *str, const struct qstr *name)
708{ 702{
709 struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls; 703 struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
710 704
711 if ((a->len == b->len) && 705 if ((name->len == len) &&
712 (nls_strnicmp(codepage, a->name, b->name, a->len) == 0)) { 706 (nls_strnicmp(codepage, name->name, str, len) == 0))
713 /*
714 * To preserve case, don't let an existing negative dentry's
715 * case take precedence. If a is not a negative dentry, this
716 * should have no side effects
717 */
718 memcpy((void *)a->name, b->name, a->len);
719 return 0; 707 return 0;
720 }
721 return 1; 708 return 1;
722} 709}
723 710
@@ -725,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
725 .d_revalidate = cifs_d_revalidate, 712 .d_revalidate = cifs_d_revalidate,
726 .d_hash = cifs_ci_hash, 713 .d_hash = cifs_ci_hash,
727 .d_compare = cifs_ci_compare, 714 .d_compare = cifs_ci_compare,
715 .d_automount = cifs_dfs_d_automount,
728}; 716};
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0eb87026cad..548f06230a6 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
66 /* Search for server name delimiter */ 66 /* Search for server name delimiter */
67 sep = memchr(hostname, '\\', len); 67 sep = memchr(hostname, '\\', len);
68 if (sep) 68 if (sep)
69 len = sep - unc; 69 len = sep - hostname;
70 else 70 else
71 cFYI(1, "%s: probably server name is whole unc: %s", 71 cFYI(1, "%s: probably server name is whole unc: %s",
72 __func__, unc); 72 __func__, unc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ae82159cf7f..0de17c1db60 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -104,58 +104,6 @@ static inline int cifs_get_disposition(unsigned int flags)
104 return FILE_OPEN; 104 return FILE_OPEN;
105} 105}
106 106
107static inline int cifs_open_inode_helper(struct inode *inode,
108 struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
109 char *full_path, int xid)
110{
111 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
112 struct timespec temp;
113 int rc;
114
115 if (pCifsInode->clientCanCacheRead) {
116 /* we have the inode open somewhere else
117 no need to discard cache data */
118 goto client_can_cache;
119 }
120
121 /* BB need same check in cifs_create too? */
122 /* if not oplocked, invalidate inode pages if mtime or file
123 size changed */
124 temp = cifs_NTtimeToUnix(buf->LastWriteTime);
125 if (timespec_equal(&inode->i_mtime, &temp) &&
126 (inode->i_size ==
127 (loff_t)le64_to_cpu(buf->EndOfFile))) {
128 cFYI(1, "inode unchanged on server");
129 } else {
130 if (inode->i_mapping) {
131 /* BB no need to lock inode until after invalidate
132 since namei code should already have it locked? */
133 rc = filemap_write_and_wait(inode->i_mapping);
134 mapping_set_error(inode->i_mapping, rc);
135 }
136 cFYI(1, "invalidating remote inode since open detected it "
137 "changed");
138 invalidate_remote_inode(inode);
139 }
140
141client_can_cache:
142 if (pTcon->unix_ext)
143 rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
144 xid);
145 else
146 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
147 xid, NULL);
148
149 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
150 pCifsInode->clientCanCacheAll = true;
151 pCifsInode->clientCanCacheRead = true;
152 cFYI(1, "Exclusive Oplock granted on inode %p", inode);
153 } else if ((oplock & 0xF) == OPLOCK_READ)
154 pCifsInode->clientCanCacheRead = true;
155
156 return rc;
157}
158
159int cifs_posix_open(char *full_path, struct inode **pinode, 107int cifs_posix_open(char *full_path, struct inode **pinode,
160 struct super_block *sb, int mode, unsigned int f_flags, 108 struct super_block *sb, int mode, unsigned int f_flags,
161 __u32 *poplock, __u16 *pnetfid, int xid) 109 __u32 *poplock, __u16 *pnetfid, int xid)
@@ -218,6 +166,76 @@ posix_open_ret:
218 return rc; 166 return rc;
219} 167}
220 168
169static int
170cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
171 struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
172 __u16 *pnetfid, int xid)
173{
174 int rc;
175 int desiredAccess;
176 int disposition;
177 FILE_ALL_INFO *buf;
178
179 desiredAccess = cifs_convert_flags(f_flags);
180
181/*********************************************************************
182 * open flag mapping table:
183 *
184 * POSIX Flag CIFS Disposition
185 * ---------- ----------------
186 * O_CREAT FILE_OPEN_IF
187 * O_CREAT | O_EXCL FILE_CREATE
188 * O_CREAT | O_TRUNC FILE_OVERWRITE_IF
189 * O_TRUNC FILE_OVERWRITE
190 * none of the above FILE_OPEN
191 *
192 * Note that there is not a direct match between disposition
193 * FILE_SUPERSEDE (ie create whether or not file exists although
194 * O_CREAT | O_TRUNC is similar but truncates the existing
195 * file rather than creating a new file as FILE_SUPERSEDE does
196 * (which uses the attributes / metadata passed in on open call)
197 *?
198 *? O_SYNC is a reasonable match to CIFS writethrough flag
199 *? and the read write flags match reasonably. O_LARGEFILE
200 *? is irrelevant because largefile support is always used
201 *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
202 * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
203 *********************************************************************/
204
205 disposition = cifs_get_disposition(f_flags);
206
207 /* BB pass O_SYNC flag through on file attributes .. BB */
208
209 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
210 if (!buf)
211 return -ENOMEM;
212
213 if (tcon->ses->capabilities & CAP_NT_SMBS)
214 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
215 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
216 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
217 & CIFS_MOUNT_MAP_SPECIAL_CHR);
218 else
219 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
220 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
221 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
222 & CIFS_MOUNT_MAP_SPECIAL_CHR);
223
224 if (rc)
225 goto out;
226
227 if (tcon->unix_ext)
228 rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
229 xid);
230 else
231 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
232 xid, pnetfid);
233
234out:
235 kfree(buf);
236 return rc;
237}
238
221struct cifsFileInfo * 239struct cifsFileInfo *
222cifs_new_fileinfo(__u16 fileHandle, struct file *file, 240cifs_new_fileinfo(__u16 fileHandle, struct file *file,
223 struct tcon_link *tlink, __u32 oplock) 241 struct tcon_link *tlink, __u32 oplock)
@@ -253,12 +271,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
253 list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); 271 list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
254 spin_unlock(&cifs_file_list_lock); 272 spin_unlock(&cifs_file_list_lock);
255 273
256 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 274 cifs_set_oplock_level(pCifsInode, oplock);
257 pCifsInode->clientCanCacheAll = true;
258 pCifsInode->clientCanCacheRead = true;
259 cFYI(1, "Exclusive Oplock inode %p", inode);
260 } else if ((oplock & 0xF) == OPLOCK_READ)
261 pCifsInode->clientCanCacheRead = true;
262 275
263 file->private_data = pCifsFile; 276 file->private_data = pCifsFile;
264 return pCifsFile; 277 return pCifsFile;
@@ -271,8 +284,10 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
271 */ 284 */
272void cifsFileInfo_put(struct cifsFileInfo *cifs_file) 285void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
273{ 286{
287 struct inode *inode = cifs_file->dentry->d_inode;
274 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); 288 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
275 struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode); 289 struct cifsInodeInfo *cifsi = CIFS_I(inode);
290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
276 struct cifsLockInfo *li, *tmp; 291 struct cifsLockInfo *li, *tmp;
277 292
278 spin_lock(&cifs_file_list_lock); 293 spin_lock(&cifs_file_list_lock);
@@ -288,8 +303,14 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
288 if (list_empty(&cifsi->openFileList)) { 303 if (list_empty(&cifsi->openFileList)) {
289 cFYI(1, "closing last open instance for inode %p", 304 cFYI(1, "closing last open instance for inode %p",
290 cifs_file->dentry->d_inode); 305 cifs_file->dentry->d_inode);
291 cifsi->clientCanCacheRead = false; 306
292 cifsi->clientCanCacheAll = false; 307 /* in strict cache mode we need invalidate mapping on the last
308 close because it may cause a error when we open this file
309 again and get at least level II oplock */
310 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
311 CIFS_I(inode)->invalid_mapping = true;
312
313 cifs_set_oplock_level(cifsi, 0);
293 } 314 }
294 spin_unlock(&cifs_file_list_lock); 315 spin_unlock(&cifs_file_list_lock);
295 316
@@ -327,10 +348,8 @@ int cifs_open(struct inode *inode, struct file *file)
327 struct cifsFileInfo *pCifsFile = NULL; 348 struct cifsFileInfo *pCifsFile = NULL;
328 struct cifsInodeInfo *pCifsInode; 349 struct cifsInodeInfo *pCifsInode;
329 char *full_path = NULL; 350 char *full_path = NULL;
330 int desiredAccess; 351 bool posix_open_ok = false;
331 int disposition;
332 __u16 netfid; 352 __u16 netfid;
333 FILE_ALL_INFO *buf = NULL;
334 353
335 xid = GetXid(); 354 xid = GetXid();
336 355
@@ -368,17 +387,7 @@ int cifs_open(struct inode *inode, struct file *file)
368 file->f_flags, &oplock, &netfid, xid); 387 file->f_flags, &oplock, &netfid, xid);
369 if (rc == 0) { 388 if (rc == 0) {
370 cFYI(1, "posix open succeeded"); 389 cFYI(1, "posix open succeeded");
371 390 posix_open_ok = true;
372 pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
373 oplock);
374 if (pCifsFile == NULL) {
375 CIFSSMBClose(xid, tcon, netfid);
376 rc = -ENOMEM;
377 }
378
379 cifs_fscache_set_inode_cookie(inode, file);
380
381 goto out;
382 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 391 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
383 if (tcon->ses->serverNOS) 392 if (tcon->ses->serverNOS)
384 cERROR(1, "server %s of type %s returned" 393 cERROR(1, "server %s of type %s returned"
@@ -395,103 +404,39 @@ int cifs_open(struct inode *inode, struct file *file)
395 or DFS errors */ 404 or DFS errors */
396 } 405 }
397 406
398 desiredAccess = cifs_convert_flags(file->f_flags); 407 if (!posix_open_ok) {
399 408 rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
400/********************************************************************* 409 file->f_flags, &oplock, &netfid, xid);
401 * open flag mapping table: 410 if (rc)
402 * 411 goto out;
403 * POSIX Flag CIFS Disposition
404 * ---------- ----------------
405 * O_CREAT FILE_OPEN_IF
406 * O_CREAT | O_EXCL FILE_CREATE
407 * O_CREAT | O_TRUNC FILE_OVERWRITE_IF
408 * O_TRUNC FILE_OVERWRITE
409 * none of the above FILE_OPEN
410 *
411 * Note that there is not a direct match between disposition
412 * FILE_SUPERSEDE (ie create whether or not file exists although
413 * O_CREAT | O_TRUNC is similar but truncates the existing
414 * file rather than creating a new file as FILE_SUPERSEDE does
415 * (which uses the attributes / metadata passed in on open call)
416 *?
417 *? O_SYNC is a reasonable match to CIFS writethrough flag
418 *? and the read write flags match reasonably. O_LARGEFILE
419 *? is irrelevant because largefile support is always used
420 *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
421 * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
422 *********************************************************************/
423
424 disposition = cifs_get_disposition(file->f_flags);
425
426 /* BB pass O_SYNC flag through on file attributes .. BB */
427
428 /* Also refresh inode by passing in file_info buf returned by SMBOpen
429 and calling get_inode_info with returned buf (at least helps
430 non-Unix server case) */
431
432 /* BB we can not do this if this is the second open of a file
433 and the first handle has writebehind data, we might be
434 able to simply do a filemap_fdatawrite/filemap_fdatawait first */
435 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
436 if (!buf) {
437 rc = -ENOMEM;
438 goto out;
439 }
440
441 if (tcon->ses->capabilities & CAP_NT_SMBS)
442 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
443 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
444 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
445 & CIFS_MOUNT_MAP_SPECIAL_CHR);
446 else
447 rc = -EIO; /* no NT SMB support fall into legacy open below */
448
449 if (rc == -EIO) {
450 /* Old server, try legacy style OpenX */
451 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
452 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
453 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
454 & CIFS_MOUNT_MAP_SPECIAL_CHR);
455 }
456 if (rc) {
457 cFYI(1, "cifs_open returned 0x%x", rc);
458 goto out;
459 } 412 }
460 413
461 rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
462 if (rc != 0)
463 goto out;
464
465 pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock); 414 pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
466 if (pCifsFile == NULL) { 415 if (pCifsFile == NULL) {
416 CIFSSMBClose(xid, tcon, netfid);
467 rc = -ENOMEM; 417 rc = -ENOMEM;
468 goto out; 418 goto out;
469 } 419 }
470 420
471 cifs_fscache_set_inode_cookie(inode, file); 421 cifs_fscache_set_inode_cookie(inode, file);
472 422
473 if (oplock & CIFS_CREATE_ACTION) { 423 if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
474 /* time to set mode which we can not set earlier due to 424 /* time to set mode which we can not set earlier due to
475 problems creating new read-only files */ 425 problems creating new read-only files */
476 if (tcon->unix_ext) { 426 struct cifs_unix_set_info_args args = {
477 struct cifs_unix_set_info_args args = { 427 .mode = inode->i_mode,
478 .mode = inode->i_mode, 428 .uid = NO_CHANGE_64,
479 .uid = NO_CHANGE_64, 429 .gid = NO_CHANGE_64,
480 .gid = NO_CHANGE_64, 430 .ctime = NO_CHANGE_64,
481 .ctime = NO_CHANGE_64, 431 .atime = NO_CHANGE_64,
482 .atime = NO_CHANGE_64, 432 .mtime = NO_CHANGE_64,
483 .mtime = NO_CHANGE_64, 433 .device = 0,
484 .device = 0, 434 };
485 }; 435 CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
486 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, 436 pCifsFile->pid);
487 cifs_sb->local_nls,
488 cifs_sb->mnt_cifs_flags &
489 CIFS_MOUNT_MAP_SPECIAL_CHR);
490 }
491 } 437 }
492 438
493out: 439out:
494 kfree(buf);
495 kfree(full_path); 440 kfree(full_path);
496 FreeXid(xid); 441 FreeXid(xid);
497 cifs_put_tlink(tlink); 442 cifs_put_tlink(tlink);
@@ -607,8 +552,6 @@ reopen_success:
607 rc = filemap_write_and_wait(inode->i_mapping); 552 rc = filemap_write_and_wait(inode->i_mapping);
608 mapping_set_error(inode->i_mapping, rc); 553 mapping_set_error(inode->i_mapping, rc);
609 554
610 pCifsInode->clientCanCacheAll = false;
611 pCifsInode->clientCanCacheRead = false;
612 if (tcon->unix_ext) 555 if (tcon->unix_ext)
613 rc = cifs_get_inode_info_unix(&inode, 556 rc = cifs_get_inode_info_unix(&inode,
614 full_path, inode->i_sb, xid); 557 full_path, inode->i_sb, xid);
@@ -622,18 +565,9 @@ reopen_success:
622 invalidate the current end of file on the server 565 invalidate the current end of file on the server
623 we can not go to the server to get the new inod 566 we can not go to the server to get the new inod
624 info */ 567 info */
625 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 568
626 pCifsInode->clientCanCacheAll = true; 569 cifs_set_oplock_level(pCifsInode, oplock);
627 pCifsInode->clientCanCacheRead = true; 570
628 cFYI(1, "Exclusive Oplock granted on inode %p",
629 pCifsFile->dentry->d_inode);
630 } else if ((oplock & 0xF) == OPLOCK_READ) {
631 pCifsInode->clientCanCacheRead = true;
632 pCifsInode->clientCanCacheAll = false;
633 } else {
634 pCifsInode->clientCanCacheRead = false;
635 pCifsInode->clientCanCacheAll = false;
636 }
637 cifs_relock_file(pCifsFile); 571 cifs_relock_file(pCifsFile);
638 572
639reopen_error_exit: 573reopen_error_exit:
@@ -775,12 +709,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
775 709
776 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 710 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
777 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); 711 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
778
779 if (file->private_data == NULL) {
780 rc = -EBADF;
781 FreeXid(xid);
782 return rc;
783 }
784 netfid = ((struct cifsFileInfo *)file->private_data)->netfid; 712 netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
785 713
786 if ((tcon->ses->capabilities & CAP_UNIX) && 714 if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -806,12 +734,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
806 734
807 /* BB we could chain these into one lock request BB */ 735 /* BB we could chain these into one lock request BB */
808 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 736 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
809 0, 1, lockType, 0 /* wait flag */ ); 737 0, 1, lockType, 0 /* wait flag */, 0);
810 if (rc == 0) { 738 if (rc == 0) {
811 rc = CIFSSMBLock(xid, tcon, netfid, length, 739 rc = CIFSSMBLock(xid, tcon, netfid, length,
812 pfLock->fl_start, 1 /* numUnlock */ , 740 pfLock->fl_start, 1 /* numUnlock */ ,
813 0 /* numLock */ , lockType, 741 0 /* numLock */ , lockType,
814 0 /* wait flag */ ); 742 0 /* wait flag */, 0);
815 pfLock->fl_type = F_UNLCK; 743 pfLock->fl_type = F_UNLCK;
816 if (rc != 0) 744 if (rc != 0)
817 cERROR(1, "Error unlocking previously locked " 745 cERROR(1, "Error unlocking previously locked "
@@ -828,13 +756,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
828 rc = CIFSSMBLock(xid, tcon, netfid, length, 756 rc = CIFSSMBLock(xid, tcon, netfid, length,
829 pfLock->fl_start, 0, 1, 757 pfLock->fl_start, 0, 1,
830 lockType | LOCKING_ANDX_SHARED_LOCK, 758 lockType | LOCKING_ANDX_SHARED_LOCK,
831 0 /* wait flag */); 759 0 /* wait flag */, 0);
832 if (rc == 0) { 760 if (rc == 0) {
833 rc = CIFSSMBLock(xid, tcon, netfid, 761 rc = CIFSSMBLock(xid, tcon, netfid,
834 length, pfLock->fl_start, 1, 0, 762 length, pfLock->fl_start, 1, 0,
835 lockType | 763 lockType |
836 LOCKING_ANDX_SHARED_LOCK, 764 LOCKING_ANDX_SHARED_LOCK,
837 0 /* wait flag */); 765 0 /* wait flag */, 0);
838 pfLock->fl_type = F_RDLCK; 766 pfLock->fl_type = F_RDLCK;
839 if (rc != 0) 767 if (rc != 0)
840 cERROR(1, "Error unlocking " 768 cERROR(1, "Error unlocking "
@@ -877,8 +805,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
877 805
878 if (numLock) { 806 if (numLock) {
879 rc = CIFSSMBLock(xid, tcon, netfid, length, 807 rc = CIFSSMBLock(xid, tcon, netfid, length,
880 pfLock->fl_start, 808 pfLock->fl_start, 0, numLock, lockType,
881 0, numLock, lockType, wait_flag); 809 wait_flag, 0);
882 810
883 if (rc == 0) { 811 if (rc == 0) {
884 /* For Windows locks we must store them. */ 812 /* For Windows locks we must store them. */
@@ -898,9 +826,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
898 (pfLock->fl_start + length) >= 826 (pfLock->fl_start + length) >=
899 (li->offset + li->length)) { 827 (li->offset + li->length)) {
900 stored_rc = CIFSSMBLock(xid, tcon, 828 stored_rc = CIFSSMBLock(xid, tcon,
901 netfid, 829 netfid, li->length,
902 li->length, li->offset, 830 li->offset, 1, 0,
903 1, 0, li->type, false); 831 li->type, false, 0);
904 if (stored_rc) 832 if (stored_rc)
905 rc = stored_rc; 833 rc = stored_rc;
906 else { 834 else {
@@ -919,31 +847,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
919 return rc; 847 return rc;
920} 848}
921 849
922/*
923 * Set the timeout on write requests past EOF. For some servers (Windows)
924 * these calls can be very long.
925 *
926 * If we're writing >10M past the EOF we give a 180s timeout. Anything less
927 * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
928 * The 10M cutoff is totally arbitrary. A better scheme for this would be
929 * welcome if someone wants to suggest one.
930 *
931 * We may be able to do a better job with this if there were some way to
932 * declare that a file should be sparse.
933 */
934static int
935cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
936{
937 if (offset <= cifsi->server_eof)
938 return CIFS_STD_OP;
939 else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
940 return CIFS_VLONG_OP;
941 else
942 return CIFS_LONG_OP;
943}
944
945/* update the file size (if needed) after a write */ 850/* update the file size (if needed) after a write */
946static void 851void
947cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, 852cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
948 unsigned int bytes_written) 853 unsigned int bytes_written)
949{ 854{
@@ -956,14 +861,15 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
956ssize_t cifs_user_write(struct file *file, const char __user *write_data, 861ssize_t cifs_user_write(struct file *file, const char __user *write_data,
957 size_t write_size, loff_t *poffset) 862 size_t write_size, loff_t *poffset)
958{ 863{
864 struct inode *inode = file->f_path.dentry->d_inode;
959 int rc = 0; 865 int rc = 0;
960 unsigned int bytes_written = 0; 866 unsigned int bytes_written = 0;
961 unsigned int total_written; 867 unsigned int total_written;
962 struct cifs_sb_info *cifs_sb; 868 struct cifs_sb_info *cifs_sb;
963 struct cifsTconInfo *pTcon; 869 struct cifsTconInfo *pTcon;
964 int xid, long_op; 870 int xid;
965 struct cifsFileInfo *open_file; 871 struct cifsFileInfo *open_file;
966 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); 872 struct cifsInodeInfo *cifsi = CIFS_I(inode);
967 873
968 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 874 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
969 875
@@ -982,7 +888,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
982 888
983 xid = GetXid(); 889 xid = GetXid();
984 890
985 long_op = cifs_write_timeout(cifsi, *poffset);
986 for (total_written = 0; write_size > total_written; 891 for (total_written = 0; write_size > total_written;
987 total_written += bytes_written) { 892 total_written += bytes_written) {
988 rc = -EAGAIN; 893 rc = -EAGAIN;
@@ -1010,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1010 min_t(const int, cifs_sb->wsize, 915 min_t(const int, cifs_sb->wsize,
1011 write_size - total_written), 916 write_size - total_written),
1012 *poffset, &bytes_written, 917 *poffset, &bytes_written,
1013 NULL, write_data + total_written, long_op); 918 NULL, write_data + total_written, 0);
1014 } 919 }
1015 if (rc || (bytes_written == 0)) { 920 if (rc || (bytes_written == 0)) {
1016 if (total_written) 921 if (total_written)
@@ -1023,27 +928,21 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1023 cifs_update_eof(cifsi, *poffset, bytes_written); 928 cifs_update_eof(cifsi, *poffset, bytes_written);
1024 *poffset += bytes_written; 929 *poffset += bytes_written;
1025 } 930 }
1026 long_op = CIFS_STD_OP; /* subsequent writes fast -
1027 15 seconds is plenty */
1028 } 931 }
1029 932
1030 cifs_stats_bytes_written(pTcon, total_written); 933 cifs_stats_bytes_written(pTcon, total_written);
1031 934
1032 /* since the write may have blocked check these pointers again */
1033 if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
1034 struct inode *inode = file->f_path.dentry->d_inode;
1035/* Do not update local mtime - server will set its actual value on write 935/* Do not update local mtime - server will set its actual value on write
1036 * inode->i_ctime = inode->i_mtime = 936 * inode->i_ctime = inode->i_mtime =
1037 * current_fs_time(inode->i_sb);*/ 937 * current_fs_time(inode->i_sb);*/
1038 if (total_written > 0) { 938 if (total_written > 0) {
1039 spin_lock(&inode->i_lock); 939 spin_lock(&inode->i_lock);
1040 if (*poffset > file->f_path.dentry->d_inode->i_size) 940 if (*poffset > inode->i_size)
1041 i_size_write(file->f_path.dentry->d_inode, 941 i_size_write(inode, *poffset);
1042 *poffset); 942 spin_unlock(&inode->i_lock);
1043 spin_unlock(&inode->i_lock);
1044 }
1045 mark_inode_dirty_sync(file->f_path.dentry->d_inode);
1046 } 943 }
944 mark_inode_dirty_sync(inode);
945
1047 FreeXid(xid); 946 FreeXid(xid);
1048 return total_written; 947 return total_written;
1049} 948}
@@ -1057,7 +956,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1057 unsigned int total_written; 956 unsigned int total_written;
1058 struct cifs_sb_info *cifs_sb; 957 struct cifs_sb_info *cifs_sb;
1059 struct cifsTconInfo *pTcon; 958 struct cifsTconInfo *pTcon;
1060 int xid, long_op; 959 int xid;
1061 struct dentry *dentry = open_file->dentry; 960 struct dentry *dentry = open_file->dentry;
1062 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); 961 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
1063 962
@@ -1070,7 +969,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1070 969
1071 xid = GetXid(); 970 xid = GetXid();
1072 971
1073 long_op = cifs_write_timeout(cifsi, *poffset);
1074 for (total_written = 0; write_size > total_written; 972 for (total_written = 0; write_size > total_written;
1075 total_written += bytes_written) { 973 total_written += bytes_written) {
1076 rc = -EAGAIN; 974 rc = -EAGAIN;
@@ -1100,7 +998,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1100 rc = CIFSSMBWrite2(xid, pTcon, 998 rc = CIFSSMBWrite2(xid, pTcon,
1101 open_file->netfid, len, 999 open_file->netfid, len,
1102 *poffset, &bytes_written, 1000 *poffset, &bytes_written,
1103 iov, 1, long_op); 1001 iov, 1, 0);
1104 } else 1002 } else
1105 rc = CIFSSMBWrite(xid, pTcon, 1003 rc = CIFSSMBWrite(xid, pTcon,
1106 open_file->netfid, 1004 open_file->netfid,
@@ -1108,7 +1006,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1108 write_size - total_written), 1006 write_size - total_written),
1109 *poffset, &bytes_written, 1007 *poffset, &bytes_written,
1110 write_data + total_written, 1008 write_data + total_written,
1111 NULL, long_op); 1009 NULL, 0);
1112 } 1010 }
1113 if (rc || (bytes_written == 0)) { 1011 if (rc || (bytes_written == 0)) {
1114 if (total_written) 1012 if (total_written)
@@ -1121,8 +1019,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1121 cifs_update_eof(cifsi, *poffset, bytes_written); 1019 cifs_update_eof(cifsi, *poffset, bytes_written);
1122 *poffset += bytes_written; 1020 *poffset += bytes_written;
1123 } 1021 }
1124 long_op = CIFS_STD_OP; /* subsequent writes fast -
1125 15 seconds is plenty */
1126 } 1022 }
1127 1023
1128 cifs_stats_bytes_written(pTcon, total_written); 1024 cifs_stats_bytes_written(pTcon, total_written);
@@ -1138,7 +1034,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1138 return total_written; 1034 return total_written;
1139} 1035}
1140 1036
1141#ifdef CONFIG_CIFS_EXPERIMENTAL
1142struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, 1037struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
1143 bool fsuid_only) 1038 bool fsuid_only)
1144{ 1039{
@@ -1172,13 +1067,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
1172 spin_unlock(&cifs_file_list_lock); 1067 spin_unlock(&cifs_file_list_lock);
1173 return NULL; 1068 return NULL;
1174} 1069}
1175#endif
1176 1070
1177struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, 1071struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
1178 bool fsuid_only) 1072 bool fsuid_only)
1179{ 1073{
1180 struct cifsFileInfo *open_file; 1074 struct cifsFileInfo *open_file;
1181 struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); 1075 struct cifs_sb_info *cifs_sb;
1182 bool any_available = false; 1076 bool any_available = false;
1183 int rc; 1077 int rc;
1184 1078
@@ -1192,6 +1086,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
1192 return NULL; 1086 return NULL;
1193 } 1087 }
1194 1088
1089 cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1090
1195 /* only filter by fsuid on multiuser mounts */ 1091 /* only filter by fsuid on multiuser mounts */
1196 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) 1092 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1197 fsuid_only = false; 1093 fsuid_only = false;
@@ -1322,7 +1218,7 @@ static int cifs_writepages(struct address_space *mapping,
1322 struct pagevec pvec; 1218 struct pagevec pvec;
1323 int rc = 0; 1219 int rc = 0;
1324 int scanned = 0; 1220 int scanned = 0;
1325 int xid, long_op; 1221 int xid;
1326 1222
1327 cifs_sb = CIFS_SB(mapping->host->i_sb); 1223 cifs_sb = CIFS_SB(mapping->host->i_sb);
1328 1224
@@ -1460,43 +1356,67 @@ retry:
1460 break; 1356 break;
1461 } 1357 }
1462 if (n_iov) { 1358 if (n_iov) {
1359retry_write:
1463 open_file = find_writable_file(CIFS_I(mapping->host), 1360 open_file = find_writable_file(CIFS_I(mapping->host),
1464 false); 1361 false);
1465 if (!open_file) { 1362 if (!open_file) {
1466 cERROR(1, "No writable handles for inode"); 1363 cERROR(1, "No writable handles for inode");
1467 rc = -EBADF; 1364 rc = -EBADF;
1468 } else { 1365 } else {
1469 long_op = cifs_write_timeout(cifsi, offset);
1470 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid, 1366 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
1471 bytes_to_write, offset, 1367 bytes_to_write, offset,
1472 &bytes_written, iov, n_iov, 1368 &bytes_written, iov, n_iov,
1473 long_op); 1369 0);
1474 cifsFileInfo_put(open_file); 1370 cifsFileInfo_put(open_file);
1475 cifs_update_eof(cifsi, offset, bytes_written);
1476 } 1371 }
1477 1372
1478 if (rc || bytes_written < bytes_to_write) { 1373 cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
1479 cERROR(1, "Write2 ret %d, wrote %d", 1374
1480 rc, bytes_written); 1375 /*
1481 mapping_set_error(mapping, rc); 1376 * For now, treat a short write as if nothing got
1482 } else { 1377 * written. A zero length write however indicates
1378 * ENOSPC or EFBIG. We have no way to know which
1379 * though, so call it ENOSPC for now. EFBIG would
1380 * get translated to AS_EIO anyway.
1381 *
1382 * FIXME: make it take into account the data that did
1383 * get written
1384 */
1385 if (rc == 0) {
1386 if (bytes_written == 0)
1387 rc = -ENOSPC;
1388 else if (bytes_written < bytes_to_write)
1389 rc = -EAGAIN;
1390 }
1391
1392 /* retry on data-integrity flush */
1393 if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
1394 goto retry_write;
1395
1396 /* fix the stats and EOF */
1397 if (bytes_written > 0) {
1483 cifs_stats_bytes_written(tcon, bytes_written); 1398 cifs_stats_bytes_written(tcon, bytes_written);
1399 cifs_update_eof(cifsi, offset, bytes_written);
1484 } 1400 }
1485 1401
1486 for (i = 0; i < n_iov; i++) { 1402 for (i = 0; i < n_iov; i++) {
1487 page = pvec.pages[first + i]; 1403 page = pvec.pages[first + i];
1488 /* Should we also set page error on 1404 /* on retryable write error, redirty page */
1489 success rc but too little data written? */ 1405 if (rc == -EAGAIN)
1490 /* BB investigate retry logic on temporary 1406 redirty_page_for_writepage(wbc, page);
1491 server crash cases and how recovery works 1407 else if (rc != 0)
1492 when page marked as error */
1493 if (rc)
1494 SetPageError(page); 1408 SetPageError(page);
1495 kunmap(page); 1409 kunmap(page);
1496 unlock_page(page); 1410 unlock_page(page);
1497 end_page_writeback(page); 1411 end_page_writeback(page);
1498 page_cache_release(page); 1412 page_cache_release(page);
1499 } 1413 }
1414
1415 if (rc != -EAGAIN)
1416 mapping_set_error(mapping, rc);
1417 else
1418 rc = 0;
1419
1500 if ((wbc->nr_to_write -= n_iov) <= 0) 1420 if ((wbc->nr_to_write -= n_iov) <= 0)
1501 done = 1; 1421 done = 1;
1502 index = next; 1422 index = next;
@@ -1608,27 +1528,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1608 return rc; 1528 return rc;
1609} 1529}
1610 1530
1611int cifs_fsync(struct file *file, int datasync) 1531int cifs_strict_fsync(struct file *file, int datasync)
1612{ 1532{
1613 int xid; 1533 int xid;
1614 int rc = 0; 1534 int rc = 0;
1615 struct cifsTconInfo *tcon; 1535 struct cifsTconInfo *tcon;
1616 struct cifsFileInfo *smbfile = file->private_data; 1536 struct cifsFileInfo *smbfile = file->private_data;
1617 struct inode *inode = file->f_path.dentry->d_inode; 1537 struct inode *inode = file->f_path.dentry->d_inode;
1538 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1618 1539
1619 xid = GetXid(); 1540 xid = GetXid();
1620 1541
1621 cFYI(1, "Sync file - name: %s datasync: 0x%x", 1542 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1622 file->f_path.dentry->d_name.name, datasync); 1543 file->f_path.dentry->d_name.name, datasync);
1623 1544
1624 rc = filemap_write_and_wait(inode->i_mapping); 1545 if (!CIFS_I(inode)->clientCanCacheRead)
1625 if (rc == 0) { 1546 cifs_invalidate_mapping(inode);
1626 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1627 1547
1628 tcon = tlink_tcon(smbfile->tlink); 1548 tcon = tlink_tcon(smbfile->tlink);
1629 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) 1549 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1630 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); 1550 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1631 } 1551
1552 FreeXid(xid);
1553 return rc;
1554}
1555
1556int cifs_fsync(struct file *file, int datasync)
1557{
1558 int xid;
1559 int rc = 0;
1560 struct cifsTconInfo *tcon;
1561 struct cifsFileInfo *smbfile = file->private_data;
1562 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1563
1564 xid = GetXid();
1565
1566 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1567 file->f_path.dentry->d_name.name, datasync);
1568
1569 tcon = tlink_tcon(smbfile->tlink);
1570 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1571 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1632 1572
1633 FreeXid(xid); 1573 FreeXid(xid);
1634 return rc; 1574 return rc;
@@ -1679,42 +1619,242 @@ int cifs_flush(struct file *file, fl_owner_t id)
1679 return rc; 1619 return rc;
1680} 1620}
1681 1621
1682ssize_t cifs_user_read(struct file *file, char __user *read_data, 1622static int
1683 size_t read_size, loff_t *poffset) 1623cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
1684{ 1624{
1685 int rc = -EACCES; 1625 int rc = 0;
1686 unsigned int bytes_read = 0; 1626 unsigned long i;
1687 unsigned int total_read = 0; 1627
1688 unsigned int current_read_size; 1628 for (i = 0; i < num_pages; i++) {
1689 struct cifs_sb_info *cifs_sb; 1629 pages[i] = alloc_page(__GFP_HIGHMEM);
1630 if (!pages[i]) {
1631 /*
1632 * save number of pages we have already allocated and
1633 * return with ENOMEM error
1634 */
1635 num_pages = i;
1636 rc = -ENOMEM;
1637 goto error;
1638 }
1639 }
1640
1641 return rc;
1642
1643error:
1644 for (i = 0; i < num_pages; i++)
1645 put_page(pages[i]);
1646 return rc;
1647}
1648
1649static inline
1650size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
1651{
1652 size_t num_pages;
1653 size_t clen;
1654
1655 clen = min_t(const size_t, len, wsize);
1656 num_pages = clen / PAGE_CACHE_SIZE;
1657 if (clen % PAGE_CACHE_SIZE)
1658 num_pages++;
1659
1660 if (cur_len)
1661 *cur_len = clen;
1662
1663 return num_pages;
1664}
1665
1666static ssize_t
1667cifs_iovec_write(struct file *file, const struct iovec *iov,
1668 unsigned long nr_segs, loff_t *poffset)
1669{
1670 size_t total_written = 0, written = 0;
1671 unsigned long num_pages, npages;
1672 size_t copied, len, cur_len, i;
1673 struct kvec *to_send;
1674 struct page **pages;
1675 struct iov_iter it;
1676 struct inode *inode;
1677 struct cifsFileInfo *open_file;
1690 struct cifsTconInfo *pTcon; 1678 struct cifsTconInfo *pTcon;
1679 struct cifs_sb_info *cifs_sb;
1680 int xid, rc;
1681
1682 len = iov_length(iov, nr_segs);
1683 if (!len)
1684 return 0;
1685
1686 rc = generic_write_checks(file, poffset, &len, 0);
1687 if (rc)
1688 return rc;
1689
1690 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1691 num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
1692
1693 pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
1694 if (!pages)
1695 return -ENOMEM;
1696
1697 to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
1698 if (!to_send) {
1699 kfree(pages);
1700 return -ENOMEM;
1701 }
1702
1703 rc = cifs_write_allocate_pages(pages, num_pages);
1704 if (rc) {
1705 kfree(pages);
1706 kfree(to_send);
1707 return rc;
1708 }
1709
1710 xid = GetXid();
1711 open_file = file->private_data;
1712 pTcon = tlink_tcon(open_file->tlink);
1713 inode = file->f_path.dentry->d_inode;
1714
1715 iov_iter_init(&it, iov, nr_segs, len, 0);
1716 npages = num_pages;
1717
1718 do {
1719 size_t save_len = cur_len;
1720 for (i = 0; i < npages; i++) {
1721 copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
1722 copied = iov_iter_copy_from_user(pages[i], &it, 0,
1723 copied);
1724 cur_len -= copied;
1725 iov_iter_advance(&it, copied);
1726 to_send[i+1].iov_base = kmap(pages[i]);
1727 to_send[i+1].iov_len = copied;
1728 }
1729
1730 cur_len = save_len - cur_len;
1731
1732 do {
1733 if (open_file->invalidHandle) {
1734 rc = cifs_reopen_file(open_file, false);
1735 if (rc != 0)
1736 break;
1737 }
1738 rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
1739 cur_len, *poffset, &written,
1740 to_send, npages, 0);
1741 } while (rc == -EAGAIN);
1742
1743 for (i = 0; i < npages; i++)
1744 kunmap(pages[i]);
1745
1746 if (written) {
1747 len -= written;
1748 total_written += written;
1749 cifs_update_eof(CIFS_I(inode), *poffset, written);
1750 *poffset += written;
1751 } else if (rc < 0) {
1752 if (!total_written)
1753 total_written = rc;
1754 break;
1755 }
1756
1757 /* get length and number of kvecs of the next write */
1758 npages = get_numpages(cifs_sb->wsize, len, &cur_len);
1759 } while (len > 0);
1760
1761 if (total_written > 0) {
1762 spin_lock(&inode->i_lock);
1763 if (*poffset > inode->i_size)
1764 i_size_write(inode, *poffset);
1765 spin_unlock(&inode->i_lock);
1766 }
1767
1768 cifs_stats_bytes_written(pTcon, total_written);
1769 mark_inode_dirty_sync(inode);
1770
1771 for (i = 0; i < num_pages; i++)
1772 put_page(pages[i]);
1773 kfree(to_send);
1774 kfree(pages);
1775 FreeXid(xid);
1776 return total_written;
1777}
1778
1779static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
1780 unsigned long nr_segs, loff_t pos)
1781{
1782 ssize_t written;
1783 struct inode *inode;
1784
1785 inode = iocb->ki_filp->f_path.dentry->d_inode;
1786
1787 /*
1788 * BB - optimize the way when signing is disabled. We can drop this
1789 * extra memory-to-memory copying and use iovec buffers for constructing
1790 * write request.
1791 */
1792
1793 written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
1794 if (written > 0) {
1795 CIFS_I(inode)->invalid_mapping = true;
1796 iocb->ki_pos = pos;
1797 }
1798
1799 return written;
1800}
1801
1802ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
1803 unsigned long nr_segs, loff_t pos)
1804{
1805 struct inode *inode;
1806
1807 inode = iocb->ki_filp->f_path.dentry->d_inode;
1808
1809 if (CIFS_I(inode)->clientCanCacheAll)
1810 return generic_file_aio_write(iocb, iov, nr_segs, pos);
1811
1812 /*
1813 * In strict cache mode we need to write the data to the server exactly
1814 * from the pos to pos+len-1 rather than flush all affected pages
1815 * because it may cause a error with mandatory locks on these pages but
1816 * not on the region from pos to ppos+len-1.
1817 */
1818
1819 return cifs_user_writev(iocb, iov, nr_segs, pos);
1820}
1821
1822static ssize_t
1823cifs_iovec_read(struct file *file, const struct iovec *iov,
1824 unsigned long nr_segs, loff_t *poffset)
1825{
1826 int rc;
1691 int xid; 1827 int xid;
1828 unsigned int total_read, bytes_read = 0;
1829 size_t len, cur_len;
1830 int iov_offset = 0;
1831 struct cifs_sb_info *cifs_sb;
1832 struct cifsTconInfo *pTcon;
1692 struct cifsFileInfo *open_file; 1833 struct cifsFileInfo *open_file;
1693 char *smb_read_data;
1694 char __user *current_offset;
1695 struct smb_com_read_rsp *pSMBr; 1834 struct smb_com_read_rsp *pSMBr;
1835 char *read_data;
1836
1837 if (!nr_segs)
1838 return 0;
1839
1840 len = iov_length(iov, nr_segs);
1841 if (!len)
1842 return 0;
1696 1843
1697 xid = GetXid(); 1844 xid = GetXid();
1698 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1845 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1699 1846
1700 if (file->private_data == NULL) {
1701 rc = -EBADF;
1702 FreeXid(xid);
1703 return rc;
1704 }
1705 open_file = file->private_data; 1847 open_file = file->private_data;
1706 pTcon = tlink_tcon(open_file->tlink); 1848 pTcon = tlink_tcon(open_file->tlink);
1707 1849
1708 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1850 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1709 cFYI(1, "attempting read on write only file instance"); 1851 cFYI(1, "attempting read on write only file instance");
1710 1852
1711 for (total_read = 0, current_offset = read_data; 1853 for (total_read = 0; total_read < len; total_read += bytes_read) {
1712 read_size > total_read; 1854 cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
1713 total_read += bytes_read, current_offset += bytes_read) {
1714 current_read_size = min_t(const int, read_size - total_read,
1715 cifs_sb->rsize);
1716 rc = -EAGAIN; 1855 rc = -EAGAIN;
1717 smb_read_data = NULL; 1856 read_data = NULL;
1857
1718 while (rc == -EAGAIN) { 1858 while (rc == -EAGAIN) {
1719 int buf_type = CIFS_NO_BUFFER; 1859 int buf_type = CIFS_NO_BUFFER;
1720 if (open_file->invalidHandle) { 1860 if (open_file->invalidHandle) {
@@ -1722,27 +1862,25 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1722 if (rc != 0) 1862 if (rc != 0)
1723 break; 1863 break;
1724 } 1864 }
1725 rc = CIFSSMBRead(xid, pTcon, 1865 rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
1726 open_file->netfid, 1866 cur_len, *poffset, &bytes_read,
1727 current_read_size, *poffset, 1867 &read_data, &buf_type);
1728 &bytes_read, &smb_read_data, 1868 pSMBr = (struct smb_com_read_rsp *)read_data;
1729 &buf_type); 1869 if (read_data) {
1730 pSMBr = (struct smb_com_read_rsp *)smb_read_data; 1870 char *data_offset = read_data + 4 +
1731 if (smb_read_data) { 1871 le16_to_cpu(pSMBr->DataOffset);
1732 if (copy_to_user(current_offset, 1872 if (memcpy_toiovecend(iov, data_offset,
1733 smb_read_data + 1873 iov_offset, bytes_read))
1734 4 /* RFC1001 length field */ +
1735 le16_to_cpu(pSMBr->DataOffset),
1736 bytes_read))
1737 rc = -EFAULT; 1874 rc = -EFAULT;
1738
1739 if (buf_type == CIFS_SMALL_BUFFER) 1875 if (buf_type == CIFS_SMALL_BUFFER)
1740 cifs_small_buf_release(smb_read_data); 1876 cifs_small_buf_release(read_data);
1741 else if (buf_type == CIFS_LARGE_BUFFER) 1877 else if (buf_type == CIFS_LARGE_BUFFER)
1742 cifs_buf_release(smb_read_data); 1878 cifs_buf_release(read_data);
1743 smb_read_data = NULL; 1879 read_data = NULL;
1880 iov_offset += bytes_read;
1744 } 1881 }
1745 } 1882 }
1883
1746 if (rc || (bytes_read == 0)) { 1884 if (rc || (bytes_read == 0)) {
1747 if (total_read) { 1885 if (total_read) {
1748 break; 1886 break;
@@ -1755,13 +1893,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1755 *poffset += bytes_read; 1893 *poffset += bytes_read;
1756 } 1894 }
1757 } 1895 }
1896
1758 FreeXid(xid); 1897 FreeXid(xid);
1759 return total_read; 1898 return total_read;
1760} 1899}
1761 1900
1901ssize_t cifs_user_read(struct file *file, char __user *read_data,
1902 size_t read_size, loff_t *poffset)
1903{
1904 struct iovec iov;
1905 iov.iov_base = read_data;
1906 iov.iov_len = read_size;
1907
1908 return cifs_iovec_read(file, &iov, 1, poffset);
1909}
1910
1911static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
1912 unsigned long nr_segs, loff_t pos)
1913{
1914 ssize_t read;
1915
1916 read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
1917 if (read > 0)
1918 iocb->ki_pos = pos;
1919
1920 return read;
1921}
1922
1923ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
1924 unsigned long nr_segs, loff_t pos)
1925{
1926 struct inode *inode;
1927
1928 inode = iocb->ki_filp->f_path.dentry->d_inode;
1929
1930 if (CIFS_I(inode)->clientCanCacheRead)
1931 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1932
1933 /*
1934 * In strict cache mode we need to read from the server all the time
1935 * if we don't have level II oplock because the server can delay mtime
1936 * change - so we can't make a decision about inode invalidating.
1937 * And we can also fail with pagereading if there are mandatory locks
1938 * on pages affected by this read but not on the region from pos to
1939 * pos+len-1.
1940 */
1941
1942 return cifs_user_readv(iocb, iov, nr_segs, pos);
1943}
1762 1944
1763static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, 1945static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1764 loff_t *poffset) 1946 loff_t *poffset)
1765{ 1947{
1766 int rc = -EACCES; 1948 int rc = -EACCES;
1767 unsigned int bytes_read = 0; 1949 unsigned int bytes_read = 0;
@@ -1829,6 +2011,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1829 return total_read; 2011 return total_read;
1830} 2012}
1831 2013
2014int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
2015{
2016 int rc, xid;
2017 struct inode *inode = file->f_path.dentry->d_inode;
2018
2019 xid = GetXid();
2020
2021 if (!CIFS_I(inode)->clientCanCacheRead)
2022 cifs_invalidate_mapping(inode);
2023
2024 rc = generic_file_mmap(file, vma);
2025 FreeXid(xid);
2026 return rc;
2027}
2028
1832int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 2029int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1833{ 2030{
1834 int rc, xid; 2031 int rc, xid;
@@ -2275,7 +2472,8 @@ void cifs_oplock_break(struct work_struct *work)
2275 */ 2472 */
2276 if (!cfile->oplock_break_cancelled) { 2473 if (!cfile->oplock_break_cancelled) {
2277 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, 2474 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
2278 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false); 2475 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
2476 cinode->clientCanCacheRead ? 1 : 0);
2279 cFYI(1, "Oplock release rc = %d", rc); 2477 cFYI(1, "Oplock release rc = %d", rc);
2280 } 2478 }
2281 2479
@@ -2299,8 +2497,10 @@ void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2299 2497
2300void cifs_oplock_break_put(struct cifsFileInfo *cfile) 2498void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2301{ 2499{
2500 struct super_block *sb = cfile->dentry->d_sb;
2501
2302 cifsFileInfo_put(cfile); 2502 cifsFileInfo_put(cfile);
2303 cifs_sb_deactive(cfile->dentry->d_sb); 2503 cifs_sb_deactive(sb);
2304} 2504}
2305 2505
2306const struct address_space_operations cifs_addr_ops = { 2506const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a2ad94efcfe..297a43d0ff7 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -2,7 +2,7 @@
2 * fs/cifs/fscache.c - CIFS filesystem cache interface 2 * fs/cifs/fscache.c - CIFS filesystem cache interface
3 * 3 *
4 * Copyright (c) 2010 Novell, Inc. 4 * Copyright (c) 2010 Novell, Inc.
5 * Author(s): Suresh Jayaraman (sjayaraman@suse.de> 5 * Author(s): Suresh Jayaraman <sjayaraman@suse.de>
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published 8 * it under the terms of the GNU Lesser General Public License as published
@@ -67,10 +67,12 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
67 if (cifsi->fscache) 67 if (cifsi->fscache)
68 return; 68 return;
69 69
70 cifsi->fscache = fscache_acquire_cookie(tcon->fscache, 70 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
71 cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
71 &cifs_fscache_inode_object_def, cifsi); 72 &cifs_fscache_inode_object_def, cifsi);
72 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, 73 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
73 cifsi->fscache); 74 cifsi->fscache);
75 }
74} 76}
75 77
76void cifs_fscache_release_inode_cookie(struct inode *inode) 78void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
101{ 103{
102 if ((filp->f_flags & O_ACCMODE) != O_RDONLY) 104 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
103 cifs_fscache_disable_inode_cookie(inode); 105 cifs_fscache_disable_inode_cookie(inode);
104 else { 106 else
105 cifs_fscache_enable_inode_cookie(inode); 107 cifs_fscache_enable_inode_cookie(inode);
106 cFYI(1, "CIFS: fscache inode cookie set");
107 }
108} 108}
109 109
110void cifs_fscache_reset_inode_cookie(struct inode *inode) 110void cifs_fscache_reset_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 39869c3c3ef..8852470b4fb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
32#include "fscache.h" 32#include "fscache.h"
33 33
34 34
35static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) 35static void cifs_set_ops(struct inode *inode)
36{ 36{
37 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 37 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
38 38
@@ -44,13 +44,17 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
44 inode->i_fop = &cifs_file_direct_nobrl_ops; 44 inode->i_fop = &cifs_file_direct_nobrl_ops;
45 else 45 else
46 inode->i_fop = &cifs_file_direct_ops; 46 inode->i_fop = &cifs_file_direct_ops;
47 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
48 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
49 inode->i_fop = &cifs_file_strict_nobrl_ops;
50 else
51 inode->i_fop = &cifs_file_strict_ops;
47 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 52 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
48 inode->i_fop = &cifs_file_nobrl_ops; 53 inode->i_fop = &cifs_file_nobrl_ops;
49 else { /* not direct, send byte range locks */ 54 else { /* not direct, send byte range locks */
50 inode->i_fop = &cifs_file_ops; 55 inode->i_fop = &cifs_file_ops;
51 } 56 }
52 57
53
54 /* check if server can support readpages */ 58 /* check if server can support readpages */
55 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf < 59 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
56 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) 60 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
@@ -60,7 +64,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
60 break; 64 break;
61 case S_IFDIR: 65 case S_IFDIR:
62#ifdef CONFIG_CIFS_DFS_UPCALL 66#ifdef CONFIG_CIFS_DFS_UPCALL
63 if (is_dfs_referral) { 67 if (IS_AUTOMOUNT(inode)) {
64 inode->i_op = &cifs_dfs_referral_inode_operations; 68 inode->i_op = &cifs_dfs_referral_inode_operations;
65 } else { 69 } else {
66#else /* NO DFS support, treat as a directory */ 70#else /* NO DFS support, treat as a directory */
@@ -167,7 +171,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
167 } 171 }
168 spin_unlock(&inode->i_lock); 172 spin_unlock(&inode->i_lock);
169 173
170 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); 174 if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
175 inode->i_flags |= S_AUTOMOUNT;
176 cifs_set_ops(inode);
171} 177}
172 178
173void 179void
@@ -518,6 +524,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
518 524
519 fattr->cf_eof = le64_to_cpu(info->EndOfFile); 525 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
520 fattr->cf_bytes = le64_to_cpu(info->AllocationSize); 526 fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
527 fattr->cf_createtime = le64_to_cpu(info->CreationTime);
521 528
522 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { 529 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
523 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; 530 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
@@ -686,13 +693,18 @@ int cifs_get_inode_info(struct inode **pinode,
686 cFYI(1, "cifs_sfu_type failed: %d", tmprc); 693 cFYI(1, "cifs_sfu_type failed: %d", tmprc);
687 } 694 }
688 695
689#ifdef CONFIG_CIFS_EXPERIMENTAL 696#ifdef CONFIG_CIFS_ACL
690 /* fill in 0777 bits from ACL */ 697 /* fill in 0777 bits from ACL */
691 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 698 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
692 cFYI(1, "Getting mode bits from ACL"); 699 rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
693 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); 700 pfid);
701 if (rc) {
702 cFYI(1, "%s: Getting ACL failed with error: %d",
703 __func__, rc);
704 goto cgii_exit;
705 }
694 } 706 }
695#endif 707#endif /* CONFIG_CIFS_ACL */
696 708
697 /* fill in remaining high mode bits e.g. SUID, VTX */ 709 /* fill in remaining high mode bits e.g. SUID, VTX */
698 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) 710 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
@@ -723,12 +735,12 @@ static const struct inode_operations cifs_ipc_inode_ops = {
723 .lookup = cifs_lookup, 735 .lookup = cifs_lookup,
724}; 736};
725 737
726char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) 738char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
739 struct cifsTconInfo *tcon)
727{ 740{
728 int pplen = cifs_sb->prepathlen; 741 int pplen = cifs_sb->prepathlen;
729 int dfsplen; 742 int dfsplen;
730 char *full_path = NULL; 743 char *full_path = NULL;
731 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
732 744
733 /* if no prefix path, simply set path to the root of share to "" */ 745 /* if no prefix path, simply set path to the root of share to "" */
734 if (pplen == 0) { 746 if (pplen == 0) {
@@ -774,6 +786,10 @@ cifs_find_inode(struct inode *inode, void *opaque)
774 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) 786 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
775 return 0; 787 return 0;
776 788
789 /* use createtime like an i_generation field */
790 if (CIFS_I(inode)->createtime != fattr->cf_createtime)
791 return 0;
792
777 /* don't match inode of different type */ 793 /* don't match inode of different type */
778 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) 794 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
779 return 0; 795 return 0;
@@ -791,6 +807,7 @@ cifs_init_inode(struct inode *inode, void *opaque)
791 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; 807 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
792 808
793 CIFS_I(inode)->uniqueid = fattr->cf_uniqueid; 809 CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
810 CIFS_I(inode)->createtime = fattr->cf_createtime;
794 return 0; 811 return 0;
795} 812}
796 813
@@ -804,14 +821,14 @@ inode_has_hashed_dentries(struct inode *inode)
804{ 821{
805 struct dentry *dentry; 822 struct dentry *dentry;
806 823
807 spin_lock(&dcache_lock); 824 spin_lock(&inode->i_lock);
808 list_for_each_entry(dentry, &inode->i_dentry, d_alias) { 825 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
809 if (!d_unhashed(dentry) || IS_ROOT(dentry)) { 826 if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
810 spin_unlock(&dcache_lock); 827 spin_unlock(&inode->i_lock);
811 return true; 828 return true;
812 } 829 }
813 } 830 }
814 spin_unlock(&dcache_lock); 831 spin_unlock(&inode->i_lock);
815 return false; 832 return false;
816} 833}
817 834
@@ -870,7 +887,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
870 char *full_path; 887 char *full_path;
871 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb); 888 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
872 889
873 full_path = cifs_build_path_to_root(cifs_sb); 890 full_path = cifs_build_path_to_root(cifs_sb, tcon);
874 if (full_path == NULL) 891 if (full_path == NULL)
875 return ERR_PTR(-ENOMEM); 892 return ERR_PTR(-ENOMEM);
876 893
@@ -881,8 +898,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
881 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 898 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
882 xid, NULL); 899 xid, NULL);
883 900
884 if (!inode) 901 if (!inode) {
885 return ERR_PTR(rc); 902 inode = ERR_PTR(rc);
903 goto out;
904 }
886 905
887#ifdef CONFIG_CIFS_FSCACHE 906#ifdef CONFIG_CIFS_FSCACHE
888 /* populate tcon->resource_id */ 907 /* populate tcon->resource_id */
@@ -898,13 +917,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
898 inode->i_uid = cifs_sb->mnt_uid; 917 inode->i_uid = cifs_sb->mnt_uid;
899 inode->i_gid = cifs_sb->mnt_gid; 918 inode->i_gid = cifs_sb->mnt_gid;
900 } else if (rc) { 919 } else if (rc) {
901 kfree(full_path);
902 _FreeXid(xid);
903 iget_failed(inode); 920 iget_failed(inode);
904 return ERR_PTR(rc); 921 inode = ERR_PTR(rc);
905 } 922 }
906 923
907 924out:
908 kfree(full_path); 925 kfree(full_path);
909 /* can not call macro FreeXid here since in a void func 926 /* can not call macro FreeXid here since in a void func
910 * TODO: This is no longer true 927 * TODO: This is no longer true
@@ -1313,10 +1330,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1313/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need 1330/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
1314 to set uid/gid */ 1331 to set uid/gid */
1315 inc_nlink(inode); 1332 inc_nlink(inode);
1316 if (pTcon->nocase)
1317 direntry->d_op = &cifs_ci_dentry_ops;
1318 else
1319 direntry->d_op = &cifs_dentry_ops;
1320 1333
1321 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); 1334 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
1322 cifs_fill_uniqueid(inode->i_sb, &fattr); 1335 cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1357,10 +1370,6 @@ mkdir_get_info:
1357 rc = cifs_get_inode_info(&newinode, full_path, NULL, 1370 rc = cifs_get_inode_info(&newinode, full_path, NULL,
1358 inode->i_sb, xid, NULL); 1371 inode->i_sb, xid, NULL);
1359 1372
1360 if (pTcon->nocase)
1361 direntry->d_op = &cifs_ci_dentry_ops;
1362 else
1363 direntry->d_op = &cifs_dentry_ops;
1364 d_instantiate(direntry, newinode); 1373 d_instantiate(direntry, newinode);
1365 /* setting nlink not necessary except in cases where we 1374 /* setting nlink not necessary except in cases where we
1366 * failed to get it from the server or was set bogus */ 1375 * failed to get it from the server or was set bogus */
@@ -1648,6 +1657,7 @@ static bool
1648cifs_inode_needs_reval(struct inode *inode) 1657cifs_inode_needs_reval(struct inode *inode)
1649{ 1658{
1650 struct cifsInodeInfo *cifs_i = CIFS_I(inode); 1659 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1660 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1651 1661
1652 if (cifs_i->clientCanCacheRead) 1662 if (cifs_i->clientCanCacheRead)
1653 return false; 1663 return false;
@@ -1658,20 +1668,22 @@ cifs_inode_needs_reval(struct inode *inode)
1658 if (cifs_i->time == 0) 1668 if (cifs_i->time == 0)
1659 return true; 1669 return true;
1660 1670
1661 /* FIXME: the actimeo should be tunable */ 1671 if (!time_in_range(jiffies, cifs_i->time,
1662 if (time_after_eq(jiffies, cifs_i->time + HZ)) 1672 cifs_i->time + cifs_sb->actimeo))
1663 return true; 1673 return true;
1664 1674
1665 /* hardlinked files w/ noserverino get "special" treatment */ 1675 /* hardlinked files w/ noserverino get "special" treatment */
1666 if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && 1676 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
1667 S_ISREG(inode->i_mode) && inode->i_nlink != 1) 1677 S_ISREG(inode->i_mode) && inode->i_nlink != 1)
1668 return true; 1678 return true;
1669 1679
1670 return false; 1680 return false;
1671} 1681}
1672 1682
1673/* check invalid_mapping flag and zap the cache if it's set */ 1683/*
1674static void 1684 * Zap the cache. Called when invalid_mapping flag is set.
1685 */
1686void
1675cifs_invalidate_mapping(struct inode *inode) 1687cifs_invalidate_mapping(struct inode *inode)
1676{ 1688{
1677 int rc; 1689 int rc;
@@ -2114,11 +2126,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2114 2126
2115 if (attrs->ia_valid & ATTR_MODE) { 2127 if (attrs->ia_valid & ATTR_MODE) {
2116 rc = 0; 2128 rc = 0;
2117#ifdef CONFIG_CIFS_EXPERIMENTAL 2129#ifdef CONFIG_CIFS_ACL
2118 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) 2130 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
2119 rc = mode_to_acl(inode, full_path, mode); 2131 rc = mode_to_cifs_acl(inode, full_path, mode);
2120 else 2132 if (rc) {
2121#endif 2133 cFYI(1, "%s: Setting ACL failed with error: %d",
2134 __func__, rc);
2135 goto cifs_setattr_exit;
2136 }
2137 } else
2138#endif /* CONFIG_CIFS_ACL */
2122 if (((mode & S_IWUGO) == 0) && 2139 if (((mode & S_IWUGO) == 0) &&
2123 (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { 2140 (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
2124 2141
@@ -2177,7 +2194,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2177 2194
2178 setattr_copy(inode, attrs); 2195 setattr_copy(inode, attrs);
2179 mark_inode_dirty(inode); 2196 mark_inode_dirty(inode);
2180 return 0;
2181 2197
2182cifs_setattr_exit: 2198cifs_setattr_exit:
2183 kfree(full_path); 2199 kfree(full_path);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 077bf756f34..0c98672d012 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
38 struct cifs_sb_info *cifs_sb; 38 struct cifs_sb_info *cifs_sb;
39#ifdef CONFIG_CIFS_POSIX 39#ifdef CONFIG_CIFS_POSIX
40 struct cifsFileInfo *pSMBFile = filep->private_data; 40 struct cifsFileInfo *pSMBFile = filep->private_data;
41 struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink); 41 struct cifsTconInfo *tcon;
42 __u64 ExtAttrBits = 0; 42 __u64 ExtAttrBits = 0;
43 __u64 ExtAttrMask = 0; 43 __u64 ExtAttrMask = 0;
44 __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability); 44 __u64 caps;
45#endif /* CONFIG_CIFS_POSIX */ 45#endif /* CONFIG_CIFS_POSIX */
46 46
47 xid = GetXid(); 47 xid = GetXid();
@@ -62,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
62 break; 62 break;
63#ifdef CONFIG_CIFS_POSIX 63#ifdef CONFIG_CIFS_POSIX
64 case FS_IOC_GETFLAGS: 64 case FS_IOC_GETFLAGS:
65 if (pSMBFile == NULL)
66 break;
67 tcon = tlink_tcon(pSMBFile->tlink);
68 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
65 if (CIFS_UNIX_EXTATTR_CAP & caps) { 69 if (CIFS_UNIX_EXTATTR_CAP & caps) {
66 if (pSMBFile == NULL)
67 break;
68 rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, 70 rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
69 &ExtAttrBits, &ExtAttrMask); 71 &ExtAttrBits, &ExtAttrMask);
70 if (rc == 0) 72 if (rc == 0)
@@ -75,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
75 break; 77 break;
76 78
77 case FS_IOC_SETFLAGS: 79 case FS_IOC_SETFLAGS:
80 if (pSMBFile == NULL)
81 break;
82 tcon = tlink_tcon(pSMBFile->tlink);
83 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
78 if (CIFS_UNIX_EXTATTR_CAP & caps) { 84 if (CIFS_UNIX_EXTATTR_CAP & caps) {
79 if (get_user(ExtAttrBits, (int __user *)arg)) { 85 if (get_user(ExtAttrBits, (int __user *)arg)) {
80 rc = -EFAULT; 86 rc = -EFAULT;
81 break; 87 break;
82 } 88 }
83 if (pSMBFile == NULL)
84 break;
85 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, 89 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
86 extAttrBits, &ExtAttrMask);*/ 90 extAttrBits, &ExtAttrMask);*/
87 } 91 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 85cdbf831e7..02cd60aefbf 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,7 +28,6 @@
28#include "cifsproto.h" 28#include "cifsproto.h"
29#include "cifs_debug.h" 29#include "cifs_debug.h"
30#include "cifs_fs_sb.h" 30#include "cifs_fs_sb.h"
31#include "md5.h"
32 31
33#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1) 32#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
34#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1)) 33#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
@@ -47,6 +46,44 @@
47 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15] 46 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
48 47
49static int 48static int
49symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
50{
51 int rc;
52 unsigned int size;
53 struct crypto_shash *md5;
54 struct sdesc *sdescmd5;
55
56 md5 = crypto_alloc_shash("md5", 0, 0);
57 if (IS_ERR(md5)) {
58 cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
59 return PTR_ERR(md5);
60 }
61 size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
62 sdescmd5 = kmalloc(size, GFP_KERNEL);
63 if (!sdescmd5) {
64 rc = -ENOMEM;
65 cERROR(1, "%s: Memory allocation failure\n", __func__);
66 goto symlink_hash_err;
67 }
68 sdescmd5->shash.tfm = md5;
69 sdescmd5->shash.flags = 0x0;
70
71 rc = crypto_shash_init(&sdescmd5->shash);
72 if (rc) {
73 cERROR(1, "%s: Could not init md5 shash\n", __func__);
74 goto symlink_hash_err;
75 }
76 crypto_shash_update(&sdescmd5->shash, link_str, link_len);
77 rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
78
79symlink_hash_err:
80 crypto_free_shash(md5);
81 kfree(sdescmd5);
82
83 return rc;
84}
85
86static int
50CIFSParseMFSymlink(const u8 *buf, 87CIFSParseMFSymlink(const u8 *buf,
51 unsigned int buf_len, 88 unsigned int buf_len,
52 unsigned int *_link_len, 89 unsigned int *_link_len,
@@ -56,7 +93,6 @@ CIFSParseMFSymlink(const u8 *buf,
56 unsigned int link_len; 93 unsigned int link_len;
57 const char *md5_str1; 94 const char *md5_str1;
58 const char *link_str; 95 const char *link_str;
59 struct MD5Context md5_ctx;
60 u8 md5_hash[16]; 96 u8 md5_hash[16];
61 char md5_str2[34]; 97 char md5_str2[34];
62 98
@@ -70,9 +106,11 @@ CIFSParseMFSymlink(const u8 *buf,
70 if (rc != 1) 106 if (rc != 1)
71 return -EINVAL; 107 return -EINVAL;
72 108
73 cifs_MD5_init(&md5_ctx); 109 rc = symlink_hash(link_len, link_str, md5_hash);
74 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len); 110 if (rc) {
75 cifs_MD5_final(md5_hash, &md5_ctx); 111 cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
112 return rc;
113 }
76 114
77 snprintf(md5_str2, sizeof(md5_str2), 115 snprintf(md5_str2, sizeof(md5_str2),
78 CIFS_MF_SYMLINK_MD5_FORMAT, 116 CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -94,9 +132,9 @@ CIFSParseMFSymlink(const u8 *buf,
94static int 132static int
95CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) 133CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
96{ 134{
135 int rc;
97 unsigned int link_len; 136 unsigned int link_len;
98 unsigned int ofs; 137 unsigned int ofs;
99 struct MD5Context md5_ctx;
100 u8 md5_hash[16]; 138 u8 md5_hash[16];
101 139
102 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE) 140 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
@@ -107,9 +145,11 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
107 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) 145 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
108 return -ENAMETOOLONG; 146 return -ENAMETOOLONG;
109 147
110 cifs_MD5_init(&md5_ctx); 148 rc = symlink_hash(link_len, link_str, md5_hash);
111 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len); 149 if (rc) {
112 cifs_MD5_final(md5_hash, &md5_ctx); 150 cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
151 return rc;
152 }
113 153
114 snprintf(buf, buf_len, 154 snprintf(buf, buf_len,
115 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, 155 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -524,10 +564,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
524 cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d", 564 cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
525 rc); 565 rc);
526 } else { 566 } else {
527 if (pTcon->nocase)
528 direntry->d_op = &cifs_ci_dentry_ops;
529 else
530 direntry->d_op = &cifs_dentry_ops;
531 d_instantiate(direntry, newinode); 567 d_instantiate(direntry, newinode);
532 } 568 }
533 } 569 }
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
deleted file mode 100644
index a725c2609d6..00000000000
--- a/fs/cifs/md4.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/*
2 Unix SMB/Netbios implementation.
3 Version 1.9.
4 a implementation of MD4 designed for use in the SMB authentication protocol
5 Copyright (C) Andrew Tridgell 1997-1998.
6 Modified by Steve French (sfrench@us.ibm.com) 2002-2003
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*/
22#include <linux/module.h>
23#include <linux/fs.h>
24#include "cifsencrypt.h"
25
26/* NOTE: This code makes no attempt to be fast! */
27
28static __u32
29F(__u32 X, __u32 Y, __u32 Z)
30{
31 return (X & Y) | ((~X) & Z);
32}
33
34static __u32
35G(__u32 X, __u32 Y, __u32 Z)
36{
37 return (X & Y) | (X & Z) | (Y & Z);
38}
39
40static __u32
41H(__u32 X, __u32 Y, __u32 Z)
42{
43 return X ^ Y ^ Z;
44}
45
46static __u32
47lshift(__u32 x, int s)
48{
49 x &= 0xFFFFFFFF;
50 return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
51}
52
53#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s)
54#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s)
55#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s)
56
57/* this applies md4 to 64 byte chunks */
58static void
59mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
60{
61 int j;
62 __u32 AA, BB, CC, DD;
63 __u32 X[16];
64
65
66 for (j = 0; j < 16; j++)
67 X[j] = M[j];
68
69 AA = *A;
70 BB = *B;
71 CC = *C;
72 DD = *D;
73
74 ROUND1(A, B, C, D, 0, 3);
75 ROUND1(D, A, B, C, 1, 7);
76 ROUND1(C, D, A, B, 2, 11);
77 ROUND1(B, C, D, A, 3, 19);
78 ROUND1(A, B, C, D, 4, 3);
79 ROUND1(D, A, B, C, 5, 7);
80 ROUND1(C, D, A, B, 6, 11);
81 ROUND1(B, C, D, A, 7, 19);
82 ROUND1(A, B, C, D, 8, 3);
83 ROUND1(D, A, B, C, 9, 7);
84 ROUND1(C, D, A, B, 10, 11);
85 ROUND1(B, C, D, A, 11, 19);
86 ROUND1(A, B, C, D, 12, 3);
87 ROUND1(D, A, B, C, 13, 7);
88 ROUND1(C, D, A, B, 14, 11);
89 ROUND1(B, C, D, A, 15, 19);
90
91 ROUND2(A, B, C, D, 0, 3);
92 ROUND2(D, A, B, C, 4, 5);
93 ROUND2(C, D, A, B, 8, 9);
94 ROUND2(B, C, D, A, 12, 13);
95 ROUND2(A, B, C, D, 1, 3);
96 ROUND2(D, A, B, C, 5, 5);
97 ROUND2(C, D, A, B, 9, 9);
98 ROUND2(B, C, D, A, 13, 13);
99 ROUND2(A, B, C, D, 2, 3);
100 ROUND2(D, A, B, C, 6, 5);
101 ROUND2(C, D, A, B, 10, 9);
102 ROUND2(B, C, D, A, 14, 13);
103 ROUND2(A, B, C, D, 3, 3);
104 ROUND2(D, A, B, C, 7, 5);
105 ROUND2(C, D, A, B, 11, 9);
106 ROUND2(B, C, D, A, 15, 13);
107
108 ROUND3(A, B, C, D, 0, 3);
109 ROUND3(D, A, B, C, 8, 9);
110 ROUND3(C, D, A, B, 4, 11);
111 ROUND3(B, C, D, A, 12, 15);
112 ROUND3(A, B, C, D, 2, 3);
113 ROUND3(D, A, B, C, 10, 9);
114 ROUND3(C, D, A, B, 6, 11);
115 ROUND3(B, C, D, A, 14, 15);
116 ROUND3(A, B, C, D, 1, 3);
117 ROUND3(D, A, B, C, 9, 9);
118 ROUND3(C, D, A, B, 5, 11);
119 ROUND3(B, C, D, A, 13, 15);
120 ROUND3(A, B, C, D, 3, 3);
121 ROUND3(D, A, B, C, 11, 9);
122 ROUND3(C, D, A, B, 7, 11);
123 ROUND3(B, C, D, A, 15, 15);
124
125 *A += AA;
126 *B += BB;
127 *C += CC;
128 *D += DD;
129
130 *A &= 0xFFFFFFFF;
131 *B &= 0xFFFFFFFF;
132 *C &= 0xFFFFFFFF;
133 *D &= 0xFFFFFFFF;
134
135 for (j = 0; j < 16; j++)
136 X[j] = 0;
137}
138
139static void
140copy64(__u32 *M, unsigned char *in)
141{
142 int i;
143
144 for (i = 0; i < 16; i++)
145 M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) |
146 (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0);
147}
148
149static void
150copy4(unsigned char *out, __u32 x)
151{
152 out[0] = x & 0xFF;
153 out[1] = (x >> 8) & 0xFF;
154 out[2] = (x >> 16) & 0xFF;
155 out[3] = (x >> 24) & 0xFF;
156}
157
158/* produce a md4 message digest from data of length n bytes */
159void
160mdfour(unsigned char *out, unsigned char *in, int n)
161{
162 unsigned char buf[128];
163 __u32 M[16];
164 __u32 b = n * 8;
165 int i;
166 __u32 A = 0x67452301;
167 __u32 B = 0xefcdab89;
168 __u32 C = 0x98badcfe;
169 __u32 D = 0x10325476;
170
171 while (n > 64) {
172 copy64(M, in);
173 mdfour64(M, &A, &B, &C, &D);
174 in += 64;
175 n -= 64;
176 }
177
178 for (i = 0; i < 128; i++)
179 buf[i] = 0;
180 memcpy(buf, in, n);
181 buf[n] = 0x80;
182
183 if (n <= 55) {
184 copy4(buf + 56, b);
185 copy64(M, buf);
186 mdfour64(M, &A, &B, &C, &D);
187 } else {
188 copy4(buf + 120, b);
189 copy64(M, buf);
190 mdfour64(M, &A, &B, &C, &D);
191 copy64(M, buf + 64);
192 mdfour64(M, &A, &B, &C, &D);
193 }
194
195 for (i = 0; i < 128; i++)
196 buf[i] = 0;
197 copy64(M, buf);
198
199 copy4(out, A);
200 copy4(out + 4, B);
201 copy4(out + 8, C);
202 copy4(out + 12, D);
203
204 A = B = C = D = 0;
205}
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
deleted file mode 100644
index 98b66a54c31..00000000000
--- a/fs/cifs/md5.c
+++ /dev/null
@@ -1,366 +0,0 @@
1/*
2 * This code implements the MD5 message-digest algorithm.
3 * The algorithm is due to Ron Rivest. This code was
4 * written by Colin Plumb in 1993, no copyright is claimed.
5 * This code is in the public domain; do with it what you wish.
6 *
7 * Equivalent code is available from RSA Data Security, Inc.
8 * This code has been tested against that, and is equivalent,
9 * except that you don't need to include two pages of legalese
10 * with every copy.
11 *
12 * To compute the message digest of a chunk of bytes, declare an
13 * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
14 * needed on buffers full of bytes, and then call cifs_MD5_final, which
15 * will fill a supplied 16-byte array with the digest.
16 */
17
18/* This code slightly modified to fit into Samba by
19 abartlet@samba.org Jun 2001
20 and to fit the cifs vfs by
21 Steve French sfrench@us.ibm.com */
22
23#include <linux/string.h>
24#include "md5.h"
25
26static void MD5Transform(__u32 buf[4], __u32 const in[16]);
27
28/*
29 * Note: this code is harmless on little-endian machines.
30 */
31static void
32byteReverse(unsigned char *buf, unsigned longs)
33{
34 __u32 t;
35 do {
36 t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
37 ((unsigned) buf[1] << 8 | buf[0]);
38 *(__u32 *) buf = t;
39 buf += 4;
40 } while (--longs);
41}
42
43/*
44 * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
45 * initialization constants.
46 */
47void
48cifs_MD5_init(struct MD5Context *ctx)
49{
50 ctx->buf[0] = 0x67452301;
51 ctx->buf[1] = 0xefcdab89;
52 ctx->buf[2] = 0x98badcfe;
53 ctx->buf[3] = 0x10325476;
54
55 ctx->bits[0] = 0;
56 ctx->bits[1] = 0;
57}
58
59/*
60 * Update context to reflect the concatenation of another buffer full
61 * of bytes.
62 */
63void
64cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
65{
66 register __u32 t;
67
68 /* Update bitcount */
69
70 t = ctx->bits[0];
71 if ((ctx->bits[0] = t + ((__u32) len << 3)) < t)
72 ctx->bits[1]++; /* Carry from low to high */
73 ctx->bits[1] += len >> 29;
74
75 t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
76
77 /* Handle any leading odd-sized chunks */
78
79 if (t) {
80 unsigned char *p = (unsigned char *) ctx->in + t;
81
82 t = 64 - t;
83 if (len < t) {
84 memmove(p, buf, len);
85 return;
86 }
87 memmove(p, buf, t);
88 byteReverse(ctx->in, 16);
89 MD5Transform(ctx->buf, (__u32 *) ctx->in);
90 buf += t;
91 len -= t;
92 }
93 /* Process data in 64-byte chunks */
94
95 while (len >= 64) {
96 memmove(ctx->in, buf, 64);
97 byteReverse(ctx->in, 16);
98 MD5Transform(ctx->buf, (__u32 *) ctx->in);
99 buf += 64;
100 len -= 64;
101 }
102
103 /* Handle any remaining bytes of data. */
104
105 memmove(ctx->in, buf, len);
106}
107
108/*
109 * Final wrapup - pad to 64-byte boundary with the bit pattern
110 * 1 0* (64-bit count of bits processed, MSB-first)
111 */
112void
113cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
114{
115 unsigned int count;
116 unsigned char *p;
117
118 /* Compute number of bytes mod 64 */
119 count = (ctx->bits[0] >> 3) & 0x3F;
120
121 /* Set the first char of padding to 0x80. This is safe since there is
122 always at least one byte free */
123 p = ctx->in + count;
124 *p++ = 0x80;
125
126 /* Bytes of padding needed to make 64 bytes */
127 count = 64 - 1 - count;
128
129 /* Pad out to 56 mod 64 */
130 if (count < 8) {
131 /* Two lots of padding: Pad the first block to 64 bytes */
132 memset(p, 0, count);
133 byteReverse(ctx->in, 16);
134 MD5Transform(ctx->buf, (__u32 *) ctx->in);
135
136 /* Now fill the next block with 56 bytes */
137 memset(ctx->in, 0, 56);
138 } else {
139 /* Pad block to 56 bytes */
140 memset(p, 0, count - 8);
141 }
142 byteReverse(ctx->in, 14);
143
144 /* Append length in bits and transform */
145 ((__u32 *) ctx->in)[14] = ctx->bits[0];
146 ((__u32 *) ctx->in)[15] = ctx->bits[1];
147
148 MD5Transform(ctx->buf, (__u32 *) ctx->in);
149 byteReverse((unsigned char *) ctx->buf, 4);
150 memmove(digest, ctx->buf, 16);
151 memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
152}
153
154/* The four core functions - F1 is optimized somewhat */
155
156/* #define F1(x, y, z) (x & y | ~x & z) */
157#define F1(x, y, z) (z ^ (x & (y ^ z)))
158#define F2(x, y, z) F1(z, x, y)
159#define F3(x, y, z) (x ^ y ^ z)
160#define F4(x, y, z) (y ^ (x | ~z))
161
162/* This is the central step in the MD5 algorithm. */
163#define MD5STEP(f, w, x, y, z, data, s) \
164 (w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x)
165
166/*
167 * The core of the MD5 algorithm, this alters an existing MD5 hash to
168 * reflect the addition of 16 longwords of new data. cifs_MD5_update blocks
169 * the data and converts bytes into longwords for this routine.
170 */
171static void
172MD5Transform(__u32 buf[4], __u32 const in[16])
173{
174 register __u32 a, b, c, d;
175
176 a = buf[0];
177 b = buf[1];
178 c = buf[2];
179 d = buf[3];
180
181 MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
182 MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
183 MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
184 MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
185 MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
186 MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
187 MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
188 MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
189 MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
190 MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
191 MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
192 MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
193 MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
194 MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
195 MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
196 MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
197
198 MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
199 MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
200 MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
201 MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
202 MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
203 MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
204 MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
205 MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
206 MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
207 MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
208 MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
209 MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
210 MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
211 MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
212 MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
213 MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
214
215 MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
216 MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
217 MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
218 MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
219 MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
220 MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
221 MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
222 MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
223 MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
224 MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
225 MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
226 MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
227 MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
228 MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
229 MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
230 MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
231
232 MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
233 MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
234 MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
235 MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
236 MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
237 MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
238 MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
239 MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
240 MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
241 MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
242 MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
243 MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
244 MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
245 MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
246 MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
247 MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
248
249 buf[0] += a;
250 buf[1] += b;
251 buf[2] += c;
252 buf[3] += d;
253}
254
255#if 0 /* currently unused */
256/***********************************************************************
257 the rfc 2104 version of hmac_md5 initialisation.
258***********************************************************************/
259static void
260hmac_md5_init_rfc2104(unsigned char *key, int key_len,
261 struct HMACMD5Context *ctx)
262{
263 int i;
264
265 /* if key is longer than 64 bytes reset it to key=MD5(key) */
266 if (key_len > 64) {
267 unsigned char tk[16];
268 struct MD5Context tctx;
269
270 cifs_MD5_init(&tctx);
271 cifs_MD5_update(&tctx, key, key_len);
272 cifs_MD5_final(tk, &tctx);
273
274 key = tk;
275 key_len = 16;
276 }
277
278 /* start out by storing key in pads */
279 memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
280 memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
281 memcpy(ctx->k_ipad, key, key_len);
282 memcpy(ctx->k_opad, key, key_len);
283
284 /* XOR key with ipad and opad values */
285 for (i = 0; i < 64; i++) {
286 ctx->k_ipad[i] ^= 0x36;
287 ctx->k_opad[i] ^= 0x5c;
288 }
289
290 cifs_MD5_init(&ctx->ctx);
291 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
292}
293#endif
294
295/***********************************************************************
296 the microsoft version of hmac_md5 initialisation.
297***********************************************************************/
298void
299hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
300 struct HMACMD5Context *ctx)
301{
302 int i;
303
304 /* if key is longer than 64 bytes truncate it */
305 if (key_len > 64)
306 key_len = 64;
307
308 /* start out by storing key in pads */
309 memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
310 memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
311 memcpy(ctx->k_ipad, key, key_len);
312 memcpy(ctx->k_opad, key, key_len);
313
314 /* XOR key with ipad and opad values */
315 for (i = 0; i < 64; i++) {
316 ctx->k_ipad[i] ^= 0x36;
317 ctx->k_opad[i] ^= 0x5c;
318 }
319
320 cifs_MD5_init(&ctx->ctx);
321 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
322}
323
324/***********************************************************************
325 update hmac_md5 "inner" buffer
326***********************************************************************/
327void
328hmac_md5_update(const unsigned char *text, int text_len,
329 struct HMACMD5Context *ctx)
330{
331 cifs_MD5_update(&ctx->ctx, text, text_len); /* then text of datagram */
332}
333
334/***********************************************************************
335 finish off hmac_md5 "inner" buffer and generate outer one.
336***********************************************************************/
337void
338hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
339{
340 struct MD5Context ctx_o;
341
342 cifs_MD5_final(digest, &ctx->ctx);
343
344 cifs_MD5_init(&ctx_o);
345 cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
346 cifs_MD5_update(&ctx_o, digest, 16);
347 cifs_MD5_final(digest, &ctx_o);
348}
349
350/***********************************************************
351 single function to calculate an HMAC MD5 digest from data.
352 use the microsoft hmacmd5 init method because the key is 16 bytes.
353************************************************************/
354#if 0 /* currently unused */
355static void
356hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
357 unsigned char *digest)
358{
359 struct HMACMD5Context ctx;
360 hmac_md5_init_limK_to_64(key, 16, &ctx);
361 if (data_len != 0)
362 hmac_md5_update(data, data_len, &ctx);
363
364 hmac_md5_final(digest, &ctx);
365}
366#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
deleted file mode 100644
index 6fba8cb402f..00000000000
--- a/fs/cifs/md5.h
+++ /dev/null
@@ -1,38 +0,0 @@
1#ifndef MD5_H
2#define MD5_H
3#ifndef HEADER_MD5_H
4/* Try to avoid clashes with OpenSSL */
5#define HEADER_MD5_H
6#endif
7
8struct MD5Context {
9 __u32 buf[4];
10 __u32 bits[2];
11 unsigned char in[64];
12};
13#endif /* !MD5_H */
14
15#ifndef _HMAC_MD5_H
16struct HMACMD5Context {
17 struct MD5Context ctx;
18 unsigned char k_ipad[65];
19 unsigned char k_opad[65];
20};
21#endif /* _HMAC_MD5_H */
22
23void cifs_MD5_init(struct MD5Context *context);
24void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
25 unsigned len);
26void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
27
28/* The following definitions come from lib/hmacmd5.c */
29
30/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
31 struct HMACMD5Context *ctx);*/
32void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
33 struct HMACMD5Context *ctx);
34void hmac_md5_update(const unsigned char *text, int text_len,
35 struct HMACMD5Context *ctx);
36void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
37/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
38 unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c4e296fe351..a09e077ba92 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -569,10 +569,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
569 569
570 cFYI(1, "file id match, oplock break"); 570 cFYI(1, "file id match, oplock break");
571 pCifsInode = CIFS_I(netfile->dentry->d_inode); 571 pCifsInode = CIFS_I(netfile->dentry->d_inode);
572 pCifsInode->clientCanCacheAll = false;
573 if (pSMB->OplockLevel == 0)
574 pCifsInode->clientCanCacheRead = false;
575 572
573 cifs_set_oplock_level(pCifsInode,
574 pSMB->OplockLevel ? OPLOCK_READ : 0);
576 /* 575 /*
577 * cifs_oplock_break_put() can't be called 576 * cifs_oplock_break_put() can't be called
578 * from here. Get reference after queueing 577 * from here. Get reference after queueing
@@ -638,77 +637,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
638 return; 637 return;
639} 638}
640 639
641/* Convert 16 bit Unicode pathname to wire format from string in current code
642 page. Conversion may involve remapping up the seven characters that are
643 only legal in POSIX-like OS (if they are present in the string). Path
644 names are little endian 16 bit Unicode on the wire */
645int
646cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
647 const struct nls_table *cp, int mapChars)
648{
649 int i, j, charlen;
650 int len_remaining = maxlen;
651 char src_char;
652 __u16 temp;
653
654 if (!mapChars)
655 return cifs_strtoUCS(target, source, PATH_MAX, cp);
656
657 for (i = 0, j = 0; i < maxlen; j++) {
658 src_char = source[i];
659 switch (src_char) {
660 case 0:
661 target[j] = 0;
662 goto ctoUCS_out;
663 case ':':
664 target[j] = cpu_to_le16(UNI_COLON);
665 break;
666 case '*':
667 target[j] = cpu_to_le16(UNI_ASTERIK);
668 break;
669 case '?':
670 target[j] = cpu_to_le16(UNI_QUESTION);
671 break;
672 case '<':
673 target[j] = cpu_to_le16(UNI_LESSTHAN);
674 break;
675 case '>':
676 target[j] = cpu_to_le16(UNI_GRTRTHAN);
677 break;
678 case '|':
679 target[j] = cpu_to_le16(UNI_PIPE);
680 break;
681 /* BB We can not handle remapping slash until
682 all the calls to build_path_from_dentry
683 are modified, as they use slash as separator BB */
684 /* case '\\':
685 target[j] = cpu_to_le16(UNI_SLASH);
686 break;*/
687 default:
688 charlen = cp->char2uni(source+i,
689 len_remaining, &temp);
690 /* if no match, use question mark, which
691 at least in some cases servers as wild card */
692 if (charlen < 1) {
693 target[j] = cpu_to_le16(0x003f);
694 charlen = 1;
695 } else
696 target[j] = cpu_to_le16(temp);
697 len_remaining -= charlen;
698 /* character may take more than one byte in the
699 the source string, but will take exactly two
700 bytes in the target string */
701 i += charlen;
702 continue;
703 }
704 i++; /* move to next char in source string */
705 len_remaining--;
706 }
707
708ctoUCS_out:
709 return i;
710}
711
712void 640void
713cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) 641cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
714{ 642{
@@ -722,3 +650,23 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
722 cifs_sb_master_tcon(cifs_sb)->treeName); 650 cifs_sb_master_tcon(cifs_sb)->treeName);
723 } 651 }
724} 652}
653
654void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
655{
656 oplock &= 0xF;
657
658 if (oplock == OPLOCK_EXCLUSIVE) {
659 cinode->clientCanCacheAll = true;
660 cinode->clientCanCacheRead = true;
661 cFYI(1, "Exclusive Oplock granted on inode %p",
662 &cinode->vfs_inode);
663 } else if (oplock == OPLOCK_READ) {
664 cinode->clientCanCacheAll = false;
665 cinode->clientCanCacheRead = true;
666 cFYI(1, "Level II Oplock granted on inode %p",
667 &cinode->vfs_inode);
668 } else {
669 cinode->clientCanCacheAll = false;
670 cinode->clientCanCacheRead = false;
671 }
672}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62..8d9189f6447 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
899 } 899 }
900 /* else ERRHRD class errors or junk - return EIO */ 900 /* else ERRHRD class errors or junk - return EIO */
901 901
902 cFYI(1, "Mapping smb error code %d to POSIX err %d", 902 cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
903 smberrcode, rc); 903 le32_to_cpu(smb->Status.CifsError), rc);
904 904
905 /* generic corrective action e.g. reconnect SMB session on 905 /* generic corrective action e.g. reconnect SMB session on
906 * ERRbaduid could be added */ 906 * ERRbaduid could be added */
@@ -916,14 +916,14 @@ unsigned int
916smbCalcSize(struct smb_hdr *ptr) 916smbCalcSize(struct smb_hdr *ptr)
917{ 917{
918 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 918 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
919 2 /* size of the bcc field */ + BCC(ptr)); 919 2 /* size of the bcc field */ + get_bcc(ptr));
920} 920}
921 921
922unsigned int 922unsigned int
923smbCalcSize_LE(struct smb_hdr *ptr) 923smbCalcSize_LE(struct smb_hdr *ptr)
924{ 924{
925 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 925 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
926 2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr))); 926 2 /* size of the bcc field */ + get_bcc_le(ptr));
927} 927}
928 928
929/* The following are taken from fs/ntfs/util.c */ 929/* The following are taken from fs/ntfs/util.c */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ef7bb7b50f5..7f25cc3d225 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
79 cFYI(1, "For %s", name->name); 79 cFYI(1, "For %s", name->name);
80 80
81 if (parent->d_op && parent->d_op->d_hash) 81 if (parent->d_op && parent->d_op->d_hash)
82 parent->d_op->d_hash(parent, name); 82 parent->d_op->d_hash(parent, parent->d_inode, name);
83 else 83 else
84 name->hash = full_name_hash(name->name, name->len); 84 name->hash = full_name_hash(name->name, name->len);
85 85
@@ -102,11 +102,6 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
102 return NULL; 102 return NULL;
103 } 103 }
104 104
105 if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
106 dentry->d_op = &cifs_ci_dentry_ops;
107 else
108 dentry->d_op = &cifs_dentry_ops;
109
110 alias = d_materialise_unique(dentry, inode); 105 alias = d_materialise_unique(dentry, inode);
111 if (alias != NULL) { 106 if (alias != NULL) {
112 dput(dentry); 107 dput(dentry);
@@ -160,6 +155,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
160 fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); 155 fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
161 fattr->cf_eof = le64_to_cpu(info->EndOfFile); 156 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
162 fattr->cf_bytes = le64_to_cpu(info->AllocationSize); 157 fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
158 fattr->cf_createtime = le64_to_cpu(info->CreationTime);
163 fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); 159 fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
164 fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); 160 fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
165 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); 161 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
@@ -226,26 +222,29 @@ static int initiate_cifs_search(const int xid, struct file *file)
226 char *full_path = NULL; 222 char *full_path = NULL;
227 struct cifsFileInfo *cifsFile; 223 struct cifsFileInfo *cifsFile;
228 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 224 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
229 struct tcon_link *tlink; 225 struct tcon_link *tlink = NULL;
230 struct cifsTconInfo *pTcon; 226 struct cifsTconInfo *pTcon;
231 227
232 tlink = cifs_sb_tlink(cifs_sb);
233 if (IS_ERR(tlink))
234 return PTR_ERR(tlink);
235 pTcon = tlink_tcon(tlink);
236
237 if (file->private_data == NULL)
238 file->private_data =
239 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
240 if (file->private_data == NULL) { 228 if (file->private_data == NULL) {
241 rc = -ENOMEM; 229 tlink = cifs_sb_tlink(cifs_sb);
242 goto error_exit; 230 if (IS_ERR(tlink))
231 return PTR_ERR(tlink);
232
233 cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
234 if (cifsFile == NULL) {
235 rc = -ENOMEM;
236 goto error_exit;
237 }
238 file->private_data = cifsFile;
239 cifsFile->tlink = cifs_get_tlink(tlink);
240 pTcon = tlink_tcon(tlink);
241 } else {
242 cifsFile = file->private_data;
243 pTcon = tlink_tcon(cifsFile->tlink);
243 } 244 }
244 245
245 cifsFile = file->private_data;
246 cifsFile->invalidHandle = true; 246 cifsFile->invalidHandle = true;
247 cifsFile->srch_inf.endOfSearch = false; 247 cifsFile->srch_inf.endOfSearch = false;
248 cifsFile->tlink = cifs_get_tlink(tlink);
249 248
250 full_path = build_path_from_dentry(file->f_path.dentry); 249 full_path = build_path_from_dentry(file->f_path.dentry);
251 if (full_path == NULL) { 250 if (full_path == NULL) {
@@ -756,18 +755,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
756 rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, 755 rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
757 ino, fattr.cf_dtype); 756 ino, fattr.cf_dtype);
758 757
759 /*
760 * we can not return filldir errors to the caller since they are
761 * "normal" when the stat blocksize is too small - we return remapped
762 * error instead
763 *
764 * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
765 * case already. Why should we be clobbering other errors from it?
766 */
767 if (rc) {
768 cFYI(1, "filldir rc = %d", rc);
769 rc = -EOVERFLOW;
770 }
771 dput(tmp_dentry); 758 dput(tmp_dentry);
772 return rc; 759 return rc;
773} 760}
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7b01d3f6eed..1adc9625a34 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -277,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
277} 277}
278 278
279static void 279static void
280decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses, 280decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
281 const struct nls_table *nls_cp) 281 const struct nls_table *nls_cp)
282{ 282{
283 int len; 283 int len;
@@ -323,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
323 return; 323 return;
324} 324}
325 325
326static int decode_ascii_ssetup(char **pbcc_area, int bleft, 326static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
327 struct cifsSesInfo *ses, 327 struct cifsSesInfo *ses,
328 const struct nls_table *nls_cp) 328 const struct nls_table *nls_cp)
329{ 329{
@@ -420,7 +420,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
420 return 0; 420 return 0;
421} 421}
422 422
423#ifdef CONFIG_CIFS_EXPERIMENTAL
424/* BB Move to ntlmssp.c eventually */ 423/* BB Move to ntlmssp.c eventually */
425 424
426/* We do not malloc the blob, it is passed in pbuffer, because 425/* We do not malloc the blob, it is passed in pbuffer, because
@@ -431,13 +430,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
431 NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer; 430 NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
432 __u32 flags; 431 __u32 flags;
433 432
433 memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
434 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); 434 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
435 sec_blob->MessageType = NtLmNegotiate; 435 sec_blob->MessageType = NtLmNegotiate;
436 436
437 /* BB is NTLMV2 session security format easier to use here? */ 437 /* BB is NTLMV2 session security format easier to use here? */
438 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | 438 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
439 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 439 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
440 NTLMSSP_NEGOTIATE_NTLM; 440 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
441 if (ses->server->secMode & 441 if (ses->server->secMode &
442 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 442 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
443 flags |= NTLMSSP_NEGOTIATE_SIGN; 443 flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -446,7 +446,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
446 NTLMSSP_NEGOTIATE_EXTENDED_SEC; 446 NTLMSSP_NEGOTIATE_EXTENDED_SEC;
447 } 447 }
448 448
449 sec_blob->NegotiateFlags |= cpu_to_le32(flags); 449 sec_blob->NegotiateFlags = cpu_to_le32(flags);
450 450
451 sec_blob->WorkstationName.BufferOffset = 0; 451 sec_blob->WorkstationName.BufferOffset = 0;
452 sec_blob->WorkstationName.Length = 0; 452 sec_blob->WorkstationName.Length = 0;
@@ -477,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
477 flags = NTLMSSP_NEGOTIATE_56 | 477 flags = NTLMSSP_NEGOTIATE_56 |
478 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | 478 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
479 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 479 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
480 NTLMSSP_NEGOTIATE_NTLM; 480 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
481 if (ses->server->secMode & 481 if (ses->server->secMode &
482 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 482 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
483 flags |= NTLMSSP_NEGOTIATE_SIGN; 483 flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -485,7 +485,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
485 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; 485 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
486 486
487 tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); 487 tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
488 sec_blob->NegotiateFlags |= cpu_to_le32(flags); 488 sec_blob->NegotiateFlags = cpu_to_le32(flags);
489 489
490 sec_blob->LmChallengeResponse.BufferOffset = 490 sec_blob->LmChallengeResponse.BufferOffset =
491 cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE)); 491 cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
@@ -544,8 +544,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
544 sec_blob->WorkstationName.MaximumLength = 0; 544 sec_blob->WorkstationName.MaximumLength = 0;
545 tmp += 2; 545 tmp += 2;
546 546
547 if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) && 547 if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
548 !calc_seckey(ses)) { 548 (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
549 && !calc_seckey(ses)) {
549 memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); 550 memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
550 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); 551 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
551 sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); 552 sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
@@ -563,17 +564,6 @@ setup_ntlmv2_ret:
563 return rc; 564 return rc;
564} 565}
565 566
566
567static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
568 struct cifsSesInfo *ses)
569{
570 build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
571 pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
572
573 return;
574}
575#endif
576
577int 567int
578CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, 568CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
579 const struct nls_table *nls_cp) 569 const struct nls_table *nls_cp)
@@ -585,12 +575,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
585 char *str_area; 575 char *str_area;
586 SESSION_SETUP_ANDX *pSMB; 576 SESSION_SETUP_ANDX *pSMB;
587 __u32 capabilities; 577 __u32 capabilities;
588 int count; 578 __u16 count;
589 int resp_buf_type; 579 int resp_buf_type;
590 struct kvec iov[3]; 580 struct kvec iov[3];
591 enum securityEnum type; 581 enum securityEnum type;
592 __u16 action; 582 __u16 action, bytes_remaining;
593 int bytes_remaining;
594 struct key *spnego_key = NULL; 583 struct key *spnego_key = NULL;
595 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 584 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
596 u16 blob_len; 585 u16 blob_len;
@@ -814,71 +803,70 @@ ssetup_ntlmssp_authenticate:
814 rc = -ENOSYS; 803 rc = -ENOSYS;
815 goto ssetup_exit; 804 goto ssetup_exit;
816#endif /* CONFIG_CIFS_UPCALL */ 805#endif /* CONFIG_CIFS_UPCALL */
817 } else { 806 } else if (type == RawNTLMSSP) {
818#ifdef CONFIG_CIFS_EXPERIMENTAL 807 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
819 if (type == RawNTLMSSP) { 808 cERROR(1, "NTLMSSP requires Unicode support");
820 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { 809 rc = -ENOSYS;
821 cERROR(1, "NTLMSSP requires Unicode support"); 810 goto ssetup_exit;
822 rc = -ENOSYS; 811 }
812
813 cFYI(1, "ntlmssp session setup phase %d", phase);
814 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
815 capabilities |= CAP_EXTENDED_SECURITY;
816 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
817 switch(phase) {
818 case NtLmNegotiate:
819 build_ntlmssp_negotiate_blob(
820 pSMB->req.SecurityBlob, ses);
821 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
822 iov[1].iov_base = pSMB->req.SecurityBlob;
823 pSMB->req.SecurityBlobLength =
824 cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
825 break;
826 case NtLmAuthenticate:
827 /*
828 * 5 is an empirical value, large enough to hold
829 * authenticate message plus max 10 of av paris,
830 * domain, user, workstation names, flags, etc.
831 */
832 ntlmsspblob = kzalloc(
833 5*sizeof(struct _AUTHENTICATE_MESSAGE),
834 GFP_KERNEL);
835 if (!ntlmsspblob) {
836 cERROR(1, "Can't allocate NTLMSSP blob");
837 rc = -ENOMEM;
823 goto ssetup_exit; 838 goto ssetup_exit;
824 } 839 }
825 840
826 cFYI(1, "ntlmssp session setup phase %d", phase); 841 rc = build_ntlmssp_auth_blob(ntlmsspblob,
827 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 842 &blob_len, ses, nls_cp);
828 capabilities |= CAP_EXTENDED_SECURITY; 843 if (rc)
829 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
830 if (phase == NtLmNegotiate) {
831 setup_ntlmssp_neg_req(pSMB, ses);
832 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
833 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
834 } else if (phase == NtLmAuthenticate) {
835 /* 5 is an empirical value, large enought to
836 * hold authenticate message, max 10 of
837 * av paris, doamin,user,workstation mames,
838 * flags etc..
839 */
840 ntlmsspblob = kmalloc(
841 5*sizeof(struct _AUTHENTICATE_MESSAGE),
842 GFP_KERNEL);
843 if (!ntlmsspblob) {
844 cERROR(1, "Can't allocate NTLMSSP");
845 rc = -ENOMEM;
846 goto ssetup_exit;
847 }
848
849 rc = build_ntlmssp_auth_blob(ntlmsspblob,
850 &blob_len, ses, nls_cp);
851 if (rc)
852 goto ssetup_exit;
853 iov[1].iov_len = blob_len;
854 iov[1].iov_base = ntlmsspblob;
855 pSMB->req.SecurityBlobLength =
856 cpu_to_le16(blob_len);
857 /* Make sure that we tell the server that we
858 are using the uid that it just gave us back
859 on the response (challenge) */
860 smb_buf->Uid = ses->Suid;
861 } else {
862 cERROR(1, "invalid phase %d", phase);
863 rc = -ENOSYS;
864 goto ssetup_exit; 844 goto ssetup_exit;
865 } 845 iov[1].iov_len = blob_len;
866 /* unicode strings must be word aligned */ 846 iov[1].iov_base = ntlmsspblob;
867 if ((iov[0].iov_len + iov[1].iov_len) % 2) { 847 pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
868 *bcc_ptr = 0; 848 /*
869 bcc_ptr++; 849 * Make sure that we tell the server that we are using
870 } 850 * the uid that it just gave us back on the response
871 unicode_oslm_strings(&bcc_ptr, nls_cp); 851 * (challenge)
872 } else { 852 */
873 cERROR(1, "secType %d not supported!", type); 853 smb_buf->Uid = ses->Suid;
854 break;
855 default:
856 cERROR(1, "invalid phase %d", phase);
874 rc = -ENOSYS; 857 rc = -ENOSYS;
875 goto ssetup_exit; 858 goto ssetup_exit;
876 } 859 }
877#else 860 /* unicode strings must be word aligned */
861 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
862 *bcc_ptr = 0;
863 bcc_ptr++;
864 }
865 unicode_oslm_strings(&bcc_ptr, nls_cp);
866 } else {
878 cERROR(1, "secType %d not supported!", type); 867 cERROR(1, "secType %d not supported!", type);
879 rc = -ENOSYS; 868 rc = -ENOSYS;
880 goto ssetup_exit; 869 goto ssetup_exit;
881#endif
882 } 870 }
883 871
884 iov[2].iov_base = str_area; 872 iov[2].iov_base = str_area;
@@ -887,10 +875,10 @@ ssetup_ntlmssp_authenticate:
887 count = iov[1].iov_len + iov[2].iov_len; 875 count = iov[1].iov_len + iov[2].iov_len;
888 smb_buf->smb_buf_length += count; 876 smb_buf->smb_buf_length += count;
889 877
890 BCC_LE(smb_buf) = cpu_to_le16(count); 878 put_bcc_le(count, smb_buf);
891 879
892 rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, 880 rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
893 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); 881 CIFS_LOG_ERROR);
894 /* SMB request buf freed in SendReceive2 */ 882 /* SMB request buf freed in SendReceive2 */
895 883
896 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 884 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
@@ -921,7 +909,7 @@ ssetup_ntlmssp_authenticate:
921 cFYI(1, "UID = %d ", ses->Suid); 909 cFYI(1, "UID = %d ", ses->Suid);
922 /* response can have either 3 or 4 word count - Samba sends 3 */ 910 /* response can have either 3 or 4 word count - Samba sends 3 */
923 /* and lanman response is 3 */ 911 /* and lanman response is 3 */
924 bytes_remaining = BCC(smb_buf); 912 bytes_remaining = get_bcc(smb_buf);
925 bcc_ptr = pByteArea(smb_buf); 913 bcc_ptr = pByteArea(smb_buf);
926 914
927 if (smb_buf->WordCount == 4) { 915 if (smb_buf->WordCount == 4) {
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index b6b6dcb500b..04721485925 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -45,7 +45,6 @@
45 up with a different answer to the one above) 45 up with a different answer to the one above)
46*/ 46*/
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include "cifsencrypt.h"
49#define uchar unsigned char 48#define uchar unsigned char
50 49
51static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9, 50static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 192ea51af20..b5450e9f40c 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -32,9 +32,8 @@
32#include "cifs_unicode.h" 32#include "cifs_unicode.h"
33#include "cifspdu.h" 33#include "cifspdu.h"
34#include "cifsglob.h" 34#include "cifsglob.h"
35#include "md5.h"
36#include "cifs_debug.h" 35#include "cifs_debug.h"
37#include "cifsencrypt.h" 36#include "cifsproto.h"
38 37
39#ifndef false 38#ifndef false
40#define false 0 39#define false 0
@@ -48,14 +47,57 @@
48#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) 47#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
49#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) 48#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
50 49
51/*The following definitions come from libsmb/smbencrypt.c */ 50/* produce a md4 message digest from data of length n bytes */
51int
52mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
53{
54 int rc;
55 unsigned int size;
56 struct crypto_shash *md4;
57 struct sdesc *sdescmd4;
58
59 md4 = crypto_alloc_shash("md4", 0, 0);
60 if (IS_ERR(md4)) {
61 cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
62 return PTR_ERR(md4);
63 }
64 size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
65 sdescmd4 = kmalloc(size, GFP_KERNEL);
66 if (!sdescmd4) {
67 rc = -ENOMEM;
68 cERROR(1, "%s: Memory allocation failure\n", __func__);
69 goto mdfour_err;
70 }
71 sdescmd4->shash.tfm = md4;
72 sdescmd4->shash.flags = 0x0;
73
74 rc = crypto_shash_init(&sdescmd4->shash);
75 if (rc) {
76 cERROR(1, "%s: Could not init md4 shash\n", __func__);
77 goto mdfour_err;
78 }
79 crypto_shash_update(&sdescmd4->shash, link_str, link_len);
80 rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
52 81
53void SMBencrypt(unsigned char *passwd, const unsigned char *c8, 82mdfour_err:
54 unsigned char *p24); 83 crypto_free_shash(md4);
55void E_md4hash(const unsigned char *passwd, unsigned char *p16); 84 kfree(sdescmd4);
56static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8, 85
57 unsigned char p24[24]); 86 return rc;
58void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); 87}
88
89/* Does the des encryption from the NT or LM MD4 hash. */
90static void
91SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
92 unsigned char p24[24])
93{
94 unsigned char p21[21];
95
96 memset(p21, '\0', 21);
97
98 memcpy(p21, passwd, 16);
99 E_P24(p21, c8, p24);
100}
59 101
60/* 102/*
61 This implements the X/Open SMB password encryption 103 This implements the X/Open SMB password encryption
@@ -118,9 +160,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
118 * Creates the MD4 Hash of the users password in NT UNICODE. 160 * Creates the MD4 Hash of the users password in NT UNICODE.
119 */ 161 */
120 162
121void 163int
122E_md4hash(const unsigned char *passwd, unsigned char *p16) 164E_md4hash(const unsigned char *passwd, unsigned char *p16)
123{ 165{
166 int rc;
124 int len; 167 int len;
125 __u16 wpwd[129]; 168 __u16 wpwd[129];
126 169
@@ -139,8 +182,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
139 /* Calculate length in bytes */ 182 /* Calculate length in bytes */
140 len = _my_wcslen(wpwd) * sizeof(__u16); 183 len = _my_wcslen(wpwd) * sizeof(__u16);
141 184
142 mdfour(p16, (unsigned char *) wpwd, len); 185 rc = mdfour(p16, (unsigned char *) wpwd, len);
143 memset(wpwd, 0, 129 * 2); 186 memset(wpwd, 0, 129 * 2);
187
188 return rc;
144} 189}
145 190
146#if 0 /* currently unused */ 191#if 0 /* currently unused */
@@ -212,19 +257,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
212} 257}
213#endif 258#endif
214 259
215/* Does the des encryption from the NT or LM MD4 hash. */
216static void
217SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
218 unsigned char p24[24])
219{
220 unsigned char p21[21];
221
222 memset(p21, '\0', 21);
223
224 memcpy(p21, passwd, 16);
225 E_P24(p21, c8, p24);
226}
227
228/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ 260/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
229#if 0 /* currently unused */ 261#if 0 /* currently unused */
230static void 262static void
@@ -242,16 +274,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
242#endif 274#endif
243 275
244/* Does the NT MD4 hash then des encryption. */ 276/* Does the NT MD4 hash then des encryption. */
245 277int
246void
247SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) 278SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
248{ 279{
280 int rc;
249 unsigned char p21[21]; 281 unsigned char p21[21];
250 282
251 memset(p21, '\0', 21); 283 memset(p21, '\0', 21);
252 284
253 E_md4hash(passwd, p21); 285 rc = E_md4hash(passwd, p21);
286 if (rc) {
287 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
288 return rc;
289 }
254 SMBOWFencrypt(p21, c8, p24); 290 SMBOWFencrypt(p21, c8, p24);
291 return rc;
255} 292}
256 293
257 294
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e0588cdf4cc..c1ccca1a933 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,7 +36,13 @@
36 36
37extern mempool_t *cifs_mid_poolp; 37extern mempool_t *cifs_mid_poolp;
38 38
39static struct mid_q_entry * 39static void
40wake_up_task(struct mid_q_entry *mid)
41{
42 wake_up_process(mid->callback_data);
43}
44
45struct mid_q_entry *
40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) 46AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
41{ 47{
42 struct mid_q_entry *temp; 48 struct mid_q_entry *temp;
@@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
58 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ 64 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
59 /* when mid allocated can be before when sent */ 65 /* when mid allocated can be before when sent */
60 temp->when_alloc = jiffies; 66 temp->when_alloc = jiffies;
61 temp->tsk = current; 67
68 /*
69 * The default is for the mid to be synchronous, so the
70 * default callback just wakes up the current task.
71 */
72 temp->callback = wake_up_task;
73 temp->callback_data = current;
62 } 74 }
63 75
64 spin_lock(&GlobalMid_Lock);
65 list_add_tail(&temp->qhead, &server->pending_mid_q);
66 atomic_inc(&midCount); 76 atomic_inc(&midCount);
67 temp->midState = MID_REQUEST_ALLOCATED; 77 temp->midState = MID_REQUEST_ALLOCATED;
68 spin_unlock(&GlobalMid_Lock);
69 return temp; 78 return temp;
70} 79}
71 80
72static void 81void
73DeleteMidQEntry(struct mid_q_entry *midEntry) 82DeleteMidQEntry(struct mid_q_entry *midEntry)
74{ 83{
75#ifdef CONFIG_CIFS_STATS2 84#ifdef CONFIG_CIFS_STATS2
76 unsigned long now; 85 unsigned long now;
77#endif 86#endif
78 spin_lock(&GlobalMid_Lock);
79 midEntry->midState = MID_FREE; 87 midEntry->midState = MID_FREE;
80 list_del(&midEntry->qhead);
81 atomic_dec(&midCount); 88 atomic_dec(&midCount);
82 spin_unlock(&GlobalMid_Lock);
83 if (midEntry->largeBuf) 89 if (midEntry->largeBuf)
84 cifs_buf_release(midEntry->resp_buf); 90 cifs_buf_release(midEntry->resp_buf);
85 else 91 else
@@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
103 mempool_free(midEntry, cifs_mid_poolp); 109 mempool_free(midEntry, cifs_mid_poolp);
104} 110}
105 111
112static void
113delete_mid(struct mid_q_entry *mid)
114{
115 spin_lock(&GlobalMid_Lock);
116 list_del(&mid->qhead);
117 spin_unlock(&GlobalMid_Lock);
118
119 DeleteMidQEntry(mid);
120}
121
106static int 122static int
107smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) 123smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
108{ 124{
@@ -119,7 +135,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
119 if (ssocket == NULL) 135 if (ssocket == NULL)
120 return -ENOTSOCK; /* BB eventually add reconnect code here */ 136 return -ENOTSOCK; /* BB eventually add reconnect code here */
121 137
122 smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr; 138 smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
123 smb_msg.msg_namelen = sizeof(struct sockaddr); 139 smb_msg.msg_namelen = sizeof(struct sockaddr);
124 smb_msg.msg_control = NULL; 140 smb_msg.msg_control = NULL;
125 smb_msg.msg_controllen = 0; 141 smb_msg.msg_controllen = 0;
@@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
244 return smb_sendv(server, &iov, 1); 260 return smb_sendv(server, &iov, 1);
245} 261}
246 262
247static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) 263static int wait_for_free_request(struct TCP_Server_Info *server,
264 const int long_op)
248{ 265{
249 if (long_op == CIFS_ASYNC_OP) { 266 if (long_op == CIFS_ASYNC_OP) {
250 /* oplock breaks must not be held up */ 267 /* oplock breaks must not be held up */
251 atomic_inc(&ses->server->inFlight); 268 atomic_inc(&server->inFlight);
252 return 0; 269 return 0;
253 } 270 }
254 271
255 spin_lock(&GlobalMid_Lock); 272 spin_lock(&GlobalMid_Lock);
256 while (1) { 273 while (1) {
257 if (atomic_read(&ses->server->inFlight) >= 274 if (atomic_read(&server->inFlight) >= cifs_max_pending) {
258 cifs_max_pending){
259 spin_unlock(&GlobalMid_Lock); 275 spin_unlock(&GlobalMid_Lock);
260#ifdef CONFIG_CIFS_STATS2 276#ifdef CONFIG_CIFS_STATS2
261 atomic_inc(&ses->server->num_waiters); 277 atomic_inc(&server->num_waiters);
262#endif 278#endif
263 wait_event(ses->server->request_q, 279 wait_event(server->request_q,
264 atomic_read(&ses->server->inFlight) 280 atomic_read(&server->inFlight)
265 < cifs_max_pending); 281 < cifs_max_pending);
266#ifdef CONFIG_CIFS_STATS2 282#ifdef CONFIG_CIFS_STATS2
267 atomic_dec(&ses->server->num_waiters); 283 atomic_dec(&server->num_waiters);
268#endif 284#endif
269 spin_lock(&GlobalMid_Lock); 285 spin_lock(&GlobalMid_Lock);
270 } else { 286 } else {
271 if (ses->server->tcpStatus == CifsExiting) { 287 if (server->tcpStatus == CifsExiting) {
272 spin_unlock(&GlobalMid_Lock); 288 spin_unlock(&GlobalMid_Lock);
273 return -ENOENT; 289 return -ENOENT;
274 } 290 }
@@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
278 294
279 /* update # of requests on the wire to server */ 295 /* update # of requests on the wire to server */
280 if (long_op != CIFS_BLOCKING_OP) 296 if (long_op != CIFS_BLOCKING_OP)
281 atomic_inc(&ses->server->inFlight); 297 atomic_inc(&server->inFlight);
282 spin_unlock(&GlobalMid_Lock); 298 spin_unlock(&GlobalMid_Lock);
283 break; 299 break;
284 } 300 }
@@ -308,53 +324,81 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
308 *ppmidQ = AllocMidQEntry(in_buf, ses->server); 324 *ppmidQ = AllocMidQEntry(in_buf, ses->server);
309 if (*ppmidQ == NULL) 325 if (*ppmidQ == NULL)
310 return -ENOMEM; 326 return -ENOMEM;
327 spin_lock(&GlobalMid_Lock);
328 list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
329 spin_unlock(&GlobalMid_Lock);
311 return 0; 330 return 0;
312} 331}
313 332
314static int wait_for_response(struct cifsSesInfo *ses, 333static int
315 struct mid_q_entry *midQ, 334wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
316 unsigned long timeout,
317 unsigned long time_to_wait)
318{ 335{
319 unsigned long curr_timeout; 336 int error;
320 337
321 for (;;) { 338 error = wait_event_killable(server->response_q,
322 curr_timeout = timeout + jiffies; 339 midQ->midState != MID_REQUEST_SUBMITTED);
323 wait_event_timeout(ses->server->response_q, 340 if (error < 0)
324 midQ->midState != MID_REQUEST_SUBMITTED, timeout); 341 return -ERESTARTSYS;
325 342
326 if (time_after(jiffies, curr_timeout) && 343 return 0;
327 (midQ->midState == MID_REQUEST_SUBMITTED) && 344}
328 ((ses->server->tcpStatus == CifsGood) ||
329 (ses->server->tcpStatus == CifsNew))) {
330 345
331 unsigned long lrt;
332 346
333 /* We timed out. Is the server still 347/*
334 sending replies ? */ 348 * Send a SMB request and set the callback function in the mid to handle
335 spin_lock(&GlobalMid_Lock); 349 * the result. Caller is responsible for dealing with timeouts.
336 lrt = ses->server->lstrp; 350 */
337 spin_unlock(&GlobalMid_Lock); 351int
352cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
353 mid_callback_t *callback, void *cbdata)
354{
355 int rc;
356 struct mid_q_entry *mid;
338 357
339 /* Calculate time_to_wait past last receive time. 358 rc = wait_for_free_request(server, CIFS_ASYNC_OP);
340 Although we prefer not to time out if the 359 if (rc)
341 server is still responding - we will time 360 return rc;
342 out if the server takes more than 15 (or 45 361
343 or 180) seconds to respond to this request 362 mutex_lock(&server->srv_mutex);
344 and has not responded to any request from 363 mid = AllocMidQEntry(in_buf, server);
345 other threads on the client within 10 seconds */ 364 if (mid == NULL) {
346 lrt += time_to_wait; 365 mutex_unlock(&server->srv_mutex);
347 if (time_after(jiffies, lrt)) { 366 return -ENOMEM;
348 /* No replies for time_to_wait. */
349 cERROR(1, "server not responding");
350 return -1;
351 }
352 } else {
353 return 0;
354 }
355 } 367 }
356}
357 368
369 /* put it on the pending_mid_q */
370 spin_lock(&GlobalMid_Lock);
371 list_add_tail(&mid->qhead, &server->pending_mid_q);
372 spin_unlock(&GlobalMid_Lock);
373
374 rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
375 if (rc) {
376 mutex_unlock(&server->srv_mutex);
377 goto out_err;
378 }
379
380 mid->callback = callback;
381 mid->callback_data = cbdata;
382 mid->midState = MID_REQUEST_SUBMITTED;
383#ifdef CONFIG_CIFS_STATS2
384 atomic_inc(&server->inSend);
385#endif
386 rc = smb_send(server, in_buf, in_buf->smb_buf_length);
387#ifdef CONFIG_CIFS_STATS2
388 atomic_dec(&server->inSend);
389 mid->when_sent = jiffies;
390#endif
391 mutex_unlock(&server->srv_mutex);
392 if (rc)
393 goto out_err;
394
395 return rc;
396out_err:
397 delete_mid(mid);
398 atomic_dec(&server->inFlight);
399 wake_up(&server->request_q);
400 return rc;
401}
358 402
359/* 403/*
360 * 404 *
@@ -382,6 +426,81 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
382 return rc; 426 return rc;
383} 427}
384 428
429static int
430sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
431{
432 int rc = 0;
433
434 cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
435 mid->mid, mid->midState);
436
437 spin_lock(&GlobalMid_Lock);
438 /* ensure that it's no longer on the pending_mid_q */
439 list_del_init(&mid->qhead);
440
441 switch (mid->midState) {
442 case MID_RESPONSE_RECEIVED:
443 spin_unlock(&GlobalMid_Lock);
444 return rc;
445 case MID_REQUEST_SUBMITTED:
446 /* socket is going down, reject all calls */
447 if (server->tcpStatus == CifsExiting) {
448 cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
449 __func__, mid->mid, mid->command, mid->midState);
450 rc = -EHOSTDOWN;
451 break;
452 }
453 case MID_RETRY_NEEDED:
454 rc = -EAGAIN;
455 break;
456 default:
457 cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
458 mid->mid, mid->midState);
459 rc = -EIO;
460 }
461 spin_unlock(&GlobalMid_Lock);
462
463 DeleteMidQEntry(mid);
464 return rc;
465}
466
467/*
468 * An NT cancel request header looks just like the original request except:
469 *
470 * The Command is SMB_COM_NT_CANCEL
471 * The WordCount is zeroed out
472 * The ByteCount is zeroed out
473 *
474 * This function mangles an existing request buffer into a
475 * SMB_COM_NT_CANCEL request and then sends it.
476 */
477static int
478send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
479 struct mid_q_entry *mid)
480{
481 int rc = 0;
482
483 /* -4 for RFC1001 length and +2 for BCC field */
484 in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4 + 2;
485 in_buf->Command = SMB_COM_NT_CANCEL;
486 in_buf->WordCount = 0;
487 put_bcc_le(0, in_buf);
488
489 mutex_lock(&server->srv_mutex);
490 rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
491 if (rc) {
492 mutex_unlock(&server->srv_mutex);
493 return rc;
494 }
495 rc = smb_send(server, in_buf, in_buf->smb_buf_length);
496 mutex_unlock(&server->srv_mutex);
497
498 cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
499 in_buf->Mid, rc);
500
501 return rc;
502}
503
385int 504int
386SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, 505SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
387 struct kvec *iov, int n_vec, int *pRespBufType /* ret */, 506 struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -390,7 +509,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
390 int rc = 0; 509 int rc = 0;
391 int long_op; 510 int long_op;
392 unsigned int receive_len; 511 unsigned int receive_len;
393 unsigned long timeout;
394 struct mid_q_entry *midQ; 512 struct mid_q_entry *midQ;
395 struct smb_hdr *in_buf = iov[0].iov_base; 513 struct smb_hdr *in_buf = iov[0].iov_base;
396 514
@@ -413,7 +531,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
413 to the same server. We may make this configurable later or 531 to the same server. We may make this configurable later or
414 use ses->maxReq */ 532 use ses->maxReq */
415 533
416 rc = wait_for_free_request(ses, long_op); 534 rc = wait_for_free_request(ses->server, long_op);
417 if (rc) { 535 if (rc) {
418 cifs_small_buf_release(in_buf); 536 cifs_small_buf_release(in_buf);
419 return rc; 537 return rc;
@@ -457,65 +575,20 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
457 if (rc < 0) 575 if (rc < 0)
458 goto out; 576 goto out;
459 577
460 if (long_op == CIFS_STD_OP) 578 if (long_op == CIFS_ASYNC_OP)
461 timeout = 15 * HZ;
462 else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
463 timeout = 180 * HZ;
464 else if (long_op == CIFS_LONG_OP)
465 timeout = 45 * HZ; /* should be greater than
466 servers oplock break timeout (about 43 seconds) */
467 else if (long_op == CIFS_ASYNC_OP)
468 goto out; 579 goto out;
469 else if (long_op == CIFS_BLOCKING_OP)
470 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */
471 else {
472 cERROR(1, "unknown timeout flag %d", long_op);
473 rc = -EIO;
474 goto out;
475 }
476
477 /* wait for 15 seconds or until woken up due to response arriving or
478 due to last connection to this server being unmounted */
479 if (signal_pending(current)) {
480 /* if signal pending do not hold up user for full smb timeout
481 but we still give response a chance to complete */
482 timeout = 2 * HZ;
483 }
484
485 /* No user interrupts in wait - wreaks havoc with performance */
486 wait_for_response(ses, midQ, timeout, 10 * HZ);
487
488 spin_lock(&GlobalMid_Lock);
489 580
490 if (midQ->resp_buf == NULL) { 581 rc = wait_for_response(ses->server, midQ);
491 cERROR(1, "No response to cmd %d mid %d", 582 if (rc != 0)
492 midQ->command, midQ->mid); 583 goto out;
493 if (midQ->midState == MID_REQUEST_SUBMITTED) {
494 if (ses->server->tcpStatus == CifsExiting)
495 rc = -EHOSTDOWN;
496 else {
497 ses->server->tcpStatus = CifsNeedReconnect;
498 midQ->midState = MID_RETRY_NEEDED;
499 }
500 }
501 584
502 if (rc != -EHOSTDOWN) { 585 rc = sync_mid_result(midQ, ses->server);
503 if (midQ->midState == MID_RETRY_NEEDED) { 586 if (rc != 0) {
504 rc = -EAGAIN;
505 cFYI(1, "marking request for retry");
506 } else {
507 rc = -EIO;
508 }
509 }
510 spin_unlock(&GlobalMid_Lock);
511 DeleteMidQEntry(midQ);
512 /* Update # of requests on wire to server */
513 atomic_dec(&ses->server->inFlight); 587 atomic_dec(&ses->server->inFlight);
514 wake_up(&ses->server->request_q); 588 wake_up(&ses->server->request_q);
515 return rc; 589 return rc;
516 } 590 }
517 591
518 spin_unlock(&GlobalMid_Lock);
519 receive_len = midQ->resp_buf->smb_buf_length; 592 receive_len = midQ->resp_buf->smb_buf_length;
520 593
521 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 594 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -559,19 +632,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
559 if (receive_len >= sizeof(struct smb_hdr) - 4 632 if (receive_len >= sizeof(struct smb_hdr) - 4
560 /* do not count RFC1001 header */ + 633 /* do not count RFC1001 header */ +
561 (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ ) 634 (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
562 BCC(midQ->resp_buf) = 635 put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
563 le16_to_cpu(BCC_LE(midQ->resp_buf));
564 if ((flags & CIFS_NO_RESP) == 0) 636 if ((flags & CIFS_NO_RESP) == 0)
565 midQ->resp_buf = NULL; /* mark it so buf will 637 midQ->resp_buf = NULL; /* mark it so buf will
566 not be freed by 638 not be freed by
567 DeleteMidQEntry */ 639 delete_mid */
568 } else { 640 } else {
569 rc = -EIO; 641 rc = -EIO;
570 cFYI(1, "Bad MID state?"); 642 cFYI(1, "Bad MID state?");
571 } 643 }
572 644
573out: 645out:
574 DeleteMidQEntry(midQ); 646 delete_mid(midQ);
575 atomic_dec(&ses->server->inFlight); 647 atomic_dec(&ses->server->inFlight);
576 wake_up(&ses->server->request_q); 648 wake_up(&ses->server->request_q);
577 649
@@ -585,7 +657,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
585{ 657{
586 int rc = 0; 658 int rc = 0;
587 unsigned int receive_len; 659 unsigned int receive_len;
588 unsigned long timeout;
589 struct mid_q_entry *midQ; 660 struct mid_q_entry *midQ;
590 661
591 if (ses == NULL) { 662 if (ses == NULL) {
@@ -610,7 +681,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
610 return -EIO; 681 return -EIO;
611 } 682 }
612 683
613 rc = wait_for_free_request(ses, long_op); 684 rc = wait_for_free_request(ses->server, long_op);
614 if (rc) 685 if (rc)
615 return rc; 686 return rc;
616 687
@@ -649,64 +720,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
649 if (rc < 0) 720 if (rc < 0)
650 goto out; 721 goto out;
651 722
652 if (long_op == CIFS_STD_OP) 723 if (long_op == CIFS_ASYNC_OP)
653 timeout = 15 * HZ;
654 /* wait for 15 seconds or until woken up due to response arriving or
655 due to last connection to this server being unmounted */
656 else if (long_op == CIFS_ASYNC_OP)
657 goto out; 724 goto out;
658 else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
659 timeout = 180 * HZ;
660 else if (long_op == CIFS_LONG_OP)
661 timeout = 45 * HZ; /* should be greater than
662 servers oplock break timeout (about 43 seconds) */
663 else if (long_op == CIFS_BLOCKING_OP)
664 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
665 else {
666 cERROR(1, "unknown timeout flag %d", long_op);
667 rc = -EIO;
668 goto out;
669 }
670 725
671 if (signal_pending(current)) { 726 rc = wait_for_response(ses->server, midQ);
672 /* if signal pending do not hold up user for full smb timeout 727 if (rc != 0)
673 but we still give response a chance to complete */ 728 goto out;
674 timeout = 2 * HZ;
675 }
676
677 /* No user interrupts in wait - wreaks havoc with performance */
678 wait_for_response(ses, midQ, timeout, 10 * HZ);
679
680 spin_lock(&GlobalMid_Lock);
681 if (midQ->resp_buf == NULL) {
682 cERROR(1, "No response for cmd %d mid %d",
683 midQ->command, midQ->mid);
684 if (midQ->midState == MID_REQUEST_SUBMITTED) {
685 if (ses->server->tcpStatus == CifsExiting)
686 rc = -EHOSTDOWN;
687 else {
688 ses->server->tcpStatus = CifsNeedReconnect;
689 midQ->midState = MID_RETRY_NEEDED;
690 }
691 }
692 729
693 if (rc != -EHOSTDOWN) { 730 rc = sync_mid_result(midQ, ses->server);
694 if (midQ->midState == MID_RETRY_NEEDED) { 731 if (rc != 0) {
695 rc = -EAGAIN;
696 cFYI(1, "marking request for retry");
697 } else {
698 rc = -EIO;
699 }
700 }
701 spin_unlock(&GlobalMid_Lock);
702 DeleteMidQEntry(midQ);
703 /* Update # of requests on wire to server */
704 atomic_dec(&ses->server->inFlight); 732 atomic_dec(&ses->server->inFlight);
705 wake_up(&ses->server->request_q); 733 wake_up(&ses->server->request_q);
706 return rc; 734 return rc;
707 } 735 }
708 736
709 spin_unlock(&GlobalMid_Lock);
710 receive_len = midQ->resp_buf->smb_buf_length; 737 receive_len = midQ->resp_buf->smb_buf_length;
711 738
712 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 739 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -748,43 +775,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
748 if (receive_len >= sizeof(struct smb_hdr) - 4 775 if (receive_len >= sizeof(struct smb_hdr) - 4
749 /* do not count RFC1001 header */ + 776 /* do not count RFC1001 header */ +
750 (2 * out_buf->WordCount) + 2 /* bcc */ ) 777 (2 * out_buf->WordCount) + 2 /* bcc */ )
751 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 778 put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
752 } else { 779 } else {
753 rc = -EIO; 780 rc = -EIO;
754 cERROR(1, "Bad MID state?"); 781 cERROR(1, "Bad MID state?");
755 } 782 }
756 783
757out: 784out:
758 DeleteMidQEntry(midQ); 785 delete_mid(midQ);
759 atomic_dec(&ses->server->inFlight); 786 atomic_dec(&ses->server->inFlight);
760 wake_up(&ses->server->request_q); 787 wake_up(&ses->server->request_q);
761 788
762 return rc; 789 return rc;
763} 790}
764 791
765/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
766
767static int
768send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
769 struct mid_q_entry *midQ)
770{
771 int rc = 0;
772 struct cifsSesInfo *ses = tcon->ses;
773 __u16 mid = in_buf->Mid;
774
775 header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
776 in_buf->Mid = mid;
777 mutex_lock(&ses->server->srv_mutex);
778 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
779 if (rc) {
780 mutex_unlock(&ses->server->srv_mutex);
781 return rc;
782 }
783 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
784 mutex_unlock(&ses->server->srv_mutex);
785 return rc;
786}
787
788/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows 792/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
789 blocking lock to return. */ 793 blocking lock to return. */
790 794
@@ -807,7 +811,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
807 pSMB->hdr.Mid = GetNextMid(ses->server); 811 pSMB->hdr.Mid = GetNextMid(ses->server);
808 812
809 return SendReceive(xid, ses, in_buf, out_buf, 813 return SendReceive(xid, ses, in_buf, out_buf,
810 &bytes_returned, CIFS_STD_OP); 814 &bytes_returned, 0);
811} 815}
812 816
813int 817int
@@ -845,7 +849,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
845 return -EIO; 849 return -EIO;
846 } 850 }
847 851
848 rc = wait_for_free_request(ses, CIFS_BLOCKING_OP); 852 rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
849 if (rc) 853 if (rc)
850 return rc; 854 return rc;
851 855
@@ -863,7 +867,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
863 867
864 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); 868 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
865 if (rc) { 869 if (rc) {
866 DeleteMidQEntry(midQ); 870 delete_mid(midQ);
867 mutex_unlock(&ses->server->srv_mutex); 871 mutex_unlock(&ses->server->srv_mutex);
868 return rc; 872 return rc;
869 } 873 }
@@ -880,7 +884,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
880 mutex_unlock(&ses->server->srv_mutex); 884 mutex_unlock(&ses->server->srv_mutex);
881 885
882 if (rc < 0) { 886 if (rc < 0) {
883 DeleteMidQEntry(midQ); 887 delete_mid(midQ);
884 return rc; 888 return rc;
885 } 889 }
886 890
@@ -899,10 +903,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
899 if (in_buf->Command == SMB_COM_TRANSACTION2) { 903 if (in_buf->Command == SMB_COM_TRANSACTION2) {
900 /* POSIX lock. We send a NT_CANCEL SMB to cause the 904 /* POSIX lock. We send a NT_CANCEL SMB to cause the
901 blocking lock to return. */ 905 blocking lock to return. */
902 906 rc = send_nt_cancel(ses->server, in_buf, midQ);
903 rc = send_nt_cancel(tcon, in_buf, midQ);
904 if (rc) { 907 if (rc) {
905 DeleteMidQEntry(midQ); 908 delete_mid(midQ);
906 return rc; 909 return rc;
907 } 910 }
908 } else { 911 } else {
@@ -914,47 +917,22 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
914 /* If we get -ENOLCK back the lock may have 917 /* If we get -ENOLCK back the lock may have
915 already been removed. Don't exit in this case. */ 918 already been removed. Don't exit in this case. */
916 if (rc && rc != -ENOLCK) { 919 if (rc && rc != -ENOLCK) {
917 DeleteMidQEntry(midQ); 920 delete_mid(midQ);
918 return rc; 921 return rc;
919 } 922 }
920 } 923 }
921 924
922 /* Wait 5 seconds for the response. */ 925 if (wait_for_response(ses->server, midQ) == 0) {
923 if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) {
924 /* We got the response - restart system call. */ 926 /* We got the response - restart system call. */
925 rstart = 1; 927 rstart = 1;
926 } 928 }
927 } 929 }
928 930
929 spin_lock(&GlobalMid_Lock); 931 rc = sync_mid_result(midQ, ses->server);
930 if (midQ->resp_buf) { 932 if (rc != 0)
931 spin_unlock(&GlobalMid_Lock);
932 receive_len = midQ->resp_buf->smb_buf_length;
933 } else {
934 cERROR(1, "No response for cmd %d mid %d",
935 midQ->command, midQ->mid);
936 if (midQ->midState == MID_REQUEST_SUBMITTED) {
937 if (ses->server->tcpStatus == CifsExiting)
938 rc = -EHOSTDOWN;
939 else {
940 ses->server->tcpStatus = CifsNeedReconnect;
941 midQ->midState = MID_RETRY_NEEDED;
942 }
943 }
944
945 if (rc != -EHOSTDOWN) {
946 if (midQ->midState == MID_RETRY_NEEDED) {
947 rc = -EAGAIN;
948 cFYI(1, "marking request for retry");
949 } else {
950 rc = -EIO;
951 }
952 }
953 spin_unlock(&GlobalMid_Lock);
954 DeleteMidQEntry(midQ);
955 return rc; 933 return rc;
956 }
957 934
935 receive_len = midQ->resp_buf->smb_buf_length;
958 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 936 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
959 cERROR(1, "Frame too large received. Length: %d Xid: %d", 937 cERROR(1, "Frame too large received. Length: %d Xid: %d",
960 receive_len, xid); 938 receive_len, xid);
@@ -998,10 +976,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
998 if (receive_len >= sizeof(struct smb_hdr) - 4 976 if (receive_len >= sizeof(struct smb_hdr) - 4
999 /* do not count RFC1001 header */ + 977 /* do not count RFC1001 header */ +
1000 (2 * out_buf->WordCount) + 2 /* bcc */ ) 978 (2 * out_buf->WordCount) + 2 /* bcc */ )
1001 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 979 put_bcc(get_bcc_le(out_buf), out_buf);
1002 980
1003out: 981out:
1004 DeleteMidQEntry(midQ); 982 delete_mid(midQ);
1005 if (rstart && rc == -EACCES) 983 if (rstart && rc == -EACCES)
1006 return -ERESTARTSYS; 984 return -ERESTARTSYS;
1007 return rc; 985 return rc;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a264b744bb4..eae2a149160 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -30,10 +30,11 @@
30 30
31#define MAX_EA_VALUE_SIZE 65535 31#define MAX_EA_VALUE_SIZE 65535
32#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" 32#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
33#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
33#define CIFS_XATTR_USER_PREFIX "user." 34#define CIFS_XATTR_USER_PREFIX "user."
34#define CIFS_XATTR_SYSTEM_PREFIX "system." 35#define CIFS_XATTR_SYSTEM_PREFIX "system."
35#define CIFS_XATTR_OS2_PREFIX "os2." 36#define CIFS_XATTR_OS2_PREFIX "os2."
36#define CIFS_XATTR_SECURITY_PREFIX ".security" 37#define CIFS_XATTR_SECURITY_PREFIX "security."
37#define CIFS_XATTR_TRUSTED_PREFIX "trusted." 38#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
38#define XATTR_TRUSTED_PREFIX_LEN 8 39#define XATTR_TRUSTED_PREFIX_LEN 8
39#define XATTR_SECURITY_PREFIX_LEN 9 40#define XATTR_SECURITY_PREFIX_LEN 9
@@ -277,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
277 cifs_sb->local_nls, 278 cifs_sb->local_nls,
278 cifs_sb->mnt_cifs_flags & 279 cifs_sb->mnt_cifs_flags &
279 CIFS_MOUNT_MAP_SPECIAL_CHR); 280 CIFS_MOUNT_MAP_SPECIAL_CHR);
280#ifdef CONFIG_CIFS_EXPERIMENTAL
281 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
282 __u16 fid;
283 int oplock = 0;
284 struct cifs_ntsd *pacl = NULL;
285 __u32 buflen = 0;
286 if (experimEnabled)
287 rc = CIFSSMBOpen(xid, pTcon, full_path,
288 FILE_OPEN, GENERIC_READ, 0, &fid,
289 &oplock, NULL, cifs_sb->local_nls,
290 cifs_sb->mnt_cifs_flags &
291 CIFS_MOUNT_MAP_SPECIAL_CHR);
292 /* else rc is EOPNOTSUPP from above */
293
294 if (rc == 0) {
295 rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
296 &buflen);
297 CIFSSMBClose(xid, pTcon, fid);
298 }
299 }
300#endif /* EXPERIMENTAL */
301#else 281#else
302 cFYI(1, "query POSIX ACL not supported yet"); 282 cFYI(1, "Query POSIX ACL not supported yet");
303#endif /* CONFIG_CIFS_POSIX */ 283#endif /* CONFIG_CIFS_POSIX */
304 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, 284 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
305 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 285 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -311,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
311 cifs_sb->mnt_cifs_flags & 291 cifs_sb->mnt_cifs_flags &
312 CIFS_MOUNT_MAP_SPECIAL_CHR); 292 CIFS_MOUNT_MAP_SPECIAL_CHR);
313#else 293#else
314 cFYI(1, "query POSIX default ACL not supported yet"); 294 cFYI(1, "Query POSIX default ACL not supported yet");
315#endif 295#endif /* CONFIG_CIFS_POSIX */
296 } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
297 strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
298#ifdef CONFIG_CIFS_ACL
299 u32 acllen;
300 struct cifs_ntsd *pacl;
301
302 pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
303 full_path, &acllen);
304 if (IS_ERR(pacl)) {
305 rc = PTR_ERR(pacl);
306 cERROR(1, "%s: error %zd getting sec desc",
307 __func__, rc);
308 } else {
309 if (ea_value) {
310 if (acllen > buf_size)
311 acllen = -ERANGE;
312 else
313 memcpy(ea_value, pacl, acllen);
314 }
315 rc = acllen;
316 kfree(pacl);
317 }
318#else
319 cFYI(1, "Query CIFS ACL not supported yet");
320#endif /* CONFIG_CIFS_ACL */
316 } else if (strncmp(ea_name, 321 } else if (strncmp(ea_name,
317 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { 322 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
318 cFYI(1, "Trusted xattr namespace not supported yet"); 323 cFYI(1, "Trusted xattr namespace not supported yet");
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 9060f08e70c..69015787618 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -20,10 +20,9 @@
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
21 21
22#include <linux/coda.h> 22#include <linux/coda.h>
23#include <linux/coda_linux.h>
24#include <linux/coda_psdev.h> 23#include <linux/coda_psdev.h>
25#include <linux/coda_fs_i.h> 24#include "coda_linux.h"
26#include <linux/coda_cache.h> 25#include "coda_cache.h"
27 26
28static atomic_t permission_epoch = ATOMIC_INIT(0); 27static atomic_t permission_epoch = ATOMIC_INIT(0);
29 28
@@ -93,7 +92,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
93 struct list_head *child; 92 struct list_head *child;
94 struct dentry *de; 93 struct dentry *de;
95 94
96 spin_lock(&dcache_lock); 95 spin_lock(&parent->d_lock);
97 list_for_each(child, &parent->d_subdirs) 96 list_for_each(child, &parent->d_subdirs)
98 { 97 {
99 de = list_entry(child, struct dentry, d_u.d_child); 98 de = list_entry(child, struct dentry, d_u.d_child);
@@ -102,7 +101,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
102 continue; 101 continue;
103 coda_flag_inode(de->d_inode, flag); 102 coda_flag_inode(de->d_inode, flag);
104 } 103 }
105 spin_unlock(&dcache_lock); 104 spin_unlock(&parent->d_lock);
106 return; 105 return;
107} 106}
108 107
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 602240569c8..6475877b076 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -7,9 +7,8 @@
7#include <linux/time.h> 7#include <linux/time.h>
8 8
9#include <linux/coda.h> 9#include <linux/coda.h>
10#include <linux/coda_linux.h>
11#include <linux/coda_fs_i.h>
12#include <linux/coda_psdev.h> 10#include <linux/coda_psdev.h>
11#include "coda_linux.h"
13 12
14static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) 13static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
15{ 14{
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 00000000000..c910b5eb1ce
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
1/* Coda filesystem -- Linux Minicache
2 *
3 * Copyright (C) 1989 - 1997 Carnegie Mellon University
4 *
5 * Carnegie Mellon University encourages users of this software to
6 * contribute improvements to the Coda project. Contact Peter Braam
7 * <coda@cs.cmu.edu>
8 */
9
10#ifndef _CFSNC_HEADER_
11#define _CFSNC_HEADER_
12
13/* credential cache */
14void coda_cache_enter(struct inode *inode, int mask);
15void coda_cache_clear_inode(struct inode *);
16void coda_cache_clear_all(struct super_block *sb);
17int coda_cache_check(struct inode *inode, int mask);
18
19/* for downcalls and attributes and lookups */
20void coda_flag_inode_children(struct inode *inode, int flag);
21
22#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 00000000000..e35071b1de0
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
1/*
2 * coda_fs_i.h
3 *
4 * Copyright (C) 1998 Carnegie Mellon University
5 *
6 */
7
8#ifndef _LINUX_CODA_FS_I
9#define _LINUX_CODA_FS_I
10
11#include <linux/types.h>
12#include <linux/list.h>
13#include <linux/spinlock.h>
14#include <linux/coda.h>
15
16/*
17 * coda fs inode data
18 * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
19 * c_cached_perm.
20 * vfs_inode is set only when the inode is created and never changes.
21 * c_fid is set when the inode is created and should be considered immutable.
22 */
23struct coda_inode_info {
24 struct CodaFid c_fid; /* Coda identifier */
25 u_short c_flags; /* flags (see below) */
26 unsigned int c_mapcount; /* nr of times this inode is mapped */
27 unsigned int c_cached_epoch; /* epoch for cached permissions */
28 vuid_t c_uid; /* fsuid for cached permissions */
29 unsigned int c_cached_perm; /* cached access permissions */
30 spinlock_t c_lock;
31 struct inode vfs_inode;
32};
33
34/*
35 * coda fs file private data
36 */
37#define CODA_MAGIC 0xC0DAC0DA
38struct coda_file_info {
39 int cfi_magic; /* magic number */
40 struct file *cfi_container; /* container file for this cnode */
41 unsigned int cfi_mapcount; /* nr of times this file is mapped */
42};
43
44#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
45
46/* flags */
47#define C_VATTR 0x1 /* Validity of vattr in inode */
48#define C_FLUSH 0x2 /* used after a flush */
49#define C_DYING 0x4 /* from venus (which died) */
50#define C_PURGE 0x8
51
52int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
53struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
54int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
55struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
56void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
57
58#endif
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index bf4a3fd3c8e..2bdbcc11b37 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -17,9 +17,8 @@
17#include <linux/string.h> 17#include <linux/string.h>
18 18
19#include <linux/coda.h> 19#include <linux/coda.h>
20#include <linux/coda_linux.h>
21#include <linux/coda_psdev.h> 20#include <linux/coda_psdev.h>
22#include <linux/coda_fs_i.h> 21#include "coda_linux.h"
23 22
24/* initialize the debugging variables */ 23/* initialize the debugging variables */
25int coda_fake_statfs; 24int coda_fake_statfs;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 00000000000..9b0c5323890
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
1/*
2 * Coda File System, Linux Kernel module
3 *
4 * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
5 * Linux modifications (C) 1996, Peter J. Braam
6 * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
7 *
8 * Carnegie Mellon University encourages users of this software to
9 * contribute improvements to the Coda project.
10 */
11
12#ifndef _LINUX_CODA_FS
13#define _LINUX_CODA_FS
14
15#include <linux/kernel.h>
16#include <linux/param.h>
17#include <linux/mm.h>
18#include <linux/vmalloc.h>
19#include <linux/slab.h>
20#include <linux/wait.h>
21#include <linux/types.h>
22#include <linux/fs.h>
23#include "coda_fs_i.h"
24
25/* operations */
26extern const struct inode_operations coda_dir_inode_operations;
27extern const struct inode_operations coda_file_inode_operations;
28extern const struct inode_operations coda_ioctl_inode_operations;
29
30extern const struct dentry_operations coda_dentry_operations;
31
32extern const struct address_space_operations coda_file_aops;
33extern const struct address_space_operations coda_symlink_aops;
34
35extern const struct file_operations coda_dir_operations;
36extern const struct file_operations coda_file_operations;
37extern const struct file_operations coda_ioctl_operations;
38
39/* operations shared over more than one file */
40int coda_open(struct inode *i, struct file *f);
41int coda_release(struct inode *i, struct file *f);
42int coda_permission(struct inode *inode, int mask, unsigned int flags);
43int coda_revalidate_inode(struct dentry *);
44int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
45int coda_setattr(struct dentry *, struct iattr *);
46
47/* this file: heloers */
48char *coda_f2s(struct CodaFid *f);
49int coda_isroot(struct inode *i);
50int coda_iscontrol(const char *name, size_t length);
51
52void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
53void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
54unsigned short coda_flags_to_cflags(unsigned short);
55
56/* sysctl.h */
57void coda_sysctl_init(void);
58void coda_sysctl_clean(void);
59
60#define CODA_ALLOC(ptr, cast, size) do { \
61 if (size < PAGE_SIZE) \
62 ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
63 else \
64 ptr = (cast)vmalloc((unsigned long) size); \
65 if (!ptr) \
66 printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
67 else memset( ptr, 0, size ); \
68} while (0)
69
70
71#define CODA_FREE(ptr,size) \
72 do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
73
74/* inode to cnode access functions */
75
76static inline struct coda_inode_info *ITOC(struct inode *inode)
77{
78 return list_entry(inode, struct coda_inode_info, vfs_inode);
79}
80
81static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
82{
83 return &(ITOC(inode)->c_fid);
84}
85
86static __inline__ char *coda_i2s(struct inode *inode)
87{
88 return coda_f2s(&(ITOC(inode)->c_fid));
89}
90
91/* this will not zap the inode away */
92static __inline__ void coda_flag_inode(struct inode *inode, int flag)
93{
94 struct coda_inode_info *cii = ITOC(inode);
95
96 spin_lock(&cii->c_lock);
97 cii->c_flags |= flag;
98 spin_unlock(&cii->c_lock);
99}
100
101#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 5d8b3553960..2b8dae4d121 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -18,14 +18,14 @@
18#include <linux/errno.h> 18#include <linux/errno.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
21#include <linux/namei.h>
21 22
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23 24
24#include <linux/coda.h> 25#include <linux/coda.h>
25#include <linux/coda_linux.h>
26#include <linux/coda_psdev.h> 26#include <linux/coda_psdev.h>
27#include <linux/coda_fs_i.h> 27#include "coda_linux.h"
28#include <linux/coda_cache.h> 28#include "coda_cache.h"
29 29
30#include "coda_int.h" 30#include "coda_int.h"
31 31
@@ -47,7 +47,7 @@ static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
47 47
48/* dentry ops */ 48/* dentry ops */
49static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd); 49static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
50static int coda_dentry_delete(struct dentry *); 50static int coda_dentry_delete(const struct dentry *);
51 51
52/* support routines */ 52/* support routines */
53static int coda_venus_readdir(struct file *coda_file, void *buf, 53static int coda_venus_readdir(struct file *coda_file, void *buf,
@@ -60,7 +60,7 @@ static int coda_return_EIO(void)
60} 60}
61#define CODA_EIO_ERROR ((void *) (coda_return_EIO)) 61#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
62 62
63static const struct dentry_operations coda_dentry_operations = 63const struct dentry_operations coda_dentry_operations =
64{ 64{
65 .d_revalidate = coda_dentry_revalidate, 65 .d_revalidate = coda_dentry_revalidate,
66 .d_delete = coda_dentry_delete, 66 .d_delete = coda_dentry_delete,
@@ -125,8 +125,6 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
125 return ERR_PTR(error); 125 return ERR_PTR(error);
126 126
127exit: 127exit:
128 entry->d_op = &coda_dentry_operations;
129
130 if (inode && (type & CODA_NOCACHE)) 128 if (inode && (type & CODA_NOCACHE))
131 coda_flag_inode(inode, C_VATTR | C_PURGE); 129 coda_flag_inode(inode, C_VATTR | C_PURGE);
132 130
@@ -134,10 +132,13 @@ exit:
134} 132}
135 133
136 134
137int coda_permission(struct inode *inode, int mask) 135int coda_permission(struct inode *inode, int mask, unsigned int flags)
138{ 136{
139 int error; 137 int error;
140 138
139 if (flags & IPERM_FLAG_RCU)
140 return -ECHILD;
141
141 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 142 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
142 143
143 if (!mask) 144 if (!mask)
@@ -541,9 +542,13 @@ out:
541/* called when a cache lookup succeeds */ 542/* called when a cache lookup succeeds */
542static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd) 543static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
543{ 544{
544 struct inode *inode = de->d_inode; 545 struct inode *inode;
545 struct coda_inode_info *cii; 546 struct coda_inode_info *cii;
546 547
548 if (nd->flags & LOOKUP_RCU)
549 return -ECHILD;
550
551 inode = de->d_inode;
547 if (!inode || coda_isroot(inode)) 552 if (!inode || coda_isroot(inode))
548 goto out; 553 goto out;
549 if (is_bad_inode(inode)) 554 if (is_bad_inode(inode))
@@ -559,7 +564,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
559 if (cii->c_flags & C_FLUSH) 564 if (cii->c_flags & C_FLUSH)
560 coda_flag_inode_children(inode, C_FLUSH); 565 coda_flag_inode_children(inode, C_FLUSH);
561 566
562 if (atomic_read(&de->d_count) > 1) 567 if (de->d_count > 1)
563 /* pretend it's valid, but don't change the flags */ 568 /* pretend it's valid, but don't change the flags */
564 goto out; 569 goto out;
565 570
@@ -577,7 +582,7 @@ out:
577 * This is the callback from dput() when d_count is going to 0. 582 * This is the callback from dput() when d_count is going to 0.
578 * We use this to unhash dentries with bad inodes. 583 * We use this to unhash dentries with bad inodes.
579 */ 584 */
580static int coda_dentry_delete(struct dentry * dentry) 585static int coda_dentry_delete(const struct dentry * dentry)
581{ 586{
582 int flags; 587 int flags;
583 588
diff --git a/fs/coda/file.c b/fs/coda/file.c
index c8b50ba4366..0433057be33 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -21,10 +21,9 @@
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22 22
23#include <linux/coda.h> 23#include <linux/coda.h>
24#include <linux/coda_linux.h>
25#include <linux/coda_fs_i.h>
26#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
27 25
26#include "coda_linux.h"
28#include "coda_int.h" 27#include "coda_int.h"
29 28
30static ssize_t 29static ssize_t
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 5ea57c8c7f9..871b2771546 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -28,10 +28,9 @@
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29 29
30#include <linux/coda.h> 30#include <linux/coda.h>
31#include <linux/coda_linux.h>
32#include <linux/coda_psdev.h> 31#include <linux/coda_psdev.h>
33#include <linux/coda_fs_i.h> 32#include "coda_linux.h"
34#include <linux/coda_cache.h> 33#include "coda_cache.h"
35 34
36#include "coda_int.h" 35#include "coda_int.h"
37 36
@@ -45,7 +44,7 @@ static struct kmem_cache * coda_inode_cachep;
45static struct inode *coda_alloc_inode(struct super_block *sb) 44static struct inode *coda_alloc_inode(struct super_block *sb)
46{ 45{
47 struct coda_inode_info *ei; 46 struct coda_inode_info *ei;
48 ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); 47 ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
49 if (!ei) 48 if (!ei)
50 return NULL; 49 return NULL;
51 memset(&ei->c_fid, 0, sizeof(struct CodaFid)); 50 memset(&ei->c_fid, 0, sizeof(struct CodaFid));
@@ -56,11 +55,18 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
56 return &ei->vfs_inode; 55 return &ei->vfs_inode;
57} 56}
58 57
59static void coda_destroy_inode(struct inode *inode) 58static void coda_i_callback(struct rcu_head *head)
60{ 59{
60 struct inode *inode = container_of(head, struct inode, i_rcu);
61 INIT_LIST_HEAD(&inode->i_dentry);
61 kmem_cache_free(coda_inode_cachep, ITOC(inode)); 62 kmem_cache_free(coda_inode_cachep, ITOC(inode));
62} 63}
63 64
65static void coda_destroy_inode(struct inode *inode)
66{
67 call_rcu(&inode->i_rcu, coda_i_callback);
68}
69
64static void init_once(void *foo) 70static void init_once(void *foo)
65{ 71{
66 struct coda_inode_info *ei = (struct coda_inode_info *) foo; 72 struct coda_inode_info *ei = (struct coda_inode_info *) foo;
@@ -186,6 +192,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
186 sb->s_blocksize_bits = 12; 192 sb->s_blocksize_bits = 12;
187 sb->s_magic = CODA_SUPER_MAGIC; 193 sb->s_magic = CODA_SUPER_MAGIC;
188 sb->s_op = &coda_super_operations; 194 sb->s_op = &coda_super_operations;
195 sb->s_d_op = &coda_dentry_operations;
189 sb->s_bdi = &vc->bdi; 196 sb->s_bdi = &vc->bdi;
190 197
191 /* get root fid from Venus: this needs the root inode */ 198 /* get root fid from Venus: this needs the root inode */
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 2fd89b5c5c7..6cbb3afb36d 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -19,12 +19,12 @@
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20 20
21#include <linux/coda.h> 21#include <linux/coda.h>
22#include <linux/coda_linux.h>
23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 22#include <linux/coda_psdev.h>
25 23
24#include "coda_linux.h"
25
26/* pioctl ops */ 26/* pioctl ops */
27static int coda_ioctl_permission(struct inode *inode, int mask); 27static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
28static long coda_pioctl(struct file *filp, unsigned int cmd, 28static long coda_pioctl(struct file *filp, unsigned int cmd,
29 unsigned long user_data); 29 unsigned long user_data);
30 30
@@ -41,8 +41,10 @@ const struct file_operations coda_ioctl_operations = {
41}; 41};
42 42
43/* the coda pioctl inode ops */ 43/* the coda pioctl inode ops */
44static int coda_ioctl_permission(struct inode *inode, int mask) 44static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
45{ 45{
46 if (flags & IPERM_FLAG_RCU)
47 return -ECHILD;
46 return (mask & MAY_EXEC) ? -EACCES : 0; 48 return (mask & MAY_EXEC) ? -EACCES : 0;
47} 49}
48 50
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 62647a8595e..8f616e0e252 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -43,10 +43,10 @@
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44 44
45#include <linux/coda.h> 45#include <linux/coda.h>
46#include <linux/coda_linux.h>
47#include <linux/coda_fs_i.h>
48#include <linux/coda_psdev.h> 46#include <linux/coda_psdev.h>
49 47
48#include "coda_linux.h"
49
50#include "coda_int.h" 50#include "coda_int.h"
51 51
52/* statistics */ 52/* statistics */
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index af78f007a2b..ab94ef63cae 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -16,9 +16,9 @@
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17 17
18#include <linux/coda.h> 18#include <linux/coda.h>
19#include <linux/coda_linux.h>
20#include <linux/coda_psdev.h> 19#include <linux/coda_psdev.h>
21#include <linux/coda_fs_i.h> 20
21#include "coda_linux.h"
22 22
23static int coda_symlink_filler(struct file *file, struct page *page) 23static int coda_symlink_filler(struct file *file, struct page *page)
24{ 24{
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c3563cab975..9727e0c5257 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,10 +33,9 @@
33#include <linux/vfs.h> 33#include <linux/vfs.h>
34 34
35#include <linux/coda.h> 35#include <linux/coda.h>
36#include <linux/coda_linux.h>
37#include <linux/coda_psdev.h> 36#include <linux/coda_psdev.h>
38#include <linux/coda_fs_i.h> 37#include "coda_linux.h"
39#include <linux/coda_cache.h> 38#include "coda_cache.h"
40 39
41#include "coda_int.h" 40#include "coda_int.h"
42 41
diff --git a/fs/compat.c b/fs/compat.c
index c580c322fa6..f6fd0a00e6c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
257} 257}
258 258
259/* 259/*
260 * The following statfs calls are copies of code from fs/open.c and 260 * The following statfs calls are copies of code from fs/statfs.c and
261 * should be checked against those from time to time 261 * should be checked against those from time to time
262 */ 262 */
263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
@@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
320 __put_user(kbuf->f_namelen, &ubuf->f_namelen) || 320 __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
321 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || 321 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
322 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || 322 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
323 __put_user(kbuf->f_frsize, &ubuf->f_frsize)) 323 __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
324 __put_user(kbuf->f_flags, &ubuf->f_flags) ||
325 __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
324 return -EFAULT; 326 return -EFAULT;
325 return 0; 327 return 0;
326} 328}
@@ -597,10 +599,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
597 if (nr_segs > fast_segs) { 599 if (nr_segs > fast_segs) {
598 ret = -ENOMEM; 600 ret = -ENOMEM;
599 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 601 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
600 if (iov == NULL) { 602 if (iov == NULL)
601 *ret_pointer = fast_pointer;
602 goto out; 603 goto out;
603 }
604 } 604 }
605 *ret_pointer = iov; 605 *ret_pointer = iov;
606 606
@@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max)
1350 argv++; 1350 argv++;
1351 if (i++ >= max) 1351 if (i++ >= max)
1352 return -E2BIG; 1352 return -E2BIG;
1353
1354 if (fatal_signal_pending(current))
1355 return -ERESTARTNOHAND;
1356 cond_resched();
1353 } 1357 }
1354 } 1358 }
1355 return i; 1359 return i;
@@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
1391 while (len > 0) { 1395 while (len > 0) {
1392 int offset, bytes_to_copy; 1396 int offset, bytes_to_copy;
1393 1397
1398 if (fatal_signal_pending(current)) {
1399 ret = -ERESTARTNOHAND;
1400 goto out;
1401 }
1402 cond_resched();
1403
1394 offset = pos % PAGE_SIZE; 1404 offset = pos % PAGE_SIZE;
1395 if (offset == 0) 1405 if (offset == 0)
1396 offset = PAGE_SIZE; 1406 offset = PAGE_SIZE;
@@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
1407 if (!kmapped_page || kpos != (pos & PAGE_MASK)) { 1417 if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
1408 struct page *page; 1418 struct page *page;
1409 1419
1410#ifdef CONFIG_STACK_GROWSUP 1420 page = get_arg_page(bprm, pos, 1);
1411 ret = expand_stack_downwards(bprm->vma, pos); 1421 if (!page) {
1412 if (ret < 0) {
1413 /* We've exceed the stack rlimit. */
1414 ret = -E2BIG;
1415 goto out;
1416 }
1417#endif
1418 ret = get_user_pages(current, bprm->mm, pos,
1419 1, 1, 1, &page, NULL);
1420 if (ret <= 0) {
1421 /* We've exceed the stack rlimit. */
1422 ret = -E2BIG; 1422 ret = -E2BIG;
1423 goto out; 1423 goto out;
1424 } 1424 }
@@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename,
1539 return retval; 1539 return retval;
1540 1540
1541out: 1541out:
1542 if (bprm->mm) 1542 if (bprm->mm) {
1543 acct_arg_size(bprm, 0);
1543 mmput(bprm->mm); 1544 mmput(bprm->mm);
1545 }
1544 1546
1545out_file: 1547out_file:
1546 if (bprm->file) { 1548 if (bprm->file) {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 410ed188faa..61abb638b4b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,7 +19,6 @@
19#include <linux/compiler.h> 19#include <linux/compiler.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/smp_lock.h>
23#include <linux/ioctl.h> 22#include <linux/ioctl.h>
24#include <linux/if.h> 23#include <linux/if.h>
25#include <linux/if_bridge.h> 24#include <linux/if_bridge.h>
@@ -43,7 +42,7 @@
43#include <linux/tty.h> 42#include <linux/tty.h>
44#include <linux/vt_kern.h> 43#include <linux/vt_kern.h>
45#include <linux/fb.h> 44#include <linux/fb.h>
46#include <linux/videodev.h> 45#include <linux/videodev2.h>
47#include <linux/netdevice.h> 46#include <linux/netdevice.h>
48#include <linux/raw.h> 47#include <linux/raw.h>
49#include <linux/blkdev.h> 48#include <linux/blkdev.h>
@@ -837,6 +836,7 @@ COMPATIBLE_IOCTL(TCSETSW)
837COMPATIBLE_IOCTL(TCSETSF) 836COMPATIBLE_IOCTL(TCSETSF)
838COMPATIBLE_IOCTL(TIOCLINUX) 837COMPATIBLE_IOCTL(TIOCLINUX)
839COMPATIBLE_IOCTL(TIOCSBRK) 838COMPATIBLE_IOCTL(TIOCSBRK)
839COMPATIBLE_IOCTL(TIOCGDEV)
840COMPATIBLE_IOCTL(TIOCCBRK) 840COMPATIBLE_IOCTL(TIOCCBRK)
841COMPATIBLE_IOCTL(TIOCGSID) 841COMPATIBLE_IOCTL(TIOCGSID)
842COMPATIBLE_IOCTL(TIOCGICOUNT) 842COMPATIBLE_IOCTL(TIOCGICOUNT)
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0..9febcdefdfd 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
1config CONFIGFS_FS 1config CONFIGFS_FS
2 tristate "Userspace-driven configuration filesystem" 2 tristate "Userspace-driven configuration filesystem"
3 depends on SYSFS 3 select SYSFS
4 help 4 help
5 configfs is a ram-based filesystem that provides the converse 5 configfs is a RAM-based filesystem that provides the converse
6 of sysfs's functionality. Where sysfs is a filesystem-based 6 of sysfs's functionality. Where sysfs is a filesystem-based
7 view of kernel objects, configfs is a filesystem-based manager 7 view of kernel objects, configfs is a filesystem-based manager
8 of kernel objects, or config_items. 8 of kernel objects, or config_items.
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da6061a6df4..82bda8fdfc1 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -90,6 +90,7 @@ extern const struct file_operations configfs_file_operations;
90extern const struct file_operations bin_fops; 90extern const struct file_operations bin_fops;
91extern const struct inode_operations configfs_dir_inode_operations; 91extern const struct inode_operations configfs_dir_inode_operations;
92extern const struct inode_operations configfs_symlink_inode_operations; 92extern const struct inode_operations configfs_symlink_inode_operations;
93extern const struct dentry_operations configfs_dentry_ops;
93 94
94extern int configfs_symlink(struct inode *dir, struct dentry *dentry, 95extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
95 const char *symname); 96 const char *symname);
@@ -120,7 +121,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
120{ 121{
121 struct config_item * item = NULL; 122 struct config_item * item = NULL;
122 123
123 spin_lock(&dcache_lock); 124 spin_lock(&dentry->d_lock);
124 if (!d_unhashed(dentry)) { 125 if (!d_unhashed(dentry)) {
125 struct configfs_dirent * sd = dentry->d_fsdata; 126 struct configfs_dirent * sd = dentry->d_fsdata;
126 if (sd->s_type & CONFIGFS_ITEM_LINK) { 127 if (sd->s_type & CONFIGFS_ITEM_LINK) {
@@ -129,7 +130,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
129 } else 130 } else
130 item = config_item_get(sd->s_element); 131 item = config_item_get(sd->s_element);
131 } 132 }
132 spin_unlock(&dcache_lock); 133 spin_unlock(&dentry->d_lock);
133 134
134 return item; 135 return item;
135} 136}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 0b502f80c69..90ff3cb10de 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,12 +67,12 @@ static void configfs_d_iput(struct dentry * dentry,
67 * We _must_ delete our dentries on last dput, as the chain-to-parent 67 * We _must_ delete our dentries on last dput, as the chain-to-parent
68 * behavior is required to clear the parents of default_groups. 68 * behavior is required to clear the parents of default_groups.
69 */ 69 */
70static int configfs_d_delete(struct dentry *dentry) 70static int configfs_d_delete(const struct dentry *dentry)
71{ 71{
72 return 1; 72 return 1;
73} 73}
74 74
75static const struct dentry_operations configfs_dentry_ops = { 75const struct dentry_operations configfs_dentry_ops = {
76 .d_iput = configfs_d_iput, 76 .d_iput = configfs_d_iput,
77 /* simple_delete_dentry() isn't exported */ 77 /* simple_delete_dentry() isn't exported */
78 .d_delete = configfs_d_delete, 78 .d_delete = configfs_d_delete,
@@ -232,10 +232,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
232 232
233 sd->s_mode = mode; 233 sd->s_mode = mode;
234 sd->s_dentry = dentry; 234 sd->s_dentry = dentry;
235 if (dentry) { 235 if (dentry)
236 dentry->d_fsdata = configfs_get(sd); 236 dentry->d_fsdata = configfs_get(sd);
237 dentry->d_op = &configfs_dentry_ops;
238 }
239 237
240 return 0; 238 return 0;
241} 239}
@@ -278,7 +276,6 @@ static int create_dir(struct config_item * k, struct dentry * p,
278 error = configfs_create(d, mode, init_dir); 276 error = configfs_create(d, mode, init_dir);
279 if (!error) { 277 if (!error) {
280 inc_nlink(p->d_inode); 278 inc_nlink(p->d_inode);
281 (d)->d_op = &configfs_dentry_ops;
282 } else { 279 } else {
283 struct configfs_dirent *sd = d->d_fsdata; 280 struct configfs_dirent *sd = d->d_fsdata;
284 if (sd) { 281 if (sd) {
@@ -371,9 +368,7 @@ int configfs_create_link(struct configfs_symlink *sl,
371 CONFIGFS_ITEM_LINK); 368 CONFIGFS_ITEM_LINK);
372 if (!err) { 369 if (!err) {
373 err = configfs_create(dentry, mode, init_symlink); 370 err = configfs_create(dentry, mode, init_symlink);
374 if (!err) 371 if (err) {
375 dentry->d_op = &configfs_dentry_ops;
376 else {
377 struct configfs_dirent *sd = dentry->d_fsdata; 372 struct configfs_dirent *sd = dentry->d_fsdata;
378 if (sd) { 373 if (sd) {
379 spin_lock(&configfs_dirent_lock); 374 spin_lock(&configfs_dirent_lock);
@@ -399,8 +394,7 @@ static void remove_dir(struct dentry * d)
399 if (d->d_inode) 394 if (d->d_inode)
400 simple_rmdir(parent->d_inode,d); 395 simple_rmdir(parent->d_inode,d);
401 396
402 pr_debug(" o %s removing done (%d)\n",d->d_name.name, 397 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
403 atomic_read(&d->d_count));
404 398
405 dput(parent); 399 dput(parent);
406} 400}
@@ -448,7 +442,6 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
448 return error; 442 return error;
449 } 443 }
450 444
451 dentry->d_op = &configfs_dentry_ops;
452 d_rehash(dentry); 445 d_rehash(dentry);
453 446
454 return 0; 447 return 0;
@@ -493,7 +486,10 @@ static struct dentry * configfs_lookup(struct inode *dir,
493 * If it doesn't exist and it isn't a NOT_PINNED item, 486 * If it doesn't exist and it isn't a NOT_PINNED item,
494 * it must be negative. 487 * it must be negative.
495 */ 488 */
496 return simple_lookup(dir, dentry, nd); 489 if (dentry->d_name.len > NAME_MAX)
490 return ERR_PTR(-ENAMETOOLONG);
491 d_add(dentry, NULL);
492 return NULL;
497 } 493 }
498 494
499out: 495out:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 253476d78ed..c83f4768eea 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -250,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
250 struct dentry * dentry = sd->s_dentry; 250 struct dentry * dentry = sd->s_dentry;
251 251
252 if (dentry) { 252 if (dentry) {
253 spin_lock(&dcache_lock);
254 spin_lock(&dentry->d_lock); 253 spin_lock(&dentry->d_lock);
255 if (!(d_unhashed(dentry) && dentry->d_inode)) { 254 if (!(d_unhashed(dentry) && dentry->d_inode)) {
256 dget_locked(dentry); 255 dget_dlock(dentry);
257 __d_drop(dentry); 256 __d_drop(dentry);
258 spin_unlock(&dentry->d_lock); 257 spin_unlock(&dentry->d_lock);
259 spin_unlock(&dcache_lock);
260 simple_unlink(parent->d_inode, dentry); 258 simple_unlink(parent->d_inode, dentry);
261 } else { 259 } else
262 spin_unlock(&dentry->d_lock); 260 spin_unlock(&dentry->d_lock);
263 spin_unlock(&dcache_lock);
264 }
265 } 261 }
266} 262}
267 263
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 7d3607febe1..ecc62178bed 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -101,6 +101,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
101 configfs_root_group.cg_item.ci_dentry = root; 101 configfs_root_group.cg_item.ci_dentry = root;
102 root->d_fsdata = &configfs_root; 102 root->d_fsdata = &configfs_root;
103 sb->s_root = root; 103 sb->s_root = root;
104 sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
104 return 0; 105 return 0;
105} 106}
106 107
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 32fd5fe9ca0..e141939080f 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops;
34static DEFINE_MUTEX(read_mutex); 34static DEFINE_MUTEX(read_mutex);
35 35
36 36
37/* These two macros may change in future, to provide better st_ino 37/* These macros may change in future, to provide better st_ino semantics. */
38 semantics. */
39#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1)
40#define OFFSET(x) ((x)->i_ino) 38#define OFFSET(x) ((x)->i_ino)
41 39
42static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode) 40static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
43{ 41{
42 if (!cino->offset)
43 return offset + 1;
44 if (!cino->size)
45 return offset + 1;
46
47 /*
48 * The file mode test fixes buggy mkcramfs implementations where
49 * cramfs_inode->offset is set to a non zero value for entries
50 * which did not contain data, like devices node and fifos.
51 */
52 switch (cino->mode & S_IFMT) {
53 case S_IFREG:
54 case S_IFDIR:
55 case S_IFLNK:
56 return cino->offset << 2;
57 default:
58 break;
59 }
60 return offset + 1;
61}
62
63static struct inode *get_cramfs_inode(struct super_block *sb,
64 struct cramfs_inode *cramfs_inode, unsigned int offset)
65{
66 struct inode *inode;
44 static struct timespec zerotime; 67 static struct timespec zerotime;
68
69 inode = iget_locked(sb, cramino(cramfs_inode, offset));
70 if (!inode)
71 return ERR_PTR(-ENOMEM);
72 if (!(inode->i_state & I_NEW))
73 return inode;
74
75 switch (cramfs_inode->mode & S_IFMT) {
76 case S_IFREG:
77 inode->i_fop = &generic_ro_fops;
78 inode->i_data.a_ops = &cramfs_aops;
79 break;
80 case S_IFDIR:
81 inode->i_op = &cramfs_dir_inode_operations;
82 inode->i_fop = &cramfs_directory_operations;
83 break;
84 case S_IFLNK:
85 inode->i_op = &page_symlink_inode_operations;
86 inode->i_data.a_ops = &cramfs_aops;
87 break;
88 default:
89 init_special_inode(inode, cramfs_inode->mode,
90 old_decode_dev(cramfs_inode->size));
91 }
92
45 inode->i_mode = cramfs_inode->mode; 93 inode->i_mode = cramfs_inode->mode;
46 inode->i_uid = cramfs_inode->uid; 94 inode->i_uid = cramfs_inode->uid;
47 inode->i_size = cramfs_inode->size;
48 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
49 inode->i_gid = cramfs_inode->gid; 95 inode->i_gid = cramfs_inode->gid;
96
97 /* if the lower 2 bits are zero, the inode contains data */
98 if (!(inode->i_ino & 3)) {
99 inode->i_size = cramfs_inode->size;
100 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
101 }
102
50 /* Struct copy intentional */ 103 /* Struct copy intentional */
51 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; 104 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
52 /* inode->i_nlink is left 1 - arguably wrong for directories, 105 /* inode->i_nlink is left 1 - arguably wrong for directories,
53 but it's the best we can do without reading the directory 106 but it's the best we can do without reading the directory
54 contents. 1 yields the right result in GNU find, even 107 contents. 1 yields the right result in GNU find, even
55 without -noleaf option. */ 108 without -noleaf option. */
56 if (S_ISREG(inode->i_mode)) {
57 inode->i_fop = &generic_ro_fops;
58 inode->i_data.a_ops = &cramfs_aops;
59 } else if (S_ISDIR(inode->i_mode)) {
60 inode->i_op = &cramfs_dir_inode_operations;
61 inode->i_fop = &cramfs_directory_operations;
62 } else if (S_ISLNK(inode->i_mode)) {
63 inode->i_op = &page_symlink_inode_operations;
64 inode->i_data.a_ops = &cramfs_aops;
65 } else {
66 init_special_inode(inode, inode->i_mode,
67 old_decode_dev(cramfs_inode->size));
68 }
69}
70 109
71static struct inode *get_cramfs_inode(struct super_block *sb, 110 unlock_new_inode(inode);
72 struct cramfs_inode * cramfs_inode) 111
73{
74 struct inode *inode;
75 if (CRAMINO(cramfs_inode) == 1) {
76 inode = new_inode(sb);
77 if (inode) {
78 inode->i_ino = 1;
79 setup_inode(inode, cramfs_inode);
80 }
81 } else {
82 inode = iget_locked(sb, CRAMINO(cramfs_inode));
83 if (inode && (inode->i_state & I_NEW)) {
84 setup_inode(inode, cramfs_inode);
85 unlock_new_inode(inode);
86 }
87 }
88 return inode; 112 return inode;
89} 113}
90 114
@@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
265 printk(KERN_ERR "cramfs: root is not a directory\n"); 289 printk(KERN_ERR "cramfs: root is not a directory\n");
266 goto out; 290 goto out;
267 } 291 }
292 /* correct strange, hard-coded permissions of mkcramfs */
293 super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
294
268 root_offset = super.root.offset << 2; 295 root_offset = super.root.offset << 2;
269 if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { 296 if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
270 sbi->size=super.size; 297 sbi->size=super.size;
@@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
289 316
290 /* Set it all up.. */ 317 /* Set it all up.. */
291 sb->s_op = &cramfs_ops; 318 sb->s_op = &cramfs_ops;
292 root = get_cramfs_inode(sb, &super.root); 319 root = get_cramfs_inode(sb, &super.root, 0);
293 if (!root) 320 if (!root)
294 goto out; 321 goto out;
295 sb->s_root = d_alloc_root(root); 322 sb->s_root = d_alloc_root(root);
@@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
365 */ 392 */
366 namelen = de->namelen << 2; 393 namelen = de->namelen << 2;
367 memcpy(buf, name, namelen); 394 memcpy(buf, name, namelen);
368 ino = CRAMINO(de); 395 ino = cramino(de, OFFSET(inode) + offset);
369 mode = de->mode; 396 mode = de->mode;
370 mutex_unlock(&read_mutex); 397 mutex_unlock(&read_mutex);
371 nextoffset = offset + sizeof(*de) + namelen; 398 nextoffset = offset + sizeof(*de) + namelen;
@@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
404 struct cramfs_inode *de; 431 struct cramfs_inode *de;
405 char *name; 432 char *name;
406 int namelen, retval; 433 int namelen, retval;
434 int dir_off = OFFSET(dir) + offset;
407 435
408 de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); 436 de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
409 name = (char *)(de+1); 437 name = (char *)(de+1);
410 438
411 /* Try to take advantage of sorted directories */ 439 /* Try to take advantage of sorted directories */
@@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
436 if (!retval) { 464 if (!retval) {
437 struct cramfs_inode entry = *de; 465 struct cramfs_inode entry = *de;
438 mutex_unlock(&read_mutex); 466 mutex_unlock(&read_mutex);
439 d_add(dentry, get_cramfs_inode(dir->i_sb, &entry)); 467 d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
440 return NULL; 468 return NULL;
441 } 469 }
442 /* else (retval < 0) */ 470 /* else (retval < 0) */
diff --git a/fs/dcache.c b/fs/dcache.c
index 23702a9d4e6..2a6bd9a4ae9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -33,20 +33,58 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h> 34#include <linux/fs_struct.h>
35#include <linux/hardirq.h> 35#include <linux/hardirq.h>
36#include <linux/bit_spinlock.h>
37#include <linux/rculist_bl.h>
36#include "internal.h" 38#include "internal.h"
37 39
40/*
41 * Usage:
42 * dcache->d_inode->i_lock protects:
43 * - i_dentry, d_alias, d_inode of aliases
44 * dcache_hash_bucket lock protects:
45 * - the dcache hash table
46 * s_anon bl list spinlock protects:
47 * - the s_anon list (see __d_drop)
48 * dcache_lru_lock protects:
49 * - the dcache lru lists and counters
50 * d_lock protects:
51 * - d_flags
52 * - d_name
53 * - d_lru
54 * - d_count
55 * - d_unhashed()
56 * - d_parent and d_subdirs
57 * - childrens' d_child and d_parent
58 * - d_alias, d_inode
59 *
60 * Ordering:
61 * dentry->d_inode->i_lock
62 * dentry->d_lock
63 * dcache_lru_lock
64 * dcache_hash_bucket lock
65 * s_anon lock
66 *
67 * If there is an ancestor relationship:
68 * dentry->d_parent->...->d_parent->d_lock
69 * ...
70 * dentry->d_parent->d_lock
71 * dentry->d_lock
72 *
73 * If no ancestor relationship:
74 * if (dentry1 < dentry2)
75 * dentry1->d_lock
76 * dentry2->d_lock
77 */
38int sysctl_vfs_cache_pressure __read_mostly = 100; 78int sysctl_vfs_cache_pressure __read_mostly = 100;
39EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); 79EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
40 80
41 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); 81static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
42__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); 82__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
43 83
44EXPORT_SYMBOL(dcache_lock); 84EXPORT_SYMBOL(rename_lock);
45 85
46static struct kmem_cache *dentry_cache __read_mostly; 86static struct kmem_cache *dentry_cache __read_mostly;
47 87
48#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
49
50/* 88/*
51 * This is the single most critical data structure when it comes 89 * This is the single most critical data structure when it comes
52 * to the dcache: the hashtable for lookups. Somebody should try 90 * to the dcache: the hashtable for lookups. Somebody should try
@@ -60,22 +98,51 @@ static struct kmem_cache *dentry_cache __read_mostly;
60 98
61static unsigned int d_hash_mask __read_mostly; 99static unsigned int d_hash_mask __read_mostly;
62static unsigned int d_hash_shift __read_mostly; 100static unsigned int d_hash_shift __read_mostly;
63static struct hlist_head *dentry_hashtable __read_mostly; 101
102struct dcache_hash_bucket {
103 struct hlist_bl_head head;
104};
105static struct dcache_hash_bucket *dentry_hashtable __read_mostly;
106
107static inline struct dcache_hash_bucket *d_hash(struct dentry *parent,
108 unsigned long hash)
109{
110 hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
111 hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
112 return dentry_hashtable + (hash & D_HASHMASK);
113}
114
115static inline void spin_lock_bucket(struct dcache_hash_bucket *b)
116{
117 bit_spin_lock(0, (unsigned long *)&b->head.first);
118}
119
120static inline void spin_unlock_bucket(struct dcache_hash_bucket *b)
121{
122 __bit_spin_unlock(0, (unsigned long *)&b->head.first);
123}
64 124
65/* Statistics gathering. */ 125/* Statistics gathering. */
66struct dentry_stat_t dentry_stat = { 126struct dentry_stat_t dentry_stat = {
67 .age_limit = 45, 127 .age_limit = 45,
68}; 128};
69 129
70static struct percpu_counter nr_dentry __cacheline_aligned_in_smp; 130static DEFINE_PER_CPU(unsigned int, nr_dentry);
71static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
72 131
73#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) 132#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
133static int get_nr_dentry(void)
134{
135 int i;
136 int sum = 0;
137 for_each_possible_cpu(i)
138 sum += per_cpu(nr_dentry, i);
139 return sum < 0 ? 0 : sum;
140}
141
74int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, 142int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
75 size_t *lenp, loff_t *ppos) 143 size_t *lenp, loff_t *ppos)
76{ 144{
77 dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry); 145 dentry_stat.nr_dentry = get_nr_dentry();
78 dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
79 return proc_dointvec(table, write, buffer, lenp, ppos); 146 return proc_dointvec(table, write, buffer, lenp, ppos);
80} 147}
81#endif 148#endif
@@ -91,35 +158,51 @@ static void __d_free(struct rcu_head *head)
91} 158}
92 159
93/* 160/*
94 * no dcache_lock, please. 161 * no locks, please.
95 */ 162 */
96static void d_free(struct dentry *dentry) 163static void d_free(struct dentry *dentry)
97{ 164{
98 percpu_counter_dec(&nr_dentry); 165 BUG_ON(dentry->d_count);
166 this_cpu_dec(nr_dentry);
99 if (dentry->d_op && dentry->d_op->d_release) 167 if (dentry->d_op && dentry->d_op->d_release)
100 dentry->d_op->d_release(dentry); 168 dentry->d_op->d_release(dentry);
101 169
102 /* if dentry was never inserted into hash, immediate free is OK */ 170 /* if dentry was never inserted into hash, immediate free is OK */
103 if (hlist_unhashed(&dentry->d_hash)) 171 if (hlist_bl_unhashed(&dentry->d_hash))
104 __d_free(&dentry->d_u.d_rcu); 172 __d_free(&dentry->d_u.d_rcu);
105 else 173 else
106 call_rcu(&dentry->d_u.d_rcu, __d_free); 174 call_rcu(&dentry->d_u.d_rcu, __d_free);
107} 175}
108 176
177/**
178 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
179 * @dentry: the target dentry
180 * After this call, in-progress rcu-walk path lookup will fail. This
181 * should be called after unhashing, and after changing d_inode (if
182 * the dentry has not already been unhashed).
183 */
184static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
185{
186 assert_spin_locked(&dentry->d_lock);
187 /* Go through a barrier */
188 write_seqcount_barrier(&dentry->d_seq);
189}
190
109/* 191/*
110 * Release the dentry's inode, using the filesystem 192 * Release the dentry's inode, using the filesystem
111 * d_iput() operation if defined. 193 * d_iput() operation if defined. Dentry has no refcount
194 * and is unhashed.
112 */ 195 */
113static void dentry_iput(struct dentry * dentry) 196static void dentry_iput(struct dentry * dentry)
114 __releases(dentry->d_lock) 197 __releases(dentry->d_lock)
115 __releases(dcache_lock) 198 __releases(dentry->d_inode->i_lock)
116{ 199{
117 struct inode *inode = dentry->d_inode; 200 struct inode *inode = dentry->d_inode;
118 if (inode) { 201 if (inode) {
119 dentry->d_inode = NULL; 202 dentry->d_inode = NULL;
120 list_del_init(&dentry->d_alias); 203 list_del_init(&dentry->d_alias);
121 spin_unlock(&dentry->d_lock); 204 spin_unlock(&dentry->d_lock);
122 spin_unlock(&dcache_lock); 205 spin_unlock(&inode->i_lock);
123 if (!inode->i_nlink) 206 if (!inode->i_nlink)
124 fsnotify_inoderemove(inode); 207 fsnotify_inoderemove(inode);
125 if (dentry->d_op && dentry->d_op->d_iput) 208 if (dentry->d_op && dentry->d_op->d_iput)
@@ -128,65 +211,191 @@ static void dentry_iput(struct dentry * dentry)
128 iput(inode); 211 iput(inode);
129 } else { 212 } else {
130 spin_unlock(&dentry->d_lock); 213 spin_unlock(&dentry->d_lock);
131 spin_unlock(&dcache_lock);
132 } 214 }
133} 215}
134 216
135/* 217/*
136 * dentry_lru_(add|del|move_tail) must be called with dcache_lock held. 218 * Release the dentry's inode, using the filesystem
219 * d_iput() operation if defined. dentry remains in-use.
220 */
221static void dentry_unlink_inode(struct dentry * dentry)
222 __releases(dentry->d_lock)
223 __releases(dentry->d_inode->i_lock)
224{
225 struct inode *inode = dentry->d_inode;
226 dentry->d_inode = NULL;
227 list_del_init(&dentry->d_alias);
228 dentry_rcuwalk_barrier(dentry);
229 spin_unlock(&dentry->d_lock);
230 spin_unlock(&inode->i_lock);
231 if (!inode->i_nlink)
232 fsnotify_inoderemove(inode);
233 if (dentry->d_op && dentry->d_op->d_iput)
234 dentry->d_op->d_iput(dentry, inode);
235 else
236 iput(inode);
237}
238
239/*
240 * dentry_lru_(add|del|move_tail) must be called with d_lock held.
137 */ 241 */
138static void dentry_lru_add(struct dentry *dentry) 242static void dentry_lru_add(struct dentry *dentry)
139{ 243{
140 if (list_empty(&dentry->d_lru)) { 244 if (list_empty(&dentry->d_lru)) {
245 spin_lock(&dcache_lru_lock);
141 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 246 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
142 dentry->d_sb->s_nr_dentry_unused++; 247 dentry->d_sb->s_nr_dentry_unused++;
143 percpu_counter_inc(&nr_dentry_unused); 248 dentry_stat.nr_unused++;
249 spin_unlock(&dcache_lru_lock);
144 } 250 }
145} 251}
146 252
253static void __dentry_lru_del(struct dentry *dentry)
254{
255 list_del_init(&dentry->d_lru);
256 dentry->d_sb->s_nr_dentry_unused--;
257 dentry_stat.nr_unused--;
258}
259
147static void dentry_lru_del(struct dentry *dentry) 260static void dentry_lru_del(struct dentry *dentry)
148{ 261{
149 if (!list_empty(&dentry->d_lru)) { 262 if (!list_empty(&dentry->d_lru)) {
150 list_del_init(&dentry->d_lru); 263 spin_lock(&dcache_lru_lock);
151 dentry->d_sb->s_nr_dentry_unused--; 264 __dentry_lru_del(dentry);
152 percpu_counter_dec(&nr_dentry_unused); 265 spin_unlock(&dcache_lru_lock);
153 } 266 }
154} 267}
155 268
156static void dentry_lru_move_tail(struct dentry *dentry) 269static void dentry_lru_move_tail(struct dentry *dentry)
157{ 270{
271 spin_lock(&dcache_lru_lock);
158 if (list_empty(&dentry->d_lru)) { 272 if (list_empty(&dentry->d_lru)) {
159 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 273 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
160 dentry->d_sb->s_nr_dentry_unused++; 274 dentry->d_sb->s_nr_dentry_unused++;
161 percpu_counter_inc(&nr_dentry_unused); 275 dentry_stat.nr_unused++;
162 } else { 276 } else {
163 list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 277 list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
164 } 278 }
279 spin_unlock(&dcache_lru_lock);
165} 280}
166 281
167/** 282/**
168 * d_kill - kill dentry and return parent 283 * d_kill - kill dentry and return parent
169 * @dentry: dentry to kill 284 * @dentry: dentry to kill
285 * @parent: parent dentry
170 * 286 *
171 * The dentry must already be unhashed and removed from the LRU. 287 * The dentry must already be unhashed and removed from the LRU.
172 * 288 *
173 * If this is the root of the dentry tree, return NULL. 289 * If this is the root of the dentry tree, return NULL.
290 *
291 * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
292 * d_kill.
174 */ 293 */
175static struct dentry *d_kill(struct dentry *dentry) 294static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
176 __releases(dentry->d_lock) 295 __releases(dentry->d_lock)
177 __releases(dcache_lock) 296 __releases(parent->d_lock)
297 __releases(dentry->d_inode->i_lock)
178{ 298{
179 struct dentry *parent; 299 dentry->d_parent = NULL;
180
181 list_del(&dentry->d_u.d_child); 300 list_del(&dentry->d_u.d_child);
182 /*drops the locks, at that point nobody can reach this dentry */ 301 if (parent)
302 spin_unlock(&parent->d_lock);
183 dentry_iput(dentry); 303 dentry_iput(dentry);
304 /*
305 * dentry_iput drops the locks, at which point nobody (except
306 * transient RCU lookups) can reach this dentry.
307 */
308 d_free(dentry);
309 return parent;
310}
311
312/**
313 * d_drop - drop a dentry
314 * @dentry: dentry to drop
315 *
316 * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
317 * be found through a VFS lookup any more. Note that this is different from
318 * deleting the dentry - d_delete will try to mark the dentry negative if
319 * possible, giving a successful _negative_ lookup, while d_drop will
320 * just make the cache lookup fail.
321 *
322 * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
323 * reason (NFS timeouts or autofs deletes).
324 *
325 * __d_drop requires dentry->d_lock.
326 */
327void __d_drop(struct dentry *dentry)
328{
329 if (!(dentry->d_flags & DCACHE_UNHASHED)) {
330 if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) {
331 bit_spin_lock(0,
332 (unsigned long *)&dentry->d_sb->s_anon.first);
333 dentry->d_flags |= DCACHE_UNHASHED;
334 hlist_bl_del_init(&dentry->d_hash);
335 __bit_spin_unlock(0,
336 (unsigned long *)&dentry->d_sb->s_anon.first);
337 } else {
338 struct dcache_hash_bucket *b;
339 b = d_hash(dentry->d_parent, dentry->d_name.hash);
340 spin_lock_bucket(b);
341 /*
342 * We may not actually need to put DCACHE_UNHASHED
343 * manipulations under the hash lock, but follow
344 * the principle of least surprise.
345 */
346 dentry->d_flags |= DCACHE_UNHASHED;
347 hlist_bl_del_rcu(&dentry->d_hash);
348 spin_unlock_bucket(b);
349 dentry_rcuwalk_barrier(dentry);
350 }
351 }
352}
353EXPORT_SYMBOL(__d_drop);
354
355void d_drop(struct dentry *dentry)
356{
357 spin_lock(&dentry->d_lock);
358 __d_drop(dentry);
359 spin_unlock(&dentry->d_lock);
360}
361EXPORT_SYMBOL(d_drop);
362
363/*
364 * Finish off a dentry we've decided to kill.
365 * dentry->d_lock must be held, returns with it unlocked.
366 * If ref is non-zero, then decrement the refcount too.
367 * Returns dentry requiring refcount drop, or NULL if we're done.
368 */
369static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
370 __releases(dentry->d_lock)
371{
372 struct inode *inode;
373 struct dentry *parent;
374
375 inode = dentry->d_inode;
376 if (inode && !spin_trylock(&inode->i_lock)) {
377relock:
378 spin_unlock(&dentry->d_lock);
379 cpu_relax();
380 return dentry; /* try again with same dentry */
381 }
184 if (IS_ROOT(dentry)) 382 if (IS_ROOT(dentry))
185 parent = NULL; 383 parent = NULL;
186 else 384 else
187 parent = dentry->d_parent; 385 parent = dentry->d_parent;
188 d_free(dentry); 386 if (parent && !spin_trylock(&parent->d_lock)) {
189 return parent; 387 if (inode)
388 spin_unlock(&inode->i_lock);
389 goto relock;
390 }
391
392 if (ref)
393 dentry->d_count--;
394 /* if dentry was on the d_lru list delete it from there */
395 dentry_lru_del(dentry);
396 /* if it was on the hash then remove it */
397 __d_drop(dentry);
398 return d_kill(dentry, parent);
190} 399}
191 400
192/* 401/*
@@ -214,34 +423,26 @@ static struct dentry *d_kill(struct dentry *dentry)
214 * call the dentry unlink method as well as removing it from the queues and 423 * call the dentry unlink method as well as removing it from the queues and
215 * releasing its resources. If the parent dentries were scheduled for release 424 * releasing its resources. If the parent dentries were scheduled for release
216 * they too may now get deleted. 425 * they too may now get deleted.
217 *
218 * no dcache lock, please.
219 */ 426 */
220
221void dput(struct dentry *dentry) 427void dput(struct dentry *dentry)
222{ 428{
223 if (!dentry) 429 if (!dentry)
224 return; 430 return;
225 431
226repeat: 432repeat:
227 if (atomic_read(&dentry->d_count) == 1) 433 if (dentry->d_count == 1)
228 might_sleep(); 434 might_sleep();
229 if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
230 return;
231
232 spin_lock(&dentry->d_lock); 435 spin_lock(&dentry->d_lock);
233 if (atomic_read(&dentry->d_count)) { 436 BUG_ON(!dentry->d_count);
437 if (dentry->d_count > 1) {
438 dentry->d_count--;
234 spin_unlock(&dentry->d_lock); 439 spin_unlock(&dentry->d_lock);
235 spin_unlock(&dcache_lock);
236 return; 440 return;
237 } 441 }
238 442
239 /* 443 if (dentry->d_flags & DCACHE_OP_DELETE) {
240 * AV: ->d_delete() is _NOT_ allowed to block now.
241 */
242 if (dentry->d_op && dentry->d_op->d_delete) {
243 if (dentry->d_op->d_delete(dentry)) 444 if (dentry->d_op->d_delete(dentry))
244 goto unhash_it; 445 goto kill_it;
245 } 446 }
246 447
247 /* Unreachable? Get rid of it */ 448 /* Unreachable? Get rid of it */
@@ -252,16 +453,12 @@ repeat:
252 dentry->d_flags |= DCACHE_REFERENCED; 453 dentry->d_flags |= DCACHE_REFERENCED;
253 dentry_lru_add(dentry); 454 dentry_lru_add(dentry);
254 455
255 spin_unlock(&dentry->d_lock); 456 dentry->d_count--;
256 spin_unlock(&dcache_lock); 457 spin_unlock(&dentry->d_lock);
257 return; 458 return;
258 459
259unhash_it:
260 __d_drop(dentry);
261kill_it: 460kill_it:
262 /* if dentry was on the d_lru list delete it from there */ 461 dentry = dentry_kill(dentry, 1);
263 dentry_lru_del(dentry);
264 dentry = d_kill(dentry);
265 if (dentry) 462 if (dentry)
266 goto repeat; 463 goto repeat;
267} 464}
@@ -284,9 +481,9 @@ int d_invalidate(struct dentry * dentry)
284 /* 481 /*
285 * If it's already been dropped, return OK. 482 * If it's already been dropped, return OK.
286 */ 483 */
287 spin_lock(&dcache_lock); 484 spin_lock(&dentry->d_lock);
288 if (d_unhashed(dentry)) { 485 if (d_unhashed(dentry)) {
289 spin_unlock(&dcache_lock); 486 spin_unlock(&dentry->d_lock);
290 return 0; 487 return 0;
291 } 488 }
292 /* 489 /*
@@ -294,9 +491,9 @@ int d_invalidate(struct dentry * dentry)
294 * to get rid of unused child entries. 491 * to get rid of unused child entries.
295 */ 492 */
296 if (!list_empty(&dentry->d_subdirs)) { 493 if (!list_empty(&dentry->d_subdirs)) {
297 spin_unlock(&dcache_lock); 494 spin_unlock(&dentry->d_lock);
298 shrink_dcache_parent(dentry); 495 shrink_dcache_parent(dentry);
299 spin_lock(&dcache_lock); 496 spin_lock(&dentry->d_lock);
300 } 497 }
301 498
302 /* 499 /*
@@ -309,35 +506,61 @@ int d_invalidate(struct dentry * dentry)
309 * we might still populate it if it was a 506 * we might still populate it if it was a
310 * working directory or similar). 507 * working directory or similar).
311 */ 508 */
312 spin_lock(&dentry->d_lock); 509 if (dentry->d_count > 1) {
313 if (atomic_read(&dentry->d_count) > 1) {
314 if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { 510 if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
315 spin_unlock(&dentry->d_lock); 511 spin_unlock(&dentry->d_lock);
316 spin_unlock(&dcache_lock);
317 return -EBUSY; 512 return -EBUSY;
318 } 513 }
319 } 514 }
320 515
321 __d_drop(dentry); 516 __d_drop(dentry);
322 spin_unlock(&dentry->d_lock); 517 spin_unlock(&dentry->d_lock);
323 spin_unlock(&dcache_lock);
324 return 0; 518 return 0;
325} 519}
326EXPORT_SYMBOL(d_invalidate); 520EXPORT_SYMBOL(d_invalidate);
327 521
328/* This should be called _only_ with dcache_lock held */ 522/* This must be called with d_lock held */
329static inline struct dentry * __dget_locked(struct dentry *dentry) 523static inline void __dget_dlock(struct dentry *dentry)
330{ 524{
331 atomic_inc(&dentry->d_count); 525 dentry->d_count++;
332 dentry_lru_del(dentry);
333 return dentry;
334} 526}
335 527
336struct dentry * dget_locked(struct dentry *dentry) 528static inline void __dget(struct dentry *dentry)
337{ 529{
338 return __dget_locked(dentry); 530 spin_lock(&dentry->d_lock);
531 __dget_dlock(dentry);
532 spin_unlock(&dentry->d_lock);
533}
534
535struct dentry *dget_parent(struct dentry *dentry)
536{
537 struct dentry *ret;
538
539repeat:
540 /*
541 * Don't need rcu_dereference because we re-check it was correct under
542 * the lock.
543 */
544 rcu_read_lock();
545 ret = dentry->d_parent;
546 if (!ret) {
547 rcu_read_unlock();
548 goto out;
549 }
550 spin_lock(&ret->d_lock);
551 if (unlikely(ret != dentry->d_parent)) {
552 spin_unlock(&ret->d_lock);
553 rcu_read_unlock();
554 goto repeat;
555 }
556 rcu_read_unlock();
557 BUG_ON(!ret->d_count);
558 ret->d_count++;
559 spin_unlock(&ret->d_lock);
560out:
561 return ret;
339} 562}
340EXPORT_SYMBOL(dget_locked); 563EXPORT_SYMBOL(dget_parent);
341 564
342/** 565/**
343 * d_find_alias - grab a hashed alias of inode 566 * d_find_alias - grab a hashed alias of inode
@@ -355,42 +578,51 @@ EXPORT_SYMBOL(dget_locked);
355 * any other hashed alias over that one unless @want_discon is set, 578 * any other hashed alias over that one unless @want_discon is set,
356 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. 579 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
357 */ 580 */
358 581static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
359static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
360{ 582{
361 struct list_head *head, *next, *tmp; 583 struct dentry *alias, *discon_alias;
362 struct dentry *alias, *discon_alias=NULL;
363 584
364 head = &inode->i_dentry; 585again:
365 next = inode->i_dentry.next; 586 discon_alias = NULL;
366 while (next != head) { 587 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
367 tmp = next; 588 spin_lock(&alias->d_lock);
368 next = tmp->next;
369 prefetch(next);
370 alias = list_entry(tmp, struct dentry, d_alias);
371 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { 589 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
372 if (IS_ROOT(alias) && 590 if (IS_ROOT(alias) &&
373 (alias->d_flags & DCACHE_DISCONNECTED)) 591 (alias->d_flags & DCACHE_DISCONNECTED)) {
374 discon_alias = alias; 592 discon_alias = alias;
375 else if (!want_discon) { 593 } else if (!want_discon) {
376 __dget_locked(alias); 594 __dget_dlock(alias);
595 spin_unlock(&alias->d_lock);
596 return alias;
597 }
598 }
599 spin_unlock(&alias->d_lock);
600 }
601 if (discon_alias) {
602 alias = discon_alias;
603 spin_lock(&alias->d_lock);
604 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
605 if (IS_ROOT(alias) &&
606 (alias->d_flags & DCACHE_DISCONNECTED)) {
607 __dget_dlock(alias);
608 spin_unlock(&alias->d_lock);
377 return alias; 609 return alias;
378 } 610 }
379 } 611 }
612 spin_unlock(&alias->d_lock);
613 goto again;
380 } 614 }
381 if (discon_alias) 615 return NULL;
382 __dget_locked(discon_alias);
383 return discon_alias;
384} 616}
385 617
386struct dentry * d_find_alias(struct inode *inode) 618struct dentry *d_find_alias(struct inode *inode)
387{ 619{
388 struct dentry *de = NULL; 620 struct dentry *de = NULL;
389 621
390 if (!list_empty(&inode->i_dentry)) { 622 if (!list_empty(&inode->i_dentry)) {
391 spin_lock(&dcache_lock); 623 spin_lock(&inode->i_lock);
392 de = __d_find_alias(inode, 0); 624 de = __d_find_alias(inode, 0);
393 spin_unlock(&dcache_lock); 625 spin_unlock(&inode->i_lock);
394 } 626 }
395 return de; 627 return de;
396} 628}
@@ -404,54 +636,61 @@ void d_prune_aliases(struct inode *inode)
404{ 636{
405 struct dentry *dentry; 637 struct dentry *dentry;
406restart: 638restart:
407 spin_lock(&dcache_lock); 639 spin_lock(&inode->i_lock);
408 list_for_each_entry(dentry, &inode->i_dentry, d_alias) { 640 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
409 spin_lock(&dentry->d_lock); 641 spin_lock(&dentry->d_lock);
410 if (!atomic_read(&dentry->d_count)) { 642 if (!dentry->d_count) {
411 __dget_locked(dentry); 643 __dget_dlock(dentry);
412 __d_drop(dentry); 644 __d_drop(dentry);
413 spin_unlock(&dentry->d_lock); 645 spin_unlock(&dentry->d_lock);
414 spin_unlock(&dcache_lock); 646 spin_unlock(&inode->i_lock);
415 dput(dentry); 647 dput(dentry);
416 goto restart; 648 goto restart;
417 } 649 }
418 spin_unlock(&dentry->d_lock); 650 spin_unlock(&dentry->d_lock);
419 } 651 }
420 spin_unlock(&dcache_lock); 652 spin_unlock(&inode->i_lock);
421} 653}
422EXPORT_SYMBOL(d_prune_aliases); 654EXPORT_SYMBOL(d_prune_aliases);
423 655
424/* 656/*
425 * Throw away a dentry - free the inode, dput the parent. This requires that 657 * Try to throw away a dentry - free the inode, dput the parent.
426 * the LRU list has already been removed. 658 * Requires dentry->d_lock is held, and dentry->d_count == 0.
659 * Releases dentry->d_lock.
427 * 660 *
428 * Try to prune ancestors as well. This is necessary to prevent 661 * This may fail if locks cannot be acquired no problem, just try again.
429 * quadratic behavior of shrink_dcache_parent(), but is also expected
430 * to be beneficial in reducing dentry cache fragmentation.
431 */ 662 */
432static void prune_one_dentry(struct dentry * dentry) 663static void try_prune_one_dentry(struct dentry *dentry)
433 __releases(dentry->d_lock) 664 __releases(dentry->d_lock)
434 __releases(dcache_lock)
435 __acquires(dcache_lock)
436{ 665{
437 __d_drop(dentry); 666 struct dentry *parent;
438 dentry = d_kill(dentry);
439 667
668 parent = dentry_kill(dentry, 0);
440 /* 669 /*
441 * Prune ancestors. Locking is simpler than in dput(), 670 * If dentry_kill returns NULL, we have nothing more to do.
442 * because dcache_lock needs to be taken anyway. 671 * if it returns the same dentry, trylocks failed. In either
672 * case, just loop again.
673 *
674 * Otherwise, we need to prune ancestors too. This is necessary
675 * to prevent quadratic behavior of shrink_dcache_parent(), but
676 * is also expected to be beneficial in reducing dentry cache
677 * fragmentation.
443 */ 678 */
444 spin_lock(&dcache_lock); 679 if (!parent)
680 return;
681 if (parent == dentry)
682 return;
683
684 /* Prune ancestors. */
685 dentry = parent;
445 while (dentry) { 686 while (dentry) {
446 if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock)) 687 spin_lock(&dentry->d_lock);
688 if (dentry->d_count > 1) {
689 dentry->d_count--;
690 spin_unlock(&dentry->d_lock);
447 return; 691 return;
448 692 }
449 if (dentry->d_op && dentry->d_op->d_delete) 693 dentry = dentry_kill(dentry, 1);
450 dentry->d_op->d_delete(dentry);
451 dentry_lru_del(dentry);
452 __d_drop(dentry);
453 dentry = d_kill(dentry);
454 spin_lock(&dcache_lock);
455 } 694 }
456} 695}
457 696
@@ -459,24 +698,35 @@ static void shrink_dentry_list(struct list_head *list)
459{ 698{
460 struct dentry *dentry; 699 struct dentry *dentry;
461 700
462 while (!list_empty(list)) { 701 rcu_read_lock();
463 dentry = list_entry(list->prev, struct dentry, d_lru); 702 for (;;) {
464 dentry_lru_del(dentry); 703 dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
704 if (&dentry->d_lru == list)
705 break; /* empty */
706 spin_lock(&dentry->d_lock);
707 if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
708 spin_unlock(&dentry->d_lock);
709 continue;
710 }
465 711
466 /* 712 /*
467 * We found an inuse dentry which was not removed from 713 * We found an inuse dentry which was not removed from
468 * the LRU because of laziness during lookup. Do not free 714 * the LRU because of laziness during lookup. Do not free
469 * it - just keep it off the LRU list. 715 * it - just keep it off the LRU list.
470 */ 716 */
471 spin_lock(&dentry->d_lock); 717 if (dentry->d_count) {
472 if (atomic_read(&dentry->d_count)) { 718 dentry_lru_del(dentry);
473 spin_unlock(&dentry->d_lock); 719 spin_unlock(&dentry->d_lock);
474 continue; 720 continue;
475 } 721 }
476 prune_one_dentry(dentry); 722
477 /* dentry->d_lock was dropped in prune_one_dentry() */ 723 rcu_read_unlock();
478 cond_resched_lock(&dcache_lock); 724
725 try_prune_one_dentry(dentry);
726
727 rcu_read_lock();
479 } 728 }
729 rcu_read_unlock();
480} 730}
481 731
482/** 732/**
@@ -495,42 +745,44 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
495 LIST_HEAD(tmp); 745 LIST_HEAD(tmp);
496 int cnt = *count; 746 int cnt = *count;
497 747
498 spin_lock(&dcache_lock); 748relock:
749 spin_lock(&dcache_lru_lock);
499 while (!list_empty(&sb->s_dentry_lru)) { 750 while (!list_empty(&sb->s_dentry_lru)) {
500 dentry = list_entry(sb->s_dentry_lru.prev, 751 dentry = list_entry(sb->s_dentry_lru.prev,
501 struct dentry, d_lru); 752 struct dentry, d_lru);
502 BUG_ON(dentry->d_sb != sb); 753 BUG_ON(dentry->d_sb != sb);
503 754
755 if (!spin_trylock(&dentry->d_lock)) {
756 spin_unlock(&dcache_lru_lock);
757 cpu_relax();
758 goto relock;
759 }
760
504 /* 761 /*
505 * If we are honouring the DCACHE_REFERENCED flag and the 762 * If we are honouring the DCACHE_REFERENCED flag and the
506 * dentry has this flag set, don't free it. Clear the flag 763 * dentry has this flag set, don't free it. Clear the flag
507 * and put it back on the LRU. 764 * and put it back on the LRU.
508 */ 765 */
509 if (flags & DCACHE_REFERENCED) { 766 if (flags & DCACHE_REFERENCED &&
510 spin_lock(&dentry->d_lock); 767 dentry->d_flags & DCACHE_REFERENCED) {
511 if (dentry->d_flags & DCACHE_REFERENCED) { 768 dentry->d_flags &= ~DCACHE_REFERENCED;
512 dentry->d_flags &= ~DCACHE_REFERENCED; 769 list_move(&dentry->d_lru, &referenced);
513 list_move(&dentry->d_lru, &referenced);
514 spin_unlock(&dentry->d_lock);
515 cond_resched_lock(&dcache_lock);
516 continue;
517 }
518 spin_unlock(&dentry->d_lock); 770 spin_unlock(&dentry->d_lock);
771 } else {
772 list_move_tail(&dentry->d_lru, &tmp);
773 spin_unlock(&dentry->d_lock);
774 if (!--cnt)
775 break;
519 } 776 }
520 777 cond_resched_lock(&dcache_lru_lock);
521 list_move_tail(&dentry->d_lru, &tmp);
522 if (!--cnt)
523 break;
524 cond_resched_lock(&dcache_lock);
525 } 778 }
526
527 *count = cnt;
528 shrink_dentry_list(&tmp);
529
530 if (!list_empty(&referenced)) 779 if (!list_empty(&referenced))
531 list_splice(&referenced, &sb->s_dentry_lru); 780 list_splice(&referenced, &sb->s_dentry_lru);
532 spin_unlock(&dcache_lock); 781 spin_unlock(&dcache_lru_lock);
533 782
783 shrink_dentry_list(&tmp);
784
785 *count = cnt;
534} 786}
535 787
536/** 788/**
@@ -546,13 +798,12 @@ static void prune_dcache(int count)
546{ 798{
547 struct super_block *sb, *p = NULL; 799 struct super_block *sb, *p = NULL;
548 int w_count; 800 int w_count;
549 int unused = percpu_counter_sum_positive(&nr_dentry_unused); 801 int unused = dentry_stat.nr_unused;
550 int prune_ratio; 802 int prune_ratio;
551 int pruned; 803 int pruned;
552 804
553 if (unused == 0 || count == 0) 805 if (unused == 0 || count == 0)
554 return; 806 return;
555 spin_lock(&dcache_lock);
556 if (count >= unused) 807 if (count >= unused)
557 prune_ratio = 1; 808 prune_ratio = 1;
558 else 809 else
@@ -589,11 +840,9 @@ static void prune_dcache(int count)
589 if (down_read_trylock(&sb->s_umount)) { 840 if (down_read_trylock(&sb->s_umount)) {
590 if ((sb->s_root != NULL) && 841 if ((sb->s_root != NULL) &&
591 (!list_empty(&sb->s_dentry_lru))) { 842 (!list_empty(&sb->s_dentry_lru))) {
592 spin_unlock(&dcache_lock);
593 __shrink_dcache_sb(sb, &w_count, 843 __shrink_dcache_sb(sb, &w_count,
594 DCACHE_REFERENCED); 844 DCACHE_REFERENCED);
595 pruned -= w_count; 845 pruned -= w_count;
596 spin_lock(&dcache_lock);
597 } 846 }
598 up_read(&sb->s_umount); 847 up_read(&sb->s_umount);
599 } 848 }
@@ -609,7 +858,6 @@ static void prune_dcache(int count)
609 if (p) 858 if (p)
610 __put_super(p); 859 __put_super(p);
611 spin_unlock(&sb_lock); 860 spin_unlock(&sb_lock);
612 spin_unlock(&dcache_lock);
613} 861}
614 862
615/** 863/**
@@ -623,12 +871,14 @@ void shrink_dcache_sb(struct super_block *sb)
623{ 871{
624 LIST_HEAD(tmp); 872 LIST_HEAD(tmp);
625 873
626 spin_lock(&dcache_lock); 874 spin_lock(&dcache_lru_lock);
627 while (!list_empty(&sb->s_dentry_lru)) { 875 while (!list_empty(&sb->s_dentry_lru)) {
628 list_splice_init(&sb->s_dentry_lru, &tmp); 876 list_splice_init(&sb->s_dentry_lru, &tmp);
877 spin_unlock(&dcache_lru_lock);
629 shrink_dentry_list(&tmp); 878 shrink_dentry_list(&tmp);
879 spin_lock(&dcache_lru_lock);
630 } 880 }
631 spin_unlock(&dcache_lock); 881 spin_unlock(&dcache_lru_lock);
632} 882}
633EXPORT_SYMBOL(shrink_dcache_sb); 883EXPORT_SYMBOL(shrink_dcache_sb);
634 884
@@ -645,10 +895,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
645 BUG_ON(!IS_ROOT(dentry)); 895 BUG_ON(!IS_ROOT(dentry));
646 896
647 /* detach this root from the system */ 897 /* detach this root from the system */
648 spin_lock(&dcache_lock); 898 spin_lock(&dentry->d_lock);
649 dentry_lru_del(dentry); 899 dentry_lru_del(dentry);
650 __d_drop(dentry); 900 __d_drop(dentry);
651 spin_unlock(&dcache_lock); 901 spin_unlock(&dentry->d_lock);
652 902
653 for (;;) { 903 for (;;) {
654 /* descend to the first leaf in the current subtree */ 904 /* descend to the first leaf in the current subtree */
@@ -657,14 +907,16 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
657 907
658 /* this is a branch with children - detach all of them 908 /* this is a branch with children - detach all of them
659 * from the system in one go */ 909 * from the system in one go */
660 spin_lock(&dcache_lock); 910 spin_lock(&dentry->d_lock);
661 list_for_each_entry(loop, &dentry->d_subdirs, 911 list_for_each_entry(loop, &dentry->d_subdirs,
662 d_u.d_child) { 912 d_u.d_child) {
913 spin_lock_nested(&loop->d_lock,
914 DENTRY_D_LOCK_NESTED);
663 dentry_lru_del(loop); 915 dentry_lru_del(loop);
664 __d_drop(loop); 916 __d_drop(loop);
665 cond_resched_lock(&dcache_lock); 917 spin_unlock(&loop->d_lock);
666 } 918 }
667 spin_unlock(&dcache_lock); 919 spin_unlock(&dentry->d_lock);
668 920
669 /* move to the first child */ 921 /* move to the first child */
670 dentry = list_entry(dentry->d_subdirs.next, 922 dentry = list_entry(dentry->d_subdirs.next,
@@ -676,7 +928,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
676 do { 928 do {
677 struct inode *inode; 929 struct inode *inode;
678 930
679 if (atomic_read(&dentry->d_count) != 0) { 931 if (dentry->d_count != 0) {
680 printk(KERN_ERR 932 printk(KERN_ERR
681 "BUG: Dentry %p{i=%lx,n=%s}" 933 "BUG: Dentry %p{i=%lx,n=%s}"
682 " still in use (%d)" 934 " still in use (%d)"
@@ -685,20 +937,23 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
685 dentry->d_inode ? 937 dentry->d_inode ?
686 dentry->d_inode->i_ino : 0UL, 938 dentry->d_inode->i_ino : 0UL,
687 dentry->d_name.name, 939 dentry->d_name.name,
688 atomic_read(&dentry->d_count), 940 dentry->d_count,
689 dentry->d_sb->s_type->name, 941 dentry->d_sb->s_type->name,
690 dentry->d_sb->s_id); 942 dentry->d_sb->s_id);
691 BUG(); 943 BUG();
692 } 944 }
693 945
694 if (IS_ROOT(dentry)) 946 if (IS_ROOT(dentry)) {
695 parent = NULL; 947 parent = NULL;
696 else { 948 list_del(&dentry->d_u.d_child);
949 } else {
697 parent = dentry->d_parent; 950 parent = dentry->d_parent;
698 atomic_dec(&parent->d_count); 951 spin_lock(&parent->d_lock);
952 parent->d_count--;
953 list_del(&dentry->d_u.d_child);
954 spin_unlock(&parent->d_lock);
699 } 955 }
700 956
701 list_del(&dentry->d_u.d_child);
702 detached++; 957 detached++;
703 958
704 inode = dentry->d_inode; 959 inode = dentry->d_inode;
@@ -728,8 +983,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
728 983
729/* 984/*
730 * destroy the dentries attached to a superblock on unmounting 985 * destroy the dentries attached to a superblock on unmounting
731 * - we don't need to use dentry->d_lock, and only need dcache_lock when 986 * - we don't need to use dentry->d_lock because:
732 * removing the dentry from the system lists and hashes because:
733 * - the superblock is detached from all mountings and open files, so the 987 * - the superblock is detached from all mountings and open files, so the
734 * dentry trees will not be rearranged by the VFS 988 * dentry trees will not be rearranged by the VFS
735 * - s_umount is write-locked, so the memory pressure shrinker will ignore 989 * - s_umount is write-locked, so the memory pressure shrinker will ignore
@@ -746,11 +1000,13 @@ void shrink_dcache_for_umount(struct super_block *sb)
746 1000
747 dentry = sb->s_root; 1001 dentry = sb->s_root;
748 sb->s_root = NULL; 1002 sb->s_root = NULL;
749 atomic_dec(&dentry->d_count); 1003 spin_lock(&dentry->d_lock);
1004 dentry->d_count--;
1005 spin_unlock(&dentry->d_lock);
750 shrink_dcache_for_umount_subtree(dentry); 1006 shrink_dcache_for_umount_subtree(dentry);
751 1007
752 while (!hlist_empty(&sb->s_anon)) { 1008 while (!hlist_bl_empty(&sb->s_anon)) {
753 dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash); 1009 dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
754 shrink_dcache_for_umount_subtree(dentry); 1010 shrink_dcache_for_umount_subtree(dentry);
755 } 1011 }
756} 1012}
@@ -768,15 +1024,20 @@ void shrink_dcache_for_umount(struct super_block *sb)
768 * Return true if the parent or its subdirectories contain 1024 * Return true if the parent or its subdirectories contain
769 * a mount point 1025 * a mount point
770 */ 1026 */
771
772int have_submounts(struct dentry *parent) 1027int have_submounts(struct dentry *parent)
773{ 1028{
774 struct dentry *this_parent = parent; 1029 struct dentry *this_parent;
775 struct list_head *next; 1030 struct list_head *next;
1031 unsigned seq;
1032 int locked = 0;
1033
1034 seq = read_seqbegin(&rename_lock);
1035again:
1036 this_parent = parent;
776 1037
777 spin_lock(&dcache_lock);
778 if (d_mountpoint(parent)) 1038 if (d_mountpoint(parent))
779 goto positive; 1039 goto positive;
1040 spin_lock(&this_parent->d_lock);
780repeat: 1041repeat:
781 next = this_parent->d_subdirs.next; 1042 next = this_parent->d_subdirs.next;
782resume: 1043resume:
@@ -784,27 +1045,65 @@ resume:
784 struct list_head *tmp = next; 1045 struct list_head *tmp = next;
785 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 1046 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
786 next = tmp->next; 1047 next = tmp->next;
1048
1049 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
787 /* Have we found a mount point ? */ 1050 /* Have we found a mount point ? */
788 if (d_mountpoint(dentry)) 1051 if (d_mountpoint(dentry)) {
1052 spin_unlock(&dentry->d_lock);
1053 spin_unlock(&this_parent->d_lock);
789 goto positive; 1054 goto positive;
1055 }
790 if (!list_empty(&dentry->d_subdirs)) { 1056 if (!list_empty(&dentry->d_subdirs)) {
1057 spin_unlock(&this_parent->d_lock);
1058 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
791 this_parent = dentry; 1059 this_parent = dentry;
1060 spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
792 goto repeat; 1061 goto repeat;
793 } 1062 }
1063 spin_unlock(&dentry->d_lock);
794 } 1064 }
795 /* 1065 /*
796 * All done at this level ... ascend and resume the search. 1066 * All done at this level ... ascend and resume the search.
797 */ 1067 */
798 if (this_parent != parent) { 1068 if (this_parent != parent) {
799 next = this_parent->d_u.d_child.next; 1069 struct dentry *tmp;
800 this_parent = this_parent->d_parent; 1070 struct dentry *child;
1071
1072 tmp = this_parent->d_parent;
1073 rcu_read_lock();
1074 spin_unlock(&this_parent->d_lock);
1075 child = this_parent;
1076 this_parent = tmp;
1077 spin_lock(&this_parent->d_lock);
1078 /* might go back up the wrong parent if we have had a rename
1079 * or deletion */
1080 if (this_parent != child->d_parent ||
1081 (!locked && read_seqretry(&rename_lock, seq))) {
1082 spin_unlock(&this_parent->d_lock);
1083 rcu_read_unlock();
1084 goto rename_retry;
1085 }
1086 rcu_read_unlock();
1087 next = child->d_u.d_child.next;
801 goto resume; 1088 goto resume;
802 } 1089 }
803 spin_unlock(&dcache_lock); 1090 spin_unlock(&this_parent->d_lock);
1091 if (!locked && read_seqretry(&rename_lock, seq))
1092 goto rename_retry;
1093 if (locked)
1094 write_sequnlock(&rename_lock);
804 return 0; /* No mount points found in tree */ 1095 return 0; /* No mount points found in tree */
805positive: 1096positive:
806 spin_unlock(&dcache_lock); 1097 if (!locked && read_seqretry(&rename_lock, seq))
1098 goto rename_retry;
1099 if (locked)
1100 write_sequnlock(&rename_lock);
807 return 1; 1101 return 1;
1102
1103rename_retry:
1104 locked = 1;
1105 write_seqlock(&rename_lock);
1106 goto again;
808} 1107}
809EXPORT_SYMBOL(have_submounts); 1108EXPORT_SYMBOL(have_submounts);
810 1109
@@ -824,11 +1123,16 @@ EXPORT_SYMBOL(have_submounts);
824 */ 1123 */
825static int select_parent(struct dentry * parent) 1124static int select_parent(struct dentry * parent)
826{ 1125{
827 struct dentry *this_parent = parent; 1126 struct dentry *this_parent;
828 struct list_head *next; 1127 struct list_head *next;
1128 unsigned seq;
829 int found = 0; 1129 int found = 0;
1130 int locked = 0;
830 1131
831 spin_lock(&dcache_lock); 1132 seq = read_seqbegin(&rename_lock);
1133again:
1134 this_parent = parent;
1135 spin_lock(&this_parent->d_lock);
832repeat: 1136repeat:
833 next = this_parent->d_subdirs.next; 1137 next = this_parent->d_subdirs.next;
834resume: 1138resume:
@@ -837,11 +1141,13 @@ resume:
837 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 1141 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
838 next = tmp->next; 1142 next = tmp->next;
839 1143
1144 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1145
840 /* 1146 /*
841 * move only zero ref count dentries to the end 1147 * move only zero ref count dentries to the end
842 * of the unused list for prune_dcache 1148 * of the unused list for prune_dcache
843 */ 1149 */
844 if (!atomic_read(&dentry->d_count)) { 1150 if (!dentry->d_count) {
845 dentry_lru_move_tail(dentry); 1151 dentry_lru_move_tail(dentry);
846 found++; 1152 found++;
847 } else { 1153 } else {
@@ -853,28 +1159,63 @@ resume:
853 * ensures forward progress). We'll be coming back to find 1159 * ensures forward progress). We'll be coming back to find
854 * the rest. 1160 * the rest.
855 */ 1161 */
856 if (found && need_resched()) 1162 if (found && need_resched()) {
1163 spin_unlock(&dentry->d_lock);
857 goto out; 1164 goto out;
1165 }
858 1166
859 /* 1167 /*
860 * Descend a level if the d_subdirs list is non-empty. 1168 * Descend a level if the d_subdirs list is non-empty.
861 */ 1169 */
862 if (!list_empty(&dentry->d_subdirs)) { 1170 if (!list_empty(&dentry->d_subdirs)) {
1171 spin_unlock(&this_parent->d_lock);
1172 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
863 this_parent = dentry; 1173 this_parent = dentry;
1174 spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
864 goto repeat; 1175 goto repeat;
865 } 1176 }
1177
1178 spin_unlock(&dentry->d_lock);
866 } 1179 }
867 /* 1180 /*
868 * All done at this level ... ascend and resume the search. 1181 * All done at this level ... ascend and resume the search.
869 */ 1182 */
870 if (this_parent != parent) { 1183 if (this_parent != parent) {
871 next = this_parent->d_u.d_child.next; 1184 struct dentry *tmp;
872 this_parent = this_parent->d_parent; 1185 struct dentry *child;
1186
1187 tmp = this_parent->d_parent;
1188 rcu_read_lock();
1189 spin_unlock(&this_parent->d_lock);
1190 child = this_parent;
1191 this_parent = tmp;
1192 spin_lock(&this_parent->d_lock);
1193 /* might go back up the wrong parent if we have had a rename
1194 * or deletion */
1195 if (this_parent != child->d_parent ||
1196 (!locked && read_seqretry(&rename_lock, seq))) {
1197 spin_unlock(&this_parent->d_lock);
1198 rcu_read_unlock();
1199 goto rename_retry;
1200 }
1201 rcu_read_unlock();
1202 next = child->d_u.d_child.next;
873 goto resume; 1203 goto resume;
874 } 1204 }
875out: 1205out:
876 spin_unlock(&dcache_lock); 1206 spin_unlock(&this_parent->d_lock);
1207 if (!locked && read_seqretry(&rename_lock, seq))
1208 goto rename_retry;
1209 if (locked)
1210 write_sequnlock(&rename_lock);
877 return found; 1211 return found;
1212
1213rename_retry:
1214 if (found)
1215 return found;
1216 locked = 1;
1217 write_seqlock(&rename_lock);
1218 goto again;
878} 1219}
879 1220
880/** 1221/**
@@ -908,16 +1249,13 @@ EXPORT_SYMBOL(shrink_dcache_parent);
908 */ 1249 */
909static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 1250static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
910{ 1251{
911 int nr_unused;
912
913 if (nr) { 1252 if (nr) {
914 if (!(gfp_mask & __GFP_FS)) 1253 if (!(gfp_mask & __GFP_FS))
915 return -1; 1254 return -1;
916 prune_dcache(nr); 1255 prune_dcache(nr);
917 } 1256 }
918 1257
919 nr_unused = percpu_counter_sum_positive(&nr_dentry_unused); 1258 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
920 return (nr_unused / 100) * sysctl_vfs_cache_pressure;
921} 1259}
922 1260
923static struct shrinker dcache_shrinker = { 1261static struct shrinker dcache_shrinker = {
@@ -960,38 +1298,54 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
960 memcpy(dname, name->name, name->len); 1298 memcpy(dname, name->name, name->len);
961 dname[name->len] = 0; 1299 dname[name->len] = 0;
962 1300
963 atomic_set(&dentry->d_count, 1); 1301 dentry->d_count = 1;
964 dentry->d_flags = DCACHE_UNHASHED; 1302 dentry->d_flags = DCACHE_UNHASHED;
965 spin_lock_init(&dentry->d_lock); 1303 spin_lock_init(&dentry->d_lock);
1304 seqcount_init(&dentry->d_seq);
966 dentry->d_inode = NULL; 1305 dentry->d_inode = NULL;
967 dentry->d_parent = NULL; 1306 dentry->d_parent = NULL;
968 dentry->d_sb = NULL; 1307 dentry->d_sb = NULL;
969 dentry->d_op = NULL; 1308 dentry->d_op = NULL;
970 dentry->d_fsdata = NULL; 1309 dentry->d_fsdata = NULL;
971 dentry->d_mounted = 0; 1310 INIT_HLIST_BL_NODE(&dentry->d_hash);
972 INIT_HLIST_NODE(&dentry->d_hash);
973 INIT_LIST_HEAD(&dentry->d_lru); 1311 INIT_LIST_HEAD(&dentry->d_lru);
974 INIT_LIST_HEAD(&dentry->d_subdirs); 1312 INIT_LIST_HEAD(&dentry->d_subdirs);
975 INIT_LIST_HEAD(&dentry->d_alias); 1313 INIT_LIST_HEAD(&dentry->d_alias);
1314 INIT_LIST_HEAD(&dentry->d_u.d_child);
976 1315
977 if (parent) { 1316 if (parent) {
978 dentry->d_parent = dget(parent); 1317 spin_lock(&parent->d_lock);
1318 /*
1319 * don't need child lock because it is not subject
1320 * to concurrency here
1321 */
1322 __dget_dlock(parent);
1323 dentry->d_parent = parent;
979 dentry->d_sb = parent->d_sb; 1324 dentry->d_sb = parent->d_sb;
980 } else { 1325 d_set_d_op(dentry, dentry->d_sb->s_d_op);
981 INIT_LIST_HEAD(&dentry->d_u.d_child);
982 }
983
984 spin_lock(&dcache_lock);
985 if (parent)
986 list_add(&dentry->d_u.d_child, &parent->d_subdirs); 1326 list_add(&dentry->d_u.d_child, &parent->d_subdirs);
987 spin_unlock(&dcache_lock); 1327 spin_unlock(&parent->d_lock);
1328 }
988 1329
989 percpu_counter_inc(&nr_dentry); 1330 this_cpu_inc(nr_dentry);
990 1331
991 return dentry; 1332 return dentry;
992} 1333}
993EXPORT_SYMBOL(d_alloc); 1334EXPORT_SYMBOL(d_alloc);
994 1335
1336struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
1337{
1338 struct dentry *dentry = d_alloc(NULL, name);
1339 if (dentry) {
1340 dentry->d_sb = sb;
1341 d_set_d_op(dentry, dentry->d_sb->s_d_op);
1342 dentry->d_parent = dentry;
1343 dentry->d_flags |= DCACHE_DISCONNECTED;
1344 }
1345 return dentry;
1346}
1347EXPORT_SYMBOL(d_alloc_pseudo);
1348
995struct dentry *d_alloc_name(struct dentry *parent, const char *name) 1349struct dentry *d_alloc_name(struct dentry *parent, const char *name)
996{ 1350{
997 struct qstr q; 1351 struct qstr q;
@@ -1003,12 +1357,39 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
1003} 1357}
1004EXPORT_SYMBOL(d_alloc_name); 1358EXPORT_SYMBOL(d_alloc_name);
1005 1359
1006/* the caller must hold dcache_lock */ 1360void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
1361{
1362 WARN_ON_ONCE(dentry->d_op);
1363 WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
1364 DCACHE_OP_COMPARE |
1365 DCACHE_OP_REVALIDATE |
1366 DCACHE_OP_DELETE ));
1367 dentry->d_op = op;
1368 if (!op)
1369 return;
1370 if (op->d_hash)
1371 dentry->d_flags |= DCACHE_OP_HASH;
1372 if (op->d_compare)
1373 dentry->d_flags |= DCACHE_OP_COMPARE;
1374 if (op->d_revalidate)
1375 dentry->d_flags |= DCACHE_OP_REVALIDATE;
1376 if (op->d_delete)
1377 dentry->d_flags |= DCACHE_OP_DELETE;
1378
1379}
1380EXPORT_SYMBOL(d_set_d_op);
1381
1007static void __d_instantiate(struct dentry *dentry, struct inode *inode) 1382static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1008{ 1383{
1009 if (inode) 1384 spin_lock(&dentry->d_lock);
1385 if (inode) {
1386 if (unlikely(IS_AUTOMOUNT(inode)))
1387 dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
1010 list_add(&dentry->d_alias, &inode->i_dentry); 1388 list_add(&dentry->d_alias, &inode->i_dentry);
1389 }
1011 dentry->d_inode = inode; 1390 dentry->d_inode = inode;
1391 dentry_rcuwalk_barrier(dentry);
1392 spin_unlock(&dentry->d_lock);
1012 fsnotify_d_instantiate(dentry, inode); 1393 fsnotify_d_instantiate(dentry, inode);
1013} 1394}
1014 1395
@@ -1030,9 +1411,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1030void d_instantiate(struct dentry *entry, struct inode * inode) 1411void d_instantiate(struct dentry *entry, struct inode * inode)
1031{ 1412{
1032 BUG_ON(!list_empty(&entry->d_alias)); 1413 BUG_ON(!list_empty(&entry->d_alias));
1033 spin_lock(&dcache_lock); 1414 if (inode)
1415 spin_lock(&inode->i_lock);
1034 __d_instantiate(entry, inode); 1416 __d_instantiate(entry, inode);
1035 spin_unlock(&dcache_lock); 1417 if (inode)
1418 spin_unlock(&inode->i_lock);
1036 security_d_instantiate(entry, inode); 1419 security_d_instantiate(entry, inode);
1037} 1420}
1038EXPORT_SYMBOL(d_instantiate); 1421EXPORT_SYMBOL(d_instantiate);
@@ -1069,15 +1452,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
1069 list_for_each_entry(alias, &inode->i_dentry, d_alias) { 1452 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
1070 struct qstr *qstr = &alias->d_name; 1453 struct qstr *qstr = &alias->d_name;
1071 1454
1455 /*
1456 * Don't need alias->d_lock here, because aliases with
1457 * d_parent == entry->d_parent are not subject to name or
1458 * parent changes, because the parent inode i_mutex is held.
1459 */
1072 if (qstr->hash != hash) 1460 if (qstr->hash != hash)
1073 continue; 1461 continue;
1074 if (alias->d_parent != entry->d_parent) 1462 if (alias->d_parent != entry->d_parent)
1075 continue; 1463 continue;
1076 if (qstr->len != len) 1464 if (dentry_cmp(qstr->name, qstr->len, name, len))
1077 continue; 1465 continue;
1078 if (memcmp(qstr->name, name, len)) 1466 __dget(alias);
1079 continue;
1080 dget_locked(alias);
1081 return alias; 1467 return alias;
1082 } 1468 }
1083 1469
@@ -1091,9 +1477,11 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
1091 1477
1092 BUG_ON(!list_empty(&entry->d_alias)); 1478 BUG_ON(!list_empty(&entry->d_alias));
1093 1479
1094 spin_lock(&dcache_lock); 1480 if (inode)
1481 spin_lock(&inode->i_lock);
1095 result = __d_instantiate_unique(entry, inode); 1482 result = __d_instantiate_unique(entry, inode);
1096 spin_unlock(&dcache_lock); 1483 if (inode)
1484 spin_unlock(&inode->i_lock);
1097 1485
1098 if (!result) { 1486 if (!result) {
1099 security_d_instantiate(entry, inode); 1487 security_d_instantiate(entry, inode);
@@ -1126,6 +1514,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1126 res = d_alloc(NULL, &name); 1514 res = d_alloc(NULL, &name);
1127 if (res) { 1515 if (res) {
1128 res->d_sb = root_inode->i_sb; 1516 res->d_sb = root_inode->i_sb;
1517 d_set_d_op(res, res->d_sb->s_d_op);
1129 res->d_parent = res; 1518 res->d_parent = res;
1130 d_instantiate(res, root_inode); 1519 d_instantiate(res, root_inode);
1131 } 1520 }
@@ -1134,14 +1523,6 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1134} 1523}
1135EXPORT_SYMBOL(d_alloc_root); 1524EXPORT_SYMBOL(d_alloc_root);
1136 1525
1137static inline struct hlist_head *d_hash(struct dentry *parent,
1138 unsigned long hash)
1139{
1140 hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
1141 hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
1142 return dentry_hashtable + (hash & D_HASHMASK);
1143}
1144
1145/** 1526/**
1146 * d_obtain_alias - find or allocate a dentry for a given inode 1527 * d_obtain_alias - find or allocate a dentry for a given inode
1147 * @inode: inode to allocate the dentry for 1528 * @inode: inode to allocate the dentry for
@@ -1182,10 +1563,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
1182 } 1563 }
1183 tmp->d_parent = tmp; /* make sure dput doesn't croak */ 1564 tmp->d_parent = tmp; /* make sure dput doesn't croak */
1184 1565
1185 spin_lock(&dcache_lock); 1566
1567 spin_lock(&inode->i_lock);
1186 res = __d_find_alias(inode, 0); 1568 res = __d_find_alias(inode, 0);
1187 if (res) { 1569 if (res) {
1188 spin_unlock(&dcache_lock); 1570 spin_unlock(&inode->i_lock);
1189 dput(tmp); 1571 dput(tmp);
1190 goto out_iput; 1572 goto out_iput;
1191 } 1573 }
@@ -1193,14 +1575,17 @@ struct dentry *d_obtain_alias(struct inode *inode)
1193 /* attach a disconnected dentry */ 1575 /* attach a disconnected dentry */
1194 spin_lock(&tmp->d_lock); 1576 spin_lock(&tmp->d_lock);
1195 tmp->d_sb = inode->i_sb; 1577 tmp->d_sb = inode->i_sb;
1578 d_set_d_op(tmp, tmp->d_sb->s_d_op);
1196 tmp->d_inode = inode; 1579 tmp->d_inode = inode;
1197 tmp->d_flags |= DCACHE_DISCONNECTED; 1580 tmp->d_flags |= DCACHE_DISCONNECTED;
1198 tmp->d_flags &= ~DCACHE_UNHASHED;
1199 list_add(&tmp->d_alias, &inode->i_dentry); 1581 list_add(&tmp->d_alias, &inode->i_dentry);
1200 hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon); 1582 bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
1583 tmp->d_flags &= ~DCACHE_UNHASHED;
1584 hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
1585 __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
1201 spin_unlock(&tmp->d_lock); 1586 spin_unlock(&tmp->d_lock);
1587 spin_unlock(&inode->i_lock);
1202 1588
1203 spin_unlock(&dcache_lock);
1204 return tmp; 1589 return tmp;
1205 1590
1206 out_iput: 1591 out_iput:
@@ -1230,18 +1615,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1230 struct dentry *new = NULL; 1615 struct dentry *new = NULL;
1231 1616
1232 if (inode && S_ISDIR(inode->i_mode)) { 1617 if (inode && S_ISDIR(inode->i_mode)) {
1233 spin_lock(&dcache_lock); 1618 spin_lock(&inode->i_lock);
1234 new = __d_find_alias(inode, 1); 1619 new = __d_find_alias(inode, 1);
1235 if (new) { 1620 if (new) {
1236 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); 1621 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
1237 spin_unlock(&dcache_lock); 1622 spin_unlock(&inode->i_lock);
1238 security_d_instantiate(new, inode); 1623 security_d_instantiate(new, inode);
1239 d_move(new, dentry); 1624 d_move(new, dentry);
1240 iput(inode); 1625 iput(inode);
1241 } else { 1626 } else {
1242 /* already taking dcache_lock, so d_add() by hand */ 1627 /* already taking inode->i_lock, so d_add() by hand */
1243 __d_instantiate(dentry, inode); 1628 __d_instantiate(dentry, inode);
1244 spin_unlock(&dcache_lock); 1629 spin_unlock(&inode->i_lock);
1245 security_d_instantiate(dentry, inode); 1630 security_d_instantiate(dentry, inode);
1246 d_rehash(dentry); 1631 d_rehash(dentry);
1247 } 1632 }
@@ -1314,10 +1699,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1314 * Negative dentry: instantiate it unless the inode is a directory and 1699 * Negative dentry: instantiate it unless the inode is a directory and
1315 * already has a dentry. 1700 * already has a dentry.
1316 */ 1701 */
1317 spin_lock(&dcache_lock); 1702 spin_lock(&inode->i_lock);
1318 if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) { 1703 if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
1319 __d_instantiate(found, inode); 1704 __d_instantiate(found, inode);
1320 spin_unlock(&dcache_lock); 1705 spin_unlock(&inode->i_lock);
1321 security_d_instantiate(found, inode); 1706 security_d_instantiate(found, inode);
1322 return found; 1707 return found;
1323 } 1708 }
@@ -1327,8 +1712,8 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1327 * reference to it, move it in place and use it. 1712 * reference to it, move it in place and use it.
1328 */ 1713 */
1329 new = list_entry(inode->i_dentry.next, struct dentry, d_alias); 1714 new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
1330 dget_locked(new); 1715 __dget(new);
1331 spin_unlock(&dcache_lock); 1716 spin_unlock(&inode->i_lock);
1332 security_d_instantiate(found, inode); 1717 security_d_instantiate(found, inode);
1333 d_move(new, found); 1718 d_move(new, found);
1334 iput(inode); 1719 iput(inode);
@@ -1342,6 +1727,112 @@ err_out:
1342EXPORT_SYMBOL(d_add_ci); 1727EXPORT_SYMBOL(d_add_ci);
1343 1728
1344/** 1729/**
1730 * __d_lookup_rcu - search for a dentry (racy, store-free)
1731 * @parent: parent dentry
1732 * @name: qstr of name we wish to find
1733 * @seq: returns d_seq value at the point where the dentry was found
1734 * @inode: returns dentry->d_inode when the inode was found valid.
1735 * Returns: dentry, or NULL
1736 *
1737 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
1738 * resolution (store-free path walking) design described in
1739 * Documentation/filesystems/path-lookup.txt.
1740 *
1741 * This is not to be used outside core vfs.
1742 *
1743 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
1744 * held, and rcu_read_lock held. The returned dentry must not be stored into
1745 * without taking d_lock and checking d_seq sequence count against @seq
1746 * returned here.
1747 *
1748 * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
1749 * function.
1750 *
1751 * Alternatively, __d_lookup_rcu may be called again to look up the child of
1752 * the returned dentry, so long as its parent's seqlock is checked after the
1753 * child is looked up. Thus, an interlocking stepping of sequence lock checks
1754 * is formed, giving integrity down the path walk.
1755 */
1756struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
1757 unsigned *seq, struct inode **inode)
1758{
1759 unsigned int len = name->len;
1760 unsigned int hash = name->hash;
1761 const unsigned char *str = name->name;
1762 struct dcache_hash_bucket *b = d_hash(parent, hash);
1763 struct hlist_bl_node *node;
1764 struct dentry *dentry;
1765
1766 /*
1767 * Note: There is significant duplication with __d_lookup_rcu which is
1768 * required to prevent single threaded performance regressions
1769 * especially on architectures where smp_rmb (in seqcounts) are costly.
1770 * Keep the two functions in sync.
1771 */
1772
1773 /*
1774 * The hash list is protected using RCU.
1775 *
1776 * Carefully use d_seq when comparing a candidate dentry, to avoid
1777 * races with d_move().
1778 *
1779 * It is possible that concurrent renames can mess up our list
1780 * walk here and result in missing our dentry, resulting in the
1781 * false-negative result. d_lookup() protects against concurrent
1782 * renames using rename_lock seqlock.
1783 *
1784 * See Documentation/vfs/dcache-locking.txt for more details.
1785 */
1786 hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
1787 struct inode *i;
1788 const char *tname;
1789 int tlen;
1790
1791 if (dentry->d_name.hash != hash)
1792 continue;
1793
1794seqretry:
1795 *seq = read_seqcount_begin(&dentry->d_seq);
1796 if (dentry->d_parent != parent)
1797 continue;
1798 if (d_unhashed(dentry))
1799 continue;
1800 tlen = dentry->d_name.len;
1801 tname = dentry->d_name.name;
1802 i = dentry->d_inode;
1803 prefetch(tname);
1804 if (i)
1805 prefetch(i);
1806 /*
1807 * This seqcount check is required to ensure name and
1808 * len are loaded atomically, so as not to walk off the
1809 * edge of memory when walking. If we could load this
1810 * atomically some other way, we could drop this check.
1811 */
1812 if (read_seqcount_retry(&dentry->d_seq, *seq))
1813 goto seqretry;
1814 if (parent->d_flags & DCACHE_OP_COMPARE) {
1815 if (parent->d_op->d_compare(parent, *inode,
1816 dentry, i,
1817 tlen, tname, name))
1818 continue;
1819 } else {
1820 if (dentry_cmp(tname, tlen, str, len))
1821 continue;
1822 }
1823 /*
1824 * No extra seqcount check is required after the name
1825 * compare. The caller must perform a seqcount check in
1826 * order to do anything useful with the returned dentry
1827 * anyway.
1828 */
1829 *inode = i;
1830 return dentry;
1831 }
1832 return NULL;
1833}
1834
1835/**
1345 * d_lookup - search for a dentry 1836 * d_lookup - search for a dentry
1346 * @parent: parent dentry 1837 * @parent: parent dentry
1347 * @name: qstr of name we wish to find 1838 * @name: qstr of name we wish to find
@@ -1352,10 +1843,10 @@ EXPORT_SYMBOL(d_add_ci);
1352 * dentry is returned. The caller must use dput to free the entry when it has 1843 * dentry is returned. The caller must use dput to free the entry when it has
1353 * finished using it. %NULL is returned if the dentry does not exist. 1844 * finished using it. %NULL is returned if the dentry does not exist.
1354 */ 1845 */
1355struct dentry * d_lookup(struct dentry * parent, struct qstr * name) 1846struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
1356{ 1847{
1357 struct dentry * dentry = NULL; 1848 struct dentry *dentry;
1358 unsigned long seq; 1849 unsigned seq;
1359 1850
1360 do { 1851 do {
1361 seq = read_seqbegin(&rename_lock); 1852 seq = read_seqbegin(&rename_lock);
@@ -1367,7 +1858,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1367} 1858}
1368EXPORT_SYMBOL(d_lookup); 1859EXPORT_SYMBOL(d_lookup);
1369 1860
1370/* 1861/**
1371 * __d_lookup - search for a dentry (racy) 1862 * __d_lookup - search for a dentry (racy)
1372 * @parent: parent dentry 1863 * @parent: parent dentry
1373 * @name: qstr of name we wish to find 1864 * @name: qstr of name we wish to find
@@ -1382,17 +1873,24 @@ EXPORT_SYMBOL(d_lookup);
1382 * 1873 *
1383 * __d_lookup callers must be commented. 1874 * __d_lookup callers must be commented.
1384 */ 1875 */
1385struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1876struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
1386{ 1877{
1387 unsigned int len = name->len; 1878 unsigned int len = name->len;
1388 unsigned int hash = name->hash; 1879 unsigned int hash = name->hash;
1389 const unsigned char *str = name->name; 1880 const unsigned char *str = name->name;
1390 struct hlist_head *head = d_hash(parent,hash); 1881 struct dcache_hash_bucket *b = d_hash(parent, hash);
1882 struct hlist_bl_node *node;
1391 struct dentry *found = NULL; 1883 struct dentry *found = NULL;
1392 struct hlist_node *node;
1393 struct dentry *dentry; 1884 struct dentry *dentry;
1394 1885
1395 /* 1886 /*
1887 * Note: There is significant duplication with __d_lookup_rcu which is
1888 * required to prevent single threaded performance regressions
1889 * especially on architectures where smp_rmb (in seqcounts) are costly.
1890 * Keep the two functions in sync.
1891 */
1892
1893 /*
1396 * The hash list is protected using RCU. 1894 * The hash list is protected using RCU.
1397 * 1895 *
1398 * Take d_lock when comparing a candidate dentry, to avoid races 1896 * Take d_lock when comparing a candidate dentry, to avoid races
@@ -1407,25 +1905,16 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1407 */ 1905 */
1408 rcu_read_lock(); 1906 rcu_read_lock();
1409 1907
1410 hlist_for_each_entry_rcu(dentry, node, head, d_hash) { 1908 hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
1411 struct qstr *qstr; 1909 const char *tname;
1910 int tlen;
1412 1911
1413 if (dentry->d_name.hash != hash) 1912 if (dentry->d_name.hash != hash)
1414 continue; 1913 continue;
1415 if (dentry->d_parent != parent)
1416 continue;
1417 1914
1418 spin_lock(&dentry->d_lock); 1915 spin_lock(&dentry->d_lock);
1419
1420 /*
1421 * Recheck the dentry after taking the lock - d_move may have
1422 * changed things. Don't bother checking the hash because
1423 * we're about to compare the whole name anyway.
1424 */
1425 if (dentry->d_parent != parent) 1916 if (dentry->d_parent != parent)
1426 goto next; 1917 goto next;
1427
1428 /* non-existing due to RCU? */
1429 if (d_unhashed(dentry)) 1918 if (d_unhashed(dentry))
1430 goto next; 1919 goto next;
1431 1920
@@ -1433,18 +1922,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1433 * It is safe to compare names since d_move() cannot 1922 * It is safe to compare names since d_move() cannot
1434 * change the qstr (protected by d_lock). 1923 * change the qstr (protected by d_lock).
1435 */ 1924 */
1436 qstr = &dentry->d_name; 1925 tlen = dentry->d_name.len;
1437 if (parent->d_op && parent->d_op->d_compare) { 1926 tname = dentry->d_name.name;
1438 if (parent->d_op->d_compare(parent, qstr, name)) 1927 if (parent->d_flags & DCACHE_OP_COMPARE) {
1928 if (parent->d_op->d_compare(parent, parent->d_inode,
1929 dentry, dentry->d_inode,
1930 tlen, tname, name))
1439 goto next; 1931 goto next;
1440 } else { 1932 } else {
1441 if (qstr->len != len) 1933 if (dentry_cmp(tname, tlen, str, len))
1442 goto next;
1443 if (memcmp(qstr->name, str, len))
1444 goto next; 1934 goto next;
1445 } 1935 }
1446 1936
1447 atomic_inc(&dentry->d_count); 1937 dentry->d_count++;
1448 found = dentry; 1938 found = dentry;
1449 spin_unlock(&dentry->d_lock); 1939 spin_unlock(&dentry->d_lock);
1450 break; 1940 break;
@@ -1473,8 +1963,8 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
1473 * routine may choose to leave the hash value unchanged. 1963 * routine may choose to leave the hash value unchanged.
1474 */ 1964 */
1475 name->hash = full_name_hash(name->name, name->len); 1965 name->hash = full_name_hash(name->name, name->len);
1476 if (dir->d_op && dir->d_op->d_hash) { 1966 if (dir->d_flags & DCACHE_OP_HASH) {
1477 if (dir->d_op->d_hash(dir, name) < 0) 1967 if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
1478 goto out; 1968 goto out;
1479 } 1969 }
1480 dentry = d_lookup(dir, name); 1970 dentry = d_lookup(dir, name);
@@ -1483,34 +1973,32 @@ out:
1483} 1973}
1484 1974
1485/** 1975/**
1486 * d_validate - verify dentry provided from insecure source 1976 * d_validate - verify dentry provided from insecure source (deprecated)
1487 * @dentry: The dentry alleged to be valid child of @dparent 1977 * @dentry: The dentry alleged to be valid child of @dparent
1488 * @dparent: The parent dentry (known to be valid) 1978 * @dparent: The parent dentry (known to be valid)
1489 * 1979 *
1490 * An insecure source has sent us a dentry, here we verify it and dget() it. 1980 * An insecure source has sent us a dentry, here we verify it and dget() it.
1491 * This is used by ncpfs in its readdir implementation. 1981 * This is used by ncpfs in its readdir implementation.
1492 * Zero is returned in the dentry is invalid. 1982 * Zero is returned in the dentry is invalid.
1983 *
1984 * This function is slow for big directories, and deprecated, do not use it.
1493 */ 1985 */
1494int d_validate(struct dentry *dentry, struct dentry *parent) 1986int d_validate(struct dentry *dentry, struct dentry *dparent)
1495{ 1987{
1496 struct hlist_head *head = d_hash(parent, dentry->d_name.hash); 1988 struct dentry *child;
1497 struct hlist_node *node;
1498 struct dentry *d;
1499 1989
1500 /* Check whether the ptr might be valid at all.. */ 1990 spin_lock(&dparent->d_lock);
1501 if (!kmem_ptr_validate(dentry_cache, dentry)) 1991 list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
1502 return 0; 1992 if (dentry == child) {
1503 if (dentry->d_parent != parent) 1993 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1504 return 0; 1994 __dget_dlock(dentry);
1505 1995 spin_unlock(&dentry->d_lock);
1506 rcu_read_lock(); 1996 spin_unlock(&dparent->d_lock);
1507 hlist_for_each_entry_rcu(d, node, head, d_hash) {
1508 if (d == dentry) {
1509 dget(dentry);
1510 return 1; 1997 return 1;
1511 } 1998 }
1512 } 1999 }
1513 rcu_read_unlock(); 2000 spin_unlock(&dparent->d_lock);
2001
1514 return 0; 2002 return 0;
1515} 2003}
1516EXPORT_SYMBOL(d_validate); 2004EXPORT_SYMBOL(d_validate);
@@ -1538,16 +2026,23 @@ EXPORT_SYMBOL(d_validate);
1538 2026
1539void d_delete(struct dentry * dentry) 2027void d_delete(struct dentry * dentry)
1540{ 2028{
2029 struct inode *inode;
1541 int isdir = 0; 2030 int isdir = 0;
1542 /* 2031 /*
1543 * Are we the only user? 2032 * Are we the only user?
1544 */ 2033 */
1545 spin_lock(&dcache_lock); 2034again:
1546 spin_lock(&dentry->d_lock); 2035 spin_lock(&dentry->d_lock);
1547 isdir = S_ISDIR(dentry->d_inode->i_mode); 2036 inode = dentry->d_inode;
1548 if (atomic_read(&dentry->d_count) == 1) { 2037 isdir = S_ISDIR(inode->i_mode);
2038 if (dentry->d_count == 1) {
2039 if (inode && !spin_trylock(&inode->i_lock)) {
2040 spin_unlock(&dentry->d_lock);
2041 cpu_relax();
2042 goto again;
2043 }
1549 dentry->d_flags &= ~DCACHE_CANT_MOUNT; 2044 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1550 dentry_iput(dentry); 2045 dentry_unlink_inode(dentry);
1551 fsnotify_nameremove(dentry, isdir); 2046 fsnotify_nameremove(dentry, isdir);
1552 return; 2047 return;
1553 } 2048 }
@@ -1556,17 +2051,18 @@ void d_delete(struct dentry * dentry)
1556 __d_drop(dentry); 2051 __d_drop(dentry);
1557 2052
1558 spin_unlock(&dentry->d_lock); 2053 spin_unlock(&dentry->d_lock);
1559 spin_unlock(&dcache_lock);
1560 2054
1561 fsnotify_nameremove(dentry, isdir); 2055 fsnotify_nameremove(dentry, isdir);
1562} 2056}
1563EXPORT_SYMBOL(d_delete); 2057EXPORT_SYMBOL(d_delete);
1564 2058
1565static void __d_rehash(struct dentry * entry, struct hlist_head *list) 2059static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b)
1566{ 2060{
1567 2061 BUG_ON(!d_unhashed(entry));
2062 spin_lock_bucket(b);
1568 entry->d_flags &= ~DCACHE_UNHASHED; 2063 entry->d_flags &= ~DCACHE_UNHASHED;
1569 hlist_add_head_rcu(&entry->d_hash, list); 2064 hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
2065 spin_unlock_bucket(b);
1570} 2066}
1571 2067
1572static void _d_rehash(struct dentry * entry) 2068static void _d_rehash(struct dentry * entry)
@@ -1583,25 +2079,39 @@ static void _d_rehash(struct dentry * entry)
1583 2079
1584void d_rehash(struct dentry * entry) 2080void d_rehash(struct dentry * entry)
1585{ 2081{
1586 spin_lock(&dcache_lock);
1587 spin_lock(&entry->d_lock); 2082 spin_lock(&entry->d_lock);
1588 _d_rehash(entry); 2083 _d_rehash(entry);
1589 spin_unlock(&entry->d_lock); 2084 spin_unlock(&entry->d_lock);
1590 spin_unlock(&dcache_lock);
1591} 2085}
1592EXPORT_SYMBOL(d_rehash); 2086EXPORT_SYMBOL(d_rehash);
1593 2087
1594/* 2088/**
1595 * When switching names, the actual string doesn't strictly have to 2089 * dentry_update_name_case - update case insensitive dentry with a new name
1596 * be preserved in the target - because we're dropping the target 2090 * @dentry: dentry to be updated
1597 * anyway. As such, we can just do a simple memcpy() to copy over 2091 * @name: new name
1598 * the new name before we switch.
1599 * 2092 *
1600 * Note that we have to be a lot more careful about getting the hash 2093 * Update a case insensitive dentry with new case of name.
1601 * switched - we have to switch the hash value properly even if it 2094 *
1602 * then no longer matches the actual (corrupted) string of the target. 2095 * dentry must have been returned by d_lookup with name @name. Old and new
1603 * The hash value has to match the hash queue that the dentry is on.. 2096 * name lengths must match (ie. no d_compare which allows mismatched name
2097 * lengths).
2098 *
2099 * Parent inode i_mutex must be held over d_lookup and into this call (to
2100 * keep renames and concurrent inserts, and readdir(2) away).
1604 */ 2101 */
2102void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
2103{
2104 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2105 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
2106
2107 spin_lock(&dentry->d_lock);
2108 write_seqcount_begin(&dentry->d_seq);
2109 memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
2110 write_seqcount_end(&dentry->d_seq);
2111 spin_unlock(&dentry->d_lock);
2112}
2113EXPORT_SYMBOL(dentry_update_name_case);
2114
1605static void switch_names(struct dentry *dentry, struct dentry *target) 2115static void switch_names(struct dentry *dentry, struct dentry *target)
1606{ 2116{
1607 if (dname_external(target)) { 2117 if (dname_external(target)) {
@@ -1643,54 +2153,84 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1643 swap(dentry->d_name.len, target->d_name.len); 2153 swap(dentry->d_name.len, target->d_name.len);
1644} 2154}
1645 2155
2156static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
2157{
2158 /*
2159 * XXXX: do we really need to take target->d_lock?
2160 */
2161 if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
2162 spin_lock(&target->d_parent->d_lock);
2163 else {
2164 if (d_ancestor(dentry->d_parent, target->d_parent)) {
2165 spin_lock(&dentry->d_parent->d_lock);
2166 spin_lock_nested(&target->d_parent->d_lock,
2167 DENTRY_D_LOCK_NESTED);
2168 } else {
2169 spin_lock(&target->d_parent->d_lock);
2170 spin_lock_nested(&dentry->d_parent->d_lock,
2171 DENTRY_D_LOCK_NESTED);
2172 }
2173 }
2174 if (target < dentry) {
2175 spin_lock_nested(&target->d_lock, 2);
2176 spin_lock_nested(&dentry->d_lock, 3);
2177 } else {
2178 spin_lock_nested(&dentry->d_lock, 2);
2179 spin_lock_nested(&target->d_lock, 3);
2180 }
2181}
2182
2183static void dentry_unlock_parents_for_move(struct dentry *dentry,
2184 struct dentry *target)
2185{
2186 if (target->d_parent != dentry->d_parent)
2187 spin_unlock(&dentry->d_parent->d_lock);
2188 if (target->d_parent != target)
2189 spin_unlock(&target->d_parent->d_lock);
2190}
2191
1646/* 2192/*
1647 * We cannibalize "target" when moving dentry on top of it, 2193 * When switching names, the actual string doesn't strictly have to
1648 * because it's going to be thrown away anyway. We could be more 2194 * be preserved in the target - because we're dropping the target
1649 * polite about it, though. 2195 * anyway. As such, we can just do a simple memcpy() to copy over
1650 * 2196 * the new name before we switch.
1651 * This forceful removal will result in ugly /proc output if 2197 *
1652 * somebody holds a file open that got deleted due to a rename. 2198 * Note that we have to be a lot more careful about getting the hash
1653 * We could be nicer about the deleted file, and let it show 2199 * switched - we have to switch the hash value properly even if it
1654 * up under the name it had before it was deleted rather than 2200 * then no longer matches the actual (corrupted) string of the target.
1655 * under the original name of the file that was moved on top of it. 2201 * The hash value has to match the hash queue that the dentry is on..
1656 */ 2202 */
1657
1658/* 2203/*
1659 * d_move_locked - move a dentry 2204 * d_move - move a dentry
1660 * @dentry: entry to move 2205 * @dentry: entry to move
1661 * @target: new dentry 2206 * @target: new dentry
1662 * 2207 *
1663 * Update the dcache to reflect the move of a file name. Negative 2208 * Update the dcache to reflect the move of a file name. Negative
1664 * dcache entries should not be moved in this way. 2209 * dcache entries should not be moved in this way.
1665 */ 2210 */
1666static void d_move_locked(struct dentry * dentry, struct dentry * target) 2211void d_move(struct dentry * dentry, struct dentry * target)
1667{ 2212{
1668 struct hlist_head *list;
1669
1670 if (!dentry->d_inode) 2213 if (!dentry->d_inode)
1671 printk(KERN_WARNING "VFS: moving negative dcache entry\n"); 2214 printk(KERN_WARNING "VFS: moving negative dcache entry\n");
1672 2215
2216 BUG_ON(d_ancestor(dentry, target));
2217 BUG_ON(d_ancestor(target, dentry));
2218
1673 write_seqlock(&rename_lock); 2219 write_seqlock(&rename_lock);
1674 /*
1675 * XXXX: do we really need to take target->d_lock?
1676 */
1677 if (target < dentry) {
1678 spin_lock(&target->d_lock);
1679 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1680 } else {
1681 spin_lock(&dentry->d_lock);
1682 spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
1683 }
1684 2220
1685 /* Move the dentry to the target hash queue, if on different bucket */ 2221 dentry_lock_for_move(dentry, target);
1686 if (d_unhashed(dentry)) 2222
1687 goto already_unhashed; 2223 write_seqcount_begin(&dentry->d_seq);
2224 write_seqcount_begin(&target->d_seq);
1688 2225
1689 hlist_del_rcu(&dentry->d_hash); 2226 /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
1690 2227
1691already_unhashed: 2228 /*
1692 list = d_hash(target->d_parent, target->d_name.hash); 2229 * Move the dentry to the target hash queue. Don't bother checking
1693 __d_rehash(dentry, list); 2230 * for the same hash queue because of how unlikely it is.
2231 */
2232 __d_drop(dentry);
2233 __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
1694 2234
1695 /* Unhash the target: dput() will then get rid of it */ 2235 /* Unhash the target: dput() will then get rid of it */
1696 __d_drop(target); 2236 __d_drop(target);
@@ -1715,27 +2255,16 @@ already_unhashed:
1715 } 2255 }
1716 2256
1717 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); 2257 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2258
2259 write_seqcount_end(&target->d_seq);
2260 write_seqcount_end(&dentry->d_seq);
2261
2262 dentry_unlock_parents_for_move(dentry, target);
1718 spin_unlock(&target->d_lock); 2263 spin_unlock(&target->d_lock);
1719 fsnotify_d_move(dentry); 2264 fsnotify_d_move(dentry);
1720 spin_unlock(&dentry->d_lock); 2265 spin_unlock(&dentry->d_lock);
1721 write_sequnlock(&rename_lock); 2266 write_sequnlock(&rename_lock);
1722} 2267}
1723
1724/**
1725 * d_move - move a dentry
1726 * @dentry: entry to move
1727 * @target: new dentry
1728 *
1729 * Update the dcache to reflect the move of a file name. Negative
1730 * dcache entries should not be moved in this way.
1731 */
1732
1733void d_move(struct dentry * dentry, struct dentry * target)
1734{
1735 spin_lock(&dcache_lock);
1736 d_move_locked(dentry, target);
1737 spin_unlock(&dcache_lock);
1738}
1739EXPORT_SYMBOL(d_move); 2268EXPORT_SYMBOL(d_move);
1740 2269
1741/** 2270/**
@@ -1761,13 +2290,13 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
1761 * This helper attempts to cope with remotely renamed directories 2290 * This helper attempts to cope with remotely renamed directories
1762 * 2291 *
1763 * It assumes that the caller is already holding 2292 * It assumes that the caller is already holding
1764 * dentry->d_parent->d_inode->i_mutex and the dcache_lock 2293 * dentry->d_parent->d_inode->i_mutex and the inode->i_lock
1765 * 2294 *
1766 * Note: If ever the locking in lock_rename() changes, then please 2295 * Note: If ever the locking in lock_rename() changes, then please
1767 * remember to update this too... 2296 * remember to update this too...
1768 */ 2297 */
1769static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) 2298static struct dentry *__d_unalias(struct inode *inode,
1770 __releases(dcache_lock) 2299 struct dentry *dentry, struct dentry *alias)
1771{ 2300{
1772 struct mutex *m1 = NULL, *m2 = NULL; 2301 struct mutex *m1 = NULL, *m2 = NULL;
1773 struct dentry *ret; 2302 struct dentry *ret;
@@ -1790,10 +2319,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
1790 goto out_err; 2319 goto out_err;
1791 m2 = &alias->d_parent->d_inode->i_mutex; 2320 m2 = &alias->d_parent->d_inode->i_mutex;
1792out_unalias: 2321out_unalias:
1793 d_move_locked(alias, dentry); 2322 d_move(alias, dentry);
1794 ret = alias; 2323 ret = alias;
1795out_err: 2324out_err:
1796 spin_unlock(&dcache_lock); 2325 spin_unlock(&inode->i_lock);
1797 if (m2) 2326 if (m2)
1798 mutex_unlock(m2); 2327 mutex_unlock(m2);
1799 if (m1) 2328 if (m1)
@@ -1804,17 +2333,23 @@ out_err:
1804/* 2333/*
1805 * Prepare an anonymous dentry for life in the superblock's dentry tree as a 2334 * Prepare an anonymous dentry for life in the superblock's dentry tree as a
1806 * named dentry in place of the dentry to be replaced. 2335 * named dentry in place of the dentry to be replaced.
2336 * returns with anon->d_lock held!
1807 */ 2337 */
1808static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) 2338static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
1809{ 2339{
1810 struct dentry *dparent, *aparent; 2340 struct dentry *dparent, *aparent;
1811 2341
1812 switch_names(dentry, anon); 2342 dentry_lock_for_move(anon, dentry);
1813 swap(dentry->d_name.hash, anon->d_name.hash); 2343
2344 write_seqcount_begin(&dentry->d_seq);
2345 write_seqcount_begin(&anon->d_seq);
1814 2346
1815 dparent = dentry->d_parent; 2347 dparent = dentry->d_parent;
1816 aparent = anon->d_parent; 2348 aparent = anon->d_parent;
1817 2349
2350 switch_names(dentry, anon);
2351 swap(dentry->d_name.hash, anon->d_name.hash);
2352
1818 dentry->d_parent = (aparent == anon) ? dentry : aparent; 2353 dentry->d_parent = (aparent == anon) ? dentry : aparent;
1819 list_del(&dentry->d_u.d_child); 2354 list_del(&dentry->d_u.d_child);
1820 if (!IS_ROOT(dentry)) 2355 if (!IS_ROOT(dentry))
@@ -1829,6 +2364,13 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
1829 else 2364 else
1830 INIT_LIST_HEAD(&anon->d_u.d_child); 2365 INIT_LIST_HEAD(&anon->d_u.d_child);
1831 2366
2367 write_seqcount_end(&dentry->d_seq);
2368 write_seqcount_end(&anon->d_seq);
2369
2370 dentry_unlock_parents_for_move(anon, dentry);
2371 spin_unlock(&dentry->d_lock);
2372
2373 /* anon->d_lock still locked, returns locked */
1832 anon->d_flags &= ~DCACHE_DISCONNECTED; 2374 anon->d_flags &= ~DCACHE_DISCONNECTED;
1833} 2375}
1834 2376
@@ -1846,14 +2388,15 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
1846 2388
1847 BUG_ON(!d_unhashed(dentry)); 2389 BUG_ON(!d_unhashed(dentry));
1848 2390
1849 spin_lock(&dcache_lock);
1850
1851 if (!inode) { 2391 if (!inode) {
1852 actual = dentry; 2392 actual = dentry;
1853 __d_instantiate(dentry, NULL); 2393 __d_instantiate(dentry, NULL);
1854 goto found_lock; 2394 d_rehash(actual);
2395 goto out_nolock;
1855 } 2396 }
1856 2397
2398 spin_lock(&inode->i_lock);
2399
1857 if (S_ISDIR(inode->i_mode)) { 2400 if (S_ISDIR(inode->i_mode)) {
1858 struct dentry *alias; 2401 struct dentry *alias;
1859 2402
@@ -1864,13 +2407,12 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
1864 /* Is this an anonymous mountpoint that we could splice 2407 /* Is this an anonymous mountpoint that we could splice
1865 * into our tree? */ 2408 * into our tree? */
1866 if (IS_ROOT(alias)) { 2409 if (IS_ROOT(alias)) {
1867 spin_lock(&alias->d_lock);
1868 __d_materialise_dentry(dentry, alias); 2410 __d_materialise_dentry(dentry, alias);
1869 __d_drop(alias); 2411 __d_drop(alias);
1870 goto found; 2412 goto found;
1871 } 2413 }
1872 /* Nope, but we must(!) avoid directory aliasing */ 2414 /* Nope, but we must(!) avoid directory aliasing */
1873 actual = __d_unalias(dentry, alias); 2415 actual = __d_unalias(inode, dentry, alias);
1874 if (IS_ERR(actual)) 2416 if (IS_ERR(actual))
1875 dput(alias); 2417 dput(alias);
1876 goto out_nolock; 2418 goto out_nolock;
@@ -1881,15 +2423,14 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
1881 actual = __d_instantiate_unique(dentry, inode); 2423 actual = __d_instantiate_unique(dentry, inode);
1882 if (!actual) 2424 if (!actual)
1883 actual = dentry; 2425 actual = dentry;
1884 else if (unlikely(!d_unhashed(actual))) 2426 else
1885 goto shouldnt_be_hashed; 2427 BUG_ON(!d_unhashed(actual));
1886 2428
1887found_lock:
1888 spin_lock(&actual->d_lock); 2429 spin_lock(&actual->d_lock);
1889found: 2430found:
1890 _d_rehash(actual); 2431 _d_rehash(actual);
1891 spin_unlock(&actual->d_lock); 2432 spin_unlock(&actual->d_lock);
1892 spin_unlock(&dcache_lock); 2433 spin_unlock(&inode->i_lock);
1893out_nolock: 2434out_nolock:
1894 if (actual == dentry) { 2435 if (actual == dentry) {
1895 security_d_instantiate(dentry, inode); 2436 security_d_instantiate(dentry, inode);
@@ -1898,10 +2439,6 @@ out_nolock:
1898 2439
1899 iput(inode); 2440 iput(inode);
1900 return actual; 2441 return actual;
1901
1902shouldnt_be_hashed:
1903 spin_unlock(&dcache_lock);
1904 BUG();
1905} 2442}
1906EXPORT_SYMBOL_GPL(d_materialise_unique); 2443EXPORT_SYMBOL_GPL(d_materialise_unique);
1907 2444
@@ -1921,14 +2458,13 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
1921} 2458}
1922 2459
1923/** 2460/**
1924 * Prepend path string to a buffer 2461 * prepend_path - Prepend path string to a buffer
1925 *
1926 * @path: the dentry/vfsmount to report 2462 * @path: the dentry/vfsmount to report
1927 * @root: root vfsmnt/dentry (may be modified by this function) 2463 * @root: root vfsmnt/dentry (may be modified by this function)
1928 * @buffer: pointer to the end of the buffer 2464 * @buffer: pointer to the end of the buffer
1929 * @buflen: pointer to buffer length 2465 * @buflen: pointer to buffer length
1930 * 2466 *
1931 * Caller holds the dcache_lock. 2467 * Caller holds the rename_lock.
1932 * 2468 *
1933 * If path is not reachable from the supplied root, then the value of 2469 * If path is not reachable from the supplied root, then the value of
1934 * root is changed (without modifying refcounts). 2470 * root is changed (without modifying refcounts).
@@ -1956,7 +2492,9 @@ static int prepend_path(const struct path *path, struct path *root,
1956 } 2492 }
1957 parent = dentry->d_parent; 2493 parent = dentry->d_parent;
1958 prefetch(parent); 2494 prefetch(parent);
2495 spin_lock(&dentry->d_lock);
1959 error = prepend_name(buffer, buflen, &dentry->d_name); 2496 error = prepend_name(buffer, buflen, &dentry->d_name);
2497 spin_unlock(&dentry->d_lock);
1960 if (!error) 2498 if (!error)
1961 error = prepend(buffer, buflen, "/", 1); 2499 error = prepend(buffer, buflen, "/", 1);
1962 if (error) 2500 if (error)
@@ -2012,9 +2550,9 @@ char *__d_path(const struct path *path, struct path *root,
2012 int error; 2550 int error;
2013 2551
2014 prepend(&res, &buflen, "\0", 1); 2552 prepend(&res, &buflen, "\0", 1);
2015 spin_lock(&dcache_lock); 2553 write_seqlock(&rename_lock);
2016 error = prepend_path(path, root, &res, &buflen); 2554 error = prepend_path(path, root, &res, &buflen);
2017 spin_unlock(&dcache_lock); 2555 write_sequnlock(&rename_lock);
2018 2556
2019 if (error) 2557 if (error)
2020 return ERR_PTR(error); 2558 return ERR_PTR(error);
@@ -2076,12 +2614,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
2076 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 2614 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2077 2615
2078 get_fs_root(current->fs, &root); 2616 get_fs_root(current->fs, &root);
2079 spin_lock(&dcache_lock); 2617 write_seqlock(&rename_lock);
2080 tmp = root; 2618 tmp = root;
2081 error = path_with_deleted(path, &tmp, &res, &buflen); 2619 error = path_with_deleted(path, &tmp, &res, &buflen);
2082 if (error) 2620 if (error)
2083 res = ERR_PTR(error); 2621 res = ERR_PTR(error);
2084 spin_unlock(&dcache_lock); 2622 write_sequnlock(&rename_lock);
2085 path_put(&root); 2623 path_put(&root);
2086 return res; 2624 return res;
2087} 2625}
@@ -2107,12 +2645,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
2107 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 2645 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2108 2646
2109 get_fs_root(current->fs, &root); 2647 get_fs_root(current->fs, &root);
2110 spin_lock(&dcache_lock); 2648 write_seqlock(&rename_lock);
2111 tmp = root; 2649 tmp = root;
2112 error = path_with_deleted(path, &tmp, &res, &buflen); 2650 error = path_with_deleted(path, &tmp, &res, &buflen);
2113 if (!error && !path_equal(&tmp, &root)) 2651 if (!error && !path_equal(&tmp, &root))
2114 error = prepend_unreachable(&res, &buflen); 2652 error = prepend_unreachable(&res, &buflen);
2115 spin_unlock(&dcache_lock); 2653 write_sequnlock(&rename_lock);
2116 path_put(&root); 2654 path_put(&root);
2117 if (error) 2655 if (error)
2118 res = ERR_PTR(error); 2656 res = ERR_PTR(error);
@@ -2144,7 +2682,7 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
2144/* 2682/*
2145 * Write full pathname from the root of the filesystem into the buffer. 2683 * Write full pathname from the root of the filesystem into the buffer.
2146 */ 2684 */
2147char *__dentry_path(struct dentry *dentry, char *buf, int buflen) 2685static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
2148{ 2686{
2149 char *end = buf + buflen; 2687 char *end = buf + buflen;
2150 char *retval; 2688 char *retval;
@@ -2158,10 +2696,13 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
2158 2696
2159 while (!IS_ROOT(dentry)) { 2697 while (!IS_ROOT(dentry)) {
2160 struct dentry *parent = dentry->d_parent; 2698 struct dentry *parent = dentry->d_parent;
2699 int error;
2161 2700
2162 prefetch(parent); 2701 prefetch(parent);
2163 if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || 2702 spin_lock(&dentry->d_lock);
2164 (prepend(&end, &buflen, "/", 1) != 0)) 2703 error = prepend_name(&end, &buflen, &dentry->d_name);
2704 spin_unlock(&dentry->d_lock);
2705 if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
2165 goto Elong; 2706 goto Elong;
2166 2707
2167 retval = end; 2708 retval = end;
@@ -2171,14 +2712,25 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
2171Elong: 2712Elong:
2172 return ERR_PTR(-ENAMETOOLONG); 2713 return ERR_PTR(-ENAMETOOLONG);
2173} 2714}
2174EXPORT_SYMBOL(__dentry_path); 2715
2716char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
2717{
2718 char *retval;
2719
2720 write_seqlock(&rename_lock);
2721 retval = __dentry_path(dentry, buf, buflen);
2722 write_sequnlock(&rename_lock);
2723
2724 return retval;
2725}
2726EXPORT_SYMBOL(dentry_path_raw);
2175 2727
2176char *dentry_path(struct dentry *dentry, char *buf, int buflen) 2728char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2177{ 2729{
2178 char *p = NULL; 2730 char *p = NULL;
2179 char *retval; 2731 char *retval;
2180 2732
2181 spin_lock(&dcache_lock); 2733 write_seqlock(&rename_lock);
2182 if (d_unlinked(dentry)) { 2734 if (d_unlinked(dentry)) {
2183 p = buf + buflen; 2735 p = buf + buflen;
2184 if (prepend(&p, &buflen, "//deleted", 10) != 0) 2736 if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2186,12 +2738,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2186 buflen++; 2738 buflen++;
2187 } 2739 }
2188 retval = __dentry_path(dentry, buf, buflen); 2740 retval = __dentry_path(dentry, buf, buflen);
2189 spin_unlock(&dcache_lock); 2741 write_sequnlock(&rename_lock);
2190 if (!IS_ERR(retval) && p) 2742 if (!IS_ERR(retval) && p)
2191 *p = '/'; /* restore '/' overriden with '\0' */ 2743 *p = '/'; /* restore '/' overriden with '\0' */
2192 return retval; 2744 return retval;
2193Elong: 2745Elong:
2194 spin_unlock(&dcache_lock);
2195 return ERR_PTR(-ENAMETOOLONG); 2746 return ERR_PTR(-ENAMETOOLONG);
2196} 2747}
2197 2748
@@ -2225,7 +2776,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2225 get_fs_root_and_pwd(current->fs, &root, &pwd); 2776 get_fs_root_and_pwd(current->fs, &root, &pwd);
2226 2777
2227 error = -ENOENT; 2778 error = -ENOENT;
2228 spin_lock(&dcache_lock); 2779 write_seqlock(&rename_lock);
2229 if (!d_unlinked(pwd.dentry)) { 2780 if (!d_unlinked(pwd.dentry)) {
2230 unsigned long len; 2781 unsigned long len;
2231 struct path tmp = root; 2782 struct path tmp = root;
@@ -2234,7 +2785,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2234 2785
2235 prepend(&cwd, &buflen, "\0", 1); 2786 prepend(&cwd, &buflen, "\0", 1);
2236 error = prepend_path(&pwd, &tmp, &cwd, &buflen); 2787 error = prepend_path(&pwd, &tmp, &cwd, &buflen);
2237 spin_unlock(&dcache_lock); 2788 write_sequnlock(&rename_lock);
2238 2789
2239 if (error) 2790 if (error)
2240 goto out; 2791 goto out;
@@ -2253,8 +2804,9 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2253 if (copy_to_user(buf, cwd, len)) 2804 if (copy_to_user(buf, cwd, len))
2254 error = -EFAULT; 2805 error = -EFAULT;
2255 } 2806 }
2256 } else 2807 } else {
2257 spin_unlock(&dcache_lock); 2808 write_sequnlock(&rename_lock);
2809 }
2258 2810
2259out: 2811out:
2260 path_put(&pwd); 2812 path_put(&pwd);
@@ -2282,25 +2834,25 @@ out:
2282int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) 2834int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
2283{ 2835{
2284 int result; 2836 int result;
2285 unsigned long seq; 2837 unsigned seq;
2286 2838
2287 if (new_dentry == old_dentry) 2839 if (new_dentry == old_dentry)
2288 return 1; 2840 return 1;
2289 2841
2290 /*
2291 * Need rcu_readlock to protect against the d_parent trashing
2292 * due to d_move
2293 */
2294 rcu_read_lock();
2295 do { 2842 do {
2296 /* for restarting inner loop in case of seq retry */ 2843 /* for restarting inner loop in case of seq retry */
2297 seq = read_seqbegin(&rename_lock); 2844 seq = read_seqbegin(&rename_lock);
2845 /*
2846 * Need rcu_readlock to protect against the d_parent trashing
2847 * due to d_move
2848 */
2849 rcu_read_lock();
2298 if (d_ancestor(old_dentry, new_dentry)) 2850 if (d_ancestor(old_dentry, new_dentry))
2299 result = 1; 2851 result = 1;
2300 else 2852 else
2301 result = 0; 2853 result = 0;
2854 rcu_read_unlock();
2302 } while (read_seqretry(&rename_lock, seq)); 2855 } while (read_seqretry(&rename_lock, seq));
2303 rcu_read_unlock();
2304 2856
2305 return result; 2857 return result;
2306} 2858}
@@ -2332,10 +2884,15 @@ EXPORT_SYMBOL(path_is_under);
2332 2884
2333void d_genocide(struct dentry *root) 2885void d_genocide(struct dentry *root)
2334{ 2886{
2335 struct dentry *this_parent = root; 2887 struct dentry *this_parent;
2336 struct list_head *next; 2888 struct list_head *next;
2889 unsigned seq;
2890 int locked = 0;
2337 2891
2338 spin_lock(&dcache_lock); 2892 seq = read_seqbegin(&rename_lock);
2893again:
2894 this_parent = root;
2895 spin_lock(&this_parent->d_lock);
2339repeat: 2896repeat:
2340 next = this_parent->d_subdirs.next; 2897 next = this_parent->d_subdirs.next;
2341resume: 2898resume:
@@ -2343,21 +2900,62 @@ resume:
2343 struct list_head *tmp = next; 2900 struct list_head *tmp = next;
2344 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 2901 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
2345 next = tmp->next; 2902 next = tmp->next;
2346 if (d_unhashed(dentry)||!dentry->d_inode) 2903
2904 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
2905 if (d_unhashed(dentry) || !dentry->d_inode) {
2906 spin_unlock(&dentry->d_lock);
2347 continue; 2907 continue;
2908 }
2348 if (!list_empty(&dentry->d_subdirs)) { 2909 if (!list_empty(&dentry->d_subdirs)) {
2910 spin_unlock(&this_parent->d_lock);
2911 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
2349 this_parent = dentry; 2912 this_parent = dentry;
2913 spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
2350 goto repeat; 2914 goto repeat;
2351 } 2915 }
2352 atomic_dec(&dentry->d_count); 2916 if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
2917 dentry->d_flags |= DCACHE_GENOCIDE;
2918 dentry->d_count--;
2919 }
2920 spin_unlock(&dentry->d_lock);
2353 } 2921 }
2354 if (this_parent != root) { 2922 if (this_parent != root) {
2355 next = this_parent->d_u.d_child.next; 2923 struct dentry *tmp;
2356 atomic_dec(&this_parent->d_count); 2924 struct dentry *child;
2357 this_parent = this_parent->d_parent; 2925
2926 tmp = this_parent->d_parent;
2927 if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
2928 this_parent->d_flags |= DCACHE_GENOCIDE;
2929 this_parent->d_count--;
2930 }
2931 rcu_read_lock();
2932 spin_unlock(&this_parent->d_lock);
2933 child = this_parent;
2934 this_parent = tmp;
2935 spin_lock(&this_parent->d_lock);
2936 /* might go back up the wrong parent if we have had a rename
2937 * or deletion */
2938 if (this_parent != child->d_parent ||
2939 (!locked && read_seqretry(&rename_lock, seq))) {
2940 spin_unlock(&this_parent->d_lock);
2941 rcu_read_unlock();
2942 goto rename_retry;
2943 }
2944 rcu_read_unlock();
2945 next = child->d_u.d_child.next;
2358 goto resume; 2946 goto resume;
2359 } 2947 }
2360 spin_unlock(&dcache_lock); 2948 spin_unlock(&this_parent->d_lock);
2949 if (!locked && read_seqretry(&rename_lock, seq))
2950 goto rename_retry;
2951 if (locked)
2952 write_sequnlock(&rename_lock);
2953 return;
2954
2955rename_retry:
2956 locked = 1;
2957 write_seqlock(&rename_lock);
2958 goto again;
2361} 2959}
2362 2960
2363/** 2961/**
@@ -2411,7 +3009,7 @@ static void __init dcache_init_early(void)
2411 3009
2412 dentry_hashtable = 3010 dentry_hashtable =
2413 alloc_large_system_hash("Dentry cache", 3011 alloc_large_system_hash("Dentry cache",
2414 sizeof(struct hlist_head), 3012 sizeof(struct dcache_hash_bucket),
2415 dhash_entries, 3013 dhash_entries,
2416 13, 3014 13,
2417 HASH_EARLY, 3015 HASH_EARLY,
@@ -2420,16 +3018,13 @@ static void __init dcache_init_early(void)
2420 0); 3018 0);
2421 3019
2422 for (loop = 0; loop < (1 << d_hash_shift); loop++) 3020 for (loop = 0; loop < (1 << d_hash_shift); loop++)
2423 INIT_HLIST_HEAD(&dentry_hashtable[loop]); 3021 INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
2424} 3022}
2425 3023
2426static void __init dcache_init(void) 3024static void __init dcache_init(void)
2427{ 3025{
2428 int loop; 3026 int loop;
2429 3027
2430 percpu_counter_init(&nr_dentry, 0);
2431 percpu_counter_init(&nr_dentry_unused, 0);
2432
2433 /* 3028 /*
2434 * A constructor could be added for stable state like the lists, 3029 * A constructor could be added for stable state like the lists,
2435 * but it is probably not worth it because of the cache nature 3030 * but it is probably not worth it because of the cache nature
@@ -2446,7 +3041,7 @@ static void __init dcache_init(void)
2446 3041
2447 dentry_hashtable = 3042 dentry_hashtable =
2448 alloc_large_system_hash("Dentry cache", 3043 alloc_large_system_hash("Dentry cache",
2449 sizeof(struct hlist_head), 3044 sizeof(struct dcache_hash_bucket),
2450 dhash_entries, 3045 dhash_entries,
2451 13, 3046 13,
2452 0, 3047 0,
@@ -2455,7 +3050,7 @@ static void __init dcache_init(void)
2455 0); 3050 0);
2456 3051
2457 for (loop = 0; loop < (1 << d_hash_shift); loop++) 3052 for (loop = 0; loop < (1 << d_hash_shift); loop++)
2458 INIT_HLIST_HEAD(&dentry_hashtable[loop]); 3053 INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
2459} 3054}
2460 3055
2461/* SLAB cache for __getname() consumers */ 3056/* SLAB cache for __getname() consumers */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 85882f6ba5f..b044705eedd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
325} 325}
326EXPORT_SYMBOL_GPL(dio_end_io); 326EXPORT_SYMBOL_GPL(dio_end_io);
327 327
328static int 328static void
329dio_bio_alloc(struct dio *dio, struct block_device *bdev, 329dio_bio_alloc(struct dio *dio, struct block_device *bdev,
330 sector_t first_sector, int nr_vecs) 330 sector_t first_sector, int nr_vecs)
331{ 331{
332 struct bio *bio; 332 struct bio *bio;
333 333
334 /*
335 * bio_alloc() is guaranteed to return a bio when called with
336 * __GFP_WAIT and we request a valid number of vectors.
337 */
334 bio = bio_alloc(GFP_KERNEL, nr_vecs); 338 bio = bio_alloc(GFP_KERNEL, nr_vecs);
335 339
336 bio->bi_bdev = bdev; 340 bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
342 346
343 dio->bio = bio; 347 dio->bio = bio;
344 dio->logical_offset_in_bio = dio->cur_page_fs_offset; 348 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
345 return 0;
346} 349}
347 350
348/* 351/*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
583 goto out; 586 goto out;
584 sector = start_sector << (dio->blkbits - 9); 587 sector = start_sector << (dio->blkbits - 9);
585 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); 588 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
589 nr_pages = min(nr_pages, BIO_MAX_PAGES);
586 BUG_ON(nr_pages <= 0); 590 BUG_ON(nr_pages <= 0);
587 ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); 591 dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
588 dio->boundary = 0; 592 dio->boundary = 0;
589out: 593out:
590 return ret; 594 return ret;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e811..1897eb1b4b6 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
1menuconfig DLM 1menuconfig DLM
2 tristate "Distributed Lock Manager (DLM)" 2 tristate "Distributed Lock Manager (DLM)"
3 depends on EXPERIMENTAL && INET 3 depends on EXPERIMENTAL && INET
4 depends on SYSFS && (IPV6 || IPV6=n) 4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
5 select CONFIGFS_FS
6 select IP_SCTP 5 select IP_SCTP
7 help 6 help
8 A general purpose distributed lock manager for kernel or userspace 7 A general purpose distributed lock manager for kernel or userspace
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622..9c64ae9e4c1 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
63#define NEEDED_RMEM (4*1024*1024) 63#define NEEDED_RMEM (4*1024*1024)
64#define CONN_HASH_SIZE 32 64#define CONN_HASH_SIZE 32
65 65
66/* Number of messages to send before rescheduling */
67#define MAX_SEND_MSG_COUNT 25
68
66struct cbuf { 69struct cbuf {
67 unsigned int base; 70 unsigned int base;
68 unsigned int len; 71 unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
108#define CF_INIT_PENDING 4 111#define CF_INIT_PENDING 4
109#define CF_IS_OTHERCON 5 112#define CF_IS_OTHERCON 5
110#define CF_CLOSE 6 113#define CF_CLOSE 6
114#define CF_APP_LIMITED 7
111 struct list_head writequeue; /* List of outgoing writequeue_entries */ 115 struct list_head writequeue; /* List of outgoing writequeue_entries */
112 spinlock_t writequeue_lock; 116 spinlock_t writequeue_lock;
113 int (*rx_action) (struct connection *); /* What to do when active */ 117 int (*rx_action) (struct connection *); /* What to do when active */
@@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk)
295{ 299{
296 struct connection *con = sock2con(sk); 300 struct connection *con = sock2con(sk);
297 301
298 if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags)) 302 if (!con)
303 return;
304
305 clear_bit(SOCK_NOSPACE, &con->sock->flags);
306
307 if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
308 con->sock->sk->sk_write_pending--;
309 clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
310 }
311
312 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
299 queue_work(send_workqueue, &con->swork); 313 queue_work(send_workqueue, &con->swork);
300} 314}
301 315
@@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con)
915 struct sockaddr_storage saddr, src_addr; 929 struct sockaddr_storage saddr, src_addr;
916 int addr_len; 930 int addr_len;
917 struct socket *sock = NULL; 931 struct socket *sock = NULL;
932 int one = 1;
918 933
919 if (con->nodeid == 0) { 934 if (con->nodeid == 0) {
920 log_print("attempt to connect sock 0 foiled"); 935 log_print("attempt to connect sock 0 foiled");
@@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con)
960 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); 975 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
961 976
962 log_print("connecting to %d", con->nodeid); 977 log_print("connecting to %d", con->nodeid);
978
979 /* Turn off Nagle's algorithm */
980 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
981 sizeof(one));
982
963 result = 983 result =
964 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, 984 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
965 O_NONBLOCK); 985 O_NONBLOCK);
@@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
1011 goto create_out; 1031 goto create_out;
1012 } 1032 }
1013 1033
1034 /* Turn off Nagle's algorithm */
1035 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
1036 sizeof(one));
1037
1014 result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, 1038 result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
1015 (char *)&one, sizeof(one)); 1039 (char *)&one, sizeof(one));
1016 1040
@@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con)
1297 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1321 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1298 struct writequeue_entry *e; 1322 struct writequeue_entry *e;
1299 int len, offset; 1323 int len, offset;
1324 int count = 0;
1300 1325
1301 mutex_lock(&con->sock_mutex); 1326 mutex_lock(&con->sock_mutex);
1302 if (con->sock == NULL) 1327 if (con->sock == NULL)
@@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con)
1319 ret = kernel_sendpage(con->sock, e->page, offset, len, 1344 ret = kernel_sendpage(con->sock, e->page, offset, len,
1320 msg_flags); 1345 msg_flags);
1321 if (ret == -EAGAIN || ret == 0) { 1346 if (ret == -EAGAIN || ret == 0) {
1347 if (ret == -EAGAIN &&
1348 test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
1349 !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
1350 /* Notify TCP that we're limited by the
1351 * application window size.
1352 */
1353 set_bit(SOCK_NOSPACE, &con->sock->flags);
1354 con->sock->sk->sk_write_pending++;
1355 }
1322 cond_resched(); 1356 cond_resched();
1323 goto out; 1357 goto out;
1324 } 1358 }
1325 if (ret <= 0) 1359 if (ret <= 0)
1326 goto send_error; 1360 goto send_error;
1327 } 1361 }
1328 /* Don't starve people filling buffers */ 1362
1363 /* Don't starve people filling buffers */
1364 if (++count >= MAX_SEND_MSG_COUNT) {
1329 cond_resched(); 1365 cond_resched();
1366 count = 0;
1367 }
1330 1368
1331 spin_lock(&con->writequeue_lock); 1369 spin_lock(&con->writequeue_lock);
1332 e->offset += ret; 1370 e->offset += ret;
@@ -1430,20 +1468,19 @@ static void work_stop(void)
1430 1468
1431static int work_start(void) 1469static int work_start(void)
1432{ 1470{
1433 int error; 1471 recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
1434 recv_workqueue = create_workqueue("dlm_recv"); 1472 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1435 error = IS_ERR(recv_workqueue); 1473 if (!recv_workqueue) {
1436 if (error) { 1474 log_print("can't start dlm_recv");
1437 log_print("can't start dlm_recv %d", error); 1475 return -ENOMEM;
1438 return error;
1439 } 1476 }
1440 1477
1441 send_workqueue = create_singlethread_workqueue("dlm_send"); 1478 send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
1442 error = IS_ERR(send_workqueue); 1479 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1443 if (error) { 1480 if (!send_workqueue) {
1444 log_print("can't start dlm_send %d", error); 1481 log_print("can't start dlm_send");
1445 destroy_workqueue(recv_workqueue); 1482 destroy_workqueue(recv_workqueue);
1446 return error; 1483 return -ENOMEM;
1447 } 1484 }
1448 1485
1449 return 0; 1486 return 0;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e..bfd8b680e64 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
348 BUG_ON(!crypt_stat || !crypt_stat->tfm 348 BUG_ON(!crypt_stat || !crypt_stat->tfm
349 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); 349 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
350 if (unlikely(ecryptfs_verbosity > 0)) { 350 if (unlikely(ecryptfs_verbosity > 0)) {
351 ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n", 351 ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
352 crypt_stat->key_size); 352 crypt_stat->key_size);
353 ecryptfs_dump_hex(crypt_stat->key, 353 ecryptfs_dump_hex(crypt_stat->key,
354 crypt_stat->key_size); 354 crypt_stat->key_size);
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
413 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 413 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
414 (extent_base + extent_offset)); 414 (extent_base + extent_offset));
415 if (rc) { 415 if (rc) {
416 ecryptfs_printk(KERN_ERR, "Error attempting to " 416 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
417 "derive IV for extent [0x%.16x]; " 417 "extent [0x%.16llx]; rc = [%d]\n",
418 "rc = [%d]\n", (extent_base + extent_offset), 418 (unsigned long long)(extent_base + extent_offset), rc);
419 rc);
420 goto out; 419 goto out;
421 } 420 }
422 if (unlikely(ecryptfs_verbosity > 0)) { 421 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
443 } 442 }
444 rc = 0; 443 rc = 0;
445 if (unlikely(ecryptfs_verbosity > 0)) { 444 if (unlikely(ecryptfs_verbosity > 0)) {
446 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; " 445 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
447 "rc = [%d]\n", (extent_base + extent_offset), 446 "rc = [%d]\n",
448 rc); 447 (unsigned long long)(extent_base + extent_offset), rc);
449 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " 448 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
450 "encryption:\n"); 449 "encryption:\n");
451 ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8); 450 ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
540 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 539 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
541 (extent_base + extent_offset)); 540 (extent_base + extent_offset));
542 if (rc) { 541 if (rc) {
543 ecryptfs_printk(KERN_ERR, "Error attempting to " 542 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
544 "derive IV for extent [0x%.16x]; " 543 "extent [0x%.16llx]; rc = [%d]\n",
545 "rc = [%d]\n", (extent_base + extent_offset), 544 (unsigned long long)(extent_base + extent_offset), rc);
546 rc);
547 goto out; 545 goto out;
548 } 546 }
549 if (unlikely(ecryptfs_verbosity > 0)) { 547 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
571 } 569 }
572 rc = 0; 570 rc = 0;
573 if (unlikely(ecryptfs_verbosity > 0)) { 571 if (unlikely(ecryptfs_verbosity > 0)) {
574 ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; " 572 ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
575 "rc = [%d]\n", (extent_base + extent_offset), 573 "rc = [%d]\n",
576 rc); 574 (unsigned long long)(extent_base + extent_offset), rc);
577 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " 575 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
578 "decryption:\n"); 576 "decryption:\n");
579 ecryptfs_dump_hex((char *)(page_address(page) 577 ecryptfs_dump_hex((char *)(page_address(page)
@@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
780 } 778 }
781 ecryptfs_printk(KERN_DEBUG, 779 ecryptfs_printk(KERN_DEBUG,
782 "Initializing cipher [%s]; strlen = [%d]; " 780 "Initializing cipher [%s]; strlen = [%d]; "
783 "key_size_bits = [%d]\n", 781 "key_size_bits = [%zd]\n",
784 crypt_stat->cipher, (int)strlen(crypt_stat->cipher), 782 crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
785 crypt_stat->key_size << 3); 783 crypt_stat->key_size << 3);
786 if (crypt_stat->tfm) { 784 if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 906e803f7f7..6fc4f319b55 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -44,12 +44,17 @@
44 */ 44 */
45static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) 45static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
46{ 46{
47 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 47 struct dentry *lower_dentry;
48 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); 48 struct vfsmount *lower_mnt;
49 struct dentry *dentry_save; 49 struct dentry *dentry_save;
50 struct vfsmount *vfsmount_save; 50 struct vfsmount *vfsmount_save;
51 int rc = 1; 51 int rc = 1;
52 52
53 if (nd->flags & LOOKUP_RCU)
54 return -ECHILD;
55
56 lower_dentry = ecryptfs_dentry_to_lower(dentry);
57 lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
53 if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) 58 if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
54 goto out; 59 goto out;
55 dentry_save = nd->path.dentry; 60 dentry_save = nd->path.dentry;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 413a3c48f0b..dbc84ed9633 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
192 (((struct user_key_payload*)key->payload.data)->data); 192 (((struct user_key_payload*)key->payload.data)->data);
193} 193}
194 194
195#define ECRYPTFS_SUPER_MAGIC 0xf15f
196#define ECRYPTFS_MAX_KEYSET_SIZE 1024 195#define ECRYPTFS_MAX_KEYSET_SIZE 1024
197#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 196#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
198#define ECRYPTFS_MAX_NUM_ENC_KEYS 64 197#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
@@ -584,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
584 583
585#define ecryptfs_printk(type, fmt, arg...) \ 584#define ecryptfs_printk(type, fmt, arg...) \
586 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); 585 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
586__attribute__ ((format(printf, 1, 2)))
587void __ecryptfs_printk(const char *fmt, ...); 587void __ecryptfs_printk(const char *fmt, ...);
588 588
589extern const struct file_operations ecryptfs_main_fops; 589extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 91da02987bf..81e10e6a944 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
47 const struct iovec *iov, 47 const struct iovec *iov,
48 unsigned long nr_segs, loff_t pos) 48 unsigned long nr_segs, loff_t pos)
49{ 49{
50 int rc; 50 ssize_t rc;
51 struct dentry *lower_dentry; 51 struct dentry *lower_dentry;
52 struct vfsmount *lower_vfsmount; 52 struct vfsmount *lower_vfsmount;
53 struct file *file = iocb->ki_filp; 53 struct file *file = iocb->ki_filp;
@@ -191,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
191 | ECRYPTFS_ENCRYPTED); 191 | ECRYPTFS_ENCRYPTED);
192 } 192 }
193 mutex_unlock(&crypt_stat->cs_mutex); 193 mutex_unlock(&crypt_stat->cs_mutex);
194 if (!ecryptfs_inode_to_private(inode)->lower_file) { 194 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
195 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 195 if (rc) {
196 if (rc) { 196 printk(KERN_ERR "%s: Error attempting to initialize "
197 printk(KERN_ERR "%s: Error attempting to initialize " 197 "the persistent file for the dentry with name "
198 "the persistent file for the dentry with name " 198 "[%s]; rc = [%d]\n", __func__,
199 "[%s]; rc = [%d]\n", __func__, 199 ecryptfs_dentry->d_name.name, rc);
200 ecryptfs_dentry->d_name.name, rc); 200 goto out_free;
201 goto out_free;
202 }
203 } 201 }
204 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) 202 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
205 && !(file->f_flags & O_RDONLY)) { 203 == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
206 rc = -EPERM; 204 rc = -EPERM;
207 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " 205 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
208 "file must hence be opened RO\n", __func__); 206 "file must hence be opened RO\n", __func__);
@@ -243,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
243 } 241 }
244 } 242 }
245 mutex_unlock(&crypt_stat->cs_mutex); 243 mutex_unlock(&crypt_stat->cs_mutex);
246 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] " 244 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
247 "size: [0x%.16x]\n", inode, inode->i_ino, 245 "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
248 i_size_read(inode)); 246 (unsigned long long)i_size_read(inode));
249 goto out; 247 goto out;
250out_free: 248out_free:
251 kmem_cache_free(ecryptfs_file_info_cache, 249 kmem_cache_free(ecryptfs_file_info_cache,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 9d1a22d6276..bd33f87a190 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
185 "context; rc = [%d]\n", rc); 185 "context; rc = [%d]\n", rc);
186 goto out; 186 goto out;
187 } 187 }
188 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { 188 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
189 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 189 if (rc) {
190 if (rc) { 190 printk(KERN_ERR "%s: Error attempting to initialize "
191 printk(KERN_ERR "%s: Error attempting to initialize " 191 "the persistent file for the dentry with name "
192 "the persistent file for the dentry with name " 192 "[%s]; rc = [%d]\n", __func__,
193 "[%s]; rc = [%d]\n", __func__, 193 ecryptfs_dentry->d_name.name, rc);
194 ecryptfs_dentry->d_name.name, rc); 194 goto out;
195 goto out;
196 }
197 } 195 }
198 rc = ecryptfs_write_metadata(ecryptfs_dentry); 196 rc = ecryptfs_write_metadata(ecryptfs_dentry);
199 if (rc) { 197 if (rc) {
@@ -260,7 +258,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
260 ecryptfs_dentry->d_parent)); 258 ecryptfs_dentry->d_parent));
261 lower_inode = lower_dentry->d_inode; 259 lower_inode = lower_dentry->d_inode;
262 fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode); 260 fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
263 BUG_ON(!atomic_read(&lower_dentry->d_count)); 261 BUG_ON(!lower_dentry->d_count);
264 ecryptfs_set_dentry_private(ecryptfs_dentry, 262 ecryptfs_set_dentry_private(ecryptfs_dentry,
265 kmem_cache_alloc(ecryptfs_dentry_info_cache, 263 kmem_cache_alloc(ecryptfs_dentry_info_cache,
266 GFP_KERNEL)); 264 GFP_KERNEL));
@@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
302 rc = -ENOMEM; 300 rc = -ENOMEM;
303 goto out; 301 goto out;
304 } 302 }
305 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { 303 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
306 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 304 if (rc) {
307 if (rc) { 305 printk(KERN_ERR "%s: Error attempting to initialize "
308 printk(KERN_ERR "%s: Error attempting to initialize " 306 "the persistent file for the dentry with name "
309 "the persistent file for the dentry with name " 307 "[%s]; rc = [%d]\n", __func__,
310 "[%s]; rc = [%d]\n", __func__, 308 ecryptfs_dentry->d_name.name, rc);
311 ecryptfs_dentry->d_name.name, rc); 309 goto out_free_kmem;
312 goto out_free_kmem;
313 }
314 } 310 }
315 crypt_stat = &ecryptfs_inode_to_private( 311 crypt_stat = &ecryptfs_inode_to_private(
316 ecryptfs_dentry->d_inode)->crypt_stat; 312 ecryptfs_dentry->d_inode)->crypt_stat;
@@ -441,7 +437,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
441 struct qstr lower_name; 437 struct qstr lower_name;
442 int rc = 0; 438 int rc = 0;
443 439
444 ecryptfs_dentry->d_op = &ecryptfs_dops;
445 if ((ecryptfs_dentry->d_name.len == 1 440 if ((ecryptfs_dentry->d_name.len == 1
446 && !strcmp(ecryptfs_dentry->d_name.name, ".")) 441 && !strcmp(ecryptfs_dentry->d_name.name, "."))
447 || (ecryptfs_dentry->d_name.len == 2 442 || (ecryptfs_dentry->d_name.len == 2
@@ -454,7 +449,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
454 lower_name.hash = ecryptfs_dentry->d_name.hash; 449 lower_name.hash = ecryptfs_dentry->d_name.hash;
455 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { 450 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
456 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, 451 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
457 &lower_name); 452 lower_dir_dentry->d_inode, &lower_name);
458 if (rc < 0) 453 if (rc < 0)
459 goto out_d_drop; 454 goto out_d_drop;
460 } 455 }
@@ -489,7 +484,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
489 lower_name.hash = full_name_hash(lower_name.name, lower_name.len); 484 lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
490 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { 485 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
491 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, 486 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
492 &lower_name); 487 lower_dir_dentry->d_inode, &lower_name);
493 if (rc < 0) 488 if (rc < 0)
494 goto out_d_drop; 489 goto out_d_drop;
495 } 490 }
@@ -980,8 +975,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
980} 975}
981 976
982static int 977static int
983ecryptfs_permission(struct inode *inode, int mask) 978ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
984{ 979{
980 if (flags & IPERM_FLAG_RCU)
981 return -ECHILD;
985 return inode_permission(ecryptfs_inode_to_lower(inode), mask); 982 return inode_permission(ecryptfs_inode_to_lower(inode), mask);
986} 983}
987 984
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index b1f6858a522..c1436cff6f2 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
59 break; 59 break;
60 default: 60 default:
61 ecryptfs_printk(KERN_WARNING, "Unknown error code: " 61 ecryptfs_printk(KERN_WARNING, "Unknown error code: "
62 "[0x%.16x]\n", err_code); 62 "[0x%.16lx]\n", err_code);
63 rc = -EINVAL; 63 rc = -EINVAL;
64 } 64 }
65 return rc; 65 return rc;
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
130 } else { 130 } else {
131 rc = -EINVAL; 131 rc = -EINVAL;
132 ecryptfs_printk(KERN_WARNING, 132 ecryptfs_printk(KERN_WARNING,
133 "Unsupported packet size: [%d]\n", size); 133 "Unsupported packet size: [%zd]\n", size);
134 } 134 }
135 return rc; 135 return rc;
136} 136}
@@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
1672 auth_tok->session_key.decrypted_key_size); 1672 auth_tok->session_key.decrypted_key_size);
1673 crypt_stat->flags |= ECRYPTFS_KEY_VALID; 1673 crypt_stat->flags |= ECRYPTFS_KEY_VALID;
1674 if (unlikely(ecryptfs_verbosity > 0)) { 1674 if (unlikely(ecryptfs_verbosity > 0)) {
1675 ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n", 1675 ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
1676 crypt_stat->key_size); 1676 crypt_stat->key_size);
1677 ecryptfs_dump_hex(crypt_stat->key, 1677 ecryptfs_dump_hex(crypt_stat->key,
1678 crypt_stat->key_size); 1678 crypt_stat->key_size);
@@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1754 if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) { 1754 if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
1755 ecryptfs_printk(KERN_ERR, "Expected " 1755 ecryptfs_printk(KERN_ERR, "Expected "
1756 "signature of size [%d]; " 1756 "signature of size [%d]; "
1757 "read size [%d]\n", 1757 "read size [%zd]\n",
1758 ECRYPTFS_SIG_SIZE, 1758 ECRYPTFS_SIG_SIZE,
1759 tag_11_contents_size); 1759 tag_11_contents_size);
1760 rc = -EIO; 1760 rc = -EIO;
@@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1787 goto out_wipe_list; 1787 goto out_wipe_list;
1788 break; 1788 break;
1789 default: 1789 default:
1790 ecryptfs_printk(KERN_DEBUG, "No packet at offset " 1790 ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
1791 "[%d] of the file header; hex value of " 1791 "of the file header; hex value of "
1792 "character is [0x%.2x]\n", i, src[i]); 1792 "character is [0x%.2x]\n", i, src[i]);
1793 next_packet_is_auth_tok_packet = 0; 1793 next_packet_is_auth_tok_packet = 0;
1794 } 1794 }
@@ -1864,8 +1864,8 @@ found_matching_auth_tok:
1864 "session key for authentication token with sig " 1864 "session key for authentication token with sig "
1865 "[%.*s]; rc = [%d]. Removing auth tok " 1865 "[%.*s]; rc = [%d]. Removing auth tok "
1866 "candidate from the list and searching for " 1866 "candidate from the list and searching for "
1867 "the next match.\n", candidate_auth_tok_sig, 1867 "the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
1868 ECRYPTFS_SIG_SIZE_HEX, rc); 1868 candidate_auth_tok_sig, rc);
1869 list_for_each_entry_safe(auth_tok_list_item, 1869 list_for_each_entry_safe(auth_tok_list_item,
1870 auth_tok_list_item_tmp, 1870 auth_tok_list_item_tmp,
1871 &auth_tok_list, list) { 1871 &auth_tok_list, list) {
@@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2168 if (encrypted_session_key_valid) { 2168 if (encrypted_session_key_valid) {
2169 ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; " 2169 ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
2170 "using auth_tok->session_key.encrypted_key, " 2170 "using auth_tok->session_key.encrypted_key, "
2171 "where key_rec->enc_key_size = [%d]\n", 2171 "where key_rec->enc_key_size = [%zd]\n",
2172 key_rec->enc_key_size); 2172 key_rec->enc_key_size);
2173 memcpy(key_rec->enc_key, 2173 memcpy(key_rec->enc_key,
2174 auth_tok->session_key.encrypted_key, 2174 auth_tok->session_key.encrypted_key,
@@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2198 if (rc < 1 || rc > 2) { 2198 if (rc < 1 || rc > 2) {
2199 ecryptfs_printk(KERN_ERR, "Error generating scatterlist " 2199 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
2200 "for crypt_stat session key; expected rc = 1; " 2200 "for crypt_stat session key; expected rc = 1; "
2201 "got rc = [%d]. key_rec->enc_key_size = [%d]\n", 2201 "got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
2202 rc, key_rec->enc_key_size); 2202 rc, key_rec->enc_key_size);
2203 rc = -ENOMEM; 2203 rc = -ENOMEM;
2204 goto out; 2204 goto out;
@@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2209 ecryptfs_printk(KERN_ERR, "Error generating scatterlist " 2209 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
2210 "for crypt_stat encrypted session key; " 2210 "for crypt_stat encrypted session key; "
2211 "expected rc = 1; got rc = [%d]. " 2211 "expected rc = 1; got rc = [%d]. "
2212 "key_rec->enc_key_size = [%d]\n", rc, 2212 "key_rec->enc_key_size = [%zd]\n", rc,
2213 key_rec->enc_key_size); 2213 key_rec->enc_key_size);
2214 rc = -ENOMEM; 2214 rc = -ENOMEM;
2215 goto out; 2215 goto out;
@@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2224 goto out; 2224 goto out;
2225 } 2225 }
2226 rc = 0; 2226 rc = 0;
2227 ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n", 2227 ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
2228 crypt_stat->key_size); 2228 crypt_stat->key_size);
2229 rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, 2229 rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
2230 (*key_rec).enc_key_size); 2230 (*key_rec).enc_key_size);
@@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2235 } 2235 }
2236 ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); 2236 ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
2237 if (ecryptfs_verbosity > 0) { 2237 if (ecryptfs_verbosity > 0) {
2238 ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n", 2238 ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
2239 key_rec->enc_key_size); 2239 key_rec->enc_key_size);
2240 ecryptfs_dump_hex(key_rec->enc_key, 2240 ecryptfs_dump_hex(key_rec->enc_key,
2241 key_rec->enc_key_size); 2241 key_rec->enc_key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index a9dbd62518e..758323a0f09 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/magic.h>
39#include "ecryptfs_kernel.h" 40#include "ecryptfs_kernel.h"
40 41
41/** 42/**
@@ -141,25 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
141 return rc; 142 return rc;
142} 143}
143 144
144/** 145static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
145 * ecryptfs_interpose 146 struct super_block *sb)
146 * @lower_dentry: Existing dentry in the lower filesystem
147 * @dentry: ecryptfs' dentry
148 * @sb: ecryptfs's super_block
149 * @flags: flags to govern behavior of interpose procedure
150 *
151 * Interposes upper and lower dentries.
152 *
153 * Returns zero on success; non-zero otherwise
154 */
155int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
156 struct super_block *sb, u32 flags)
157{ 147{
158 struct inode *lower_inode;
159 struct inode *inode; 148 struct inode *inode;
160 int rc = 0; 149 int rc = 0;
161 150
162 lower_inode = lower_dentry->d_inode;
163 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { 151 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
164 rc = -EXDEV; 152 rc = -EXDEV;
165 goto out; 153 goto out;
@@ -189,17 +177,38 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
189 if (special_file(lower_inode->i_mode)) 177 if (special_file(lower_inode->i_mode))
190 init_special_inode(inode, lower_inode->i_mode, 178 init_special_inode(inode, lower_inode->i_mode,
191 lower_inode->i_rdev); 179 lower_inode->i_rdev);
192 dentry->d_op = &ecryptfs_dops;
193 fsstack_copy_attr_all(inode, lower_inode); 180 fsstack_copy_attr_all(inode, lower_inode);
194 /* This size will be overwritten for real files w/ headers and 181 /* This size will be overwritten for real files w/ headers and
195 * other metadata */ 182 * other metadata */
196 fsstack_copy_inode_size(inode, lower_inode); 183 fsstack_copy_inode_size(inode, lower_inode);
184 return inode;
185out:
186 return ERR_PTR(rc);
187}
188
189/**
190 * ecryptfs_interpose
191 * @lower_dentry: Existing dentry in the lower filesystem
192 * @dentry: ecryptfs' dentry
193 * @sb: ecryptfs's super_block
194 * @flags: flags to govern behavior of interpose procedure
195 *
196 * Interposes upper and lower dentries.
197 *
198 * Returns zero on success; non-zero otherwise
199 */
200int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
201 struct super_block *sb, u32 flags)
202{
203 struct inode *lower_inode = lower_dentry->d_inode;
204 struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
205 if (IS_ERR(inode))
206 return PTR_ERR(inode);
197 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) 207 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
198 d_add(dentry, inode); 208 d_add(dentry, inode);
199 else 209 else
200 d_instantiate(dentry, inode); 210 d_instantiate(dentry, inode);
201out: 211 return 0;
202 return rc;
203} 212}
204 213
205enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, 214enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
@@ -492,59 +501,11 @@ struct kmem_cache *ecryptfs_sb_info_cache;
492static struct file_system_type ecryptfs_fs_type; 501static struct file_system_type ecryptfs_fs_type;
493 502
494/** 503/**
495 * ecryptfs_read_super
496 * @sb: The ecryptfs super block
497 * @dev_name: The path to mount over
498 *
499 * Read the super block of the lower filesystem, and use
500 * ecryptfs_interpose to create our initial inode and super block
501 * struct.
502 */
503static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
504{
505 struct path path;
506 int rc;
507
508 rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
509 if (rc) {
510 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
511 goto out;
512 }
513 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
514 rc = -EINVAL;
515 printk(KERN_ERR "Mount on filesystem of type "
516 "eCryptfs explicitly disallowed due to "
517 "known incompatibilities\n");
518 goto out_free;
519 }
520 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
521 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
522 sb->s_blocksize = path.dentry->d_sb->s_blocksize;
523 ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
524 ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
525 rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
526 if (rc)
527 goto out_free;
528 rc = 0;
529 goto out;
530out_free:
531 path_put(&path);
532out:
533 return rc;
534}
535
536/**
537 * ecryptfs_get_sb 504 * ecryptfs_get_sb
538 * @fs_type 505 * @fs_type
539 * @flags 506 * @flags
540 * @dev_name: The path to mount over 507 * @dev_name: The path to mount over
541 * @raw_data: The options passed into the kernel 508 * @raw_data: The options passed into the kernel
542 *
543 * The whole ecryptfs_get_sb process is broken into 3 functions:
544 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
545 * ecryptfs_read_super(): this accesses the lower filesystem and uses
546 * ecryptfs_interpose to perform most of the linking
547 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
548 */ 509 */
549static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, 510static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
550 const char *dev_name, void *raw_data) 511 const char *dev_name, void *raw_data)
@@ -553,6 +514,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
553 struct ecryptfs_sb_info *sbi; 514 struct ecryptfs_sb_info *sbi;
554 struct ecryptfs_dentry_info *root_info; 515 struct ecryptfs_dentry_info *root_info;
555 const char *err = "Getting sb failed"; 516 const char *err = "Getting sb failed";
517 struct inode *inode;
518 struct path path;
556 int rc; 519 int rc;
557 520
558 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); 521 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -575,10 +538,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
575 538
576 s->s_flags = flags; 539 s->s_flags = flags;
577 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); 540 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
578 if (rc) { 541 if (rc)
579 deactivate_locked_super(s); 542 goto out1;
580 goto out;
581 }
582 543
583 ecryptfs_set_superblock_private(s, sbi); 544 ecryptfs_set_superblock_private(s, sbi);
584 s->s_bdi = &sbi->bdi; 545 s->s_bdi = &sbi->bdi;
@@ -586,34 +547,55 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
586 /* ->kill_sb() will take care of sbi after that point */ 547 /* ->kill_sb() will take care of sbi after that point */
587 sbi = NULL; 548 sbi = NULL;
588 s->s_op = &ecryptfs_sops; 549 s->s_op = &ecryptfs_sops;
550 s->s_d_op = &ecryptfs_dops;
589 551
590 rc = -ENOMEM; 552 err = "Reading sb failed";
591 s->s_root = d_alloc(NULL, &(const struct qstr) { 553 rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
592 .hash = 0,.name = "/",.len = 1}); 554 if (rc) {
555 ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
556 goto out1;
557 }
558 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
559 rc = -EINVAL;
560 printk(KERN_ERR "Mount on filesystem of type "
561 "eCryptfs explicitly disallowed due to "
562 "known incompatibilities\n");
563 goto out_free;
564 }
565 ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
566 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
567 s->s_blocksize = path.dentry->d_sb->s_blocksize;
568 s->s_magic = ECRYPTFS_SUPER_MAGIC;
569
570 inode = ecryptfs_get_inode(path.dentry->d_inode, s);
571 rc = PTR_ERR(inode);
572 if (IS_ERR(inode))
573 goto out_free;
574
575 s->s_root = d_alloc_root(inode);
593 if (!s->s_root) { 576 if (!s->s_root) {
594 deactivate_locked_super(s); 577 iput(inode);
595 goto out; 578 rc = -ENOMEM;
579 goto out_free;
596 } 580 }
597 s->s_root->d_op = &ecryptfs_dops;
598 s->s_root->d_sb = s;
599 s->s_root->d_parent = s->s_root;
600 581
582 rc = -ENOMEM;
601 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); 583 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
602 if (!root_info) { 584 if (!root_info)
603 deactivate_locked_super(s); 585 goto out_free;
604 goto out; 586
605 }
606 /* ->kill_sb() will take care of root_info */ 587 /* ->kill_sb() will take care of root_info */
607 ecryptfs_set_dentry_private(s->s_root, root_info); 588 ecryptfs_set_dentry_private(s->s_root, root_info);
589 ecryptfs_set_dentry_lower(s->s_root, path.dentry);
590 ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
591
608 s->s_flags |= MS_ACTIVE; 592 s->s_flags |= MS_ACTIVE;
609 rc = ecryptfs_read_super(s, dev_name);
610 if (rc) {
611 deactivate_locked_super(s);
612 err = "Reading sb failed";
613 goto out;
614 }
615 return dget(s->s_root); 593 return dget(s->s_root);
616 594
595out_free:
596 path_put(&path);
597out1:
598 deactivate_locked_super(s);
617out: 599out:
618 if (sbi) { 600 if (sbi) {
619 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); 601 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
@@ -828,9 +810,10 @@ static int __init ecryptfs_init(void)
828 ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " 810 ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
829 "larger than the host's page size, and so " 811 "larger than the host's page size, and so "
830 "eCryptfs cannot run on this system. The " 812 "eCryptfs cannot run on this system. The "
831 "default eCryptfs extent size is [%d] bytes; " 813 "default eCryptfs extent size is [%u] bytes; "
832 "the page size is [%d] bytes.\n", 814 "the page size is [%lu] bytes.\n",
833 ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE); 815 ECRYPTFS_DEFAULT_EXTENT_SIZE,
816 (unsigned long)PAGE_CACHE_SIZE);
834 goto out; 817 goto out;
835 } 818 }
836 rc = ecryptfs_init_kmem_caches(); 819 rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544..cc64fca89f8 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
65 rc = ecryptfs_encrypt_page(page); 65 rc = ecryptfs_encrypt_page(page);
66 if (rc) { 66 if (rc) {
67 ecryptfs_printk(KERN_WARNING, "Error encrypting " 67 ecryptfs_printk(KERN_WARNING, "Error encrypting "
68 "page (upper index [0x%.16x])\n", page->index); 68 "page (upper index [0x%.16lx])\n", page->index);
69 ClearPageUptodate(page); 69 ClearPageUptodate(page);
70 goto out; 70 goto out;
71 } 71 }
@@ -237,7 +237,7 @@ out:
237 ClearPageUptodate(page); 237 ClearPageUptodate(page);
238 else 238 else
239 SetPageUptodate(page); 239 SetPageUptodate(page);
240 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n", 240 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
241 page->index); 241 page->index);
242 unlock_page(page); 242 unlock_page(page);
243 return rc; 243 return rc;
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
290 return -ENOMEM; 290 return -ENOMEM;
291 *pagep = page; 291 *pagep = page;
292 292
293 prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
293 if (!PageUptodate(page)) { 294 if (!PageUptodate(page)) {
294 struct ecryptfs_crypt_stat *crypt_stat = 295 struct ecryptfs_crypt_stat *crypt_stat =
295 &ecryptfs_inode_to_private(mapping->host)->crypt_stat; 296 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
335 SetPageUptodate(page); 336 SetPageUptodate(page);
336 } 337 }
337 } else { 338 } else {
338 rc = ecryptfs_decrypt_page(page); 339 if (prev_page_end_size
339 if (rc) { 340 >= i_size_read(page->mapping->host)) {
340 printk(KERN_ERR "%s: Error decrypting page " 341 zero_user(page, 0, PAGE_CACHE_SIZE);
341 "at index [%ld]; rc = [%d]\n", 342 } else {
342 __func__, page->index, rc); 343 rc = ecryptfs_decrypt_page(page);
343 ClearPageUptodate(page); 344 if (rc) {
344 goto out; 345 printk(KERN_ERR "%s: Error decrypting "
346 "page at index [%ld]; "
347 "rc = [%d]\n",
348 __func__, page->index, rc);
349 ClearPageUptodate(page);
350 goto out;
351 }
345 } 352 }
346 SetPageUptodate(page); 353 SetPageUptodate(page);
347 } 354 }
348 } 355 }
349 prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
350 /* If creating a page or more of holes, zero them out via truncate. 356 /* If creating a page or more of holes, zero them out via truncate.
351 * Note, this will increase i_size. */ 357 * Note, this will increase i_size. */
352 if (index != 0) { 358 if (index != 0) {
@@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file,
488 } else 494 } else
489 ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); 495 ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
490 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" 496 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
491 "(page w/ index = [0x%.16x], to = [%d])\n", index, to); 497 "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
492 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 498 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
493 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, 499 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
494 to); 500 to);
@@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file,
503 rc = fill_zeros_to_end_of_page(page, to); 509 rc = fill_zeros_to_end_of_page(page, to);
504 if (rc) { 510 if (rc) {
505 ecryptfs_printk(KERN_WARNING, "Error attempting to fill " 511 ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
506 "zeros in page with index = [0x%.16x]\n", index); 512 "zeros in page with index = [0x%.16lx]\n", index);
507 goto out; 513 goto out;
508 } 514 }
509 rc = ecryptfs_encrypt_page(page); 515 rc = ecryptfs_encrypt_page(page);
510 if (rc) { 516 if (rc) {
511 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " 517 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
512 "index [0x%.16x])\n", index); 518 "index [0x%.16lx])\n", index);
513 goto out; 519 goto out;
514 } 520 }
515 if (pos + copied > i_size_read(ecryptfs_inode)) { 521 if (pos + copied > i_size_read(ecryptfs_inode)) {
516 i_size_write(ecryptfs_inode, pos + copied); 522 i_size_write(ecryptfs_inode, pos + copied);
517 ecryptfs_printk(KERN_DEBUG, "Expanded file size to " 523 ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
518 "[0x%.16x]\n", i_size_read(ecryptfs_inode)); 524 "[0x%.16llx]\n",
525 (unsigned long long)i_size_read(ecryptfs_inode));
519 } 526 }
520 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 527 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
521 if (rc) 528 if (rc)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 253732382d3..3042fe123a3 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -28,7 +28,6 @@
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/seq_file.h> 30#include <linux/seq_file.h>
31#include <linux/smp_lock.h>
32#include <linux/file.h> 31#include <linux/file.h>
33#include <linux/crypto.h> 32#include <linux/crypto.h>
34#include "ecryptfs_kernel.h" 33#include "ecryptfs_kernel.h"
@@ -63,6 +62,16 @@ out:
63 return inode; 62 return inode;
64} 63}
65 64
65static void ecryptfs_i_callback(struct rcu_head *head)
66{
67 struct inode *inode = container_of(head, struct inode, i_rcu);
68 struct ecryptfs_inode_info *inode_info;
69 inode_info = ecryptfs_inode_to_private(inode);
70
71 INIT_LIST_HEAD(&inode->i_dentry);
72 kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
73}
74
66/** 75/**
67 * ecryptfs_destroy_inode 76 * ecryptfs_destroy_inode
68 * @inode: The ecryptfs inode 77 * @inode: The ecryptfs inode
@@ -89,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
89 } 98 }
90 } 99 }
91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); 100 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
92 kmem_cache_free(ecryptfs_inode_info_cache, inode_info); 101 call_rcu(&inode->i_rcu, ecryptfs_i_callback);
93} 102}
94 103
95/** 104/**
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 5073a07652c..0f31acb0131 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb)
65 return &ei->vfs_inode; 65 return &ei->vfs_inode;
66} 66}
67 67
68static void efs_destroy_inode(struct inode *inode) 68static void efs_i_callback(struct rcu_head *head)
69{ 69{
70 struct inode *inode = container_of(head, struct inode, i_rcu);
71 INIT_LIST_HEAD(&inode->i_dentry);
70 kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); 72 kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
71} 73}
72 74
75static void efs_destroy_inode(struct inode *inode)
76{
77 call_rcu(&inode->i_rcu, efs_i_callback);
78}
79
73static void init_once(void *foo) 80static void init_once(void *foo)
74{ 81{
75 struct efs_inode_info *ei = (struct efs_inode_info *) foo; 82 struct efs_inode_info *ei = (struct efs_inode_info *) foo;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8cf07242067..cc8a9b7d606 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -217,7 +217,7 @@ struct ep_send_events_data {
217 * Configuration options available inside /proc/sys/fs/epoll/ 217 * Configuration options available inside /proc/sys/fs/epoll/
218 */ 218 */
219/* Maximum number of epoll watched descriptors, per user */ 219/* Maximum number of epoll watched descriptors, per user */
220static int max_user_watches __read_mostly; 220static long max_user_watches __read_mostly;
221 221
222/* 222/*
223 * This mutex is used to serialize ep_free() and eventpoll_release_file(). 223 * This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -240,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
240 240
241#include <linux/sysctl.h> 241#include <linux/sysctl.h>
242 242
243static int zero; 243static long zero;
244static long long_max = LONG_MAX;
244 245
245ctl_table epoll_table[] = { 246ctl_table epoll_table[] = {
246 { 247 {
247 .procname = "max_user_watches", 248 .procname = "max_user_watches",
248 .data = &max_user_watches, 249 .data = &max_user_watches,
249 .maxlen = sizeof(int), 250 .maxlen = sizeof(max_user_watches),
250 .mode = 0644, 251 .mode = 0644,
251 .proc_handler = proc_dointvec_minmax, 252 .proc_handler = proc_doulongvec_minmax,
252 .extra1 = &zero, 253 .extra1 = &zero,
254 .extra2 = &long_max,
253 }, 255 },
254 { } 256 { }
255}; 257};
@@ -561,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
561 /* At this point it is safe to free the eventpoll item */ 563 /* At this point it is safe to free the eventpoll item */
562 kmem_cache_free(epi_cache, epi); 564 kmem_cache_free(epi_cache, epi);
563 565
564 atomic_dec(&ep->user->epoll_watches); 566 atomic_long_dec(&ep->user->epoll_watches);
565 567
566 return 0; 568 return 0;
567} 569}
@@ -898,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
898{ 900{
899 int error, revents, pwake = 0; 901 int error, revents, pwake = 0;
900 unsigned long flags; 902 unsigned long flags;
903 long user_watches;
901 struct epitem *epi; 904 struct epitem *epi;
902 struct ep_pqueue epq; 905 struct ep_pqueue epq;
903 906
904 if (unlikely(atomic_read(&ep->user->epoll_watches) >= 907 user_watches = atomic_long_read(&ep->user->epoll_watches);
905 max_user_watches)) 908 if (unlikely(user_watches >= max_user_watches))
906 return -ENOSPC; 909 return -ENOSPC;
907 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 910 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
908 return -ENOMEM; 911 return -ENOMEM;
@@ -966,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
966 969
967 spin_unlock_irqrestore(&ep->lock, flags); 970 spin_unlock_irqrestore(&ep->lock, flags);
968 971
969 atomic_inc(&ep->user->epoll_watches); 972 atomic_long_inc(&ep->user->epoll_watches);
970 973
971 /* We have to call this outside the lock */ 974 /* We have to call this outside the lock */
972 if (pwake) 975 if (pwake)
@@ -1426,6 +1429,7 @@ static int __init eventpoll_init(void)
1426 */ 1429 */
1427 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / 1430 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1428 EP_ITEM_COST; 1431 EP_ITEM_COST;
1432 BUG_ON(max_user_watches < 0);
1429 1433
1430 /* Initialize the structure used to perform safe poll wait head wake ups */ 1434 /* Initialize the structure used to perform safe poll wait head wake ups */
1431 ep_nested_calls_init(&poll_safewake_ncalls); 1435 ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/exec.c b/fs/exec.c
index 99d33a1371e..c62efcb959c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -164,7 +164,26 @@ out:
164 164
165#ifdef CONFIG_MMU 165#ifdef CONFIG_MMU
166 166
167static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, 167void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
168{
169 struct mm_struct *mm = current->mm;
170 long diff = (long)(pages - bprm->vma_pages);
171
172 if (!mm || !diff)
173 return;
174
175 bprm->vma_pages = pages;
176
177#ifdef SPLIT_RSS_COUNTING
178 add_mm_counter(mm, MM_ANONPAGES, diff);
179#else
180 spin_lock(&mm->page_table_lock);
181 add_mm_counter(mm, MM_ANONPAGES, diff);
182 spin_unlock(&mm->page_table_lock);
183#endif
184}
185
186struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
168 int write) 187 int write)
169{ 188{
170 struct page *page; 189 struct page *page;
@@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
186 unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; 205 unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
187 struct rlimit *rlim; 206 struct rlimit *rlim;
188 207
208 acct_arg_size(bprm, size / PAGE_SIZE);
209
189 /* 210 /*
190 * We've historically supported up to 32 pages (ARG_MAX) 211 * We've historically supported up to 32 pages (ARG_MAX)
191 * of argument strings even with small stacks 212 * of argument strings even with small stacks
@@ -254,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
254 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; 275 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
255 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 276 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
256 INIT_LIST_HEAD(&vma->anon_vma_chain); 277 INIT_LIST_HEAD(&vma->anon_vma_chain);
278
279 err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
280 if (err)
281 goto err;
282
257 err = insert_vm_struct(mm, vma); 283 err = insert_vm_struct(mm, vma);
258 if (err) 284 if (err)
259 goto err; 285 goto err;
@@ -276,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
276 302
277#else 303#else
278 304
279static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, 305void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
306{
307}
308
309struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
280 int write) 310 int write)
281{ 311{
282 struct page *page; 312 struct page *page;
@@ -1003,6 +1033,7 @@ int flush_old_exec(struct linux_binprm * bprm)
1003 /* 1033 /*
1004 * Release all of the old mmap stuff 1034 * Release all of the old mmap stuff
1005 */ 1035 */
1036 acct_arg_size(bprm, 0);
1006 retval = exec_mmap(bprm->mm); 1037 retval = exec_mmap(bprm->mm);
1007 if (retval) 1038 if (retval)
1008 goto out; 1039 goto out;
@@ -1426,8 +1457,10 @@ int do_execve(const char * filename,
1426 return retval; 1457 return retval;
1427 1458
1428out: 1459out:
1429 if (bprm->mm) 1460 if (bprm->mm) {
1430 mmput (bprm->mm); 1461 acct_arg_size(bprm, 0);
1462 mmput(bprm->mm);
1463 }
1431 1464
1432out_file: 1465out_file:
1433 if (bprm->file) { 1466 if (bprm->file) {
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 79c3ae6e045..8c6c4669b38 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
150 return &oi->vfs_inode; 150 return &oi->vfs_inode;
151} 151}
152 152
153static void exofs_i_callback(struct rcu_head *head)
154{
155 struct inode *inode = container_of(head, struct inode, i_rcu);
156 INIT_LIST_HEAD(&inode->i_dentry);
157 kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
158}
159
153/* 160/*
154 * Remove an inode from the cache 161 * Remove an inode from the cache
155 */ 162 */
156static void exofs_destroy_inode(struct inode *inode) 163static void exofs_destroy_inode(struct inode *inode)
157{ 164{
158 kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); 165 call_rcu(&inode->i_rcu, exofs_i_callback);
159} 166}
160 167
161/* 168/*
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 51b304056f1..4b6825740dd 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -43,24 +43,26 @@ find_acceptable_alias(struct dentry *result,
43 void *context) 43 void *context)
44{ 44{
45 struct dentry *dentry, *toput = NULL; 45 struct dentry *dentry, *toput = NULL;
46 struct inode *inode;
46 47
47 if (acceptable(context, result)) 48 if (acceptable(context, result))
48 return result; 49 return result;
49 50
50 spin_lock(&dcache_lock); 51 inode = result->d_inode;
51 list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) { 52 spin_lock(&inode->i_lock);
52 dget_locked(dentry); 53 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
53 spin_unlock(&dcache_lock); 54 dget(dentry);
55 spin_unlock(&inode->i_lock);
54 if (toput) 56 if (toput)
55 dput(toput); 57 dput(toput);
56 if (dentry != result && acceptable(context, dentry)) { 58 if (dentry != result && acceptable(context, dentry)) {
57 dput(result); 59 dput(result);
58 return dentry; 60 return dentry;
59 } 61 }
60 spin_lock(&dcache_lock); 62 spin_lock(&inode->i_lock);
61 toput = dentry; 63 toput = dentry;
62 } 64 }
63 spin_unlock(&dcache_lock); 65 spin_unlock(&inode->i_lock);
64 66
65 if (toput) 67 if (toput)
66 dput(toput); 68 dput(toput);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 2bcc0431bad..7b4180554a6 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -232,10 +232,17 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
232} 232}
233 233
234int 234int
235ext2_check_acl(struct inode *inode, int mask) 235ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
236{ 236{
237 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); 237 struct posix_acl *acl;
238
239 if (flags & IPERM_FLAG_RCU) {
240 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
241 return -ECHILD;
242 return -EAGAIN;
243 }
238 244
245 acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
239 if (IS_ERR(acl)) 246 if (IS_ERR(acl))
240 return PTR_ERR(acl); 247 return PTR_ERR(acl);
241 if (acl) { 248 if (acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 3ff6cbb9ac4..c939b7b1209 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size)
54#ifdef CONFIG_EXT2_FS_POSIX_ACL 54#ifdef CONFIG_EXT2_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext2_check_acl (struct inode *, int); 57extern int ext2_check_acl (struct inode *, int, unsigned int);
58extern int ext2_acl_chmod (struct inode *); 58extern int ext2_acl_chmod (struct inode *);
59extern int ext2_init_acl (struct inode *, struct inode *); 59extern int ext2_init_acl (struct inode *, struct inode *);
60 60
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2709b34206a..47cda410b54 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,21 +28,30 @@
28 28
29typedef struct ext2_dir_entry_2 ext2_dirent; 29typedef struct ext2_dir_entry_2 ext2_dirent;
30 30
31/*
32 * Tests against MAX_REC_LEN etc were put in place for 64k block
33 * sizes; if that is not possible on this arch, we can skip
34 * those tests and speed things up.
35 */
31static inline unsigned ext2_rec_len_from_disk(__le16 dlen) 36static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
32{ 37{
33 unsigned len = le16_to_cpu(dlen); 38 unsigned len = le16_to_cpu(dlen);
34 39
40#if (PAGE_CACHE_SIZE >= 65536)
35 if (len == EXT2_MAX_REC_LEN) 41 if (len == EXT2_MAX_REC_LEN)
36 return 1 << 16; 42 return 1 << 16;
43#endif
37 return len; 44 return len;
38} 45}
39 46
40static inline __le16 ext2_rec_len_to_disk(unsigned len) 47static inline __le16 ext2_rec_len_to_disk(unsigned len)
41{ 48{
49#if (PAGE_CACHE_SIZE >= 65536)
42 if (len == (1 << 16)) 50 if (len == (1 << 16))
43 return cpu_to_le16(EXT2_MAX_REC_LEN); 51 return cpu_to_le16(EXT2_MAX_REC_LEN);
44 else 52 else
45 BUG_ON(len > (1 << 16)); 53 BUG_ON(len > (1 << 16));
54#endif
46 return cpu_to_le16(len); 55 return cpu_to_le16(len);
47} 56}
48 57
@@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet)
129 p = (ext2_dirent *)(kaddr + offs); 138 p = (ext2_dirent *)(kaddr + offs);
130 rec_len = ext2_rec_len_from_disk(p->rec_len); 139 rec_len = ext2_rec_len_from_disk(p->rec_len);
131 140
132 if (rec_len < EXT2_DIR_REC_LEN(1)) 141 if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
133 goto Eshort; 142 goto Eshort;
134 if (rec_len & 3) 143 if (unlikely(rec_len & 3))
135 goto Ealign; 144 goto Ealign;
136 if (rec_len < EXT2_DIR_REC_LEN(p->name_len)) 145 if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
137 goto Enamelen; 146 goto Enamelen;
138 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) 147 if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
139 goto Espan; 148 goto Espan;
140 if (le32_to_cpu(p->inode) > max_inumber) 149 if (unlikely(le32_to_cpu(p->inode) > max_inumber))
141 goto Einumber; 150 goto Einumber;
142 } 151 }
143 if (offs != limit) 152 if (offs != limit)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index f8aecd2e329..2e1d8341d82 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
67 inode = NULL; 67 inode = NULL;
68 if (ino) { 68 if (ino) {
69 inode = ext2_iget(dir->i_sb, ino); 69 inode = ext2_iget(dir->i_sb, ino);
70 if (unlikely(IS_ERR(inode))) { 70 if (IS_ERR(inode)) {
71 if (PTR_ERR(inode) == -ESTALE) { 71 if (PTR_ERR(inode) == -ESTALE) {
72 ext2_error(dir->i_sb, __func__, 72 ext2_error(dir->i_sb, __func__,
73 "deleted inode referenced: %lu", 73 "deleted inode referenced: %lu",
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d89e0b6a2d7..7731695e65d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data);
43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
44static int ext2_sync_fs(struct super_block *sb, int wait); 44static int ext2_sync_fs(struct super_block *sb, int wait);
45 45
46void ext2_error (struct super_block * sb, const char * function, 46void ext2_error(struct super_block *sb, const char *function,
47 const char * fmt, ...) 47 const char *fmt, ...)
48{ 48{
49 struct va_format vaf;
49 va_list args; 50 va_list args;
50 struct ext2_sb_info *sbi = EXT2_SB(sb); 51 struct ext2_sb_info *sbi = EXT2_SB(sb);
51 struct ext2_super_block *es = sbi->s_es; 52 struct ext2_super_block *es = sbi->s_es;
@@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function,
59 } 60 }
60 61
61 va_start(args, fmt); 62 va_start(args, fmt);
62 printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function); 63
63 vprintk(fmt, args); 64 vaf.fmt = fmt;
64 printk("\n"); 65 vaf.va = &args;
66
67 printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
68 sb->s_id, function, &vaf);
69
65 va_end(args); 70 va_end(args);
66 71
67 if (test_opt(sb, ERRORS_PANIC)) 72 if (test_opt(sb, ERRORS_PANIC))
@@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function,
76void ext2_msg(struct super_block *sb, const char *prefix, 81void ext2_msg(struct super_block *sb, const char *prefix,
77 const char *fmt, ...) 82 const char *fmt, ...)
78{ 83{
84 struct va_format vaf;
79 va_list args; 85 va_list args;
80 86
81 va_start(args, fmt); 87 va_start(args, fmt);
82 printk("%sEXT2-fs (%s): ", prefix, sb->s_id); 88
83 vprintk(fmt, args); 89 vaf.fmt = fmt;
84 printk("\n"); 90 vaf.va = &args;
91
92 printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
93
85 va_end(args); 94 va_end(args);
86} 95}
87 96
@@ -161,11 +170,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
161 return &ei->vfs_inode; 170 return &ei->vfs_inode;
162} 171}
163 172
164static void ext2_destroy_inode(struct inode *inode) 173static void ext2_i_callback(struct rcu_head *head)
165{ 174{
175 struct inode *inode = container_of(head, struct inode, i_rcu);
176 INIT_LIST_HEAD(&inode->i_dentry);
166 kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); 177 kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
167} 178}
168 179
180static void ext2_destroy_inode(struct inode *inode)
181{
182 call_rcu(&inode->i_rcu, ext2_i_callback);
183}
184
169static void init_once(void *foo) 185static void init_once(void *foo)
170{ 186{
171 struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; 187 struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f84700be327..c2e4dce984d 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -199,14 +199,6 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
199 goto found; 199 goto found;
200 entry = next; 200 entry = next;
201 } 201 }
202 /* Check the remaining name entries */
203 while (!IS_LAST_ENTRY(entry)) {
204 struct ext2_xattr_entry *next =
205 EXT2_XATTR_NEXT(entry);
206 if ((char *)next >= end)
207 goto bad_block;
208 entry = next;
209 }
210 if (ext2_xattr_cache_insert(bh)) 202 if (ext2_xattr_cache_insert(bh))
211 ea_idebug(inode, "cache insert failed"); 203 ea_idebug(inode, "cache insert failed");
212 error = -ENODATA; 204 error = -ENODATA;
@@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
355/* 347/*
356 * ext2_xattr_set() 348 * ext2_xattr_set()
357 * 349 *
358 * Create, replace or remove an extended attribute for this inode. Buffer 350 * Create, replace or remove an extended attribute for this inode. Value
359 * is NULL to remove an existing extended attribute, and non-NULL to 351 * is NULL to remove an existing extended attribute, and non-NULL to
360 * either replace an existing extended attribute, or create a new extended 352 * either replace an existing extended attribute, or create a new extended
361 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 353 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8a11fe21218..e4fa49e6c53 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -240,10 +240,17 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
240} 240}
241 241
242int 242int
243ext3_check_acl(struct inode *inode, int mask) 243ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
244{ 244{
245 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); 245 struct posix_acl *acl;
246
247 if (flags & IPERM_FLAG_RCU) {
248 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
249 return -ECHILD;
250 return -EAGAIN;
251 }
246 252
253 acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
247 if (IS_ERR(acl)) 254 if (IS_ERR(acl))
248 return PTR_ERR(acl); 255 return PTR_ERR(acl);
249 if (acl) { 256 if (acl) {
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 597334626de..5faf8048e90 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size)
54#ifdef CONFIG_EXT3_FS_POSIX_ACL 54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext3_check_acl (struct inode *, int); 57extern int ext3_check_acl (struct inode *, int, unsigned int);
58extern int ext3_acl_chmod (struct inode *); 58extern int ext3_acl_chmod (struct inode *);
59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); 59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
60 60
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b3db2264942..045995c8ce5 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
20#include <linux/ext3_jbd.h> 20#include <linux/ext3_jbd.h>
21#include <linux/quotaops.h> 21#include <linux/quotaops.h>
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/blkdev.h>
23 24
24/* 25/*
25 * balloc.c contains the blocks allocation and deallocation routines 26 * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
39 40
40#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 41#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
41 42
43/*
44 * Calculate the block group number and offset, given a block number
45 */
46static void ext3_get_group_no_and_offset(struct super_block *sb,
47 ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
48{
49 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
50
51 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
52 if (offsetp)
53 *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
54 if (blockgrpp)
55 *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
56}
57
42/** 58/**
43 * ext3_get_group_desc() -- load group descriptor from disk 59 * ext3_get_group_desc() -- load group descriptor from disk
44 * @sb: super block 60 * @sb: super block
@@ -1885,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1885 return ext3_bg_num_gdb_meta(sb,group); 1901 return ext3_bg_num_gdb_meta(sb,group);
1886 1902
1887} 1903}
1904
1905/**
1906 * ext3_trim_all_free -- function to trim all free space in alloc. group
1907 * @sb: super block for file system
1908 * @group: allocation group to trim
1909 * @start: first group block to examine
1910 * @max: last group block to examine
1911 * @gdp: allocation group description structure
1912 * @minblocks: minimum extent block count
1913 *
1914 * ext3_trim_all_free walks through group's block bitmap searching for free
1915 * blocks. When the free block is found, it tries to allocate this block and
1916 * consequent free block to get the biggest free extent possible, until it
1917 * reaches any used block. Then issue a TRIM command on this extent and free
1918 * the extent in the block bitmap. This is done until whole group is scanned.
1919 */
1920ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
1921 ext3_grpblk_t start, ext3_grpblk_t max,
1922 ext3_grpblk_t minblocks)
1923{
1924 handle_t *handle;
1925 ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
1926 ext3_fsblk_t discard_block;
1927 struct ext3_sb_info *sbi;
1928 struct buffer_head *gdp_bh, *bitmap_bh = NULL;
1929 struct ext3_group_desc *gdp;
1930 int err = 0, ret = 0;
1931
1932 /*
1933 * We will update one block bitmap, and one group descriptor
1934 */
1935 handle = ext3_journal_start_sb(sb, 2);
1936 if (IS_ERR(handle))
1937 return PTR_ERR(handle);
1938
1939 bitmap_bh = read_block_bitmap(sb, group);
1940 if (!bitmap_bh) {
1941 err = -EIO;
1942 goto err_out;
1943 }
1944
1945 BUFFER_TRACE(bitmap_bh, "getting undo access");
1946 err = ext3_journal_get_undo_access(handle, bitmap_bh);
1947 if (err)
1948 goto err_out;
1949
1950 gdp = ext3_get_group_desc(sb, group, &gdp_bh);
1951 if (!gdp) {
1952 err = -EIO;
1953 goto err_out;
1954 }
1955
1956 BUFFER_TRACE(gdp_bh, "get_write_access");
1957 err = ext3_journal_get_write_access(handle, gdp_bh);
1958 if (err)
1959 goto err_out;
1960
1961 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1962 sbi = EXT3_SB(sb);
1963
1964 /* Walk through the whole group */
1965 while (start < max) {
1966 start = bitmap_search_next_usable_block(start, bitmap_bh, max);
1967 if (start < 0)
1968 break;
1969 next = start;
1970
1971 /*
1972 * Allocate contiguous free extents by setting bits in the
1973 * block bitmap
1974 */
1975 while (next < max
1976 && claim_block(sb_bgl_lock(sbi, group),
1977 next, bitmap_bh)) {
1978 next++;
1979 }
1980
1981 /* We did not claim any blocks */
1982 if (next == start)
1983 continue;
1984
1985 discard_block = (ext3_fsblk_t)start +
1986 ext3_group_first_block_no(sb, group);
1987
1988 /* Update counters */
1989 spin_lock(sb_bgl_lock(sbi, group));
1990 le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
1991 spin_unlock(sb_bgl_lock(sbi, group));
1992 percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
1993
1994 /* Do not issue a TRIM on extents smaller than minblocks */
1995 if ((next - start) < minblocks)
1996 goto free_extent;
1997
1998 /* Send the TRIM command down to the device */
1999 err = sb_issue_discard(sb, discard_block, next - start,
2000 GFP_NOFS, 0);
2001 count += (next - start);
2002free_extent:
2003 freed = 0;
2004
2005 /*
2006 * Clear bits in the bitmap
2007 */
2008 for (bit = start; bit < next; bit++) {
2009 BUFFER_TRACE(bitmap_bh, "clear bit");
2010 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
2011 bit, bitmap_bh->b_data)) {
2012 ext3_error(sb, __func__,
2013 "bit already cleared for block "E3FSBLK,
2014 (unsigned long)bit);
2015 BUFFER_TRACE(bitmap_bh, "bit already cleared");
2016 } else {
2017 freed++;
2018 }
2019 }
2020
2021 /* Update couters */
2022 spin_lock(sb_bgl_lock(sbi, group));
2023 le16_add_cpu(&gdp->bg_free_blocks_count, freed);
2024 spin_unlock(sb_bgl_lock(sbi, group));
2025 percpu_counter_add(&sbi->s_freeblocks_counter, freed);
2026
2027 start = next;
2028 if (err < 0) {
2029 if (err != -EOPNOTSUPP)
2030 ext3_warning(sb, __func__, "Discard command "
2031 "returned error %d\n", err);
2032 break;
2033 }
2034
2035 if (fatal_signal_pending(current)) {
2036 err = -ERESTARTSYS;
2037 break;
2038 }
2039
2040 cond_resched();
2041
2042 /* No more suitable extents */
2043 if ((free_blocks - count) < minblocks)
2044 break;
2045 }
2046
2047 /* We dirtied the bitmap block */
2048 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2049 ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
2050 if (!err)
2051 err = ret;
2052
2053 /* And the group descriptor block */
2054 BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
2055 ret = ext3_journal_dirty_metadata(handle, gdp_bh);
2056 if (!err)
2057 err = ret;
2058
2059 ext3_debug("trimmed %d blocks in the group %d\n",
2060 count, group);
2061
2062err_out:
2063 if (err)
2064 count = err;
2065 ext3_journal_stop(handle);
2066 brelse(bitmap_bh);
2067
2068 return count;
2069}
2070
2071/**
2072 * ext3_trim_fs() -- trim ioctl handle function
2073 * @sb: superblock for filesystem
2074 * @start: First Byte to trim
2075 * @len: number of Bytes to trim from start
2076 * @minlen: minimum extent length in Bytes
2077 *
2078 * ext3_trim_fs goes through all allocation groups containing Bytes from
2079 * start to start+len. For each such a group ext3_trim_all_free function
2080 * is invoked to trim all free space.
2081 */
2082int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2083{
2084 ext3_grpblk_t last_block, first_block, free_blocks;
2085 unsigned long first_group, last_group;
2086 unsigned long group, ngroups;
2087 struct ext3_group_desc *gdp;
2088 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
2089 uint64_t start, len, minlen, trimmed;
2090 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2091 int ret = 0;
2092
2093 start = range->start >> sb->s_blocksize_bits;
2094 len = range->len >> sb->s_blocksize_bits;
2095 minlen = range->minlen >> sb->s_blocksize_bits;
2096 trimmed = 0;
2097
2098 if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
2099 return -EINVAL;
2100 if (start >= max_blks)
2101 goto out;
2102 if (start < le32_to_cpu(es->s_first_data_block)) {
2103 len -= le32_to_cpu(es->s_first_data_block) - start;
2104 start = le32_to_cpu(es->s_first_data_block);
2105 }
2106 if (start + len > max_blks)
2107 len = max_blks - start;
2108
2109 ngroups = EXT3_SB(sb)->s_groups_count;
2110 smp_rmb();
2111
2112 /* Determine first and last group to examine based on start and len */
2113 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
2114 &first_group, &first_block);
2115 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
2116 &last_group, &last_block);
2117 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
2118 last_block = EXT3_BLOCKS_PER_GROUP(sb);
2119
2120 if (first_group > last_group)
2121 return -EINVAL;
2122
2123 for (group = first_group; group <= last_group; group++) {
2124 gdp = ext3_get_group_desc(sb, group, NULL);
2125 if (!gdp)
2126 break;
2127
2128 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
2129 if (free_blocks < minlen)
2130 continue;
2131
2132 if (len >= EXT3_BLOCKS_PER_GROUP(sb))
2133 len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
2134 else
2135 last_block = first_block + len;
2136
2137 ret = ext3_trim_all_free(sb, group, first_block,
2138 last_block, minlen);
2139 if (ret < 0)
2140 break;
2141
2142 trimmed += ret;
2143 first_block = 0;
2144 }
2145
2146 if (ret >= 0)
2147 ret = 0;
2148
2149out:
2150 range->len = trimmed * sb->s_blocksize;
2151
2152 return ret;
2153}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf..34f0a072b93 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
69 const char * error_msg = NULL; 69 const char * error_msg = NULL;
70 const int rlen = ext3_rec_len_from_disk(de->rec_len); 70 const int rlen = ext3_rec_len_from_disk(de->rec_len);
71 71
72 if (rlen < EXT3_DIR_REC_LEN(1)) 72 if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
73 error_msg = "rec_len is smaller than minimal"; 73 error_msg = "rec_len is smaller than minimal";
74 else if (rlen % 4 != 0) 74 else if (unlikely(rlen % 4 != 0))
75 error_msg = "rec_len % 4 != 0"; 75 error_msg = "rec_len % 4 != 0";
76 else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) 76 else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
77 error_msg = "rec_len is too small for name_len"; 77 error_msg = "rec_len is too small for name_len";
78 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) 78 else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
79 error_msg = "directory entry across blocks"; 79 error_msg = "directory entry across blocks";
80 else if (le32_to_cpu(de->inode) > 80 else if (unlikely(le32_to_cpu(de->inode) >
81 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) 81 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
82 error_msg = "inode out of bounds"; 82 error_msg = "inode out of bounds";
83 83
84 if (error_msg != NULL) 84 if (unlikely(error_msg != NULL))
85 ext3_error (dir->i_sb, function, 85 ext3_error (dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
89 (unsigned long) le32_to_cpu(de->inode), 89 (unsigned long) le32_to_cpu(de->inode),
90 rlen, de->name_len); 90 rlen, de->name_len);
91
91 return error_msg == NULL ? 1 : 0; 92 return error_msg == NULL ? 1 : 0;
92} 93}
93 94
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a9580617edd..ae94f6d949f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2145,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2145 if (try_to_extend_transaction(handle, inode)) { 2145 if (try_to_extend_transaction(handle, inode)) {
2146 if (bh) { 2146 if (bh) {
2147 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 2147 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2148 ext3_journal_dirty_metadata(handle, bh); 2148 if (ext3_journal_dirty_metadata(handle, bh))
2149 return;
2149 } 2150 }
2150 ext3_mark_inode_dirty(handle, inode); 2151 ext3_mark_inode_dirty(handle, inode);
2151 truncate_restart_transaction(handle, inode); 2152 truncate_restart_transaction(handle, inode);
2152 if (bh) { 2153 if (bh) {
2153 BUFFER_TRACE(bh, "retaking write access"); 2154 BUFFER_TRACE(bh, "retaking write access");
2154 ext3_journal_get_write_access(handle, bh); 2155 if (ext3_journal_get_write_access(handle, bh))
2156 return;
2155 } 2157 }
2156 } 2158 }
2157 2159
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783..fc080dd561f 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -276,7 +276,29 @@ group_add_out:
276 mnt_drop_write(filp->f_path.mnt); 276 mnt_drop_write(filp->f_path.mnt);
277 return err; 277 return err;
278 } 278 }
279 case FITRIM: {
279 280
281 struct super_block *sb = inode->i_sb;
282 struct fstrim_range range;
283 int ret = 0;
284
285 if (!capable(CAP_SYS_ADMIN))
286 return -EPERM;
287
288 if (copy_from_user(&range, (struct fstrim_range *)arg,
289 sizeof(range)))
290 return -EFAULT;
291
292 ret = ext3_trim_fs(sb, &range);
293 if (ret < 0)
294 return ret;
295
296 if (copy_to_user((struct fstrim_range *)arg, &range,
297 sizeof(range)))
298 return -EFAULT;
299
300 return 0;
301 }
280 302
281 default: 303 default:
282 return -ENOTTY; 304 return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index bce9dce639b..b27ba71810e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
858 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 858 struct buffer_head * bh_use[NAMEI_RA_SIZE];
859 struct buffer_head * bh, *ret = NULL; 859 struct buffer_head * bh, *ret = NULL;
860 unsigned long start, block, b; 860 unsigned long start, block, b;
861 const u8 *name = entry->name;
861 int ra_max = 0; /* Number of bh's in the readahead 862 int ra_max = 0; /* Number of bh's in the readahead
862 buffer, bh_use[] */ 863 buffer, bh_use[] */
863 int ra_ptr = 0; /* Current index into readahead 864 int ra_ptr = 0; /* Current index into readahead
@@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
871 namelen = entry->len; 872 namelen = entry->len;
872 if (namelen > EXT3_NAME_LEN) 873 if (namelen > EXT3_NAME_LEN)
873 return NULL; 874 return NULL;
875 if ((namelen <= 2) && (name[0] == '.') &&
876 (name[1] == '.' || name[1] == 0)) {
877 /*
878 * "." or ".." will only be in the first block
879 * NFS may look up ".."; "." should be handled by the VFS
880 */
881 block = start = 0;
882 nblocks = 1;
883 goto restart;
884 }
874 if (is_dx(dir)) { 885 if (is_dx(dir)) {
875 bh = ext3_dx_find_entry(dir, entry, res_dir, &err); 886 bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
876 /* 887 /*
@@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
961 struct qstr *entry, struct ext3_dir_entry_2 **res_dir, 972 struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
962 int *err) 973 int *err)
963{ 974{
964 struct super_block * sb; 975 struct super_block *sb = dir->i_sb;
965 struct dx_hash_info hinfo; 976 struct dx_hash_info hinfo;
966 u32 hash;
967 struct dx_frame frames[2], *frame; 977 struct dx_frame frames[2], *frame;
968 struct ext3_dir_entry_2 *de, *top;
969 struct buffer_head *bh; 978 struct buffer_head *bh;
970 unsigned long block; 979 unsigned long block;
971 int retval; 980 int retval;
972 int namelen = entry->len;
973 const u8 *name = entry->name;
974 981
975 sb = dir->i_sb; 982 if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
976 /* NFS may look up ".." - look at dx_root directory block */ 983 return NULL;
977 if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
978 if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
979 return NULL;
980 } else {
981 frame = frames;
982 frame->bh = NULL; /* for dx_release() */
983 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
984 dx_set_block(frame->at, 0); /* dx_root block is 0 */
985 }
986 hash = hinfo.hash;
987 do { 984 do {
988 block = dx_get_block(frame->at); 985 block = dx_get_block(frame->at);
989 if (!(bh = ext3_bread (NULL,dir, block, 0, err))) 986 if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
990 goto errout; 987 goto errout;
991 de = (struct ext3_dir_entry_2 *) bh->b_data;
992 top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
993 EXT3_DIR_REC_LEN(0));
994 for (; de < top; de = ext3_next_entry(de)) {
995 int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
996 + ((char *) de - bh->b_data);
997
998 if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
999 brelse(bh);
1000 *err = ERR_BAD_DX_DIR;
1001 goto errout;
1002 }
1003 988
1004 if (ext3_match(namelen, name, de)) { 989 retval = search_dirblock(bh, dir, entry,
1005 *res_dir = de; 990 block << EXT3_BLOCK_SIZE_BITS(sb),
1006 dx_release(frames); 991 res_dir);
1007 return bh; 992 if (retval == 1) {
1008 } 993 dx_release(frames);
994 return bh;
1009 } 995 }
1010 brelse (bh); 996 brelse(bh);
997 if (retval == -1) {
998 *err = ERR_BAD_DX_DIR;
999 goto errout;
1000 }
1001
1011 /* Check to see if we should continue to search */ 1002 /* Check to see if we should continue to search */
1012 retval = ext3_htree_next_block(dir, hash, frame, 1003 retval = ext3_htree_next_block(dir, hinfo.hash, frame,
1013 frames, NULL); 1004 frames, NULL);
1014 if (retval < 0) { 1005 if (retval < 0) {
1015 ext3_warning(sb, __func__, 1006 ext3_warning(sb, __func__,
@@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1047 return ERR_PTR(-EIO); 1038 return ERR_PTR(-EIO);
1048 } 1039 }
1049 inode = ext3_iget(dir->i_sb, ino); 1040 inode = ext3_iget(dir->i_sb, ino);
1050 if (unlikely(IS_ERR(inode))) { 1041 if (IS_ERR(inode)) {
1051 if (PTR_ERR(inode) == -ESTALE) { 1042 if (PTR_ERR(inode) == -ESTALE) {
1052 ext3_error(dir->i_sb, __func__, 1043 ext3_error(dir->i_sb, __func__,
1053 "deleted inode referenced: %lu", 1044 "deleted inode referenced: %lu",
@@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1607 if (err) 1598 if (err)
1608 goto journal_error; 1599 goto journal_error;
1609 } 1600 }
1610 ext3_journal_dirty_metadata(handle, frames[0].bh); 1601 err = ext3_journal_dirty_metadata(handle, frames[0].bh);
1602 if (err)
1603 goto journal_error;
1611 } 1604 }
1612 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1605 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1613 if (!de) 1606 if (!de)
@@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle,
1644 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) 1637 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
1645 return -EIO; 1638 return -EIO;
1646 if (de == de_del) { 1639 if (de == de_del) {
1640 int err;
1641
1647 BUFFER_TRACE(bh, "get_write_access"); 1642 BUFFER_TRACE(bh, "get_write_access");
1648 ext3_journal_get_write_access(handle, bh); 1643 err = ext3_journal_get_write_access(handle, bh);
1644 if (err)
1645 goto journal_error;
1646
1649 if (pde) 1647 if (pde)
1650 pde->rec_len = ext3_rec_len_to_disk( 1648 pde->rec_len = ext3_rec_len_to_disk(
1651 ext3_rec_len_from_disk(pde->rec_len) + 1649 ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle,
1654 de->inode = 0; 1652 de->inode = 0;
1655 dir->i_version++; 1653 dir->i_version++;
1656 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 1654 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1657 ext3_journal_dirty_metadata(handle, bh); 1655 err = ext3_journal_dirty_metadata(handle, bh);
1656 if (err) {
1657journal_error:
1658 ext3_std_error(dir->i_sb, err);
1659 return err;
1660 }
1658 return 0; 1661 return 0;
1659 } 1662 }
1660 i += ext3_rec_len_from_disk(de->rec_len); 1663 i += ext3_rec_len_from_disk(de->rec_len);
@@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1762{ 1765{
1763 handle_t *handle; 1766 handle_t *handle;
1764 struct inode * inode; 1767 struct inode * inode;
1765 struct buffer_head * dir_block; 1768 struct buffer_head * dir_block = NULL;
1766 struct ext3_dir_entry_2 * de; 1769 struct ext3_dir_entry_2 * de;
1767 int err, retries = 0; 1770 int err, retries = 0;
1768 1771
@@ -1790,15 +1793,14 @@ retry:
1790 inode->i_fop = &ext3_dir_operations; 1793 inode->i_fop = &ext3_dir_operations;
1791 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1794 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1792 dir_block = ext3_bread (handle, inode, 0, 1, &err); 1795 dir_block = ext3_bread (handle, inode, 0, 1, &err);
1793 if (!dir_block) { 1796 if (!dir_block)
1794 drop_nlink(inode); /* is this nlink == 0? */ 1797 goto out_clear_inode;
1795 unlock_new_inode(inode); 1798
1796 ext3_mark_inode_dirty(handle, inode);
1797 iput (inode);
1798 goto out_stop;
1799 }
1800 BUFFER_TRACE(dir_block, "get_write_access"); 1799 BUFFER_TRACE(dir_block, "get_write_access");
1801 ext3_journal_get_write_access(handle, dir_block); 1800 err = ext3_journal_get_write_access(handle, dir_block);
1801 if (err)
1802 goto out_clear_inode;
1803
1802 de = (struct ext3_dir_entry_2 *) dir_block->b_data; 1804 de = (struct ext3_dir_entry_2 *) dir_block->b_data;
1803 de->inode = cpu_to_le32(inode->i_ino); 1805 de->inode = cpu_to_le32(inode->i_ino);
1804 de->name_len = 1; 1806 de->name_len = 1;
@@ -1814,11 +1816,16 @@ retry:
1814 ext3_set_de_type(dir->i_sb, de, S_IFDIR); 1816 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1815 inode->i_nlink = 2; 1817 inode->i_nlink = 2;
1816 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); 1818 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1817 ext3_journal_dirty_metadata(handle, dir_block); 1819 err = ext3_journal_dirty_metadata(handle, dir_block);
1818 brelse (dir_block); 1820 if (err)
1819 ext3_mark_inode_dirty(handle, inode); 1821 goto out_clear_inode;
1820 err = ext3_add_entry (handle, dentry, inode); 1822
1823 err = ext3_mark_inode_dirty(handle, inode);
1824 if (!err)
1825 err = ext3_add_entry (handle, dentry, inode);
1826
1821 if (err) { 1827 if (err) {
1828out_clear_inode:
1822 inode->i_nlink = 0; 1829 inode->i_nlink = 0;
1823 unlock_new_inode(inode); 1830 unlock_new_inode(inode);
1824 ext3_mark_inode_dirty(handle, inode); 1831 ext3_mark_inode_dirty(handle, inode);
@@ -1827,10 +1834,14 @@ retry:
1827 } 1834 }
1828 inc_nlink(dir); 1835 inc_nlink(dir);
1829 ext3_update_dx_flag(dir); 1836 ext3_update_dx_flag(dir);
1830 ext3_mark_inode_dirty(handle, dir); 1837 err = ext3_mark_inode_dirty(handle, dir);
1838 if (err)
1839 goto out_clear_inode;
1840
1831 d_instantiate(dentry, inode); 1841 d_instantiate(dentry, inode);
1832 unlock_new_inode(inode); 1842 unlock_new_inode(inode);
1833out_stop: 1843out_stop:
1844 brelse(dir_block);
1834 ext3_journal_stop(handle); 1845 ext3_journal_stop(handle);
1835 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) 1846 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1836 goto retry; 1847 goto retry;
@@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2353 goto end_rename; 2364 goto end_rename;
2354 } else { 2365 } else {
2355 BUFFER_TRACE(new_bh, "get write access"); 2366 BUFFER_TRACE(new_bh, "get write access");
2356 ext3_journal_get_write_access(handle, new_bh); 2367 retval = ext3_journal_get_write_access(handle, new_bh);
2368 if (retval)
2369 goto journal_error;
2357 new_de->inode = cpu_to_le32(old_inode->i_ino); 2370 new_de->inode = cpu_to_le32(old_inode->i_ino);
2358 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, 2371 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2359 EXT3_FEATURE_INCOMPAT_FILETYPE)) 2372 EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2362 new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; 2375 new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
2363 ext3_mark_inode_dirty(handle, new_dir); 2376 ext3_mark_inode_dirty(handle, new_dir);
2364 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); 2377 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
2365 ext3_journal_dirty_metadata(handle, new_bh); 2378 retval = ext3_journal_dirty_metadata(handle, new_bh);
2379 if (retval)
2380 goto journal_error;
2366 brelse(new_bh); 2381 brelse(new_bh);
2367 new_bh = NULL; 2382 new_bh = NULL;
2368 } 2383 }
@@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2411 ext3_update_dx_flag(old_dir); 2426 ext3_update_dx_flag(old_dir);
2412 if (dir_bh) { 2427 if (dir_bh) {
2413 BUFFER_TRACE(dir_bh, "get_write_access"); 2428 BUFFER_TRACE(dir_bh, "get_write_access");
2414 ext3_journal_get_write_access(handle, dir_bh); 2429 retval = ext3_journal_get_write_access(handle, dir_bh);
2430 if (retval)
2431 goto journal_error;
2415 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2432 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2416 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); 2433 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
2417 ext3_journal_dirty_metadata(handle, dir_bh); 2434 retval = ext3_journal_dirty_metadata(handle, dir_bh);
2435 if (retval) {
2436journal_error:
2437 ext3_std_error(new_dir->i_sb, retval);
2438 goto end_rename;
2439 }
2418 drop_nlink(old_dir); 2440 drop_nlink(old_dir);
2419 if (new_inode) { 2441 if (new_inode) {
2420 drop_nlink(new_inode); 2442 drop_nlink(new_inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index e746d30b123..108b142e11e 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb,
249 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 249 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
250 set_buffer_uptodate(gdb); 250 set_buffer_uptodate(gdb);
251 unlock_buffer(gdb); 251 unlock_buffer(gdb);
252 ext3_journal_dirty_metadata(handle, gdb); 252 err = ext3_journal_dirty_metadata(handle, gdb);
253 if (err) {
254 brelse(gdb);
255 goto exit_bh;
256 }
253 ext3_set_bit(bit, bh->b_data); 257 ext3_set_bit(bit, bh->b_data);
254 brelse(gdb); 258 brelse(gdb);
255 } 259 }
@@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb,
269 err = PTR_ERR(gdb); 273 err = PTR_ERR(gdb);
270 goto exit_bh; 274 goto exit_bh;
271 } 275 }
272 ext3_journal_dirty_metadata(handle, gdb); 276 err = ext3_journal_dirty_metadata(handle, gdb);
277 if (err) {
278 brelse(gdb);
279 goto exit_bh;
280 }
273 ext3_set_bit(bit, bh->b_data); 281 ext3_set_bit(bit, bh->b_data);
274 brelse(gdb); 282 brelse(gdb);
275 } 283 }
@@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb,
295 err = PTR_ERR(it); 303 err = PTR_ERR(it);
296 goto exit_bh; 304 goto exit_bh;
297 } 305 }
298 ext3_journal_dirty_metadata(handle, it); 306 err = ext3_journal_dirty_metadata(handle, it);
307 if (err) {
308 brelse(it);
309 goto exit_bh;
310 }
299 brelse(it); 311 brelse(it);
300 ext3_set_bit(bit, bh->b_data); 312 ext3_set_bit(bit, bh->b_data);
301 } 313 }
@@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb,
306 318
307 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), 319 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
308 bh->b_data); 320 bh->b_data);
309 ext3_journal_dirty_metadata(handle, bh); 321 err = ext3_journal_dirty_metadata(handle, bh);
322 if (err)
323 goto exit_bh;
310 brelse(bh); 324 brelse(bh);
311 325
312 /* Mark unused entries in inode bitmap used */ 326 /* Mark unused entries in inode bitmap used */
@@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb,
319 333
320 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), 334 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
321 bh->b_data); 335 bh->b_data);
322 ext3_journal_dirty_metadata(handle, bh); 336 err = ext3_journal_dirty_metadata(handle, bh);
323exit_bh: 337exit_bh:
324 brelse(bh); 338 brelse(bh);
325 339
@@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
503 * reserved inode, and will become GDT blocks (primary and backup). 517 * reserved inode, and will become GDT blocks (primary and backup).
504 */ 518 */
505 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; 519 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
506 ext3_journal_dirty_metadata(handle, dind); 520 err = ext3_journal_dirty_metadata(handle, dind);
521 if (err)
522 goto exit_group_desc;
507 brelse(dind); 523 brelse(dind);
524 dind = NULL;
508 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 525 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
509 ext3_mark_iloc_dirty(handle, inode, &iloc); 526 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
527 if (err)
528 goto exit_group_desc;
510 memset((*primary)->b_data, 0, sb->s_blocksize); 529 memset((*primary)->b_data, 0, sb->s_blocksize);
511 ext3_journal_dirty_metadata(handle, *primary); 530 err = ext3_journal_dirty_metadata(handle, *primary);
531 if (err)
532 goto exit_group_desc;
512 533
513 o_group_desc = EXT3_SB(sb)->s_group_desc; 534 o_group_desc = EXT3_SB(sb)->s_group_desc;
514 memcpy(n_group_desc, o_group_desc, 535 memcpy(n_group_desc, o_group_desc,
@@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
519 kfree(o_group_desc); 540 kfree(o_group_desc);
520 541
521 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 542 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
522 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 543 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
544 if (err)
545 goto exit_inode;
523 546
524 return 0; 547 return 0;
525 548
549exit_group_desc:
550 kfree(n_group_desc);
526exit_inode: 551exit_inode:
527 //ext3_journal_release_buffer(handle, iloc.bh); 552 //ext3_journal_release_buffer(handle, iloc.bh);
528 brelse(iloc.bh); 553 brelse(iloc.bh);
@@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb,
706 } 731 }
707 ext3_debug("update metadata backup %#04lx\n", 732 ext3_debug("update metadata backup %#04lx\n",
708 (unsigned long)bh->b_blocknr); 733 (unsigned long)bh->b_blocknr);
709 if ((err = ext3_journal_get_write_access(handle, bh))) 734 if ((err = ext3_journal_get_write_access(handle, bh))) {
735 brelse(bh);
710 break; 736 break;
737 }
711 lock_buffer(bh); 738 lock_buffer(bh);
712 memcpy(bh->b_data, data, size); 739 memcpy(bh->b_data, data, size);
713 if (rest) 740 if (rest)
714 memset(bh->b_data + size, 0, rest); 741 memset(bh->b_data + size, 0, rest);
715 set_buffer_uptodate(bh); 742 set_buffer_uptodate(bh);
716 unlock_buffer(bh); 743 unlock_buffer(bh);
717 ext3_journal_dirty_metadata(handle, bh); 744 err = ext3_journal_dirty_metadata(handle, bh);
718 brelse(bh); 745 brelse(bh);
746 if (err)
747 break;
719 } 748 }
720 if ((err2 = ext3_journal_stop(handle)) && !err) 749 if ((err2 = ext3_journal_stop(handle)) && !err)
721 err = err2; 750 err = err2;
@@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
922 /* Update the global fs size fields */ 951 /* Update the global fs size fields */
923 sbi->s_groups_count++; 952 sbi->s_groups_count++;
924 953
925 ext3_journal_dirty_metadata(handle, primary); 954 err = ext3_journal_dirty_metadata(handle, primary);
955 if (err)
956 goto exit_journal;
926 957
927 /* Update the reserved block counts only once the new group is 958 /* Update the reserved block counts only once the new group is
928 * active. */ 959 * active. */
@@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
934 percpu_counter_add(&sbi->s_freeinodes_counter, 965 percpu_counter_add(&sbi->s_freeinodes_counter,
935 EXT3_INODES_PER_GROUP(sb)); 966 EXT3_INODES_PER_GROUP(sb));
936 967
937 ext3_journal_dirty_metadata(handle, sbi->s_sbh); 968 err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
938 969
939exit_journal: 970exit_journal:
940 mutex_unlock(&sbi->s_resize_lock); 971 mutex_unlock(&sbi->s_resize_lock);
@@ -1064,8 +1095,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1064 goto exit_put; 1095 goto exit_put;
1065 } 1096 }
1066 es->s_blocks_count = cpu_to_le32(o_blocks_count + add); 1097 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1067 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 1098 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1068 mutex_unlock(&EXT3_SB(sb)->s_resize_lock); 1099 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1100 if (err) {
1101 ext3_warning(sb, __func__,
1102 "error %d on journal dirty metadata", err);
1103 ext3_journal_stop(handle);
1104 goto exit_put;
1105 }
1069 ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", 1106 ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
1070 o_blocks_count, o_blocks_count + add); 1107 o_blocks_count, o_blocks_count + add);
1071 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1108 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b501..85c8cc8f247 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,7 +27,6 @@
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/parser.h> 29#include <linux/parser.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/exportfs.h> 31#include <linux/exportfs.h>
33#include <linux/vfs.h> 32#include <linux/vfs.h>
@@ -144,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
144void ext3_msg(struct super_block *sb, const char *prefix, 143void ext3_msg(struct super_block *sb, const char *prefix,
145 const char *fmt, ...) 144 const char *fmt, ...)
146{ 145{
146 struct va_format vaf;
147 va_list args; 147 va_list args;
148 148
149 va_start(args, fmt); 149 va_start(args, fmt);
150 printk("%sEXT3-fs (%s): ", prefix, sb->s_id); 150
151 vprintk(fmt, args); 151 vaf.fmt = fmt;
152 printk("\n"); 152 vaf.va = &args;
153
154 printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
155
153 va_end(args); 156 va_end(args);
154} 157}
155 158
@@ -196,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb)
196 sb->s_id); 199 sb->s_id);
197} 200}
198 201
199void ext3_error (struct super_block * sb, const char * function, 202void ext3_error(struct super_block *sb, const char *function,
200 const char * fmt, ...) 203 const char *fmt, ...)
201{ 204{
205 struct va_format vaf;
202 va_list args; 206 va_list args;
203 207
204 va_start(args, fmt); 208 va_start(args, fmt);
205 printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); 209
206 vprintk(fmt, args); 210 vaf.fmt = fmt;
207 printk("\n"); 211 vaf.va = &args;
212
213 printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
214 sb->s_id, function, &vaf);
215
208 va_end(args); 216 va_end(args);
209 217
210 ext3_handle_error(sb); 218 ext3_handle_error(sb);
@@ -275,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
275 * case we take the easy way out and panic immediately. 283 * case we take the easy way out and panic immediately.
276 */ 284 */
277 285
278void ext3_abort (struct super_block * sb, const char * function, 286void ext3_abort(struct super_block *sb, const char *function,
279 const char * fmt, ...) 287 const char *fmt, ...)
280{ 288{
289 struct va_format vaf;
281 va_list args; 290 va_list args;
282 291
283 va_start(args, fmt); 292 va_start(args, fmt);
284 printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function); 293
285 vprintk(fmt, args); 294 vaf.fmt = fmt;
286 printk("\n"); 295 vaf.va = &args;
296
297 printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
298 sb->s_id, function, &vaf);
299
287 va_end(args); 300 va_end(args);
288 301
289 if (test_opt(sb, ERRORS_PANIC)) 302 if (test_opt(sb, ERRORS_PANIC))
@@ -301,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function,
301 journal_abort(EXT3_SB(sb)->s_journal, -EIO); 314 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
302} 315}
303 316
304void ext3_warning (struct super_block * sb, const char * function, 317void ext3_warning(struct super_block *sb, const char *function,
305 const char * fmt, ...) 318 const char *fmt, ...)
306{ 319{
320 struct va_format vaf;
307 va_list args; 321 va_list args;
308 322
309 va_start(args, fmt); 323 va_start(args, fmt);
310 printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ", 324
311 sb->s_id, function); 325 vaf.fmt = fmt;
312 vprintk(fmt, args); 326 vaf.va = &args;
313 printk("\n"); 327
328 printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
329 sb->s_id, function, &vaf);
330
314 va_end(args); 331 va_end(args);
315} 332}
316 333
@@ -347,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
347 struct block_device *bdev; 364 struct block_device *bdev;
348 char b[BDEVNAME_SIZE]; 365 char b[BDEVNAME_SIZE];
349 366
350 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 367 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
351 if (IS_ERR(bdev)) 368 if (IS_ERR(bdev))
352 goto fail; 369 goto fail;
353 return bdev; 370 return bdev;
@@ -364,8 +381,7 @@ fail:
364 */ 381 */
365static int ext3_blkdev_put(struct block_device *bdev) 382static int ext3_blkdev_put(struct block_device *bdev)
366{ 383{
367 bd_release(bdev); 384 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
368 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
369} 385}
370 386
371static int ext3_blkdev_remove(struct ext3_sb_info *sbi) 387static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -480,6 +496,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
480 return &ei->vfs_inode; 496 return &ei->vfs_inode;
481} 497}
482 498
499static void ext3_i_callback(struct rcu_head *head)
500{
501 struct inode *inode = container_of(head, struct inode, i_rcu);
502 INIT_LIST_HEAD(&inode->i_dentry);
503 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
504}
505
483static void ext3_destroy_inode(struct inode *inode) 506static void ext3_destroy_inode(struct inode *inode)
484{ 507{
485 if (!list_empty(&(EXT3_I(inode)->i_orphan))) { 508 if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
@@ -490,7 +513,7 @@ static void ext3_destroy_inode(struct inode *inode)
490 false); 513 false);
491 dump_stack(); 514 dump_stack();
492 } 515 }
493 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); 516 call_rcu(&inode->i_rcu, ext3_i_callback);
494} 517}
495 518
496static void init_once(void *foo) 519static void init_once(void *foo)
@@ -731,7 +754,7 @@ static int ext3_release_dquot(struct dquot *dquot);
731static int ext3_mark_dquot_dirty(struct dquot *dquot); 754static int ext3_mark_dquot_dirty(struct dquot *dquot);
732static int ext3_write_info(struct super_block *sb, int type); 755static int ext3_write_info(struct super_block *sb, int type);
733static int ext3_quota_on(struct super_block *sb, int type, int format_id, 756static int ext3_quota_on(struct super_block *sb, int type, int format_id,
734 char *path); 757 struct path *path);
735static int ext3_quota_on_mount(struct super_block *sb, int type); 758static int ext3_quota_on_mount(struct super_block *sb, int type);
736static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 759static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
737 size_t len, loff_t off); 760 size_t len, loff_t off);
@@ -1842,13 +1865,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1842 goto failed_mount; 1865 goto failed_mount;
1843 } 1866 }
1844 1867
1845 if (generic_check_addressable(sb->s_blocksize_bits, 1868 err = generic_check_addressable(sb->s_blocksize_bits,
1846 le32_to_cpu(es->s_blocks_count))) { 1869 le32_to_cpu(es->s_blocks_count));
1870 if (err) {
1847 ext3_msg(sb, KERN_ERR, 1871 ext3_msg(sb, KERN_ERR,
1848 "error: filesystem is too large to mount safely"); 1872 "error: filesystem is too large to mount safely");
1849 if (sizeof(sector_t) < 8) 1873 if (sizeof(sector_t) < 8)
1850 ext3_msg(sb, KERN_ERR, 1874 ext3_msg(sb, KERN_ERR,
1851 "error: CONFIG_LBDAF not enabled"); 1875 "error: CONFIG_LBDAF not enabled");
1876 ret = err;
1852 goto failed_mount; 1877 goto failed_mount;
1853 } 1878 }
1854 1879
@@ -2136,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2136 if (bdev == NULL) 2161 if (bdev == NULL)
2137 return NULL; 2162 return NULL;
2138 2163
2139 if (bd_claim(bdev, sb)) {
2140 ext3_msg(sb, KERN_ERR,
2141 "error: failed to claim external journal device");
2142 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2143 return NULL;
2144 }
2145
2146 blocksize = sb->s_blocksize; 2164 blocksize = sb->s_blocksize;
2147 hblock = bdev_logical_block_size(bdev); 2165 hblock = bdev_logical_block_size(bdev);
2148 if (blocksize < hblock) { 2166 if (blocksize < hblock) {
@@ -2291,7 +2309,7 @@ static int ext3_load_journal(struct super_block *sb,
2291 EXT3_SB(sb)->s_journal = journal; 2309 EXT3_SB(sb)->s_journal = journal;
2292 ext3_clear_journal_err(sb, es); 2310 ext3_clear_journal_err(sb, es);
2293 2311
2294 if (journal_devnum && 2312 if (!really_read_only && journal_devnum &&
2295 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2313 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2296 es->s_journal_dev = cpu_to_le32(journal_devnum); 2314 es->s_journal_dev = cpu_to_le32(journal_devnum);
2297 2315
@@ -2859,27 +2877,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
2859 * Standard function to be called on quota_on 2877 * Standard function to be called on quota_on
2860 */ 2878 */
2861static int ext3_quota_on(struct super_block *sb, int type, int format_id, 2879static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2862 char *name) 2880 struct path *path)
2863{ 2881{
2864 int err; 2882 int err;
2865 struct path path;
2866 2883
2867 if (!test_opt(sb, QUOTA)) 2884 if (!test_opt(sb, QUOTA))
2868 return -EINVAL; 2885 return -EINVAL;
2869 2886
2870 err = kern_path(name, LOOKUP_FOLLOW, &path);
2871 if (err)
2872 return err;
2873
2874 /* Quotafile not on the same filesystem? */ 2887 /* Quotafile not on the same filesystem? */
2875 if (path.mnt->mnt_sb != sb) { 2888 if (path->mnt->mnt_sb != sb)
2876 path_put(&path);
2877 return -EXDEV; 2889 return -EXDEV;
2878 }
2879 /* Journaling quota? */ 2890 /* Journaling quota? */
2880 if (EXT3_SB(sb)->s_qf_names[type]) { 2891 if (EXT3_SB(sb)->s_qf_names[type]) {
2881 /* Quotafile not of fs root? */ 2892 /* Quotafile not of fs root? */
2882 if (path.dentry->d_parent != sb->s_root) 2893 if (path->dentry->d_parent != sb->s_root)
2883 ext3_msg(sb, KERN_WARNING, 2894 ext3_msg(sb, KERN_WARNING,
2884 "warning: Quota file not on filesystem root. " 2895 "warning: Quota file not on filesystem root. "
2885 "Journaled quota will not work."); 2896 "Journaled quota will not work.");
@@ -2889,7 +2900,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2889 * When we journal data on quota file, we have to flush journal to see 2900 * When we journal data on quota file, we have to flush journal to see
2890 * all updates to the file when we bypass pagecache... 2901 * all updates to the file when we bypass pagecache...
2891 */ 2902 */
2892 if (ext3_should_journal_data(path.dentry->d_inode)) { 2903 if (ext3_should_journal_data(path->dentry->d_inode)) {
2893 /* 2904 /*
2894 * We don't need to lock updates but journal_flush() could 2905 * We don't need to lock updates but journal_flush() could
2895 * otherwise be livelocked... 2906 * otherwise be livelocked...
@@ -2897,15 +2908,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2897 journal_lock_updates(EXT3_SB(sb)->s_journal); 2908 journal_lock_updates(EXT3_SB(sb)->s_journal);
2898 err = journal_flush(EXT3_SB(sb)->s_journal); 2909 err = journal_flush(EXT3_SB(sb)->s_journal);
2899 journal_unlock_updates(EXT3_SB(sb)->s_journal); 2910 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2900 if (err) { 2911 if (err)
2901 path_put(&path);
2902 return err; 2912 return err;
2903 }
2904 } 2913 }
2905 2914
2906 err = dquot_quota_on_path(sb, type, format_id, &path); 2915 return dquot_quota_on(sb, type, format_id, path);
2907 path_put(&path);
2908 return err;
2909} 2916}
2910 2917
2911/* Read data from quotafile - avoid pagecache and such because we cannot afford 2918/* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa8..32e6cc23bd9 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
925/* 925/*
926 * ext3_xattr_set_handle() 926 * ext3_xattr_set_handle()
927 * 927 *
928 * Create, replace or remove an extended attribute for this inode. Buffer 928 * Create, replace or remove an extended attribute for this inode. Value
929 * is NULL to remove an existing extended attribute, and non-NULL to 929 * is NULL to remove an existing extended attribute, and non-NULL to
930 * either replace an existing extended attribute, or create a new extended 930 * either replace an existing extended attribute, or create a new extended
931 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 931 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ea..e0270d1f8d8 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
238} 238}
239 239
240int 240int
241ext4_check_acl(struct inode *inode, int mask) 241ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
242{ 242{
243 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); 243 struct posix_acl *acl;
244
245 if (flags & IPERM_FLAG_RCU) {
246 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
247 return -ECHILD;
248 return -EAGAIN;
249 }
244 250
251 acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
245 if (IS_ERR(acl)) 252 if (IS_ERR(acl))
246 return PTR_ERR(acl); 253 return PTR_ERR(acl);
247 if (acl) { 254 if (acl) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac..dec821168fd 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
54#ifdef CONFIG_EXT4_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext4_check_acl(struct inode *, int); 57extern int ext4_check_acl(struct inode *, int, unsigned int);
58extern int ext4_acl_chmod(struct inode *); 58extern int ext4_acl_chmod(struct inode *);
59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); 59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
60 60
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 14c3af26c67..adf96b82278 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -592,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
592 * Account for the allocated meta blocks. We will never 592 * Account for the allocated meta blocks. We will never
593 * fail EDQUOT for metdata, but we do account for it. 593 * fail EDQUOT for metdata, but we do account for it.
594 */ 594 */
595 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 595 if (!(*errp) &&
596 ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
596 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 597 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
597 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 598 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
598 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 599 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ece76fb6a40..164c56092e5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
60 return (ext4_filetype_table[filetype]); 60 return (ext4_filetype_table[filetype]);
61} 61}
62 62
63 63/*
64 * Return 0 if the directory entry is OK, and 1 if there is a problem
65 *
66 * Note: this is the opposite of what ext2 and ext3 historically returned...
67 */
64int __ext4_check_dir_entry(const char *function, unsigned int line, 68int __ext4_check_dir_entry(const char *function, unsigned int line,
65 struct inode *dir, 69 struct inode *dir, struct file *filp,
66 struct ext4_dir_entry_2 *de, 70 struct ext4_dir_entry_2 *de,
67 struct buffer_head *bh, 71 struct buffer_head *bh,
68 unsigned int offset) 72 unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
71 const int rlen = ext4_rec_len_from_disk(de->rec_len, 75 const int rlen = ext4_rec_len_from_disk(de->rec_len,
72 dir->i_sb->s_blocksize); 76 dir->i_sb->s_blocksize);
73 77
74 if (rlen < EXT4_DIR_REC_LEN(1)) 78 if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
75 error_msg = "rec_len is smaller than minimal"; 79 error_msg = "rec_len is smaller than minimal";
76 else if (rlen % 4 != 0) 80 else if (unlikely(rlen % 4 != 0))
77 error_msg = "rec_len % 4 != 0"; 81 error_msg = "rec_len % 4 != 0";
78 else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) 82 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
79 error_msg = "rec_len is too small for name_len"; 83 error_msg = "rec_len is too small for name_len";
80 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) 84 else if (unlikely(((char *) de - bh->b_data) + rlen >
85 dir->i_sb->s_blocksize))
81 error_msg = "directory entry across blocks"; 86 error_msg = "directory entry across blocks";
82 else if (le32_to_cpu(de->inode) > 87 else if (unlikely(le32_to_cpu(de->inode) >
83 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) 88 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
84 error_msg = "inode out of bounds"; 89 error_msg = "inode out of bounds";
90 else
91 return 0;
85 92
86 if (error_msg != NULL) 93 if (filp)
87 ext4_error_inode(dir, function, line, bh->b_blocknr, 94 ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
88 "bad entry in directory: %s - " 95 "bad entry in directory: %s - offset=%u(%u), "
89 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 96 "inode=%u, rec_len=%d, name_len=%d",
90 error_msg, (unsigned) (offset%bh->b_size), offset, 97 error_msg, (unsigned) (offset%bh->b_size),
91 le32_to_cpu(de->inode), 98 offset, le32_to_cpu(de->inode),
92 rlen, de->name_len); 99 rlen, de->name_len);
93 return error_msg == NULL ? 1 : 0; 100 else
101 ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
102 "bad entry in directory: %s - offset=%u(%u), "
103 "inode=%u, rec_len=%d, name_len=%d",
104 error_msg, (unsigned) (offset%bh->b_size),
105 offset, le32_to_cpu(de->inode),
106 rlen, de->name_len);
107
108 return 1;
94} 109}
95 110
96static int ext4_readdir(struct file *filp, 111static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
152 */ 167 */
153 if (!bh) { 168 if (!bh) {
154 if (!dir_has_error) { 169 if (!dir_has_error) {
155 EXT4_ERROR_INODE(inode, "directory " 170 EXT4_ERROR_FILE(filp, 0,
156 "contains a hole at offset %Lu", 171 "directory contains a "
172 "hole at offset %llu",
157 (unsigned long long) filp->f_pos); 173 (unsigned long long) filp->f_pos);
158 dir_has_error = 1; 174 dir_has_error = 1;
159 } 175 }
@@ -194,8 +210,8 @@ revalidate:
194 while (!error && filp->f_pos < inode->i_size 210 while (!error && filp->f_pos < inode->i_size
195 && offset < sb->s_blocksize) { 211 && offset < sb->s_blocksize) {
196 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 212 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
197 if (!ext4_check_dir_entry(inode, de, 213 if (ext4_check_dir_entry(inode, filp, de,
198 bh, offset)) { 214 bh, offset)) {
199 /* 215 /*
200 * On error, skip the f_pos to the next block 216 * On error, skip the f_pos to the next block
201 */ 217 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8b5dd6369f8..0c8d97b56f3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ 62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) 63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
64 64
65#define EXT4_ERROR_FILE(file, fmt, a...) \ 65#define EXT4_ERROR_FILE(file, block, fmt, a...) \
66 ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) 66 ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
67 67
68/* data type for block offset of block group */ 68/* data type for block offset of block group */
69typedef int ext4_grpblk_t; 69typedef int ext4_grpblk_t;
@@ -177,7 +177,7 @@ struct mpage_da_data {
177 177
178struct ext4_io_page { 178struct ext4_io_page {
179 struct page *p_page; 179 struct page *p_page;
180 int p_count; 180 atomic_t p_count;
181}; 181};
182 182
183#define MAX_IO_PAGES 128 183#define MAX_IO_PAGES 128
@@ -561,23 +561,7 @@ struct ext4_new_group_data {
561#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 561#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
562#endif 562#endif
563 563
564 564/* Max physical block we can address w/o extents */
565/*
566 * Mount options
567 */
568struct ext4_mount_options {
569 unsigned long s_mount_opt;
570 uid_t s_resuid;
571 gid_t s_resgid;
572 unsigned long s_commit_interval;
573 u32 s_min_batch_time, s_max_batch_time;
574#ifdef CONFIG_QUOTA
575 int s_jquota_fmt;
576 char *s_qf_names[MAXQUOTAS];
577#endif
578};
579
580/* Max physical block we can addres w/o extents */
581#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF 565#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
582 566
583/* 567/*
@@ -709,6 +693,8 @@ do { \
709 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ 693 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
710 ext4_decode_extra_time(&(inode)->xtime, \ 694 ext4_decode_extra_time(&(inode)->xtime, \
711 raw_inode->xtime ## _extra); \ 695 raw_inode->xtime ## _extra); \
696 else \
697 (inode)->xtime.tv_nsec = 0; \
712} while (0) 698} while (0)
713 699
714#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ 700#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
@@ -719,6 +705,8 @@ do { \
719 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ 705 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
720 ext4_decode_extra_time(&(einode)->xtime, \ 706 ext4_decode_extra_time(&(einode)->xtime, \
721 raw_inode->xtime ## _extra); \ 707 raw_inode->xtime ## _extra); \
708 else \
709 (einode)->xtime.tv_nsec = 0; \
722} while (0) 710} while (0)
723 711
724#define i_disk_version osd1.linux1.l_i_version 712#define i_disk_version osd1.linux1.l_i_version
@@ -750,12 +738,13 @@ do { \
750 738
751/* 739/*
752 * storage for cached extent 740 * storage for cached extent
741 * If ec_len == 0, then the cache is invalid.
742 * If ec_start == 0, then the cache represents a gap (null mapping)
753 */ 743 */
754struct ext4_ext_cache { 744struct ext4_ext_cache {
755 ext4_fsblk_t ec_start; 745 ext4_fsblk_t ec_start;
756 ext4_lblk_t ec_block; 746 ext4_lblk_t ec_block;
757 __u32 ec_len; /* must be 32bit to return holes */ 747 __u32 ec_len; /* must be 32bit to return holes */
758 __u32 ec_type;
759}; 748};
760 749
761/* 750/*
@@ -774,10 +763,12 @@ struct ext4_inode_info {
774 * near to their parent directory's inode. 763 * near to their parent directory's inode.
775 */ 764 */
776 ext4_group_t i_block_group; 765 ext4_group_t i_block_group;
766 ext4_lblk_t i_dir_start_lookup;
767#if (BITS_PER_LONG < 64)
777 unsigned long i_state_flags; /* Dynamic state flags */ 768 unsigned long i_state_flags; /* Dynamic state flags */
769#endif
778 unsigned long i_flags; 770 unsigned long i_flags;
779 771
780 ext4_lblk_t i_dir_start_lookup;
781#ifdef CONFIG_EXT4_FS_XATTR 772#ifdef CONFIG_EXT4_FS_XATTR
782 /* 773 /*
783 * Extended attributes can be read independently of the main file 774 * Extended attributes can be read independently of the main file
@@ -820,7 +811,7 @@ struct ext4_inode_info {
820 */ 811 */
821 struct rw_semaphore i_data_sem; 812 struct rw_semaphore i_data_sem;
822 struct inode vfs_inode; 813 struct inode vfs_inode;
823 struct jbd2_inode jinode; 814 struct jbd2_inode *jinode;
824 815
825 struct ext4_ext_cache i_cached_extent; 816 struct ext4_ext_cache i_cached_extent;
826 /* 817 /*
@@ -840,14 +831,12 @@ struct ext4_inode_info {
840 unsigned int i_reserved_data_blocks; 831 unsigned int i_reserved_data_blocks;
841 unsigned int i_reserved_meta_blocks; 832 unsigned int i_reserved_meta_blocks;
842 unsigned int i_allocated_meta_blocks; 833 unsigned int i_allocated_meta_blocks;
843 unsigned short i_delalloc_reserved_flag; 834 ext4_lblk_t i_da_metadata_calc_last_lblock;
844 sector_t i_da_metadata_calc_last_lblock;
845 int i_da_metadata_calc_len; 835 int i_da_metadata_calc_len;
846 836
847 /* on-disk additional length */ 837 /* on-disk additional length */
848 __u16 i_extra_isize; 838 __u16 i_extra_isize;
849 839
850 spinlock_t i_block_reservation_lock;
851#ifdef CONFIG_QUOTA 840#ifdef CONFIG_QUOTA
852 /* quota space reservation, managed internally by quota code */ 841 /* quota space reservation, managed internally by quota code */
853 qsize_t i_reserved_quota; 842 qsize_t i_reserved_quota;
@@ -856,9 +845,12 @@ struct ext4_inode_info {
856 /* completed IOs that might need unwritten extents handling */ 845 /* completed IOs that might need unwritten extents handling */
857 struct list_head i_completed_io_list; 846 struct list_head i_completed_io_list;
858 spinlock_t i_completed_io_lock; 847 spinlock_t i_completed_io_lock;
848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
859 /* current io_end structure for async DIO write*/ 849 /* current io_end structure for async DIO write*/
860 ext4_io_end_t *cur_aio_dio; 850 ext4_io_end_t *cur_aio_dio;
861 851
852 spinlock_t i_block_reservation_lock;
853
862 /* 854 /*
863 * Transactions that contain inode's metadata needed to complete 855 * Transactions that contain inode's metadata needed to complete
864 * fsync and fdatasync, respectively. 856 * fsync and fdatasync, respectively.
@@ -909,17 +901,27 @@ struct ext4_inode_info {
909#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 901#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
910#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 902#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
911#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 903#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
904#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
912#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 905#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
913#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 906#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
914#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 907#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
915#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 908#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
916#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ 909#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
917 910
918#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 911#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
919#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 912 ~EXT4_MOUNT_##opt
913#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
914 EXT4_MOUNT_##opt
920#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ 915#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
921 EXT4_MOUNT_##opt) 916 EXT4_MOUNT_##opt)
922 917
918#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \
919 ~EXT4_MOUNT2_##opt
920#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \
921 EXT4_MOUNT2_##opt
922#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
923 EXT4_MOUNT2_##opt)
924
923#define ext4_set_bit ext2_set_bit 925#define ext4_set_bit ext2_set_bit
924#define ext4_set_bit_atomic ext2_set_bit_atomic 926#define ext4_set_bit_atomic ext2_set_bit_atomic
925#define ext4_clear_bit ext2_clear_bit 927#define ext4_clear_bit ext2_clear_bit
@@ -1085,6 +1087,7 @@ struct ext4_sb_info {
1085 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ 1087 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
1086 struct buffer_head **s_group_desc; 1088 struct buffer_head **s_group_desc;
1087 unsigned int s_mount_opt; 1089 unsigned int s_mount_opt;
1090 unsigned int s_mount_opt2;
1088 unsigned int s_mount_flags; 1091 unsigned int s_mount_flags;
1089 ext4_fsblk_t s_sb_block; 1092 ext4_fsblk_t s_sb_block;
1090 uid_t s_resuid; 1093 uid_t s_resuid;
@@ -1235,24 +1238,39 @@ enum {
1235 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1238 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1236 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1239 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1237 EXT4_STATE_NEWENTRY, /* File just added to dir */ 1240 EXT4_STATE_NEWENTRY, /* File just added to dir */
1241 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
1238}; 1242};
1239 1243
1240#define EXT4_INODE_BIT_FNS(name, field) \ 1244#define EXT4_INODE_BIT_FNS(name, field, offset) \
1241static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ 1245static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1242{ \ 1246{ \
1243 return test_bit(bit, &EXT4_I(inode)->i_##field); \ 1247 return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1244} \ 1248} \
1245static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ 1249static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1246{ \ 1250{ \
1247 set_bit(bit, &EXT4_I(inode)->i_##field); \ 1251 set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1248} \ 1252} \
1249static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ 1253static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1250{ \ 1254{ \
1251 clear_bit(bit, &EXT4_I(inode)->i_##field); \ 1255 clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1256}
1257
1258EXT4_INODE_BIT_FNS(flag, flags, 0)
1259#if (BITS_PER_LONG < 64)
1260EXT4_INODE_BIT_FNS(state, state_flags, 0)
1261
1262static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1263{
1264 (ei)->i_state_flags = 0;
1252} 1265}
1266#else
1267EXT4_INODE_BIT_FNS(state, flags, 32)
1253 1268
1254EXT4_INODE_BIT_FNS(flag, flags) 1269static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1255EXT4_INODE_BIT_FNS(state, state_flags) 1270{
1271 /* We depend on the fact that callers will set i_flags */
1272}
1273#endif
1256#else 1274#else
1257/* Assume that user mode programs are passing in an ext4fs superblock, not 1275/* Assume that user mode programs are passing in an ext4fs superblock, not
1258 * a kernel struct super_block. This will allow us to call the feature-test 1276 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1640,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1640 1658
1641/* dir.c */ 1659/* dir.c */
1642extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 1660extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
1661 struct file *,
1643 struct ext4_dir_entry_2 *, 1662 struct ext4_dir_entry_2 *,
1644 struct buffer_head *, unsigned int); 1663 struct buffer_head *, unsigned int);
1645#define ext4_check_dir_entry(dir, de, bh, offset) \ 1664#define ext4_check_dir_entry(dir, filp, de, bh, offset) \
1646 __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) 1665 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
1666 (de), (bh), (offset)))
1647extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1667extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1648 __u32 minor_hash, 1668 __u32 minor_hash,
1649 struct ext4_dir_entry_2 *dirent); 1669 struct ext4_dir_entry_2 *dirent);
@@ -1651,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1651 1671
1652/* fsync.c */ 1672/* fsync.c */
1653extern int ext4_sync_file(struct file *, int); 1673extern int ext4_sync_file(struct file *, int);
1674extern int ext4_flush_completed_IO(struct inode *);
1654 1675
1655/* hash.c */ 1676/* hash.c */
1656extern int ext4fs_dirhash(const char *name, int len, struct 1677extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1750,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
1750 ext4_fsblk_t, const char *, ...) 1771 ext4_fsblk_t, const char *, ...)
1751 __attribute__ ((format (printf, 5, 6))); 1772 __attribute__ ((format (printf, 5, 6)));
1752extern void ext4_error_file(struct file *, const char *, unsigned int, 1773extern void ext4_error_file(struct file *, const char *, unsigned int,
1753 const char *, ...) 1774 ext4_fsblk_t, const char *, ...)
1754 __attribute__ ((format (printf, 4, 5))); 1775 __attribute__ ((format (printf, 5, 6)));
1755extern void __ext4_std_error(struct super_block *, const char *, 1776extern void __ext4_std_error(struct super_block *, const char *,
1756 unsigned int, int); 1777 unsigned int, int);
1757extern void __ext4_abort(struct super_block *, const char *, unsigned int, 1778extern void __ext4_abort(struct super_block *, const char *, unsigned int,
@@ -2044,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2044extern void ext4_ext_truncate(struct inode *); 2065extern void ext4_ext_truncate(struct inode *);
2045extern void ext4_ext_init(struct super_block *); 2066extern void ext4_ext_init(struct super_block *);
2046extern void ext4_ext_release(struct super_block *); 2067extern void ext4_ext_release(struct super_block *);
2047extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 2068extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
2048 loff_t len); 2069 loff_t len);
2049extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2070extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
2050 ssize_t len); 2071 ssize_t len);
@@ -2060,6 +2081,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2060/* page-io.c */ 2081/* page-io.c */
2061extern int __init ext4_init_pageio(void); 2082extern int __init ext4_init_pageio(void);
2062extern void ext4_exit_pageio(void); 2083extern void ext4_exit_pageio(void);
2084extern void ext4_ioend_wait(struct inode *);
2063extern void ext4_free_io_end(ext4_io_end_t *io); 2085extern void ext4_free_io_end(ext4_io_end_t *io);
2064extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2086extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2065extern int ext4_end_io_nolock(ext4_io_end_t *io); 2087extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 28ce70fd9cd..2e29abb30f7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,10 +119,6 @@ struct ext4_ext_path {
119 * structure for external API 119 * structure for external API
120 */ 120 */
121 121
122#define EXT4_EXT_CACHE_NO 0
123#define EXT4_EXT_CACHE_GAP 1
124#define EXT4_EXT_CACHE_EXTENT 2
125
126/* 122/*
127 * to be called by ext4_ext_walk_space() 123 * to be called by ext4_ext_walk_space()
128 * negative retcode - error 124 * negative retcode - error
@@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode)
197static inline void 193static inline void
198ext4_ext_invalidate_cache(struct inode *inode) 194ext4_ext_invalidate_cache(struct inode *inode)
199{ 195{
200 EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO; 196 EXT4_I(inode)->i_cached_extent.ec_len = 0;
201} 197}
202 198
203static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) 199static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -278,7 +274,7 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
278} 274}
279 275
280extern int ext4_ext_calc_metadata_amount(struct inode *inode, 276extern int ext4_ext_calc_metadata_amount(struct inode *inode,
281 sector_t lblocks); 277 ext4_lblk_t lblocks);
282extern int ext4_extent_tree_init(handle_t *, struct inode *); 278extern int ext4_extent_tree_init(handle_t *, struct inode *);
283extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, 279extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
284 int num, 280 int num,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c..d8b992e658c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
253static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 253static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
254{ 254{
255 if (ext4_handle_valid(handle)) 255 if (ext4_handle_valid(handle))
256 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 256 return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
257 return 0; 257 return 0;
258} 258}
259 259
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0554c48cb1f..63a75810b7c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -117,11 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
117 struct ext4_extent *ex; 117 struct ext4_extent *ex;
118 depth = path->p_depth; 118 depth = path->p_depth;
119 119
120 /* try to predict block placement */ 120 /*
121 * Try to predict block placement assuming that we are
122 * filling in a file which will eventually be
123 * non-sparse --- i.e., in the case of libbfd writing
124 * an ELF object sections out-of-order but in a way
125 * the eventually results in a contiguous object or
126 * executable file, or some database extending a table
127 * space file. However, this is actually somewhat
128 * non-ideal if we are writing a sparse file such as
129 * qemu or KVM writing a raw image file that is going
130 * to stay fairly sparse, since it will end up
131 * fragmenting the file system's free space. Maybe we
132 * should have some hueristics or some way to allow
133 * userspace to pass a hint to file system,
134 * especiially if the latter case turns out to be
135 * common.
136 */
121 ex = path[depth].p_ext; 137 ex = path[depth].p_ext;
122 if (ex) 138 if (ex) {
123 return (ext4_ext_pblock(ex) + 139 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
124 (block - le32_to_cpu(ex->ee_block))); 140 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
141
142 if (block > ext_block)
143 return ext_pblk + (block - ext_block);
144 else
145 return ext_pblk - (ext_block - block);
146 }
125 147
126 /* it looks like index is empty; 148 /* it looks like index is empty;
127 * try to find starting block from index itself */ 149 * try to find starting block from index itself */
@@ -244,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
244 * to allocate @blocks 266 * to allocate @blocks
245 * Worse case is one block per extent 267 * Worse case is one block per extent
246 */ 268 */
247int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) 269int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
248{ 270{
249 struct ext4_inode_info *ei = EXT4_I(inode); 271 struct ext4_inode_info *ei = EXT4_I(inode);
250 int idxs, num = 0; 272 int idxs, num = 0;
@@ -1872,12 +1894,10 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1872 cbex.ec_block = start; 1894 cbex.ec_block = start;
1873 cbex.ec_len = end - start; 1895 cbex.ec_len = end - start;
1874 cbex.ec_start = 0; 1896 cbex.ec_start = 0;
1875 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1876 } else { 1897 } else {
1877 cbex.ec_block = le32_to_cpu(ex->ee_block); 1898 cbex.ec_block = le32_to_cpu(ex->ee_block);
1878 cbex.ec_len = ext4_ext_get_actual_len(ex); 1899 cbex.ec_len = ext4_ext_get_actual_len(ex);
1879 cbex.ec_start = ext4_ext_pblock(ex); 1900 cbex.ec_start = ext4_ext_pblock(ex);
1880 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1881 } 1901 }
1882 1902
1883 if (unlikely(cbex.ec_len == 0)) { 1903 if (unlikely(cbex.ec_len == 0)) {
@@ -1917,13 +1937,12 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1917 1937
1918static void 1938static void
1919ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1939ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1920 __u32 len, ext4_fsblk_t start, int type) 1940 __u32 len, ext4_fsblk_t start)
1921{ 1941{
1922 struct ext4_ext_cache *cex; 1942 struct ext4_ext_cache *cex;
1923 BUG_ON(len == 0); 1943 BUG_ON(len == 0);
1924 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1944 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1925 cex = &EXT4_I(inode)->i_cached_extent; 1945 cex = &EXT4_I(inode)->i_cached_extent;
1926 cex->ec_type = type;
1927 cex->ec_block = block; 1946 cex->ec_block = block;
1928 cex->ec_len = len; 1947 cex->ec_len = len;
1929 cex->ec_start = start; 1948 cex->ec_start = start;
@@ -1976,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1976 } 1995 }
1977 1996
1978 ext_debug(" -> %u:%lu\n", lblock, len); 1997 ext_debug(" -> %u:%lu\n", lblock, len);
1979 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); 1998 ext4_ext_put_in_cache(inode, lblock, len, 0);
1980} 1999}
1981 2000
2001/*
2002 * Return 0 if cache is invalid; 1 if the cache is valid
2003 */
1982static int 2004static int
1983ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2005ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1984 struct ext4_extent *ex) 2006 struct ext4_extent *ex)
1985{ 2007{
1986 struct ext4_ext_cache *cex; 2008 struct ext4_ext_cache *cex;
1987 int ret = EXT4_EXT_CACHE_NO; 2009 int ret = 0;
1988 2010
1989 /* 2011 /*
1990 * We borrow i_block_reservation_lock to protect i_cached_extent 2012 * We borrow i_block_reservation_lock to protect i_cached_extent
@@ -1993,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1993 cex = &EXT4_I(inode)->i_cached_extent; 2015 cex = &EXT4_I(inode)->i_cached_extent;
1994 2016
1995 /* has cache valid data? */ 2017 /* has cache valid data? */
1996 if (cex->ec_type == EXT4_EXT_CACHE_NO) 2018 if (cex->ec_len == 0)
1997 goto errout; 2019 goto errout;
1998 2020
1999 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
2000 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
2001 if (in_range(block, cex->ec_block, cex->ec_len)) { 2021 if (in_range(block, cex->ec_block, cex->ec_len)) {
2002 ex->ee_block = cpu_to_le32(cex->ec_block); 2022 ex->ee_block = cpu_to_le32(cex->ec_block);
2003 ext4_ext_store_pblock(ex, cex->ec_start); 2023 ext4_ext_store_pblock(ex, cex->ec_start);
@@ -2005,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2005 ext_debug("%u cached by %u:%u:%llu\n", 2025 ext_debug("%u cached by %u:%u:%llu\n",
2006 block, 2026 block,
2007 cex->ec_block, cex->ec_len, cex->ec_start); 2027 cex->ec_block, cex->ec_len, cex->ec_start);
2008 ret = cex->ec_type; 2028 ret = 1;
2009 } 2029 }
2010errout: 2030errout:
2011 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2031 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2825,14 +2845,14 @@ fix_extent_len:
2825 * to an uninitialized extent. 2845 * to an uninitialized extent.
2826 * 2846 *
2827 * Writing to an uninitized extent may result in splitting the uninitialized 2847 * Writing to an uninitized extent may result in splitting the uninitialized
2828 * extent into multiple /intialized unintialized extents (up to three) 2848 * extent into multiple /initialized uninitialized extents (up to three)
2829 * There are three possibilities: 2849 * There are three possibilities:
2830 * a> There is no split required: Entire extent should be uninitialized 2850 * a> There is no split required: Entire extent should be uninitialized
2831 * b> Splits in two extents: Write is happening at either end of the extent 2851 * b> Splits in two extents: Write is happening at either end of the extent
2832 * c> Splits in three extents: Somone is writing in middle of the extent 2852 * c> Splits in three extents: Somone is writing in middle of the extent
2833 * 2853 *
2834 * One of more index blocks maybe needed if the extent tree grow after 2854 * One of more index blocks maybe needed if the extent tree grow after
2835 * the unintialized extent split. To prevent ENOSPC occur at the IO 2855 * the uninitialized extent split. To prevent ENOSPC occur at the IO
2836 * complete, we need to split the uninitialized extent before DIO submit 2856 * complete, we need to split the uninitialized extent before DIO submit
2837 * the IO. The uninitialized extent called at this time will be split 2857 * the IO. The uninitialized extent called at this time will be split
2838 * into three uninitialized extent(at most). After IO complete, the part 2858 * into three uninitialized extent(at most). After IO complete, the part
@@ -3082,7 +3102,7 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3082 * Handle EOFBLOCKS_FL flag, clearing it if necessary 3102 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3083 */ 3103 */
3084static int check_eofblocks_fl(handle_t *handle, struct inode *inode, 3104static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3085 struct ext4_map_blocks *map, 3105 ext4_lblk_t lblk,
3086 struct ext4_ext_path *path, 3106 struct ext4_ext_path *path,
3087 unsigned int len) 3107 unsigned int len)
3088{ 3108{
@@ -3112,7 +3132,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3112 * this turns out to be false, we can bail out from this 3132 * this turns out to be false, we can bail out from this
3113 * function immediately. 3133 * function immediately.
3114 */ 3134 */
3115 if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) + 3135 if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3116 ext4_ext_get_actual_len(last_ex)) 3136 ext4_ext_get_actual_len(last_ex))
3117 return 0; 3137 return 0;
3118 /* 3138 /*
@@ -3168,8 +3188,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3168 path); 3188 path);
3169 if (ret >= 0) { 3189 if (ret >= 0) {
3170 ext4_update_inode_fsync_trans(handle, inode, 1); 3190 ext4_update_inode_fsync_trans(handle, inode, 1);
3171 err = check_eofblocks_fl(handle, inode, map, path, 3191 err = check_eofblocks_fl(handle, inode, map->m_lblk,
3172 map->m_len); 3192 path, map->m_len);
3173 } else 3193 } else
3174 err = ret; 3194 err = ret;
3175 goto out2; 3195 goto out2;
@@ -3199,7 +3219,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3199 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3219 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3200 if (ret >= 0) { 3220 if (ret >= 0) {
3201 ext4_update_inode_fsync_trans(handle, inode, 1); 3221 ext4_update_inode_fsync_trans(handle, inode, 1);
3202 err = check_eofblocks_fl(handle, inode, map, path, map->m_len); 3222 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3223 map->m_len);
3203 if (err < 0) 3224 if (err < 0)
3204 goto out2; 3225 goto out2;
3205 } 3226 }
@@ -3276,7 +3297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3276 struct ext4_extent_header *eh; 3297 struct ext4_extent_header *eh;
3277 struct ext4_extent newex, *ex; 3298 struct ext4_extent newex, *ex;
3278 ext4_fsblk_t newblock; 3299 ext4_fsblk_t newblock;
3279 int err = 0, depth, ret, cache_type; 3300 int err = 0, depth, ret;
3280 unsigned int allocated = 0; 3301 unsigned int allocated = 0;
3281 struct ext4_allocation_request ar; 3302 struct ext4_allocation_request ar;
3282 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3303 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3285,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3285 map->m_lblk, map->m_len, inode->i_ino); 3306 map->m_lblk, map->m_len, inode->i_ino);
3286 3307
3287 /* check in cache */ 3308 /* check in cache */
3288 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); 3309 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3289 if (cache_type) { 3310 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3290 if (cache_type == EXT4_EXT_CACHE_GAP) {
3291 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3311 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3292 /* 3312 /*
3293 * block isn't allocated yet and 3313 * block isn't allocated yet and
@@ -3296,7 +3316,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3296 goto out2; 3316 goto out2;
3297 } 3317 }
3298 /* we should allocate requested block */ 3318 /* we should allocate requested block */
3299 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3319 } else {
3300 /* block is already allocated */ 3320 /* block is already allocated */
3301 newblock = map->m_lblk 3321 newblock = map->m_lblk
3302 - le32_to_cpu(newex.ee_block) 3322 - le32_to_cpu(newex.ee_block)
@@ -3305,8 +3325,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3305 allocated = ext4_ext_get_actual_len(&newex) - 3325 allocated = ext4_ext_get_actual_len(&newex) -
3306 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3326 (map->m_lblk - le32_to_cpu(newex.ee_block));
3307 goto out; 3327 goto out;
3308 } else {
3309 BUG();
3310 } 3328 }
3311 } 3329 }
3312 3330
@@ -3357,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3357 /* Do not put uninitialized extent in the cache */ 3375 /* Do not put uninitialized extent in the cache */
3358 if (!ext4_ext_is_uninitialized(ex)) { 3376 if (!ext4_ext_is_uninitialized(ex)) {
3359 ext4_ext_put_in_cache(inode, ee_block, 3377 ext4_ext_put_in_cache(inode, ee_block,
3360 ee_len, ee_start, 3378 ee_len, ee_start);
3361 EXT4_EXT_CACHE_EXTENT);
3362 goto out; 3379 goto out;
3363 } 3380 }
3364 ret = ext4_ext_handle_uninitialized_extents(handle, 3381 ret = ext4_ext_handle_uninitialized_extents(handle,
@@ -3456,7 +3473,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3456 map->m_flags |= EXT4_MAP_UNINIT; 3473 map->m_flags |= EXT4_MAP_UNINIT;
3457 } 3474 }
3458 3475
3459 err = check_eofblocks_fl(handle, inode, map, path, ar.len); 3476 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
3460 if (err) 3477 if (err)
3461 goto out2; 3478 goto out2;
3462 3479
@@ -3490,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3490 * when it is _not_ an uninitialized extent. 3507 * when it is _not_ an uninitialized extent.
3491 */ 3508 */
3492 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3509 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3493 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, 3510 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
3494 EXT4_EXT_CACHE_EXTENT);
3495 ext4_update_inode_fsync_trans(handle, inode, 1); 3511 ext4_update_inode_fsync_trans(handle, inode, 1);
3496 } else 3512 } else
3497 ext4_update_inode_fsync_trans(handle, inode, 0); 3513 ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3519,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode)
3519 int err = 0; 3535 int err = 0;
3520 3536
3521 /* 3537 /*
3538 * finish any pending end_io work so we won't run the risk of
3539 * converting any truncated blocks to initialized later
3540 */
3541 ext4_flush_completed_IO(inode);
3542
3543 /*
3522 * probably first extent we're gonna free will be last in block 3544 * probably first extent we're gonna free will be last in block
3523 */ 3545 */
3524 err = ext4_writepage_trans_blocks(inode); 3546 err = ext4_writepage_trans_blocks(inode);
@@ -3605,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
3605} 3627}
3606 3628
3607/* 3629/*
3608 * preallocate space for a file. This implements ext4's fallocate inode 3630 * preallocate space for a file. This implements ext4's fallocate file
3609 * operation, which gets called from sys_fallocate system call. 3631 * operation, which gets called from sys_fallocate system call.
3610 * For block-mapped files, posix_fallocate should fall back to the method 3632 * For block-mapped files, posix_fallocate should fall back to the method
3611 * of writing zeroes to the required new blocks (the same behavior which is 3633 * of writing zeroes to the required new blocks (the same behavior which is
3612 * expected for file systems which do not support fallocate() system call). 3634 * expected for file systems which do not support fallocate() system call).
3613 */ 3635 */
3614long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3636long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3615{ 3637{
3638 struct inode *inode = file->f_path.dentry->d_inode;
3616 handle_t *handle; 3639 handle_t *handle;
3617 loff_t new_size; 3640 loff_t new_size;
3618 unsigned int max_blocks; 3641 unsigned int max_blocks;
@@ -3622,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3622 struct ext4_map_blocks map; 3645 struct ext4_map_blocks map;
3623 unsigned int credits, blkbits = inode->i_blkbits; 3646 unsigned int credits, blkbits = inode->i_blkbits;
3624 3647
3648 /* We only support the FALLOC_FL_KEEP_SIZE mode */
3649 if (mode & ~FALLOC_FL_KEEP_SIZE)
3650 return -EOPNOTSUPP;
3651
3625 /* 3652 /*
3626 * currently supporting (pre)allocate mode for extent-based 3653 * currently supporting (pre)allocate mode for extent-based
3627 * files _only_ 3654 * files _only_
@@ -3629,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3629 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3656 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3630 return -EOPNOTSUPP; 3657 return -EOPNOTSUPP;
3631 3658
3632 /* preallocation to directories is currently not supported */
3633 if (S_ISDIR(inode->i_mode))
3634 return -ENODEV;
3635
3636 map.m_lblk = offset >> blkbits; 3659 map.m_lblk = offset >> blkbits;
3637 /* 3660 /*
3638 * We can't just convert len to max_blocks because 3661 * We can't just convert len to max_blocks because
@@ -3767,7 +3790,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3767 3790
3768 logical = (__u64)newex->ec_block << blksize_bits; 3791 logical = (__u64)newex->ec_block << blksize_bits;
3769 3792
3770 if (newex->ec_type == EXT4_EXT_CACHE_GAP) { 3793 if (newex->ec_start == 0) {
3771 pgoff_t offset; 3794 pgoff_t offset;
3772 struct page *page; 3795 struct page *page;
3773 struct buffer_head *bh = NULL; 3796 struct buffer_head *bh = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5a5c55ddcee..2e8322c8aa8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
104{ 104{
105 struct super_block *sb = inode->i_sb; 105 struct super_block *sb = inode->i_sb;
106 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 106 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
107 struct ext4_inode_info *ei = EXT4_I(inode);
107 struct vfsmount *mnt = filp->f_path.mnt; 108 struct vfsmount *mnt = filp->f_path.mnt;
108 struct path path; 109 struct path path;
109 char buf[64], *cp; 110 char buf[64], *cp;
@@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
127 ext4_mark_super_dirty(sb); 128 ext4_mark_super_dirty(sb);
128 } 129 }
129 } 130 }
131 /*
132 * Set up the jbd2_inode if we are opening the inode for
133 * writing and the journal is present
134 */
135 if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
136 struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
137
138 spin_lock(&inode->i_lock);
139 if (!ei->jinode) {
140 if (!jinode) {
141 spin_unlock(&inode->i_lock);
142 return -ENOMEM;
143 }
144 ei->jinode = jinode;
145 jbd2_journal_init_jbd_inode(ei->jinode, inode);
146 jinode = NULL;
147 }
148 spin_unlock(&inode->i_lock);
149 if (unlikely(jinode != NULL))
150 jbd2_free_inode(jinode);
151 }
130 return dquot_file_open(inode, filp); 152 return dquot_file_open(inode, filp);
131} 153}
132 154
@@ -188,6 +210,7 @@ const struct file_operations ext4_file_operations = {
188 .fsync = ext4_sync_file, 210 .fsync = ext4_sync_file,
189 .splice_read = generic_file_splice_read, 211 .splice_read = generic_file_splice_read,
190 .splice_write = generic_file_splice_write, 212 .splice_write = generic_file_splice_write,
213 .fallocate = ext4_fallocate,
191}; 214};
192 215
193const struct inode_operations ext4_file_inode_operations = { 216const struct inode_operations ext4_file_inode_operations = {
@@ -201,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
201 .removexattr = generic_removexattr, 224 .removexattr = generic_removexattr,
202#endif 225#endif
203 .check_acl = ext4_check_acl, 226 .check_acl = ext4_check_acl,
204 .fallocate = ext4_fallocate,
205 .fiemap = ext4_fiemap, 227 .fiemap = ext4_fiemap,
206}; 228};
207 229
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c1a7bc923cf..7829b287822 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
75 * to written. 75 * to written.
76 * The function return the number of pending IOs on success. 76 * The function return the number of pending IOs on success.
77 */ 77 */
78static int flush_completed_IO(struct inode *inode) 78extern int ext4_flush_completed_IO(struct inode *inode)
79{ 79{
80 ext4_io_end_t *io; 80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode); 81 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -169,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync)
169 if (inode->i_sb->s_flags & MS_RDONLY) 169 if (inode->i_sb->s_flags & MS_RDONLY)
170 return 0; 170 return 0;
171 171
172 ret = flush_completed_IO(inode); 172 ret = ext4_flush_completed_IO(inode);
173 if (ret < 0) 173 if (ret < 0)
174 return ret; 174 return ret;
175 175
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ce240a23eb..eb9097aec6f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1027,7 +1027,7 @@ got:
1027 inode->i_generation = sbi->s_next_generation++; 1027 inode->i_generation = sbi->s_next_generation++;
1028 spin_unlock(&sbi->s_next_gen_lock); 1028 spin_unlock(&sbi->s_next_gen_lock);
1029 1029
1030 ei->i_state_flags = 0; 1030 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
1031 ext4_set_inode_state(inode, EXT4_STATE_NEW); 1031 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1032 1032
1033 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1033 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 19161647046..9f7f9e49914 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/printk.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/ratelimit.h>
43 45
44#include "ext4_jbd2.h" 46#include "ext4_jbd2.h"
45#include "xattr.h" 47#include "xattr.h"
@@ -53,10 +55,18 @@
53static inline int ext4_begin_ordered_truncate(struct inode *inode, 55static inline int ext4_begin_ordered_truncate(struct inode *inode,
54 loff_t new_size) 56 loff_t new_size)
55{ 57{
56 return jbd2_journal_begin_ordered_truncate( 58 trace_ext4_begin_ordered_truncate(inode, new_size);
57 EXT4_SB(inode->i_sb)->s_journal, 59 /*
58 &EXT4_I(inode)->jinode, 60 * If jinode is zero, then we never opened the file for
59 new_size); 61 * writing, so there's no need to call
62 * jbd2_journal_begin_ordered_truncate() since there's no
63 * outstanding writes we need to flush.
64 */
65 if (!EXT4_I(inode)->jinode)
66 return 0;
67 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
68 EXT4_I(inode)->jinode,
69 new_size);
60} 70}
61 71
62static void ext4_invalidatepage(struct page *page, unsigned long offset); 72static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -178,6 +188,7 @@ void ext4_evict_inode(struct inode *inode)
178 handle_t *handle; 188 handle_t *handle;
179 int err; 189 int err;
180 190
191 trace_ext4_evict_inode(inode);
181 if (inode->i_nlink) { 192 if (inode->i_nlink) {
182 truncate_inode_pages(&inode->i_data, 0); 193 truncate_inode_pages(&inode->i_data, 0);
183 goto no_delete; 194 goto no_delete;
@@ -550,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
550} 561}
551 562
552/** 563/**
553 * ext4_blks_to_allocate: Look up the block map and count the number 564 * ext4_blks_to_allocate - Look up the block map and count the number
554 * of direct blocks need to be allocated for the given branch. 565 * of direct blocks need to be allocated for the given branch.
555 * 566 *
556 * @branch: chain of indirect blocks 567 * @branch: chain of indirect blocks
@@ -589,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
589 600
590/** 601/**
591 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
603 * @handle: handle for this transaction
604 * @inode: inode which needs allocated blocks
605 * @iblock: the logical block to start allocated at
606 * @goal: preferred physical block of allocation
592 * @indirect_blks: the number of blocks need to allocate for indirect 607 * @indirect_blks: the number of blocks need to allocate for indirect
593 * blocks 608 * blocks
594 * 609 * @blks: number of desired blocks
595 * @new_blocks: on return it will store the new block numbers for 610 * @new_blocks: on return it will store the new block numbers for
596 * the indirect blocks(if needed) and the first direct block, 611 * the indirect blocks(if needed) and the first direct block,
597 * @blks: on return it will store the total number of allocated 612 * @err: on return it will store the error code
598 * direct blocks 613 *
614 * This function will return the number of blocks allocated as
615 * requested by the passed-in parameters.
599 */ 616 */
600static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 617static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
601 ext4_lblk_t iblock, ext4_fsblk_t goal, 618 ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -709,9 +726,11 @@ failed_out:
709 726
710/** 727/**
711 * ext4_alloc_branch - allocate and set up a chain of blocks. 728 * ext4_alloc_branch - allocate and set up a chain of blocks.
729 * @handle: handle for this transaction
712 * @inode: owner 730 * @inode: owner
713 * @indirect_blks: number of allocated indirect blocks 731 * @indirect_blks: number of allocated indirect blocks
714 * @blks: number of allocated direct blocks 732 * @blks: number of allocated direct blocks
733 * @goal: preferred place for allocation
715 * @offsets: offsets (in the blocks) to store the pointers to next. 734 * @offsets: offsets (in the blocks) to store the pointers to next.
716 * @branch: place to store the chain in. 735 * @branch: place to store the chain in.
717 * 736 *
@@ -824,6 +843,7 @@ failed:
824 843
825/** 844/**
826 * ext4_splice_branch - splice the allocated branch onto inode. 845 * ext4_splice_branch - splice the allocated branch onto inode.
846 * @handle: handle for this transaction
827 * @inode: owner 847 * @inode: owner
828 * @block: (logical) number of block we are adding 848 * @block: (logical) number of block we are adding
829 * @chain: chain of indirect blocks (with a missing link - see 849 * @chain: chain of indirect blocks (with a missing link - see
@@ -1079,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1079 * Calculate the number of metadata blocks need to reserve 1099 * Calculate the number of metadata blocks need to reserve
1080 * to allocate a block located at @lblock 1100 * to allocate a block located at @lblock
1081 */ 1101 */
1082static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1102static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1083{ 1103{
1084 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1104 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1085 return ext4_ext_calc_metadata_amount(inode, lblock); 1105 return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1318,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1318 * avoid double accounting 1338 * avoid double accounting
1319 */ 1339 */
1320 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1340 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1321 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1341 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1322 /* 1342 /*
1323 * We need to check for EXT4 here because migrate 1343 * We need to check for EXT4 here because migrate
1324 * could have changed the inode type in between 1344 * could have changed the inode type in between
@@ -1348,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1348 ext4_da_update_reserve_space(inode, retval, 1); 1368 ext4_da_update_reserve_space(inode, retval, 1);
1349 } 1369 }
1350 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1370 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1351 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1371 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1352 1372
1353 up_write((&EXT4_I(inode)->i_data_sem)); 1373 up_write((&EXT4_I(inode)->i_data_sem));
1354 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1374 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1876,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file,
1876/* 1896/*
1877 * Reserve a single block located at lblock 1897 * Reserve a single block located at lblock
1878 */ 1898 */
1879static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) 1899static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1880{ 1900{
1881 int retries = 0; 1901 int retries = 0;
1882 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1902 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2123,9 +2143,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2123 */ 2143 */
2124 if (unlikely(journal_data && PageChecked(page))) 2144 if (unlikely(journal_data && PageChecked(page)))
2125 err = __ext4_journalled_writepage(page, len); 2145 err = __ext4_journalled_writepage(page, len);
2126 else 2146 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2127 err = ext4_bio_write_page(&io_submit, page, 2147 err = ext4_bio_write_page(&io_submit, page,
2128 len, mpd->wbc); 2148 len, mpd->wbc);
2149 else
2150 err = block_write_full_page(page,
2151 noalloc_get_block_write, mpd->wbc);
2129 2152
2130 if (!err) 2153 if (!err)
2131 mpd->pages_written++; 2154 mpd->pages_written++;
@@ -2234,7 +2257,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2234 * affects functions in many different parts of the allocation 2257 * affects functions in many different parts of the allocation
2235 * call path. This flag exists primarily because we don't 2258 * call path. This flag exists primarily because we don't
2236 * want to change *many* call functions, so ext4_map_blocks() 2259 * want to change *many* call functions, so ext4_map_blocks()
2237 * will set the magic i_delalloc_reserved_flag once the 2260 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
2238 * inode's allocation semaphore is taken. 2261 * inode's allocation semaphore is taken.
2239 * 2262 *
2240 * If the blocks in questions were delalloc blocks, set 2263 * If the blocks in questions were delalloc blocks, set
@@ -3357,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
3357 * doing I/O at all. 3380 * doing I/O at all.
3358 * 3381 *
3359 * We could call write_cache_pages(), and then redirty all of 3382 * We could call write_cache_pages(), and then redirty all of
3360 * the pages by calling redirty_page_for_writeback() but that 3383 * the pages by calling redirty_page_for_writepage() but that
3361 * would be ugly in the extreme. So instead we would need to 3384 * would be ugly in the extreme. So instead we would need to
3362 * replicate parts of the code in the above functions, 3385 * replicate parts of the code in the above functions,
3363 * simplifying them becuase we wouldn't actually intend to 3386 * simplifying them becuase we wouldn't actually intend to
@@ -3715,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3715retry: 3738retry:
3716 io_end = ext4_init_io_end(inode, GFP_ATOMIC); 3739 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3717 if (!io_end) { 3740 if (!io_end) {
3718 if (printk_ratelimit()) 3741 pr_warn_ratelimited("%s: allocation fail\n", __func__);
3719 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3720 schedule(); 3742 schedule();
3721 goto retry; 3743 goto retry;
3722 } 3744 }
@@ -3740,9 +3762,9 @@ retry:
3740 * preallocated extents, and those write extend the file, no need to 3762 * preallocated extents, and those write extend the file, no need to
3741 * fall back to buffered IO. 3763 * fall back to buffered IO.
3742 * 3764 *
3743 * For holes, we fallocate those blocks, mark them as unintialized 3765 * For holes, we fallocate those blocks, mark them as uninitialized
3744 * If those blocks were preallocated, we mark sure they are splited, but 3766 * If those blocks were preallocated, we mark sure they are splited, but
3745 * still keep the range to write as unintialized. 3767 * still keep the range to write as uninitialized.
3746 * 3768 *
3747 * The unwrritten extents will be converted to written when DIO is completed. 3769 * The unwrritten extents will be converted to written when DIO is completed.
3748 * For async direct IO, since the IO may still pending when return, we 3770 * For async direct IO, since the IO may still pending when return, we
@@ -4040,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle,
4040 if (ext4_should_journal_data(inode)) { 4062 if (ext4_should_journal_data(inode)) {
4041 err = ext4_handle_dirty_metadata(handle, inode, bh); 4063 err = ext4_handle_dirty_metadata(handle, inode, bh);
4042 } else { 4064 } else {
4043 if (ext4_should_order_data(inode)) 4065 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
4044 err = ext4_jbd2_file_inode(handle, inode); 4066 err = ext4_jbd2_file_inode(handle, inode);
4045 mark_buffer_dirty(bh); 4067 mark_buffer_dirty(bh);
4046 } 4068 }
@@ -4164,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4164{ 4186{
4165 __le32 *p; 4187 __le32 *p;
4166 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 4188 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4189 int err;
4167 4190
4168 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4191 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4169 flags |= EXT4_FREE_BLOCKS_METADATA; 4192 flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4179,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4179 if (try_to_extend_transaction(handle, inode)) { 4202 if (try_to_extend_transaction(handle, inode)) {
4180 if (bh) { 4203 if (bh) {
4181 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4204 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4182 ext4_handle_dirty_metadata(handle, inode, bh); 4205 err = ext4_handle_dirty_metadata(handle, inode, bh);
4206 if (unlikely(err)) {
4207 ext4_std_error(inode->i_sb, err);
4208 return 1;
4209 }
4210 }
4211 err = ext4_mark_inode_dirty(handle, inode);
4212 if (unlikely(err)) {
4213 ext4_std_error(inode->i_sb, err);
4214 return 1;
4215 }
4216 err = ext4_truncate_restart_trans(handle, inode,
4217 blocks_for_truncate(inode));
4218 if (unlikely(err)) {
4219 ext4_std_error(inode->i_sb, err);
4220 return 1;
4183 } 4221 }
4184 ext4_mark_inode_dirty(handle, inode);
4185 ext4_truncate_restart_trans(handle, inode,
4186 blocks_for_truncate(inode));
4187 if (bh) { 4222 if (bh) {
4188 BUFFER_TRACE(bh, "retaking write access"); 4223 BUFFER_TRACE(bh, "retaking write access");
4189 ext4_journal_get_write_access(handle, bh); 4224 ext4_journal_get_write_access(handle, bh);
@@ -4344,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4344 (__le32 *) bh->b_data, 4379 (__le32 *) bh->b_data,
4345 (__le32 *) bh->b_data + addr_per_block, 4380 (__le32 *) bh->b_data + addr_per_block,
4346 depth); 4381 depth);
4382 brelse(bh);
4347 4383
4348 /* 4384 /*
4349 * Everything below this this pointer has been 4385 * Everything below this this pointer has been
@@ -4854,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4854 } 4890 }
4855 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4891 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4856 4892
4857 ei->i_state_flags = 0; 4893 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
4858 ei->i_dir_start_lookup = 0; 4894 ei->i_dir_start_lookup = 0;
4859 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4895 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4860 /* We now have enough fields to check if the inode was active or not. 4896 /* We now have enough fields to check if the inode was active or not.
@@ -5113,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
5113 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 5149 if (ext4_inode_blocks_set(handle, raw_inode, ei))
5114 goto out_brelse; 5150 goto out_brelse;
5115 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 5151 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5116 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 5152 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
5117 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 5153 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5118 cpu_to_le32(EXT4_OS_HURD)) 5154 cpu_to_le32(EXT4_OS_HURD))
5119 raw_inode->i_file_acl_high = 5155 raw_inode->i_file_acl_high =
@@ -5410,9 +5446,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5410 * will return the blocks that include the delayed allocation 5446 * will return the blocks that include the delayed allocation
5411 * blocks for this file. 5447 * blocks for this file.
5412 */ 5448 */
5413 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
5414 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 5449 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5415 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
5416 5450
5417 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 5451 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5418 return 0; 5452 return 0;
@@ -5649,6 +5683,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5649 int err, ret; 5683 int err, ret;
5650 5684
5651 might_sleep(); 5685 might_sleep();
5686 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5652 err = ext4_reserve_inode_write(handle, inode, &iloc); 5687 err = ext4_reserve_inode_write(handle, inode, &iloc);
5653 if (ext4_handle_valid(handle) && 5688 if (ext4_handle_valid(handle) &&
5654 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5689 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1b..eb3bc2fe647 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,6 +331,30 @@ mext_out:
331 return err; 331 return err;
332 } 332 }
333 333
334 case FITRIM:
335 {
336 struct super_block *sb = inode->i_sb;
337 struct fstrim_range range;
338 int ret = 0;
339
340 if (!capable(CAP_SYS_ADMIN))
341 return -EPERM;
342
343 if (copy_from_user(&range, (struct fstrim_range *)arg,
344 sizeof(range)))
345 return -EFAULT;
346
347 ret = ext4_trim_fs(sb, &range);
348 if (ret < 0)
349 return ret;
350
351 if (copy_to_user((struct fstrim_range *)arg, &range,
352 sizeof(range)))
353 return -EFAULT;
354
355 return 0;
356 }
357
334 default: 358 default:
335 return -ENOTTY; 359 return -ENOTTY;
336 } 360 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c58eba34724..851f49b2f9d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2608,18 +2608,12 @@ int ext4_mb_release(struct super_block *sb)
2608static inline int ext4_issue_discard(struct super_block *sb, 2608static inline int ext4_issue_discard(struct super_block *sb,
2609 ext4_group_t block_group, ext4_grpblk_t block, int count) 2609 ext4_group_t block_group, ext4_grpblk_t block, int count)
2610{ 2610{
2611 int ret;
2612 ext4_fsblk_t discard_block; 2611 ext4_fsblk_t discard_block;
2613 2612
2614 discard_block = block + ext4_group_first_block_no(sb, block_group); 2613 discard_block = block + ext4_group_first_block_no(sb, block_group);
2615 trace_ext4_discard_blocks(sb, 2614 trace_ext4_discard_blocks(sb,
2616 (unsigned long long) discard_block, count); 2615 (unsigned long long) discard_block, count);
2617 ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 2616 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2618 if (ret == -EOPNOTSUPP) {
2619 ext4_warning(sb, "discard not supported, disabling");
2620 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2621 }
2622 return ret;
2623} 2617}
2624 2618
2625/* 2619/*
@@ -2631,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2631 struct super_block *sb = journal->j_private; 2625 struct super_block *sb = journal->j_private;
2632 struct ext4_buddy e4b; 2626 struct ext4_buddy e4b;
2633 struct ext4_group_info *db; 2627 struct ext4_group_info *db;
2634 int err, count = 0, count2 = 0; 2628 int err, ret, count = 0, count2 = 0;
2635 struct ext4_free_data *entry; 2629 struct ext4_free_data *entry;
2636 struct list_head *l, *ltmp; 2630 struct list_head *l, *ltmp;
2637 2631
@@ -2641,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2641 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2635 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2642 entry->count, entry->group, entry); 2636 entry->count, entry->group, entry);
2643 2637
2644 if (test_opt(sb, DISCARD)) 2638 if (test_opt(sb, DISCARD)) {
2645 ext4_issue_discard(sb, entry->group, 2639 ret = ext4_issue_discard(sb, entry->group,
2646 entry->start_blk, entry->count); 2640 entry->start_blk, entry->count);
2641 if (unlikely(ret == -EOPNOTSUPP)) {
2642 ext4_warning(sb, "discard not supported, "
2643 "disabling");
2644 clear_opt(sb, DISCARD);
2645 }
2646 }
2647 2647
2648 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2648 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2649 /* we expect to find existing buddy because it's pinned */ 2649 /* we expect to find existing buddy because it's pinned */
@@ -3881,19 +3881,6 @@ repeat:
3881 } 3881 }
3882} 3882}
3883 3883
3884/*
3885 * finds all preallocated spaces and return blocks being freed to them
3886 * if preallocated space becomes full (no block is used from the space)
3887 * then the function frees space in buddy
3888 * XXX: at the moment, truncate (which is the only way to free blocks)
3889 * discards all preallocations
3890 */
3891static void ext4_mb_return_to_preallocation(struct inode *inode,
3892 struct ext4_buddy *e4b,
3893 sector_t block, int count)
3894{
3895 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
3896}
3897#ifdef CONFIG_EXT4_DEBUG 3884#ifdef CONFIG_EXT4_DEBUG
3898static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3885static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3899{ 3886{
@@ -4283,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4283 * EDQUOT check, as blocks and quotas have been already 4270 * EDQUOT check, as blocks and quotas have been already
4284 * reserved when data being copied into pagecache. 4271 * reserved when data being copied into pagecache.
4285 */ 4272 */
4286 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4273 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4287 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4274 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4288 else { 4275 else {
4289 /* Without delayed allocation we need to verify 4276 /* Without delayed allocation we need to verify
@@ -4380,7 +4367,8 @@ out:
4380 if (inquota && ar->len < inquota) 4367 if (inquota && ar->len < inquota)
4381 dquot_free_block(ar->inode, inquota - ar->len); 4368 dquot_free_block(ar->inode, inquota - ar->len);
4382 if (!ar->len) { 4369 if (!ar->len) {
4383 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4370 if (!ext4_test_inode_state(ar->inode,
4371 EXT4_STATE_DELALLOC_RESERVED))
4384 /* release all the reserved blocks if non delalloc */ 4372 /* release all the reserved blocks if non delalloc */
4385 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4373 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4386 reserv_blks); 4374 reserv_blks);
@@ -4626,7 +4614,11 @@ do_more:
4626 * blocks being freed are metadata. these blocks shouldn't 4614 * blocks being freed are metadata. these blocks shouldn't
4627 * be used until this transaction is committed 4615 * be used until this transaction is committed
4628 */ 4616 */
4629 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4617 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4618 if (!new_entry) {
4619 err = -ENOMEM;
4620 goto error_return;
4621 }
4630 new_entry->start_blk = bit; 4622 new_entry->start_blk = bit;
4631 new_entry->group = block_group; 4623 new_entry->group = block_group;
4632 new_entry->count = count; 4624 new_entry->count = count;
@@ -4640,12 +4632,9 @@ do_more:
4640 * with group lock held. generate_buddy look at 4632 * with group lock held. generate_buddy look at
4641 * them with group lock_held 4633 * them with group lock_held
4642 */ 4634 */
4643 if (test_opt(sb, DISCARD))
4644 ext4_issue_discard(sb, block_group, bit, count);
4645 ext4_lock_group(sb, block_group); 4635 ext4_lock_group(sb, block_group);
4646 mb_clear_bits(bitmap_bh->b_data, bit, count); 4636 mb_clear_bits(bitmap_bh->b_data, bit, count);
4647 mb_free_blocks(inode, &e4b, bit, count); 4637 mb_free_blocks(inode, &e4b, bit, count);
4648 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4649 } 4638 }
4650 4639
4651 ret = ext4_free_blks_count(sb, gdp) + count; 4640 ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4720,8 +4709,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4720 ext4_unlock_group(sb, group); 4709 ext4_unlock_group(sb, group);
4721 4710
4722 ret = ext4_issue_discard(sb, group, start, count); 4711 ret = ext4_issue_discard(sb, group, start, count);
4723 if (ret)
4724 ext4_std_error(sb, ret);
4725 4712
4726 ext4_lock_group(sb, group); 4713 ext4_lock_group(sb, group);
4727 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4714 mb_free_blocks(NULL, e4b, start, ex.fe_len);
@@ -4821,6 +4808,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4821 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 4808 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4822 ext4_grpblk_t cnt = 0, first_block, last_block; 4809 ext4_grpblk_t cnt = 0, first_block, last_block;
4823 uint64_t start, len, minlen, trimmed; 4810 uint64_t start, len, minlen, trimmed;
4811 ext4_fsblk_t first_data_blk =
4812 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4824 int ret = 0; 4813 int ret = 0;
4825 4814
4826 start = range->start >> sb->s_blocksize_bits; 4815 start = range->start >> sb->s_blocksize_bits;
@@ -4830,6 +4819,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4830 4819
4831 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 4820 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4832 return -EINVAL; 4821 return -EINVAL;
4822 if (start < first_data_blk) {
4823 len -= first_data_blk - start;
4824 start = first_data_blk;
4825 }
4833 4826
4834 /* Determine first and last group to examine based on start and len */ 4827 /* Determine first and last group to examine based on start and len */
4835 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 4828 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
@@ -4853,7 +4846,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4853 if (len >= EXT4_BLOCKS_PER_GROUP(sb)) 4846 if (len >= EXT4_BLOCKS_PER_GROUP(sb))
4854 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); 4847 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
4855 else 4848 else
4856 last_block = len; 4849 last_block = first_block + len;
4857 4850
4858 if (e4b.bd_info->bb_free >= minlen) { 4851 if (e4b.bd_info->bb_free >= minlen) {
4859 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4852 cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 25f3a974b72..b0a126f23c2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * 496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; 497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, 498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
499 S_IFREG, 0, goal); 499 S_IFREG, NULL, goal);
500 if (IS_ERR(tmp_inode)) { 500 if (IS_ERR(tmp_inode)) {
501 retval = -ENOMEM; 501 retval = -ENOMEM;
502 ext4_journal_stop(handle); 502 ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92203b8a099..5485390d32c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
581 dir->i_sb->s_blocksize - 581 dir->i_sb->s_blocksize -
582 EXT4_DIR_REC_LEN(0)); 582 EXT4_DIR_REC_LEN(0));
583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
584 if (!ext4_check_dir_entry(dir, de, bh, 584 if (ext4_check_dir_entry(dir, NULL, de, bh,
585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
586 +((char *)de - bh->b_data))) { 586 + ((char *)de - bh->b_data))) {
587 /* On error, skip the f_pos to the next block. */ 587 /* On error, skip the f_pos to the next block. */
588 dir_file->f_pos = (dir_file->f_pos | 588 dir_file->f_pos = (dir_file->f_pos |
589 (dir->i_sb->s_blocksize - 1)) + 1; 589 (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
820 if ((char *) de + namelen <= dlimit && 820 if ((char *) de + namelen <= dlimit &&
821 ext4_match (namelen, name, de)) { 821 ext4_match (namelen, name, de)) {
822 /* found a match - just to be sure, do a full check */ 822 /* found a match - just to be sure, do a full check */
823 if (!ext4_check_dir_entry(dir, de, bh, offset)) 823 if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
824 return -1; 824 return -1;
825 *res_dir = de; 825 *res_dir = de;
826 return 1; 826 return 1;
@@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
872 if (namelen > EXT4_NAME_LEN) 872 if (namelen > EXT4_NAME_LEN)
873 return NULL; 873 return NULL;
874 if ((namelen <= 2) && (name[0] == '.') && 874 if ((namelen <= 2) && (name[0] == '.') &&
875 (name[1] == '.' || name[1] == '0')) { 875 (name[1] == '.' || name[1] == '\0')) {
876 /* 876 /*
877 * "." or ".." will only be in the first block 877 * "." or ".." will only be in the first block
878 * NFS may look up ".."; "." should be handled by the VFS 878 * NFS may look up ".."; "." should be handled by the VFS
@@ -1036,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1036 return ERR_PTR(-EIO); 1036 return ERR_PTR(-EIO);
1037 } 1037 }
1038 inode = ext4_iget(dir->i_sb, ino); 1038 inode = ext4_iget(dir->i_sb, ino);
1039 if (unlikely(IS_ERR(inode))) { 1039 if (IS_ERR(inode)) {
1040 if (PTR_ERR(inode) == -ESTALE) { 1040 if (PTR_ERR(inode) == -ESTALE) {
1041 EXT4_ERROR_INODE(dir, 1041 EXT4_ERROR_INODE(dir,
1042 "deleted inode referenced: %u", 1042 "deleted inode referenced: %u",
@@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1269 de = (struct ext4_dir_entry_2 *)bh->b_data; 1269 de = (struct ext4_dir_entry_2 *)bh->b_data;
1270 top = bh->b_data + blocksize - reclen; 1270 top = bh->b_data + blocksize - reclen;
1271 while ((char *) de <= top) { 1271 while ((char *) de <= top) {
1272 if (!ext4_check_dir_entry(dir, de, bh, offset)) 1272 if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1273 return -EIO; 1273 return -EIO;
1274 if (ext4_match(namelen, name, de)) 1274 if (ext4_match(namelen, name, de))
1275 return -EEXIST; 1275 return -EEXIST;
@@ -1602,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1602 if (err) 1602 if (err)
1603 goto journal_error; 1603 goto journal_error;
1604 } 1604 }
1605 ext4_handle_dirty_metadata(handle, inode, frames[0].bh); 1605 err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
1606 if (err) {
1607 ext4_std_error(inode->i_sb, err);
1608 goto cleanup;
1609 }
1606 } 1610 }
1607 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1611 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1608 if (!de) 1612 if (!de)
@@ -1630,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle,
1630{ 1634{
1631 struct ext4_dir_entry_2 *de, *pde; 1635 struct ext4_dir_entry_2 *de, *pde;
1632 unsigned int blocksize = dir->i_sb->s_blocksize; 1636 unsigned int blocksize = dir->i_sb->s_blocksize;
1633 int i; 1637 int i, err;
1634 1638
1635 i = 0; 1639 i = 0;
1636 pde = NULL; 1640 pde = NULL;
1637 de = (struct ext4_dir_entry_2 *) bh->b_data; 1641 de = (struct ext4_dir_entry_2 *) bh->b_data;
1638 while (i < bh->b_size) { 1642 while (i < bh->b_size) {
1639 if (!ext4_check_dir_entry(dir, de, bh, i)) 1643 if (ext4_check_dir_entry(dir, NULL, de, bh, i))
1640 return -EIO; 1644 return -EIO;
1641 if (de == de_del) { 1645 if (de == de_del) {
1642 BUFFER_TRACE(bh, "get_write_access"); 1646 BUFFER_TRACE(bh, "get_write_access");
1643 ext4_journal_get_write_access(handle, bh); 1647 err = ext4_journal_get_write_access(handle, bh);
1648 if (unlikely(err)) {
1649 ext4_std_error(dir->i_sb, err);
1650 return err;
1651 }
1644 if (pde) 1652 if (pde)
1645 pde->rec_len = ext4_rec_len_to_disk( 1653 pde->rec_len = ext4_rec_len_to_disk(
1646 ext4_rec_len_from_disk(pde->rec_len, 1654 ext4_rec_len_from_disk(pde->rec_len,
@@ -1652,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle,
1652 de->inode = 0; 1660 de->inode = 0;
1653 dir->i_version++; 1661 dir->i_version++;
1654 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1662 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1655 ext4_handle_dirty_metadata(handle, dir, bh); 1663 err = ext4_handle_dirty_metadata(handle, dir, bh);
1664 if (unlikely(err)) {
1665 ext4_std_error(dir->i_sb, err);
1666 return err;
1667 }
1656 return 0; 1668 return 0;
1657 } 1669 }
1658 i += ext4_rec_len_from_disk(de->rec_len, blocksize); 1670 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1789,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1789{ 1801{
1790 handle_t *handle; 1802 handle_t *handle;
1791 struct inode *inode; 1803 struct inode *inode;
1792 struct buffer_head *dir_block; 1804 struct buffer_head *dir_block = NULL;
1793 struct ext4_dir_entry_2 *de; 1805 struct ext4_dir_entry_2 *de;
1794 unsigned int blocksize = dir->i_sb->s_blocksize; 1806 unsigned int blocksize = dir->i_sb->s_blocksize;
1795 int err, retries = 0; 1807 int err, retries = 0;
@@ -1822,7 +1834,9 @@ retry:
1822 if (!dir_block) 1834 if (!dir_block)
1823 goto out_clear_inode; 1835 goto out_clear_inode;
1824 BUFFER_TRACE(dir_block, "get_write_access"); 1836 BUFFER_TRACE(dir_block, "get_write_access");
1825 ext4_journal_get_write_access(handle, dir_block); 1837 err = ext4_journal_get_write_access(handle, dir_block);
1838 if (err)
1839 goto out_clear_inode;
1826 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1840 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1827 de->inode = cpu_to_le32(inode->i_ino); 1841 de->inode = cpu_to_le32(inode->i_ino);
1828 de->name_len = 1; 1842 de->name_len = 1;
@@ -1839,10 +1853,12 @@ retry:
1839 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1853 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1840 inode->i_nlink = 2; 1854 inode->i_nlink = 2;
1841 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 1855 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1842 ext4_handle_dirty_metadata(handle, dir, dir_block); 1856 err = ext4_handle_dirty_metadata(handle, dir, dir_block);
1843 brelse(dir_block); 1857 if (err)
1844 ext4_mark_inode_dirty(handle, inode); 1858 goto out_clear_inode;
1845 err = ext4_add_entry(handle, dentry, inode); 1859 err = ext4_mark_inode_dirty(handle, inode);
1860 if (!err)
1861 err = ext4_add_entry(handle, dentry, inode);
1846 if (err) { 1862 if (err) {
1847out_clear_inode: 1863out_clear_inode:
1848 clear_nlink(inode); 1864 clear_nlink(inode);
@@ -1853,10 +1869,13 @@ out_clear_inode:
1853 } 1869 }
1854 ext4_inc_count(handle, dir); 1870 ext4_inc_count(handle, dir);
1855 ext4_update_dx_flag(dir); 1871 ext4_update_dx_flag(dir);
1856 ext4_mark_inode_dirty(handle, dir); 1872 err = ext4_mark_inode_dirty(handle, dir);
1873 if (err)
1874 goto out_clear_inode;
1857 d_instantiate(dentry, inode); 1875 d_instantiate(dentry, inode);
1858 unlock_new_inode(inode); 1876 unlock_new_inode(inode);
1859out_stop: 1877out_stop:
1878 brelse(dir_block);
1860 ext4_journal_stop(handle); 1879 ext4_journal_stop(handle);
1861 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 1880 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1862 goto retry; 1881 goto retry;
@@ -1919,7 +1938,7 @@ static int empty_dir(struct inode *inode)
1919 } 1938 }
1920 de = (struct ext4_dir_entry_2 *) bh->b_data; 1939 de = (struct ext4_dir_entry_2 *) bh->b_data;
1921 } 1940 }
1922 if (!ext4_check_dir_entry(inode, de, bh, offset)) { 1941 if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
1923 de = (struct ext4_dir_entry_2 *)(bh->b_data + 1942 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1924 sb->s_blocksize); 1943 sb->s_blocksize);
1925 offset = (offset | (sb->s_blocksize - 1)) + 1; 1944 offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2407,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2407 ext4_current_time(new_dir); 2426 ext4_current_time(new_dir);
2408 ext4_mark_inode_dirty(handle, new_dir); 2427 ext4_mark_inode_dirty(handle, new_dir);
2409 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); 2428 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2410 ext4_handle_dirty_metadata(handle, new_dir, new_bh); 2429 retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
2430 if (unlikely(retval)) {
2431 ext4_std_error(new_dir->i_sb, retval);
2432 goto end_rename;
2433 }
2411 brelse(new_bh); 2434 brelse(new_bh);
2412 new_bh = NULL; 2435 new_bh = NULL;
2413 } 2436 }
@@ -2459,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2459 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2482 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2460 cpu_to_le32(new_dir->i_ino); 2483 cpu_to_le32(new_dir->i_ino);
2461 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2484 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2462 ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2485 retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2486 if (retval) {
2487 ext4_std_error(old_dir->i_sb, retval);
2488 goto end_rename;
2489 }
2463 ext4_dec_count(handle, old_dir); 2490 ext4_dec_count(handle, old_dir);
2464 if (new_inode) { 2491 if (new_inode) {
2465 /* checked empty_dir above, can't have another parent, 2492 /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a9d97..7270dcfca92 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,16 +32,24 @@
32 32
33static struct kmem_cache *io_page_cachep, *io_end_cachep; 33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34 34
35#define WQ_HASH_SZ 37
36#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38
35int __init ext4_init_pageio(void) 39int __init ext4_init_pageio(void)
36{ 40{
41 int i;
42
37 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); 43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
38 if (io_page_cachep == NULL) 44 if (io_page_cachep == NULL)
39 return -ENOMEM; 45 return -ENOMEM;
40 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 46 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
41 if (io_page_cachep == NULL) { 47 if (io_end_cachep == NULL) {
42 kmem_cache_destroy(io_page_cachep); 48 kmem_cache_destroy(io_page_cachep);
43 return -ENOMEM; 49 return -ENOMEM;
44 } 50 }
51 for (i = 0; i < WQ_HASH_SZ; i++)
52 init_waitqueue_head(&ioend_wq[i]);
45 53
46 return 0; 54 return 0;
47} 55}
@@ -52,24 +60,37 @@ void ext4_exit_pageio(void)
52 kmem_cache_destroy(io_page_cachep); 60 kmem_cache_destroy(io_page_cachep);
53} 61}
54 62
63void ext4_ioend_wait(struct inode *inode)
64{
65 wait_queue_head_t *wq = to_ioend_wq(inode);
66
67 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
68}
69
70static void put_io_page(struct ext4_io_page *io_page)
71{
72 if (atomic_dec_and_test(&io_page->p_count)) {
73 end_page_writeback(io_page->p_page);
74 put_page(io_page->p_page);
75 kmem_cache_free(io_page_cachep, io_page);
76 }
77}
78
55void ext4_free_io_end(ext4_io_end_t *io) 79void ext4_free_io_end(ext4_io_end_t *io)
56{ 80{
57 int i; 81 int i;
82 wait_queue_head_t *wq;
58 83
59 BUG_ON(!io); 84 BUG_ON(!io);
60 if (io->page) 85 if (io->page)
61 put_page(io->page); 86 put_page(io->page);
62 for (i = 0; i < io->num_io_pages; i++) { 87 for (i = 0; i < io->num_io_pages; i++)
63 if (--io->pages[i]->p_count == 0) { 88 put_io_page(io->pages[i]);
64 struct page *page = io->pages[i]->p_page;
65
66 end_page_writeback(page);
67 put_page(page);
68 kmem_cache_free(io_page_cachep, io->pages[i]);
69 }
70 }
71 io->num_io_pages = 0; 89 io->num_io_pages = 0;
72 iput(io->inode); 90 wq = to_ioend_wq(io->inode);
91 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
92 waitqueue_active(wq))
93 wake_up_all(wq);
73 kmem_cache_free(io_end_cachep, io); 94 kmem_cache_free(io_end_cachep, io);
74} 95}
75 96
@@ -137,13 +158,10 @@ static void ext4_end_io_work(struct work_struct *work)
137 158
138ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 159ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
139{ 160{
140 ext4_io_end_t *io = NULL; 161 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
141
142 io = kmem_cache_alloc(io_end_cachep, flags);
143 if (io) { 162 if (io) {
144 memset(io, 0, sizeof(*io)); 163 atomic_inc(&EXT4_I(inode)->i_ioend_count);
145 io->inode = igrab(inode); 164 io->inode = inode;
146 BUG_ON(!io->inode);
147 INIT_WORK(&io->work, ext4_end_io_work); 165 INIT_WORK(&io->work, ext4_end_io_work);
148 INIT_LIST_HEAD(&io->list); 166 INIT_LIST_HEAD(&io->list);
149 } 167 }
@@ -171,35 +189,15 @@ static void ext4_end_bio(struct bio *bio, int error)
171 struct workqueue_struct *wq; 189 struct workqueue_struct *wq;
172 struct inode *inode; 190 struct inode *inode;
173 unsigned long flags; 191 unsigned long flags;
174 ext4_fsblk_t err_block;
175 int i; 192 int i;
176 193
177 BUG_ON(!io_end); 194 BUG_ON(!io_end);
178 inode = io_end->inode;
179 bio->bi_private = NULL; 195 bio->bi_private = NULL;
180 bio->bi_end_io = NULL; 196 bio->bi_end_io = NULL;
181 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 197 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
182 error = 0; 198 error = 0;
183 err_block = bio->bi_sector >> (inode->i_blkbits - 9);
184 bio_put(bio); 199 bio_put(bio);
185 200
186 if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
187 pr_err("sb umounted, discard end_io request for inode %lu\n",
188 io_end->inode->i_ino);
189 ext4_free_io_end(io_end);
190 return;
191 }
192
193 if (error) {
194 io_end->flag |= EXT4_IO_END_ERROR;
195 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
196 "(offset %llu size %ld starting block %llu)",
197 inode->i_ino,
198 (unsigned long long) io_end->offset,
199 (long) io_end->size,
200 (unsigned long long) err_block);
201 }
202
203 for (i = 0; i < io_end->num_io_pages; i++) { 201 for (i = 0; i < io_end->num_io_pages; i++) {
204 struct page *page = io_end->pages[i]->p_page; 202 struct page *page = io_end->pages[i]->p_page;
205 struct buffer_head *bh, *head; 203 struct buffer_head *bh, *head;
@@ -236,14 +234,6 @@ static void ext4_end_bio(struct bio *bio, int error)
236 } while (bh != head); 234 } while (bh != head);
237 } 235 }
238 236
239 if (--io_end->pages[i]->p_count == 0) {
240 struct page *page = io_end->pages[i]->p_page;
241
242 end_page_writeback(page);
243 put_page(page);
244 kmem_cache_free(io_page_cachep, io_end->pages[i]);
245 }
246
247 /* 237 /*
248 * If this is a partial write which happened to make 238 * If this is a partial write which happened to make
249 * all buffers uptodate then we can optimize away a 239 * all buffers uptodate then we can optimize away a
@@ -253,9 +243,22 @@ static void ext4_end_bio(struct bio *bio, int error)
253 */ 243 */
254 if (!partial_write) 244 if (!partial_write)
255 SetPageUptodate(page); 245 SetPageUptodate(page);
256 }
257 246
247 put_io_page(io_end->pages[i]);
248 }
258 io_end->num_io_pages = 0; 249 io_end->num_io_pages = 0;
250 inode = io_end->inode;
251
252 if (error) {
253 io_end->flag |= EXT4_IO_END_ERROR;
254 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
255 "(offset %llu size %ld starting block %llu)",
256 inode->i_ino,
257 (unsigned long long) io_end->offset,
258 (long) io_end->size,
259 (unsigned long long)
260 bio->bi_sector >> (inode->i_blkbits - 9));
261 }
259 262
260 /* Add the io_end to per-inode completed io list*/ 263 /* Add the io_end to per-inode completed io list*/
261 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 264 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +308,6 @@ static int io_submit_init(struct ext4_io_submit *io,
305 bio->bi_private = io->io_end = io_end; 308 bio->bi_private = io->io_end = io_end;
306 bio->bi_end_io = ext4_end_bio; 309 bio->bi_end_io = ext4_end_bio;
307 310
308 io_end->inode = inode;
309 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 311 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
310 312
311 io->io_bio = bio; 313 io->io_bio = bio;
@@ -360,7 +362,7 @@ submit_and_retry:
360 if ((io_end->num_io_pages == 0) || 362 if ((io_end->num_io_pages == 0) ||
361 (io_end->pages[io_end->num_io_pages-1] != io_page)) { 363 (io_end->pages[io_end->num_io_pages-1] != io_page)) {
362 io_end->pages[io_end->num_io_pages++] = io_page; 364 io_end->pages[io_end->num_io_pages++] = io_page;
363 io_page->p_count++; 365 atomic_inc(&io_page->p_count);
364 } 366 }
365 return 0; 367 return 0;
366} 368}
@@ -389,7 +391,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
389 return -ENOMEM; 391 return -ENOMEM;
390 } 392 }
391 io_page->p_page = page; 393 io_page->p_page = page;
392 io_page->p_count = 0; 394 atomic_set(&io_page->p_count, 1);
393 get_page(page); 395 get_page(page);
394 396
395 for (bh = head = page_buffers(page), block_start = 0; 397 for (bh = head = page_buffers(page), block_start = 0;
@@ -421,10 +423,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
421 * PageWriteback bit from the page to prevent the system from 423 * PageWriteback bit from the page to prevent the system from
422 * wedging later on. 424 * wedging later on.
423 */ 425 */
424 if (io_page->p_count == 0) { 426 put_io_page(io_page);
425 put_page(page);
426 end_page_writeback(page);
427 kmem_cache_free(io_page_cachep, io_page);
428 }
429 return ret; 427 return ret;
430} 428}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index dc963929de6..3ecc6e45d2f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,7 +220,11 @@ static int setup_new_group_blocks(struct super_block *sb,
220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
221 set_buffer_uptodate(gdb); 221 set_buffer_uptodate(gdb);
222 unlock_buffer(gdb); 222 unlock_buffer(gdb);
223 ext4_handle_dirty_metadata(handle, NULL, gdb); 223 err = ext4_handle_dirty_metadata(handle, NULL, gdb);
224 if (unlikely(err)) {
225 brelse(gdb);
226 goto exit_bh;
227 }
224 ext4_set_bit(bit, bh->b_data); 228 ext4_set_bit(bit, bh->b_data);
225 brelse(gdb); 229 brelse(gdb);
226 } 230 }
@@ -232,6 +236,8 @@ static int setup_new_group_blocks(struct super_block *sb,
232 GFP_NOFS); 236 GFP_NOFS);
233 if (err) 237 if (err)
234 goto exit_bh; 238 goto exit_bh;
239 for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
240 ext4_set_bit(bit, bh->b_data);
235 241
236 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, 242 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
237 input->block_bitmap - start); 243 input->block_bitmap - start);
@@ -247,13 +253,20 @@ static int setup_new_group_blocks(struct super_block *sb,
247 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); 253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
248 if (err) 254 if (err)
249 goto exit_bh; 255 goto exit_bh;
256 for (i = 0, bit = input->inode_table - start;
257 i < sbi->s_itb_per_group; i++, bit++)
258 ext4_set_bit(bit, bh->b_data);
250 259
251 if ((err = extend_or_restart_transaction(handle, 2, bh))) 260 if ((err = extend_or_restart_transaction(handle, 2, bh)))
252 goto exit_bh; 261 goto exit_bh;
253 262
254 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, 263 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
255 bh->b_data); 264 bh->b_data);
256 ext4_handle_dirty_metadata(handle, NULL, bh); 265 err = ext4_handle_dirty_metadata(handle, NULL, bh);
266 if (unlikely(err)) {
267 ext4_std_error(sb, err);
268 goto exit_bh;
269 }
257 brelse(bh); 270 brelse(bh);
258 /* Mark unused entries in inode bitmap used */ 271 /* Mark unused entries in inode bitmap used */
259 ext4_debug("clear inode bitmap %#04llx (+%llu)\n", 272 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -265,7 +278,9 @@ static int setup_new_group_blocks(struct super_block *sb,
265 278
266 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 279 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
267 bh->b_data); 280 bh->b_data);
268 ext4_handle_dirty_metadata(handle, NULL, bh); 281 err = ext4_handle_dirty_metadata(handle, NULL, bh);
282 if (unlikely(err))
283 ext4_std_error(sb, err);
269exit_bh: 284exit_bh:
270 brelse(bh); 285 brelse(bh);
271 286
@@ -417,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
417 goto exit_dind; 432 goto exit_dind;
418 } 433 }
419 434
420 if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) 435 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
436 if (unlikely(err))
421 goto exit_dind; 437 goto exit_dind;
422 438
423 if ((err = ext4_journal_get_write_access(handle, *primary))) 439 err = ext4_journal_get_write_access(handle, *primary);
440 if (unlikely(err))
424 goto exit_sbh; 441 goto exit_sbh;
425 442
426 if ((err = ext4_journal_get_write_access(handle, dind))) 443 err = ext4_journal_get_write_access(handle, dind);
427 goto exit_primary; 444 if (unlikely(err))
445 ext4_std_error(sb, err);
428 446
429 /* ext4_reserve_inode_write() gets a reference on the iloc */ 447 /* ext4_reserve_inode_write() gets a reference on the iloc */
430 if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) 448 err = ext4_reserve_inode_write(handle, inode, &iloc);
449 if (unlikely(err))
431 goto exit_dindj; 450 goto exit_dindj;
432 451
433 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), 452 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -449,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
449 * reserved inode, and will become GDT blocks (primary and backup). 468 * reserved inode, and will become GDT blocks (primary and backup).
450 */ 469 */
451 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; 470 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
452 ext4_handle_dirty_metadata(handle, NULL, dind); 471 err = ext4_handle_dirty_metadata(handle, NULL, dind);
453 brelse(dind); 472 if (unlikely(err)) {
473 ext4_std_error(sb, err);
474 goto exit_inode;
475 }
454 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 476 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
455 ext4_mark_iloc_dirty(handle, inode, &iloc); 477 ext4_mark_iloc_dirty(handle, inode, &iloc);
456 memset((*primary)->b_data, 0, sb->s_blocksize); 478 memset((*primary)->b_data, 0, sb->s_blocksize);
457 ext4_handle_dirty_metadata(handle, NULL, *primary); 479 err = ext4_handle_dirty_metadata(handle, NULL, *primary);
480 if (unlikely(err)) {
481 ext4_std_error(sb, err);
482 goto exit_inode;
483 }
484 brelse(dind);
458 485
459 o_group_desc = EXT4_SB(sb)->s_group_desc; 486 o_group_desc = EXT4_SB(sb)->s_group_desc;
460 memcpy(n_group_desc, o_group_desc, 487 memcpy(n_group_desc, o_group_desc,
@@ -465,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
465 kfree(o_group_desc); 492 kfree(o_group_desc);
466 493
467 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 494 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
468 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 495 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
496 if (err)
497 ext4_std_error(sb, err);
469 498
470 return 0; 499 return err;
471 500
472exit_inode: 501exit_inode:
473 /* ext4_journal_release_buffer(handle, iloc.bh); */ 502 /* ext4_journal_release_buffer(handle, iloc.bh); */
474 brelse(iloc.bh); 503 brelse(iloc.bh);
475exit_dindj: 504exit_dindj:
476 /* ext4_journal_release_buffer(handle, dind); */ 505 /* ext4_journal_release_buffer(handle, dind); */
477exit_primary:
478 /* ext4_journal_release_buffer(handle, *primary); */
479exit_sbh: 506exit_sbh:
480 /* ext4_journal_release_buffer(handle, *primary); */ 507 /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
481exit_dind: 508exit_dind:
482 brelse(dind); 509 brelse(dind);
483exit_bh: 510exit_bh:
@@ -660,7 +687,9 @@ static void update_backups(struct super_block *sb,
660 memset(bh->b_data + size, 0, rest); 687 memset(bh->b_data + size, 0, rest);
661 set_buffer_uptodate(bh); 688 set_buffer_uptodate(bh);
662 unlock_buffer(bh); 689 unlock_buffer(bh);
663 ext4_handle_dirty_metadata(handle, NULL, bh); 690 err = ext4_handle_dirty_metadata(handle, NULL, bh);
691 if (unlikely(err))
692 ext4_std_error(sb, err);
664 brelse(bh); 693 brelse(bh);
665 } 694 }
666 if ((err2 = ext4_journal_stop(handle)) && !err) 695 if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -878,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
878 /* Update the global fs size fields */ 907 /* Update the global fs size fields */
879 sbi->s_groups_count++; 908 sbi->s_groups_count++;
880 909
881 ext4_handle_dirty_metadata(handle, NULL, primary); 910 err = ext4_handle_dirty_metadata(handle, NULL, primary);
911 if (unlikely(err)) {
912 ext4_std_error(sb, err);
913 goto exit_journal;
914 }
882 915
883 /* Update the reserved block counts only once the new group is 916 /* Update the reserved block counts only once the new group is
884 * active. */ 917 * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 40131b777af..48ce561fafa 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -388,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb)
388void __ext4_error(struct super_block *sb, const char *function, 388void __ext4_error(struct super_block *sb, const char *function,
389 unsigned int line, const char *fmt, ...) 389 unsigned int line, const char *fmt, ...)
390{ 390{
391 struct va_format vaf;
391 va_list args; 392 va_list args;
392 393
393 va_start(args, fmt); 394 va_start(args, fmt);
394 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", 395 vaf.fmt = fmt;
395 sb->s_id, function, line, current->comm); 396 vaf.va = &args;
396 vprintk(fmt, args); 397 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
397 printk("\n"); 398 sb->s_id, function, line, current->comm, &vaf);
398 va_end(args); 399 va_end(args);
399 400
400 ext4_handle_error(sb); 401 ext4_handle_error(sb);
@@ -405,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
405 const char *fmt, ...) 406 const char *fmt, ...)
406{ 407{
407 va_list args; 408 va_list args;
409 struct va_format vaf;
408 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 410 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
409 411
410 es->s_last_error_ino = cpu_to_le32(inode->i_ino); 412 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
411 es->s_last_error_block = cpu_to_le64(block); 413 es->s_last_error_block = cpu_to_le64(block);
412 save_error_info(inode->i_sb, function, line); 414 save_error_info(inode->i_sb, function, line);
413 va_start(args, fmt); 415 va_start(args, fmt);
416 vaf.fmt = fmt;
417 vaf.va = &args;
414 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", 418 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
415 inode->i_sb->s_id, function, line, inode->i_ino); 419 inode->i_sb->s_id, function, line, inode->i_ino);
416 if (block) 420 if (block)
417 printk("block %llu: ", block); 421 printk(KERN_CONT "block %llu: ", block);
418 printk("comm %s: ", current->comm); 422 printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
419 vprintk(fmt, args);
420 printk("\n");
421 va_end(args); 423 va_end(args);
422 424
423 ext4_handle_error(inode->i_sb); 425 ext4_handle_error(inode->i_sb);
424} 426}
425 427
426void ext4_error_file(struct file *file, const char *function, 428void ext4_error_file(struct file *file, const char *function,
427 unsigned int line, const char *fmt, ...) 429 unsigned int line, ext4_fsblk_t block,
430 const char *fmt, ...)
428{ 431{
429 va_list args; 432 va_list args;
433 struct va_format vaf;
430 struct ext4_super_block *es; 434 struct ext4_super_block *es;
431 struct inode *inode = file->f_dentry->d_inode; 435 struct inode *inode = file->f_dentry->d_inode;
432 char pathname[80], *path; 436 char pathname[80], *path;
@@ -434,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function,
434 es = EXT4_SB(inode->i_sb)->s_es; 438 es = EXT4_SB(inode->i_sb)->s_es;
435 es->s_last_error_ino = cpu_to_le32(inode->i_ino); 439 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
436 save_error_info(inode->i_sb, function, line); 440 save_error_info(inode->i_sb, function, line);
437 va_start(args, fmt);
438 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 441 path = d_path(&(file->f_path), pathname, sizeof(pathname));
439 if (!path) 442 if (IS_ERR(path))
440 path = "(unknown)"; 443 path = "(unknown)";
441 printk(KERN_CRIT 444 printk(KERN_CRIT
442 "EXT4-fs error (device %s): %s:%d: inode #%lu " 445 "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
443 "(comm %s path %s): ", 446 inode->i_sb->s_id, function, line, inode->i_ino);
444 inode->i_sb->s_id, function, line, inode->i_ino, 447 if (block)
445 current->comm, path); 448 printk(KERN_CONT "block %llu: ", block);
446 vprintk(fmt, args); 449 va_start(args, fmt);
447 printk("\n"); 450 vaf.fmt = fmt;
451 vaf.va = &args;
452 printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
448 va_end(args); 453 va_end(args);
449 454
450 ext4_handle_error(inode->i_sb); 455 ext4_handle_error(inode->i_sb);
@@ -543,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
543 panic("EXT4-fs panic from previous error\n"); 548 panic("EXT4-fs panic from previous error\n");
544} 549}
545 550
546void ext4_msg (struct super_block * sb, const char *prefix, 551void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
547 const char *fmt, ...)
548{ 552{
553 struct va_format vaf;
549 va_list args; 554 va_list args;
550 555
551 va_start(args, fmt); 556 va_start(args, fmt);
552 printk("%sEXT4-fs (%s): ", prefix, sb->s_id); 557 vaf.fmt = fmt;
553 vprintk(fmt, args); 558 vaf.va = &args;
554 printk("\n"); 559 printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
555 va_end(args); 560 va_end(args);
556} 561}
557 562
558void __ext4_warning(struct super_block *sb, const char *function, 563void __ext4_warning(struct super_block *sb, const char *function,
559 unsigned int line, const char *fmt, ...) 564 unsigned int line, const char *fmt, ...)
560{ 565{
566 struct va_format vaf;
561 va_list args; 567 va_list args;
562 568
563 va_start(args, fmt); 569 va_start(args, fmt);
564 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", 570 vaf.fmt = fmt;
565 sb->s_id, function, line); 571 vaf.va = &args;
566 vprintk(fmt, args); 572 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
567 printk("\n"); 573 sb->s_id, function, line, &vaf);
568 va_end(args); 574 va_end(args);
569} 575}
570 576
@@ -575,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
575__releases(bitlock) 581__releases(bitlock)
576__acquires(bitlock) 582__acquires(bitlock)
577{ 583{
584 struct va_format vaf;
578 va_list args; 585 va_list args;
579 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 586 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
580 587
581 es->s_last_error_ino = cpu_to_le32(ino); 588 es->s_last_error_ino = cpu_to_le32(ino);
582 es->s_last_error_block = cpu_to_le64(block); 589 es->s_last_error_block = cpu_to_le64(block);
583 __save_error_info(sb, function, line); 590 __save_error_info(sb, function, line);
591
584 va_start(args, fmt); 592 va_start(args, fmt);
593
594 vaf.fmt = fmt;
595 vaf.va = &args;
585 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", 596 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
586 sb->s_id, function, line, grp); 597 sb->s_id, function, line, grp);
587 if (ino) 598 if (ino)
588 printk("inode %lu: ", ino); 599 printk(KERN_CONT "inode %lu: ", ino);
589 if (block) 600 if (block)
590 printk("block %llu:", (unsigned long long) block); 601 printk(KERN_CONT "block %llu:", (unsigned long long) block);
591 vprintk(fmt, args); 602 printk(KERN_CONT "%pV\n", &vaf);
592 printk("\n");
593 va_end(args); 603 va_end(args);
594 604
595 if (test_opt(sb, ERRORS_CONT)) { 605 if (test_opt(sb, ERRORS_CONT)) {
@@ -647,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
647 struct block_device *bdev; 657 struct block_device *bdev;
648 char b[BDEVNAME_SIZE]; 658 char b[BDEVNAME_SIZE];
649 659
650 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 660 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
651 if (IS_ERR(bdev)) 661 if (IS_ERR(bdev))
652 goto fail; 662 goto fail;
653 return bdev; 663 return bdev;
@@ -663,8 +673,7 @@ fail:
663 */ 673 */
664static int ext4_blkdev_put(struct block_device *bdev) 674static int ext4_blkdev_put(struct block_device *bdev)
665{ 675{
666 bd_release(bdev); 676 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
667 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
668} 677}
669 678
670static int ext4_blkdev_remove(struct ext4_sb_info *sbi) 679static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -808,32 +817,43 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
808 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 817 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
809 INIT_LIST_HEAD(&ei->i_prealloc_list); 818 INIT_LIST_HEAD(&ei->i_prealloc_list);
810 spin_lock_init(&ei->i_prealloc_lock); 819 spin_lock_init(&ei->i_prealloc_lock);
811 /*
812 * Note: We can be called before EXT4_SB(sb)->s_journal is set,
813 * therefore it can be null here. Don't check it, just initialize
814 * jinode.
815 */
816 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
817 ei->i_reserved_data_blocks = 0; 820 ei->i_reserved_data_blocks = 0;
818 ei->i_reserved_meta_blocks = 0; 821 ei->i_reserved_meta_blocks = 0;
819 ei->i_allocated_meta_blocks = 0; 822 ei->i_allocated_meta_blocks = 0;
820 ei->i_da_metadata_calc_len = 0; 823 ei->i_da_metadata_calc_len = 0;
821 ei->i_delalloc_reserved_flag = 0;
822 spin_lock_init(&(ei->i_block_reservation_lock)); 824 spin_lock_init(&(ei->i_block_reservation_lock));
823#ifdef CONFIG_QUOTA 825#ifdef CONFIG_QUOTA
824 ei->i_reserved_quota = 0; 826 ei->i_reserved_quota = 0;
825#endif 827#endif
828 ei->jinode = NULL;
826 INIT_LIST_HEAD(&ei->i_completed_io_list); 829 INIT_LIST_HEAD(&ei->i_completed_io_list);
827 spin_lock_init(&ei->i_completed_io_lock); 830 spin_lock_init(&ei->i_completed_io_lock);
828 ei->cur_aio_dio = NULL; 831 ei->cur_aio_dio = NULL;
829 ei->i_sync_tid = 0; 832 ei->i_sync_tid = 0;
830 ei->i_datasync_tid = 0; 833 ei->i_datasync_tid = 0;
834 atomic_set(&ei->i_ioend_count, 0);
831 835
832 return &ei->vfs_inode; 836 return &ei->vfs_inode;
833} 837}
834 838
839static int ext4_drop_inode(struct inode *inode)
840{
841 int drop = generic_drop_inode(inode);
842
843 trace_ext4_drop_inode(inode, drop);
844 return drop;
845}
846
847static void ext4_i_callback(struct rcu_head *head)
848{
849 struct inode *inode = container_of(head, struct inode, i_rcu);
850 INIT_LIST_HEAD(&inode->i_dentry);
851 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
852}
853
835static void ext4_destroy_inode(struct inode *inode) 854static void ext4_destroy_inode(struct inode *inode)
836{ 855{
856 ext4_ioend_wait(inode);
837 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 857 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
838 ext4_msg(inode->i_sb, KERN_ERR, 858 ext4_msg(inode->i_sb, KERN_ERR,
839 "Inode %lu (%p): orphan list check failed!", 859 "Inode %lu (%p): orphan list check failed!",
@@ -843,7 +863,7 @@ static void ext4_destroy_inode(struct inode *inode)
843 true); 863 true);
844 dump_stack(); 864 dump_stack();
845 } 865 }
846 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); 866 call_rcu(&inode->i_rcu, ext4_i_callback);
847} 867}
848 868
849static void init_once(void *foo) 869static void init_once(void *foo)
@@ -881,9 +901,12 @@ void ext4_clear_inode(struct inode *inode)
881 end_writeback(inode); 901 end_writeback(inode);
882 dquot_drop(inode); 902 dquot_drop(inode);
883 ext4_discard_preallocations(inode); 903 ext4_discard_preallocations(inode);
884 if (EXT4_JOURNAL(inode)) 904 if (EXT4_I(inode)->jinode) {
885 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 905 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
886 &EXT4_I(inode)->jinode); 906 EXT4_I(inode)->jinode);
907 jbd2_free_inode(EXT4_I(inode)->jinode);
908 EXT4_I(inode)->jinode = NULL;
909 }
887} 910}
888 911
889static inline void ext4_show_quota_options(struct seq_file *seq, 912static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -1016,6 +1039,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1016 !(def_mount_opts & EXT4_DEFM_NODELALLOC)) 1039 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1017 seq_puts(seq, ",nodelalloc"); 1040 seq_puts(seq, ",nodelalloc");
1018 1041
1042 if (test_opt(sb, MBLK_IO_SUBMIT))
1043 seq_puts(seq, ",mblk_io_submit");
1019 if (sbi->s_stripe) 1044 if (sbi->s_stripe)
1020 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1045 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1021 /* 1046 /*
@@ -1136,7 +1161,7 @@ static int ext4_release_dquot(struct dquot *dquot);
1136static int ext4_mark_dquot_dirty(struct dquot *dquot); 1161static int ext4_mark_dquot_dirty(struct dquot *dquot);
1137static int ext4_write_info(struct super_block *sb, int type); 1162static int ext4_write_info(struct super_block *sb, int type);
1138static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1163static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1139 char *path); 1164 struct path *path);
1140static int ext4_quota_off(struct super_block *sb, int type); 1165static int ext4_quota_off(struct super_block *sb, int type);
1141static int ext4_quota_on_mount(struct super_block *sb, int type); 1166static int ext4_quota_on_mount(struct super_block *sb, int type);
1142static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1167static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1173,6 +1198,7 @@ static const struct super_operations ext4_sops = {
1173 .destroy_inode = ext4_destroy_inode, 1198 .destroy_inode = ext4_destroy_inode,
1174 .write_inode = ext4_write_inode, 1199 .write_inode = ext4_write_inode,
1175 .dirty_inode = ext4_dirty_inode, 1200 .dirty_inode = ext4_dirty_inode,
1201 .drop_inode = ext4_drop_inode,
1176 .evict_inode = ext4_evict_inode, 1202 .evict_inode = ext4_evict_inode,
1177 .put_super = ext4_put_super, 1203 .put_super = ext4_put_super,
1178 .sync_fs = ext4_sync_fs, 1204 .sync_fs = ext4_sync_fs,
@@ -1186,7 +1212,6 @@ static const struct super_operations ext4_sops = {
1186 .quota_write = ext4_quota_write, 1212 .quota_write = ext4_quota_write,
1187#endif 1213#endif
1188 .bdev_try_to_free_page = bdev_try_to_free_page, 1214 .bdev_try_to_free_page = bdev_try_to_free_page,
1189 .trim_fs = ext4_trim_fs
1190}; 1215};
1191 1216
1192static const struct super_operations ext4_nojournal_sops = { 1217static const struct super_operations ext4_nojournal_sops = {
@@ -1194,6 +1219,7 @@ static const struct super_operations ext4_nojournal_sops = {
1194 .destroy_inode = ext4_destroy_inode, 1219 .destroy_inode = ext4_destroy_inode,
1195 .write_inode = ext4_write_inode, 1220 .write_inode = ext4_write_inode,
1196 .dirty_inode = ext4_dirty_inode, 1221 .dirty_inode = ext4_dirty_inode,
1222 .drop_inode = ext4_drop_inode,
1197 .evict_inode = ext4_evict_inode, 1223 .evict_inode = ext4_evict_inode,
1198 .write_super = ext4_write_super, 1224 .write_super = ext4_write_super,
1199 .put_super = ext4_put_super, 1225 .put_super = ext4_put_super,
@@ -1228,8 +1254,8 @@ enum {
1228 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1254 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1229 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, 1255 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
1230 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, 1256 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
1231 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1257 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1232 Opt_block_validity, Opt_noblock_validity, 1258 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1233 Opt_inode_readahead_blks, Opt_journal_ioprio, 1259 Opt_inode_readahead_blks, Opt_journal_ioprio,
1234 Opt_dioread_nolock, Opt_dioread_lock, 1260 Opt_dioread_nolock, Opt_dioread_lock,
1235 Opt_discard, Opt_nodiscard, 1261 Opt_discard, Opt_nodiscard,
@@ -1293,6 +1319,8 @@ static const match_table_t tokens = {
1293 {Opt_resize, "resize"}, 1319 {Opt_resize, "resize"},
1294 {Opt_delalloc, "delalloc"}, 1320 {Opt_delalloc, "delalloc"},
1295 {Opt_nodelalloc, "nodelalloc"}, 1321 {Opt_nodelalloc, "nodelalloc"},
1322 {Opt_mblk_io_submit, "mblk_io_submit"},
1323 {Opt_nomblk_io_submit, "nomblk_io_submit"},
1296 {Opt_block_validity, "block_validity"}, 1324 {Opt_block_validity, "block_validity"},
1297 {Opt_noblock_validity, "noblock_validity"}, 1325 {Opt_noblock_validity, "noblock_validity"},
1298 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1326 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1371,7 +1399,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1371 sbi->s_qf_names[qtype] = NULL; 1399 sbi->s_qf_names[qtype] = NULL;
1372 return 0; 1400 return 0;
1373 } 1401 }
1374 set_opt(sbi->s_mount_opt, QUOTA); 1402 set_opt(sb, QUOTA);
1375 return 1; 1403 return 1;
1376} 1404}
1377 1405
@@ -1426,21 +1454,21 @@ static int parse_options(char *options, struct super_block *sb,
1426 switch (token) { 1454 switch (token) {
1427 case Opt_bsd_df: 1455 case Opt_bsd_df:
1428 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1456 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1429 clear_opt(sbi->s_mount_opt, MINIX_DF); 1457 clear_opt(sb, MINIX_DF);
1430 break; 1458 break;
1431 case Opt_minix_df: 1459 case Opt_minix_df:
1432 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1460 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1433 set_opt(sbi->s_mount_opt, MINIX_DF); 1461 set_opt(sb, MINIX_DF);
1434 1462
1435 break; 1463 break;
1436 case Opt_grpid: 1464 case Opt_grpid:
1437 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1465 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1438 set_opt(sbi->s_mount_opt, GRPID); 1466 set_opt(sb, GRPID);
1439 1467
1440 break; 1468 break;
1441 case Opt_nogrpid: 1469 case Opt_nogrpid:
1442 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1470 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1443 clear_opt(sbi->s_mount_opt, GRPID); 1471 clear_opt(sb, GRPID);
1444 1472
1445 break; 1473 break;
1446 case Opt_resuid: 1474 case Opt_resuid:
@@ -1458,38 +1486,38 @@ static int parse_options(char *options, struct super_block *sb,
1458 /* *sb_block = match_int(&args[0]); */ 1486 /* *sb_block = match_int(&args[0]); */
1459 break; 1487 break;
1460 case Opt_err_panic: 1488 case Opt_err_panic:
1461 clear_opt(sbi->s_mount_opt, ERRORS_CONT); 1489 clear_opt(sb, ERRORS_CONT);
1462 clear_opt(sbi->s_mount_opt, ERRORS_RO); 1490 clear_opt(sb, ERRORS_RO);
1463 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1491 set_opt(sb, ERRORS_PANIC);
1464 break; 1492 break;
1465 case Opt_err_ro: 1493 case Opt_err_ro:
1466 clear_opt(sbi->s_mount_opt, ERRORS_CONT); 1494 clear_opt(sb, ERRORS_CONT);
1467 clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 1495 clear_opt(sb, ERRORS_PANIC);
1468 set_opt(sbi->s_mount_opt, ERRORS_RO); 1496 set_opt(sb, ERRORS_RO);
1469 break; 1497 break;
1470 case Opt_err_cont: 1498 case Opt_err_cont:
1471 clear_opt(sbi->s_mount_opt, ERRORS_RO); 1499 clear_opt(sb, ERRORS_RO);
1472 clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 1500 clear_opt(sb, ERRORS_PANIC);
1473 set_opt(sbi->s_mount_opt, ERRORS_CONT); 1501 set_opt(sb, ERRORS_CONT);
1474 break; 1502 break;
1475 case Opt_nouid32: 1503 case Opt_nouid32:
1476 set_opt(sbi->s_mount_opt, NO_UID32); 1504 set_opt(sb, NO_UID32);
1477 break; 1505 break;
1478 case Opt_debug: 1506 case Opt_debug:
1479 set_opt(sbi->s_mount_opt, DEBUG); 1507 set_opt(sb, DEBUG);
1480 break; 1508 break;
1481 case Opt_oldalloc: 1509 case Opt_oldalloc:
1482 set_opt(sbi->s_mount_opt, OLDALLOC); 1510 set_opt(sb, OLDALLOC);
1483 break; 1511 break;
1484 case Opt_orlov: 1512 case Opt_orlov:
1485 clear_opt(sbi->s_mount_opt, OLDALLOC); 1513 clear_opt(sb, OLDALLOC);
1486 break; 1514 break;
1487#ifdef CONFIG_EXT4_FS_XATTR 1515#ifdef CONFIG_EXT4_FS_XATTR
1488 case Opt_user_xattr: 1516 case Opt_user_xattr:
1489 set_opt(sbi->s_mount_opt, XATTR_USER); 1517 set_opt(sb, XATTR_USER);
1490 break; 1518 break;
1491 case Opt_nouser_xattr: 1519 case Opt_nouser_xattr:
1492 clear_opt(sbi->s_mount_opt, XATTR_USER); 1520 clear_opt(sb, XATTR_USER);
1493 break; 1521 break;
1494#else 1522#else
1495 case Opt_user_xattr: 1523 case Opt_user_xattr:
@@ -1499,10 +1527,10 @@ static int parse_options(char *options, struct super_block *sb,
1499#endif 1527#endif
1500#ifdef CONFIG_EXT4_FS_POSIX_ACL 1528#ifdef CONFIG_EXT4_FS_POSIX_ACL
1501 case Opt_acl: 1529 case Opt_acl:
1502 set_opt(sbi->s_mount_opt, POSIX_ACL); 1530 set_opt(sb, POSIX_ACL);
1503 break; 1531 break;
1504 case Opt_noacl: 1532 case Opt_noacl:
1505 clear_opt(sbi->s_mount_opt, POSIX_ACL); 1533 clear_opt(sb, POSIX_ACL);
1506 break; 1534 break;
1507#else 1535#else
1508 case Opt_acl: 1536 case Opt_acl:
@@ -1521,7 +1549,7 @@ static int parse_options(char *options, struct super_block *sb,
1521 "Cannot specify journal on remount"); 1549 "Cannot specify journal on remount");
1522 return 0; 1550 return 0;
1523 } 1551 }
1524 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1552 set_opt(sb, UPDATE_JOURNAL);
1525 break; 1553 break;
1526 case Opt_journal_dev: 1554 case Opt_journal_dev:
1527 if (is_remount) { 1555 if (is_remount) {
@@ -1534,14 +1562,14 @@ static int parse_options(char *options, struct super_block *sb,
1534 *journal_devnum = option; 1562 *journal_devnum = option;
1535 break; 1563 break;
1536 case Opt_journal_checksum: 1564 case Opt_journal_checksum:
1537 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1565 set_opt(sb, JOURNAL_CHECKSUM);
1538 break; 1566 break;
1539 case Opt_journal_async_commit: 1567 case Opt_journal_async_commit:
1540 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1568 set_opt(sb, JOURNAL_ASYNC_COMMIT);
1541 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1569 set_opt(sb, JOURNAL_CHECKSUM);
1542 break; 1570 break;
1543 case Opt_noload: 1571 case Opt_noload:
1544 set_opt(sbi->s_mount_opt, NOLOAD); 1572 set_opt(sb, NOLOAD);
1545 break; 1573 break;
1546 case Opt_commit: 1574 case Opt_commit:
1547 if (match_int(&args[0], &option)) 1575 if (match_int(&args[0], &option))
@@ -1584,15 +1612,15 @@ static int parse_options(char *options, struct super_block *sb,
1584 return 0; 1612 return 0;
1585 } 1613 }
1586 } else { 1614 } else {
1587 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 1615 clear_opt(sb, DATA_FLAGS);
1588 sbi->s_mount_opt |= data_opt; 1616 sbi->s_mount_opt |= data_opt;
1589 } 1617 }
1590 break; 1618 break;
1591 case Opt_data_err_abort: 1619 case Opt_data_err_abort:
1592 set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1620 set_opt(sb, DATA_ERR_ABORT);
1593 break; 1621 break;
1594 case Opt_data_err_ignore: 1622 case Opt_data_err_ignore:
1595 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1623 clear_opt(sb, DATA_ERR_ABORT);
1596 break; 1624 break;
1597#ifdef CONFIG_QUOTA 1625#ifdef CONFIG_QUOTA
1598 case Opt_usrjquota: 1626 case Opt_usrjquota:
@@ -1632,12 +1660,12 @@ set_qf_format:
1632 break; 1660 break;
1633 case Opt_quota: 1661 case Opt_quota:
1634 case Opt_usrquota: 1662 case Opt_usrquota:
1635 set_opt(sbi->s_mount_opt, QUOTA); 1663 set_opt(sb, QUOTA);
1636 set_opt(sbi->s_mount_opt, USRQUOTA); 1664 set_opt(sb, USRQUOTA);
1637 break; 1665 break;
1638 case Opt_grpquota: 1666 case Opt_grpquota:
1639 set_opt(sbi->s_mount_opt, QUOTA); 1667 set_opt(sb, QUOTA);
1640 set_opt(sbi->s_mount_opt, GRPQUOTA); 1668 set_opt(sb, GRPQUOTA);
1641 break; 1669 break;
1642 case Opt_noquota: 1670 case Opt_noquota:
1643 if (sb_any_quota_loaded(sb)) { 1671 if (sb_any_quota_loaded(sb)) {
@@ -1645,9 +1673,9 @@ set_qf_format:
1645 "options when quota turned on"); 1673 "options when quota turned on");
1646 return 0; 1674 return 0;
1647 } 1675 }
1648 clear_opt(sbi->s_mount_opt, QUOTA); 1676 clear_opt(sb, QUOTA);
1649 clear_opt(sbi->s_mount_opt, USRQUOTA); 1677 clear_opt(sb, USRQUOTA);
1650 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1678 clear_opt(sb, GRPQUOTA);
1651 break; 1679 break;
1652#else 1680#else
1653 case Opt_quota: 1681 case Opt_quota:
@@ -1673,7 +1701,7 @@ set_qf_format:
1673 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; 1701 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1674 break; 1702 break;
1675 case Opt_nobarrier: 1703 case Opt_nobarrier:
1676 clear_opt(sbi->s_mount_opt, BARRIER); 1704 clear_opt(sb, BARRIER);
1677 break; 1705 break;
1678 case Opt_barrier: 1706 case Opt_barrier:
1679 if (args[0].from) { 1707 if (args[0].from) {
@@ -1682,9 +1710,9 @@ set_qf_format:
1682 } else 1710 } else
1683 option = 1; /* No argument, default to 1 */ 1711 option = 1; /* No argument, default to 1 */
1684 if (option) 1712 if (option)
1685 set_opt(sbi->s_mount_opt, BARRIER); 1713 set_opt(sb, BARRIER);
1686 else 1714 else
1687 clear_opt(sbi->s_mount_opt, BARRIER); 1715 clear_opt(sb, BARRIER);
1688 break; 1716 break;
1689 case Opt_ignore: 1717 case Opt_ignore:
1690 break; 1718 break;
@@ -1708,11 +1736,17 @@ set_qf_format:
1708 "Ignoring deprecated bh option"); 1736 "Ignoring deprecated bh option");
1709 break; 1737 break;
1710 case Opt_i_version: 1738 case Opt_i_version:
1711 set_opt(sbi->s_mount_opt, I_VERSION); 1739 set_opt(sb, I_VERSION);
1712 sb->s_flags |= MS_I_VERSION; 1740 sb->s_flags |= MS_I_VERSION;
1713 break; 1741 break;
1714 case Opt_nodelalloc: 1742 case Opt_nodelalloc:
1715 clear_opt(sbi->s_mount_opt, DELALLOC); 1743 clear_opt(sb, DELALLOC);
1744 break;
1745 case Opt_mblk_io_submit:
1746 set_opt(sb, MBLK_IO_SUBMIT);
1747 break;
1748 case Opt_nomblk_io_submit:
1749 clear_opt(sb, MBLK_IO_SUBMIT);
1716 break; 1750 break;
1717 case Opt_stripe: 1751 case Opt_stripe:
1718 if (match_int(&args[0], &option)) 1752 if (match_int(&args[0], &option))
@@ -1722,13 +1756,13 @@ set_qf_format:
1722 sbi->s_stripe = option; 1756 sbi->s_stripe = option;
1723 break; 1757 break;
1724 case Opt_delalloc: 1758 case Opt_delalloc:
1725 set_opt(sbi->s_mount_opt, DELALLOC); 1759 set_opt(sb, DELALLOC);
1726 break; 1760 break;
1727 case Opt_block_validity: 1761 case Opt_block_validity:
1728 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 1762 set_opt(sb, BLOCK_VALIDITY);
1729 break; 1763 break;
1730 case Opt_noblock_validity: 1764 case Opt_noblock_validity:
1731 clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 1765 clear_opt(sb, BLOCK_VALIDITY);
1732 break; 1766 break;
1733 case Opt_inode_readahead_blks: 1767 case Opt_inode_readahead_blks:
1734 if (match_int(&args[0], &option)) 1768 if (match_int(&args[0], &option))
@@ -1752,7 +1786,7 @@ set_qf_format:
1752 option); 1786 option);
1753 break; 1787 break;
1754 case Opt_noauto_da_alloc: 1788 case Opt_noauto_da_alloc:
1755 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1789 set_opt(sb, NO_AUTO_DA_ALLOC);
1756 break; 1790 break;
1757 case Opt_auto_da_alloc: 1791 case Opt_auto_da_alloc:
1758 if (args[0].from) { 1792 if (args[0].from) {
@@ -1761,24 +1795,24 @@ set_qf_format:
1761 } else 1795 } else
1762 option = 1; /* No argument, default to 1 */ 1796 option = 1; /* No argument, default to 1 */
1763 if (option) 1797 if (option)
1764 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1798 clear_opt(sb, NO_AUTO_DA_ALLOC);
1765 else 1799 else
1766 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1800 set_opt(sb,NO_AUTO_DA_ALLOC);
1767 break; 1801 break;
1768 case Opt_discard: 1802 case Opt_discard:
1769 set_opt(sbi->s_mount_opt, DISCARD); 1803 set_opt(sb, DISCARD);
1770 break; 1804 break;
1771 case Opt_nodiscard: 1805 case Opt_nodiscard:
1772 clear_opt(sbi->s_mount_opt, DISCARD); 1806 clear_opt(sb, DISCARD);
1773 break; 1807 break;
1774 case Opt_dioread_nolock: 1808 case Opt_dioread_nolock:
1775 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1809 set_opt(sb, DIOREAD_NOLOCK);
1776 break; 1810 break;
1777 case Opt_dioread_lock: 1811 case Opt_dioread_lock:
1778 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1812 clear_opt(sb, DIOREAD_NOLOCK);
1779 break; 1813 break;
1780 case Opt_init_inode_table: 1814 case Opt_init_inode_table:
1781 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); 1815 set_opt(sb, INIT_INODE_TABLE);
1782 if (args[0].from) { 1816 if (args[0].from) {
1783 if (match_int(&args[0], &option)) 1817 if (match_int(&args[0], &option))
1784 return 0; 1818 return 0;
@@ -1789,7 +1823,7 @@ set_qf_format:
1789 sbi->s_li_wait_mult = option; 1823 sbi->s_li_wait_mult = option;
1790 break; 1824 break;
1791 case Opt_noinit_inode_table: 1825 case Opt_noinit_inode_table:
1792 clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE); 1826 clear_opt(sb, INIT_INODE_TABLE);
1793 break; 1827 break;
1794 default: 1828 default:
1795 ext4_msg(sb, KERN_ERR, 1829 ext4_msg(sb, KERN_ERR,
@@ -1801,10 +1835,10 @@ set_qf_format:
1801#ifdef CONFIG_QUOTA 1835#ifdef CONFIG_QUOTA
1802 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1836 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1803 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) 1837 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1804 clear_opt(sbi->s_mount_opt, USRQUOTA); 1838 clear_opt(sb, USRQUOTA);
1805 1839
1806 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) 1840 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1807 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1841 clear_opt(sb, GRPQUOTA);
1808 1842
1809 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { 1843 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1810 ext4_msg(sb, KERN_ERR, "old and new quota " 1844 ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1874,12 +1908,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1874 ext4_commit_super(sb, 1); 1908 ext4_commit_super(sb, 1);
1875 if (test_opt(sb, DEBUG)) 1909 if (test_opt(sb, DEBUG))
1876 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 1910 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1877 "bpg=%lu, ipg=%lu, mo=%04x]\n", 1911 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
1878 sb->s_blocksize, 1912 sb->s_blocksize,
1879 sbi->s_groups_count, 1913 sbi->s_groups_count,
1880 EXT4_BLOCKS_PER_GROUP(sb), 1914 EXT4_BLOCKS_PER_GROUP(sb),
1881 EXT4_INODES_PER_GROUP(sb), 1915 EXT4_INODES_PER_GROUP(sb),
1882 sbi->s_mount_opt); 1916 sbi->s_mount_opt, sbi->s_mount_opt2);
1883 1917
1884 return res; 1918 return res;
1885} 1919}
@@ -1909,14 +1943,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
1909 size = flex_group_count * sizeof(struct flex_groups); 1943 size = flex_group_count * sizeof(struct flex_groups);
1910 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); 1944 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
1911 if (sbi->s_flex_groups == NULL) { 1945 if (sbi->s_flex_groups == NULL) {
1912 sbi->s_flex_groups = vmalloc(size); 1946 sbi->s_flex_groups = vzalloc(size);
1913 if (sbi->s_flex_groups) 1947 if (sbi->s_flex_groups == NULL) {
1914 memset(sbi->s_flex_groups, 0, size); 1948 ext4_msg(sb, KERN_ERR,
1915 } 1949 "not enough memory for %u flex groups",
1916 if (sbi->s_flex_groups == NULL) { 1950 flex_group_count);
1917 ext4_msg(sb, KERN_ERR, "not enough memory for " 1951 goto failed;
1918 "%u flex groups", flex_group_count); 1952 }
1919 goto failed;
1920 } 1953 }
1921 1954
1922 for (i = 0; i < sbi->s_groups_count; i++) { 1955 for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2699,7 +2732,6 @@ static int ext4_lazyinit_thread(void *arg)
2699 struct ext4_li_request *elr; 2732 struct ext4_li_request *elr;
2700 unsigned long next_wakeup; 2733 unsigned long next_wakeup;
2701 DEFINE_WAIT(wait); 2734 DEFINE_WAIT(wait);
2702 int ret;
2703 2735
2704 BUG_ON(NULL == eli); 2736 BUG_ON(NULL == eli);
2705 2737
@@ -2723,13 +2755,12 @@ cont_thread:
2723 elr = list_entry(pos, struct ext4_li_request, 2755 elr = list_entry(pos, struct ext4_li_request,
2724 lr_request); 2756 lr_request);
2725 2757
2726 if (time_after_eq(jiffies, elr->lr_next_sched)) 2758 if (time_after_eq(jiffies, elr->lr_next_sched)) {
2727 ret = ext4_run_li_request(elr); 2759 if (ext4_run_li_request(elr) != 0) {
2728 2760 /* error, remove the lazy_init job */
2729 if (ret) { 2761 ext4_remove_li_request(elr);
2730 ret = 0; 2762 continue;
2731 ext4_remove_li_request(elr); 2763 }
2732 continue;
2733 } 2764 }
2734 2765
2735 if (time_before(elr->lr_next_sched, next_wakeup)) 2766 if (time_before(elr->lr_next_sched, next_wakeup))
@@ -2740,7 +2771,8 @@ cont_thread:
2740 if (freezing(current)) 2771 if (freezing(current))
2741 refrigerator(); 2772 refrigerator();
2742 2773
2743 if (time_after_eq(jiffies, next_wakeup)) { 2774 if ((time_after_eq(jiffies, next_wakeup)) ||
2775 (MAX_JIFFY_OFFSET == next_wakeup)) {
2744 cond_resched(); 2776 cond_resched();
2745 continue; 2777 continue;
2746 } 2778 }
@@ -2788,9 +2820,6 @@ static void ext4_clear_request_list(void)
2788 struct ext4_li_request *elr; 2820 struct ext4_li_request *elr;
2789 2821
2790 mutex_lock(&ext4_li_info->li_list_mtx); 2822 mutex_lock(&ext4_li_info->li_list_mtx);
2791 if (list_empty(&ext4_li_info->li_request_list))
2792 return;
2793
2794 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { 2823 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
2795 elr = list_entry(pos, struct ext4_li_request, 2824 elr = list_entry(pos, struct ext4_li_request,
2796 lr_request); 2825 lr_request);
@@ -2899,7 +2928,7 @@ static int ext4_register_li_request(struct super_block *sb,
2899 struct ext4_sb_info *sbi = EXT4_SB(sb); 2928 struct ext4_sb_info *sbi = EXT4_SB(sb);
2900 struct ext4_li_request *elr; 2929 struct ext4_li_request *elr;
2901 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 2930 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2902 int ret; 2931 int ret = 0;
2903 2932
2904 if (sbi->s_li_request != NULL) 2933 if (sbi->s_li_request != NULL)
2905 return 0; 2934 return 0;
@@ -3054,41 +3083,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3054 3083
3055 /* Set defaults before we parse the mount options */ 3084 /* Set defaults before we parse the mount options */
3056 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 3085 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3057 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); 3086 set_opt(sb, INIT_INODE_TABLE);
3058 if (def_mount_opts & EXT4_DEFM_DEBUG) 3087 if (def_mount_opts & EXT4_DEFM_DEBUG)
3059 set_opt(sbi->s_mount_opt, DEBUG); 3088 set_opt(sb, DEBUG);
3060 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3089 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
3061 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", 3090 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
3062 "2.6.38"); 3091 "2.6.38");
3063 set_opt(sbi->s_mount_opt, GRPID); 3092 set_opt(sb, GRPID);
3064 } 3093 }
3065 if (def_mount_opts & EXT4_DEFM_UID16) 3094 if (def_mount_opts & EXT4_DEFM_UID16)
3066 set_opt(sbi->s_mount_opt, NO_UID32); 3095 set_opt(sb, NO_UID32);
3067#ifdef CONFIG_EXT4_FS_XATTR 3096#ifdef CONFIG_EXT4_FS_XATTR
3068 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 3097 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
3069 set_opt(sbi->s_mount_opt, XATTR_USER); 3098 set_opt(sb, XATTR_USER);
3070#endif 3099#endif
3071#ifdef CONFIG_EXT4_FS_POSIX_ACL 3100#ifdef CONFIG_EXT4_FS_POSIX_ACL
3072 if (def_mount_opts & EXT4_DEFM_ACL) 3101 if (def_mount_opts & EXT4_DEFM_ACL)
3073 set_opt(sbi->s_mount_opt, POSIX_ACL); 3102 set_opt(sb, POSIX_ACL);
3074#endif 3103#endif
3075 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 3104 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3076 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 3105 set_opt(sb, JOURNAL_DATA);
3077 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 3106 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
3078 set_opt(sbi->s_mount_opt, ORDERED_DATA); 3107 set_opt(sb, ORDERED_DATA);
3079 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 3108 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
3080 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 3109 set_opt(sb, WRITEBACK_DATA);
3081 3110
3082 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 3111 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
3083 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 3112 set_opt(sb, ERRORS_PANIC);
3084 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) 3113 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
3085 set_opt(sbi->s_mount_opt, ERRORS_CONT); 3114 set_opt(sb, ERRORS_CONT);
3086 else 3115 else
3087 set_opt(sbi->s_mount_opt, ERRORS_RO); 3116 set_opt(sb, ERRORS_RO);
3088 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) 3117 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
3089 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 3118 set_opt(sb, BLOCK_VALIDITY);
3090 if (def_mount_opts & EXT4_DEFM_DISCARD) 3119 if (def_mount_opts & EXT4_DEFM_DISCARD)
3091 set_opt(sbi->s_mount_opt, DISCARD); 3120 set_opt(sb, DISCARD);
3092 3121
3093 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 3122 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
3094 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 3123 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -3097,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3097 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 3126 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
3098 3127
3099 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) 3128 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
3100 set_opt(sbi->s_mount_opt, BARRIER); 3129 set_opt(sb, BARRIER);
3101 3130
3102 /* 3131 /*
3103 * enable delayed allocation by default 3132 * enable delayed allocation by default
@@ -3105,7 +3134,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3105 */ 3134 */
3106 if (!IS_EXT3_SB(sb) && 3135 if (!IS_EXT3_SB(sb) &&
3107 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3136 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3108 set_opt(sbi->s_mount_opt, DELALLOC); 3137 set_opt(sb, DELALLOC);
3109 3138
3110 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3139 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3111 &journal_devnum, &journal_ioprio, NULL, 0)) { 3140 &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -3257,13 +3286,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3257 * Test whether we have more sectors than will fit in sector_t, 3286 * Test whether we have more sectors than will fit in sector_t,
3258 * and whether the max offset is addressable by the page cache. 3287 * and whether the max offset is addressable by the page cache.
3259 */ 3288 */
3260 ret = generic_check_addressable(sb->s_blocksize_bits, 3289 err = generic_check_addressable(sb->s_blocksize_bits,
3261 ext4_blocks_count(es)); 3290 ext4_blocks_count(es));
3262 if (ret) { 3291 if (err) {
3263 ext4_msg(sb, KERN_ERR, "filesystem" 3292 ext4_msg(sb, KERN_ERR, "filesystem"
3264 " too large to mount safely on this system"); 3293 " too large to mount safely on this system");
3265 if (sizeof(sector_t) < 8) 3294 if (sizeof(sector_t) < 8)
3266 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 3295 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
3296 ret = err;
3267 goto failed_mount; 3297 goto failed_mount;
3268 } 3298 }
3269 3299
@@ -3348,6 +3378,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3348 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3378 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
3349 spin_lock_init(&sbi->s_next_gen_lock); 3379 spin_lock_init(&sbi->s_next_gen_lock);
3350 3380
3381 err = percpu_counter_init(&sbi->s_freeblocks_counter,
3382 ext4_count_free_blocks(sb));
3383 if (!err) {
3384 err = percpu_counter_init(&sbi->s_freeinodes_counter,
3385 ext4_count_free_inodes(sb));
3386 }
3387 if (!err) {
3388 err = percpu_counter_init(&sbi->s_dirs_counter,
3389 ext4_count_dirs(sb));
3390 }
3391 if (!err) {
3392 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
3393 }
3394 if (err) {
3395 ext4_msg(sb, KERN_ERR, "insufficient memory");
3396 goto failed_mount3;
3397 }
3398
3351 sbi->s_stripe = ext4_get_stripe_size(sbi); 3399 sbi->s_stripe = ext4_get_stripe_size(sbi);
3352 sbi->s_max_writeback_mb_bump = 128; 3400 sbi->s_max_writeback_mb_bump = 128;
3353 3401
@@ -3389,8 +3437,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3389 "suppressed and not mounted read-only"); 3437 "suppressed and not mounted read-only");
3390 goto failed_mount_wq; 3438 goto failed_mount_wq;
3391 } else { 3439 } else {
3392 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 3440 clear_opt(sb, DATA_FLAGS);
3393 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 3441 set_opt(sb, WRITEBACK_DATA);
3394 sbi->s_journal = NULL; 3442 sbi->s_journal = NULL;
3395 needs_recovery = 0; 3443 needs_recovery = 0;
3396 goto no_journal; 3444 goto no_journal;
@@ -3428,9 +3476,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3428 */ 3476 */
3429 if (jbd2_journal_check_available_features 3477 if (jbd2_journal_check_available_features
3430 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) 3478 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
3431 set_opt(sbi->s_mount_opt, ORDERED_DATA); 3479 set_opt(sb, ORDERED_DATA);
3432 else 3480 else
3433 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 3481 set_opt(sb, JOURNAL_DATA);
3434 break; 3482 break;
3435 3483
3436 case EXT4_MOUNT_ORDERED_DATA: 3484 case EXT4_MOUNT_ORDERED_DATA:
@@ -3446,22 +3494,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3446 } 3494 }
3447 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3495 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3448 3496
3449no_journal: 3497 /*
3450 err = percpu_counter_init(&sbi->s_freeblocks_counter, 3498 * The journal may have updated the bg summary counts, so we
3451 ext4_count_free_blocks(sb)); 3499 * need to update the global counters.
3452 if (!err) 3500 */
3453 err = percpu_counter_init(&sbi->s_freeinodes_counter, 3501 percpu_counter_set(&sbi->s_freeblocks_counter,
3454 ext4_count_free_inodes(sb)); 3502 ext4_count_free_blocks(sb));
3455 if (!err) 3503 percpu_counter_set(&sbi->s_freeinodes_counter,
3456 err = percpu_counter_init(&sbi->s_dirs_counter, 3504 ext4_count_free_inodes(sb));
3457 ext4_count_dirs(sb)); 3505 percpu_counter_set(&sbi->s_dirs_counter,
3458 if (!err) 3506 ext4_count_dirs(sb));
3459 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 3507 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
3460 if (err) {
3461 ext4_msg(sb, KERN_ERR, "insufficient memory");
3462 goto failed_mount_wq;
3463 }
3464 3508
3509no_journal:
3465 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3510 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
3466 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3511 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3467 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3512 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3523,18 +3568,18 @@ no_journal:
3523 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { 3568 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
3524 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 3569 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
3525 "requested data journaling mode"); 3570 "requested data journaling mode");
3526 clear_opt(sbi->s_mount_opt, DELALLOC); 3571 clear_opt(sb, DELALLOC);
3527 } 3572 }
3528 if (test_opt(sb, DIOREAD_NOLOCK)) { 3573 if (test_opt(sb, DIOREAD_NOLOCK)) {
3529 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 3574 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3530 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " 3575 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3531 "option - requested data journaling mode"); 3576 "option - requested data journaling mode");
3532 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 3577 clear_opt(sb, DIOREAD_NOLOCK);
3533 } 3578 }
3534 if (sb->s_blocksize < PAGE_SIZE) { 3579 if (sb->s_blocksize < PAGE_SIZE) {
3535 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " 3580 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3536 "option - block size is too small"); 3581 "option - block size is too small");
3537 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 3582 clear_opt(sb, DIOREAD_NOLOCK);
3538 } 3583 }
3539 } 3584 }
3540 3585
@@ -3611,10 +3656,6 @@ failed_mount_wq:
3611 jbd2_journal_destroy(sbi->s_journal); 3656 jbd2_journal_destroy(sbi->s_journal);
3612 sbi->s_journal = NULL; 3657 sbi->s_journal = NULL;
3613 } 3658 }
3614 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3615 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3616 percpu_counter_destroy(&sbi->s_dirs_counter);
3617 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3618failed_mount3: 3659failed_mount3:
3619 if (sbi->s_flex_groups) { 3660 if (sbi->s_flex_groups) {
3620 if (is_vmalloc_addr(sbi->s_flex_groups)) 3661 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3622,6 +3663,10 @@ failed_mount3:
3622 else 3663 else
3623 kfree(sbi->s_flex_groups); 3664 kfree(sbi->s_flex_groups);
3624 } 3665 }
3666 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3667 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3668 percpu_counter_destroy(&sbi->s_dirs_counter);
3669 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3625failed_mount2: 3670failed_mount2:
3626 for (i = 0; i < db_count; i++) 3671 for (i = 0; i < db_count; i++)
3627 brelse(sbi->s_group_desc[i]); 3672 brelse(sbi->s_group_desc[i]);
@@ -3732,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
3732 if (bdev == NULL) 3777 if (bdev == NULL)
3733 return NULL; 3778 return NULL;
3734 3779
3735 if (bd_claim(bdev, sb)) {
3736 ext4_msg(sb, KERN_ERR,
3737 "failed to claim external journal device");
3738 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
3739 return NULL;
3740 }
3741
3742 blocksize = sb->s_blocksize; 3780 blocksize = sb->s_blocksize;
3743 hblock = bdev_logical_block_size(bdev); 3781 hblock = bdev_logical_block_size(bdev);
3744 if (blocksize < hblock) { 3782 if (blocksize < hblock) {
@@ -3949,13 +3987,11 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3949 else 3987 else
3950 es->s_kbytes_written = 3988 es->s_kbytes_written =
3951 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); 3989 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
3952 if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter)) 3990 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
3953 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3991 &EXT4_SB(sb)->s_freeblocks_counter));
3954 &EXT4_SB(sb)->s_freeblocks_counter)); 3992 es->s_free_inodes_count =
3955 if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) 3993 cpu_to_le32(percpu_counter_sum_positive(
3956 es->s_free_inodes_count = 3994 &EXT4_SB(sb)->s_freeinodes_counter));
3957 cpu_to_le32(percpu_counter_sum_positive(
3958 &EXT4_SB(sb)->s_freeinodes_counter));
3959 sb->s_dirt = 0; 3995 sb->s_dirt = 0;
3960 BUFFER_TRACE(sbh, "marking dirty"); 3996 BUFFER_TRACE(sbh, "marking dirty");
3961 mark_buffer_dirty(sbh); 3997 mark_buffer_dirty(sbh);
@@ -4135,6 +4171,22 @@ static int ext4_unfreeze(struct super_block *sb)
4135 return 0; 4171 return 0;
4136} 4172}
4137 4173
4174/*
4175 * Structure to save mount options for ext4_remount's benefit
4176 */
4177struct ext4_mount_options {
4178 unsigned long s_mount_opt;
4179 unsigned long s_mount_opt2;
4180 uid_t s_resuid;
4181 gid_t s_resgid;
4182 unsigned long s_commit_interval;
4183 u32 s_min_batch_time, s_max_batch_time;
4184#ifdef CONFIG_QUOTA
4185 int s_jquota_fmt;
4186 char *s_qf_names[MAXQUOTAS];
4187#endif
4188};
4189
4138static int ext4_remount(struct super_block *sb, int *flags, char *data) 4190static int ext4_remount(struct super_block *sb, int *flags, char *data)
4139{ 4191{
4140 struct ext4_super_block *es; 4192 struct ext4_super_block *es;
@@ -4155,6 +4207,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4155 lock_super(sb); 4207 lock_super(sb);
4156 old_sb_flags = sb->s_flags; 4208 old_sb_flags = sb->s_flags;
4157 old_opts.s_mount_opt = sbi->s_mount_opt; 4209 old_opts.s_mount_opt = sbi->s_mount_opt;
4210 old_opts.s_mount_opt2 = sbi->s_mount_opt2;
4158 old_opts.s_resuid = sbi->s_resuid; 4211 old_opts.s_resuid = sbi->s_resuid;
4159 old_opts.s_resgid = sbi->s_resgid; 4212 old_opts.s_resgid = sbi->s_resgid;
4160 old_opts.s_commit_interval = sbi->s_commit_interval; 4213 old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -4308,6 +4361,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4308restore_opts: 4361restore_opts:
4309 sb->s_flags = old_sb_flags; 4362 sb->s_flags = old_sb_flags;
4310 sbi->s_mount_opt = old_opts.s_mount_opt; 4363 sbi->s_mount_opt = old_opts.s_mount_opt;
4364 sbi->s_mount_opt2 = old_opts.s_mount_opt2;
4311 sbi->s_resuid = old_opts.s_resuid; 4365 sbi->s_resuid = old_opts.s_resuid;
4312 sbi->s_resgid = old_opts.s_resgid; 4366 sbi->s_resgid = old_opts.s_resgid;
4313 sbi->s_commit_interval = old_opts.s_commit_interval; 4367 sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -4504,27 +4558,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
4504 * Standard function to be called on quota_on 4558 * Standard function to be called on quota_on
4505 */ 4559 */
4506static int ext4_quota_on(struct super_block *sb, int type, int format_id, 4560static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4507 char *name) 4561 struct path *path)
4508{ 4562{
4509 int err; 4563 int err;
4510 struct path path;
4511 4564
4512 if (!test_opt(sb, QUOTA)) 4565 if (!test_opt(sb, QUOTA))
4513 return -EINVAL; 4566 return -EINVAL;
4514 4567
4515 err = kern_path(name, LOOKUP_FOLLOW, &path);
4516 if (err)
4517 return err;
4518
4519 /* Quotafile not on the same filesystem? */ 4568 /* Quotafile not on the same filesystem? */
4520 if (path.mnt->mnt_sb != sb) { 4569 if (path->mnt->mnt_sb != sb)
4521 path_put(&path);
4522 return -EXDEV; 4570 return -EXDEV;
4523 }
4524 /* Journaling quota? */ 4571 /* Journaling quota? */
4525 if (EXT4_SB(sb)->s_qf_names[type]) { 4572 if (EXT4_SB(sb)->s_qf_names[type]) {
4526 /* Quotafile not in fs root? */ 4573 /* Quotafile not in fs root? */
4527 if (path.dentry->d_parent != sb->s_root) 4574 if (path->dentry->d_parent != sb->s_root)
4528 ext4_msg(sb, KERN_WARNING, 4575 ext4_msg(sb, KERN_WARNING,
4529 "Quota file not on filesystem root. " 4576 "Quota file not on filesystem root. "
4530 "Journaled quota will not work"); 4577 "Journaled quota will not work");
@@ -4535,7 +4582,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4535 * all updates to the file when we bypass pagecache... 4582 * all updates to the file when we bypass pagecache...
4536 */ 4583 */
4537 if (EXT4_SB(sb)->s_journal && 4584 if (EXT4_SB(sb)->s_journal &&
4538 ext4_should_journal_data(path.dentry->d_inode)) { 4585 ext4_should_journal_data(path->dentry->d_inode)) {
4539 /* 4586 /*
4540 * We don't need to lock updates but journal_flush() could 4587 * We don't need to lock updates but journal_flush() could
4541 * otherwise be livelocked... 4588 * otherwise be livelocked...
@@ -4543,25 +4590,19 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4543 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 4590 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
4544 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 4591 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
4545 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4592 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
4546 if (err) { 4593 if (err)
4547 path_put(&path);
4548 return err; 4594 return err;
4549 }
4550 } 4595 }
4551 4596
4552 err = dquot_quota_on_path(sb, type, format_id, &path); 4597 return dquot_quota_on(sb, type, format_id, path);
4553 path_put(&path);
4554 return err;
4555} 4598}
4556 4599
4557static int ext4_quota_off(struct super_block *sb, int type) 4600static int ext4_quota_off(struct super_block *sb, int type)
4558{ 4601{
4559 /* Force all delayed allocation blocks to be allocated */ 4602 /* Force all delayed allocation blocks to be allocated.
4560 if (test_opt(sb, DELALLOC)) { 4603 * Caller already holds s_umount sem */
4561 down_read(&sb->s_umount); 4604 if (test_opt(sb, DELALLOC))
4562 sync_filesystem(sb); 4605 sync_filesystem(sb);
4563 up_read(&sb->s_umount);
4564 }
4565 4606
4566 return dquot_quota_off(sb, type); 4607 return dquot_quota_off(sb, type);
4567} 4608}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fa4b899da4b..fc32176eee3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
427static int 427static int
428ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) 428ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
429{ 429{
430 int i_error, b_error; 430 int ret, ret2;
431 431
432 down_read(&EXT4_I(dentry->d_inode)->xattr_sem); 432 down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
433 i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); 433 ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
434 if (i_error < 0) { 434 if (ret < 0)
435 b_error = 0; 435 goto errout;
436 } else { 436 if (buffer) {
437 if (buffer) { 437 buffer += ret;
438 buffer += i_error; 438 buffer_size -= ret;
439 buffer_size -= i_error;
440 }
441 b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
442 if (b_error < 0)
443 i_error = 0;
444 } 439 }
440 ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
441 if (ret < 0)
442 goto errout;
443 ret += ret2;
444errout:
445 up_read(&EXT4_I(dentry->d_inode)->xattr_sem); 445 up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
446 return i_error + b_error; 446 return ret;
447} 447}
448 448
449/* 449/*
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
947/* 947/*
948 * ext4_xattr_set_handle() 948 * ext4_xattr_set_handle()
949 * 949 *
950 * Create, replace or remove an extended attribute for this inode. Buffer 950 * Create, replace or remove an extended attribute for this inode. Value
951 * is NULL to remove an existing extended attribute, and non-NULL to 951 * is NULL to remove an existing extended attribute, and non-NULL to
952 * either replace an existing extended attribute, or create a new extended 952 * either replace an existing extended attribute, or create a new extended
953 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 953 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index d75a77f85c2..f50408901f7 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,7 +319,8 @@ extern struct inode *fat_build_inode(struct super_block *sb,
319 struct msdos_dir_entry *de, loff_t i_pos); 319 struct msdos_dir_entry *de, loff_t i_pos);
320extern int fat_sync_inode(struct inode *inode); 320extern int fat_sync_inode(struct inode *inode);
321extern int fat_fill_super(struct super_block *sb, void *data, int silent, 321extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322 const struct inode_operations *fs_dir_inode_ops, int isvfat); 322 const struct inode_operations *fs_dir_inode_ops,
323 int isvfat, void (*setup)(struct super_block *));
323 324
324extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
325 struct inode *i2); 326 struct inode *i2);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ad6998a92c3..86753fe10bd 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -514,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
514 return &ei->vfs_inode; 514 return &ei->vfs_inode;
515} 515}
516 516
517static void fat_destroy_inode(struct inode *inode) 517static void fat_i_callback(struct rcu_head *head)
518{ 518{
519 struct inode *inode = container_of(head, struct inode, i_rcu);
520 INIT_LIST_HEAD(&inode->i_dentry);
519 kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); 521 kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
520} 522}
521 523
524static void fat_destroy_inode(struct inode *inode)
525{
526 call_rcu(&inode->i_rcu, fat_i_callback);
527}
528
522static void init_once(void *foo) 529static void init_once(void *foo)
523{ 530{
524 struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; 531 struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
@@ -696,7 +703,6 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
696 struct fid *fid, int fh_len, int fh_type) 703 struct fid *fid, int fh_len, int fh_type)
697{ 704{
698 struct inode *inode = NULL; 705 struct inode *inode = NULL;
699 struct dentry *result;
700 u32 *fh = fid->raw; 706 u32 *fh = fid->raw;
701 707
702 if (fh_len < 5 || fh_type != 3) 708 if (fh_len < 5 || fh_type != 3)
@@ -741,10 +747,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
741 * the fat_iget lookup again. If that fails, then we are totally out 747 * the fat_iget lookup again. If that fails, then we are totally out
742 * of luck. But all that is for another day 748 * of luck. But all that is for another day
743 */ 749 */
744 result = d_obtain_alias(inode); 750 return d_obtain_alias(inode);
745 if (!IS_ERR(result))
746 result->d_op = sb->s_root->d_op;
747 return result;
748} 751}
749 752
750static int 753static int
@@ -792,8 +795,6 @@ static struct dentry *fat_get_parent(struct dentry *child)
792 brelse(bh); 795 brelse(bh);
793 796
794 parent = d_obtain_alias(inode); 797 parent = d_obtain_alias(inode);
795 if (!IS_ERR(parent))
796 parent->d_op = sb->s_root->d_op;
797out: 798out:
798 unlock_super(sb); 799 unlock_super(sb);
799 800
@@ -1237,7 +1238,8 @@ static int fat_read_root(struct inode *inode)
1237 * Read the super block of an MS-DOS FS. 1238 * Read the super block of an MS-DOS FS.
1238 */ 1239 */
1239int fat_fill_super(struct super_block *sb, void *data, int silent, 1240int fat_fill_super(struct super_block *sb, void *data, int silent,
1240 const struct inode_operations *fs_dir_inode_ops, int isvfat) 1241 const struct inode_operations *fs_dir_inode_ops, int isvfat,
1242 void (*setup)(struct super_block *))
1241{ 1243{
1242 struct inode *root_inode = NULL, *fat_inode = NULL; 1244 struct inode *root_inode = NULL, *fat_inode = NULL;
1243 struct buffer_head *bh; 1245 struct buffer_head *bh;
@@ -1273,6 +1275,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1273 if (error) 1275 if (error)
1274 goto out_fail; 1276 goto out_fail;
1275 1277
1278 setup(sb); /* flavour-specific stuff that needs options */
1279
1276 error = -EIO; 1280 error = -EIO;
1277 sb_min_blocksize(sb, 512); 1281 sb_min_blocksize(sb, 512);
1278 bh = sb_bread(sb, 0); 1282 bh = sb_bread(sb, 0);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3345aabd1dd..711499040eb 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
148 * that the existing dentry can be used. The msdos fs routines will 148 * that the existing dentry can be used. The msdos fs routines will
149 * return ENOENT or EINVAL as appropriate. 149 * return ENOENT or EINVAL as appropriate.
150 */ 150 */
151static int msdos_hash(struct dentry *dentry, struct qstr *qstr) 151static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
152 struct qstr *qstr)
152{ 153{
153 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; 154 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
154 unsigned char msdos_name[MSDOS_NAME]; 155 unsigned char msdos_name[MSDOS_NAME];
@@ -164,16 +165,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
164 * Compare two msdos names. If either of the names are invalid, 165 * Compare two msdos names. If either of the names are invalid,
165 * we fall back to doing the standard name comparison. 166 * we fall back to doing the standard name comparison.
166 */ 167 */
167static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b) 168static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
169 const struct dentry *dentry, const struct inode *inode,
170 unsigned int len, const char *str, const struct qstr *name)
168{ 171{
169 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; 172 struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
170 unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME]; 173 unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
171 int error; 174 int error;
172 175
173 error = msdos_format_name(a->name, a->len, a_msdos_name, options); 176 error = msdos_format_name(name->name, name->len, a_msdos_name, options);
174 if (error) 177 if (error)
175 goto old_compare; 178 goto old_compare;
176 error = msdos_format_name(b->name, b->len, b_msdos_name, options); 179 error = msdos_format_name(str, len, b_msdos_name, options);
177 if (error) 180 if (error)
178 goto old_compare; 181 goto old_compare;
179 error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME); 182 error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
@@ -182,8 +185,8 @@ out:
182 185
183old_compare: 186old_compare:
184 error = 1; 187 error = 1;
185 if (a->len == b->len) 188 if (name->len == len)
186 error = memcmp(a->name, b->name, a->len); 189 error = memcmp(name->name, str, len);
187 goto out; 190 goto out;
188} 191}
189 192
@@ -224,11 +227,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
224 } 227 }
225out: 228out:
226 unlock_super(sb); 229 unlock_super(sb);
227 dentry->d_op = &msdos_dentry_operations; 230 return d_splice_alias(inode, dentry);
228 dentry = d_splice_alias(inode, dentry);
229 if (dentry)
230 dentry->d_op = &msdos_dentry_operations;
231 return dentry;
232 231
233error: 232error:
234 unlock_super(sb); 233 unlock_super(sb);
@@ -658,21 +657,16 @@ static const struct inode_operations msdos_dir_inode_operations = {
658 .getattr = fat_getattr, 657 .getattr = fat_getattr,
659}; 658};
660 659
661static int msdos_fill_super(struct super_block *sb, void *data, int silent) 660static void setup(struct super_block *sb)
662{ 661{
663 int res; 662 sb->s_d_op = &msdos_dentry_operations;
664
665 lock_super(sb);
666 res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
667 if (res) {
668 unlock_super(sb);
669 return res;
670 }
671
672 sb->s_flags |= MS_NOATIME; 663 sb->s_flags |= MS_NOATIME;
673 sb->s_root->d_op = &msdos_dentry_operations; 664}
674 unlock_super(sb); 665
675 return 0; 666static int msdos_fill_super(struct super_block *sb, void *data, int silent)
667{
668 return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
669 0, setup);
676} 670}
677 671
678static struct dentry *msdos_mount(struct file_system_type *fs_type, 672static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b936703b892..f88f752babd 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
43 43
44static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) 44static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
45{ 45{
46 if (nd->flags & LOOKUP_RCU)
47 return -ECHILD;
48
46 /* This is not negative dentry. Always valid. */ 49 /* This is not negative dentry. Always valid. */
47 if (dentry->d_inode) 50 if (dentry->d_inode)
48 return 1; 51 return 1;
@@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
51 54
52static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) 55static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
53{ 56{
57 if (nd->flags & LOOKUP_RCU)
58 return -ECHILD;
59
54 /* 60 /*
55 * This is not negative dentry. Always valid. 61 * This is not negative dentry. Always valid.
56 * 62 *
@@ -85,22 +91,26 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
85} 91}
86 92
87/* returns the length of a struct qstr, ignoring trailing dots */ 93/* returns the length of a struct qstr, ignoring trailing dots */
88static unsigned int vfat_striptail_len(struct qstr *qstr) 94static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
89{ 95{
90 unsigned int len = qstr->len; 96 while (len && name[len - 1] == '.')
91
92 while (len && qstr->name[len - 1] == '.')
93 len--; 97 len--;
94 return len; 98 return len;
95} 99}
96 100
101static unsigned int vfat_striptail_len(const struct qstr *qstr)
102{
103 return __vfat_striptail_len(qstr->len, qstr->name);
104}
105
97/* 106/*
98 * Compute the hash for the vfat name corresponding to the dentry. 107 * Compute the hash for the vfat name corresponding to the dentry.
99 * Note: if the name is invalid, we leave the hash code unchanged so 108 * Note: if the name is invalid, we leave the hash code unchanged so
100 * that the existing dentry can be used. The vfat fs routines will 109 * that the existing dentry can be used. The vfat fs routines will
101 * return ENOENT or EINVAL as appropriate. 110 * return ENOENT or EINVAL as appropriate.
102 */ 111 */
103static int vfat_hash(struct dentry *dentry, struct qstr *qstr) 112static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
113 struct qstr *qstr)
104{ 114{
105 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); 115 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
106 return 0; 116 return 0;
@@ -112,9 +122,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
112 * that the existing dentry can be used. The vfat fs routines will 122 * that the existing dentry can be used. The vfat fs routines will
113 * return ENOENT or EINVAL as appropriate. 123 * return ENOENT or EINVAL as appropriate.
114 */ 124 */
115static int vfat_hashi(struct dentry *dentry, struct qstr *qstr) 125static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
126 struct qstr *qstr)
116{ 127{
117 struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io; 128 struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
118 const unsigned char *name; 129 const unsigned char *name;
119 unsigned int len; 130 unsigned int len;
120 unsigned long hash; 131 unsigned long hash;
@@ -133,16 +144,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
133/* 144/*
134 * Case insensitive compare of two vfat names. 145 * Case insensitive compare of two vfat names.
135 */ 146 */
136static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b) 147static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
148 const struct dentry *dentry, const struct inode *inode,
149 unsigned int len, const char *str, const struct qstr *name)
137{ 150{
138 struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io; 151 struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
139 unsigned int alen, blen; 152 unsigned int alen, blen;
140 153
141 /* A filename cannot end in '.' or we treat it like it has none */ 154 /* A filename cannot end in '.' or we treat it like it has none */
142 alen = vfat_striptail_len(a); 155 alen = vfat_striptail_len(name);
143 blen = vfat_striptail_len(b); 156 blen = __vfat_striptail_len(len, str);
144 if (alen == blen) { 157 if (alen == blen) {
145 if (nls_strnicmp(t, a->name, b->name, alen) == 0) 158 if (nls_strnicmp(t, name->name, str, alen) == 0)
146 return 0; 159 return 0;
147 } 160 }
148 return 1; 161 return 1;
@@ -151,15 +164,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
151/* 164/*
152 * Case sensitive compare of two vfat names. 165 * Case sensitive compare of two vfat names.
153 */ 166 */
154static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b) 167static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
168 const struct dentry *dentry, const struct inode *inode,
169 unsigned int len, const char *str, const struct qstr *name)
155{ 170{
156 unsigned int alen, blen; 171 unsigned int alen, blen;
157 172
158 /* A filename cannot end in '.' or we treat it like it has none */ 173 /* A filename cannot end in '.' or we treat it like it has none */
159 alen = vfat_striptail_len(a); 174 alen = vfat_striptail_len(name);
160 blen = vfat_striptail_len(b); 175 blen = __vfat_striptail_len(len, str);
161 if (alen == blen) { 176 if (alen == blen) {
162 if (strncmp(a->name, b->name, alen) == 0) 177 if (strncmp(name->name, str, alen) == 0)
163 return 0; 178 return 0;
164 } 179 }
165 return 1; 180 return 1;
@@ -757,13 +772,10 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
757 772
758out: 773out:
759 unlock_super(sb); 774 unlock_super(sb);
760 dentry->d_op = sb->s_root->d_op;
761 dentry->d_time = dentry->d_parent->d_inode->i_version; 775 dentry->d_time = dentry->d_parent->d_inode->i_version;
762 dentry = d_splice_alias(inode, dentry); 776 dentry = d_splice_alias(inode, dentry);
763 if (dentry) { 777 if (dentry)
764 dentry->d_op = sb->s_root->d_op;
765 dentry->d_time = dentry->d_parent->d_inode->i_version; 778 dentry->d_time = dentry->d_parent->d_inode->i_version;
766 }
767 return dentry; 779 return dentry;
768 780
769error: 781error:
@@ -1051,24 +1063,18 @@ static const struct inode_operations vfat_dir_inode_operations = {
1051 .getattr = fat_getattr, 1063 .getattr = fat_getattr,
1052}; 1064};
1053 1065
1054static int vfat_fill_super(struct super_block *sb, void *data, int silent) 1066static void setup(struct super_block *sb)
1055{ 1067{
1056 int res;
1057
1058 lock_super(sb);
1059 res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
1060 if (res) {
1061 unlock_super(sb);
1062 return res;
1063 }
1064
1065 if (MSDOS_SB(sb)->options.name_check != 's') 1068 if (MSDOS_SB(sb)->options.name_check != 's')
1066 sb->s_root->d_op = &vfat_ci_dentry_ops; 1069 sb->s_d_op = &vfat_ci_dentry_ops;
1067 else 1070 else
1068 sb->s_root->d_op = &vfat_dentry_ops; 1071 sb->s_d_op = &vfat_dentry_ops;
1072}
1069 1073
1070 unlock_super(sb); 1074static int vfat_fill_super(struct super_block *sb, void *data, int silent)
1071 return 0; 1075{
1076 return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
1077 1, setup);
1072} 1078}
1073 1079
1074static struct dentry *vfat_mount(struct file_system_type *fs_type, 1080static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/file_table.c b/fs/file_table.c
index c3dee381f1b..c3e89adf53c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
311 struct files_struct *files = current->files; 311 struct files_struct *files = current->files;
312 312
313 *fput_needed = 0; 313 *fput_needed = 0;
314 if (likely((atomic_read(&files->count) == 1))) { 314 if (atomic_read(&files->count) == 1) {
315 file = fcheck_files(files, fd); 315 file = fcheck_files(files, fd);
316 } else { 316 } else {
317 rcu_read_lock(); 317 rcu_read_lock();
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8ee..751d6b255a1 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
115 tmp = &(*tmp)->next; 115 tmp = &(*tmp)->next;
116 } 116 }
117 write_unlock(&file_systems_lock); 117 write_unlock(&file_systems_lock);
118
119 synchronize_rcu();
120
118 return -EINVAL; 121 return -EINVAL;
119} 122}
120 123
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 8c04eac5079..2ba6719ac61 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -337,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
337 return ip; 337 return ip;
338} 338}
339 339
340static void vxfs_i_callback(struct rcu_head *head)
341{
342 struct inode *inode = container_of(head, struct inode, i_rcu);
343 INIT_LIST_HEAD(&inode->i_dentry);
344 kmem_cache_free(vxfs_inode_cachep, inode->i_private);
345}
346
340/** 347/**
341 * vxfs_evict_inode - remove inode from main memory 348 * vxfs_evict_inode - remove inode from main memory
342 * @ip: inode to discard. 349 * @ip: inode to discard.
@@ -350,5 +357,5 @@ vxfs_evict_inode(struct inode *ip)
350{ 357{
351 truncate_inode_pages(&ip->i_data, 0); 358 truncate_inode_pages(&ip->i_data, 0);
352 end_writeback(ip); 359 end_writeback(ip);
353 kmem_cache_free(vxfs_inode_cachep, ip->i_private); 360 call_rcu(&ip->i_rcu, vxfs_i_callback);
354} 361}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953a..59c6e495678 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
84 return list_entry(head, struct inode, i_wb_list); 84 return list_entry(head, struct inode, i_wb_list);
85} 85}
86 86
87static void bdi_queue_work(struct backing_dev_info *bdi, 87/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
88 struct wb_writeback_work *work) 88static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
89{ 89{
90 trace_writeback_queue(bdi, work);
91
92 spin_lock_bh(&bdi->wb_lock);
93 list_add_tail(&work->list, &bdi->work_list);
94 if (bdi->wb.task) { 90 if (bdi->wb.task) {
95 wake_up_process(bdi->wb.task); 91 wake_up_process(bdi->wb.task);
96 } else { 92 } else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
98 * The bdi thread isn't there, wake up the forker thread which 94 * The bdi thread isn't there, wake up the forker thread which
99 * will create and run it. 95 * will create and run it.
100 */ 96 */
101 trace_writeback_nothread(bdi, work);
102 wake_up_process(default_backing_dev_info.wb.task); 97 wake_up_process(default_backing_dev_info.wb.task);
103 } 98 }
99}
100
101static void bdi_queue_work(struct backing_dev_info *bdi,
102 struct wb_writeback_work *work)
103{
104 trace_writeback_queue(bdi, work);
105
106 spin_lock_bh(&bdi->wb_lock);
107 list_add_tail(&work->list, &bdi->work_list);
108 if (!bdi->wb.task)
109 trace_writeback_nothread(bdi, work);
110 bdi_wakeup_flusher(bdi);
104 spin_unlock_bh(&bdi->wb_lock); 111 spin_unlock_bh(&bdi->wb_lock);
105} 112}
106 113
107static void 114static void
108__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 115__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
109 bool range_cyclic, bool for_background) 116 bool range_cyclic)
110{ 117{
111 struct wb_writeback_work *work; 118 struct wb_writeback_work *work;
112 119
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
126 work->sync_mode = WB_SYNC_NONE; 133 work->sync_mode = WB_SYNC_NONE;
127 work->nr_pages = nr_pages; 134 work->nr_pages = nr_pages;
128 work->range_cyclic = range_cyclic; 135 work->range_cyclic = range_cyclic;
129 work->for_background = for_background;
130 136
131 bdi_queue_work(bdi, work); 137 bdi_queue_work(bdi, work);
132} 138}
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
144 */ 150 */
145void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 151void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
146{ 152{
147 __bdi_start_writeback(bdi, nr_pages, true, false); 153 __bdi_start_writeback(bdi, nr_pages, true);
148} 154}
149 155
150/** 156/**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
152 * @bdi: the backing device to write from 158 * @bdi: the backing device to write from
153 * 159 *
154 * Description: 160 * Description:
155 * This does WB_SYNC_NONE background writeback. The IO is only 161 * This makes sure WB_SYNC_NONE background writeback happens. When
156 * started when this function returns, we make no guarentees on 162 * this function returns, it is only guaranteed that for given BDI
157 * completion. Caller need not hold sb s_umount semaphore. 163 * some IO is happening if we are over background dirty threshold.
164 * Caller need not hold sb s_umount semaphore.
158 */ 165 */
159void bdi_start_background_writeback(struct backing_dev_info *bdi) 166void bdi_start_background_writeback(struct backing_dev_info *bdi)
160{ 167{
161 __bdi_start_writeback(bdi, LONG_MAX, true, true); 168 /*
169 * We just wake up the flusher thread. It will perform background
170 * writeback as soon as there is no other work to do.
171 */
172 trace_writeback_wake_background(bdi);
173 spin_lock_bh(&bdi->wb_lock);
174 bdi_wakeup_flusher(bdi);
175 spin_unlock_bh(&bdi->wb_lock);
162} 176}
163 177
164/* 178/*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
616 }; 630 };
617 unsigned long oldest_jif; 631 unsigned long oldest_jif;
618 long wrote = 0; 632 long wrote = 0;
633 long write_chunk;
619 struct inode *inode; 634 struct inode *inode;
620 635
621 if (wbc.for_kupdate) { 636 if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
628 wbc.range_end = LLONG_MAX; 643 wbc.range_end = LLONG_MAX;
629 } 644 }
630 645
646 /*
647 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
648 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
649 * here avoids calling into writeback_inodes_wb() more than once.
650 *
651 * The intended call sequence for WB_SYNC_ALL writeback is:
652 *
653 * wb_writeback()
654 * __writeback_inodes_sb() <== called only once
655 * write_cache_pages() <== called once for each inode
656 * (quickly) tag currently dirty pages
657 * (maybe slowly) sync all tagged pages
658 */
659 if (wbc.sync_mode == WB_SYNC_NONE)
660 write_chunk = MAX_WRITEBACK_PAGES;
661 else
662 write_chunk = LONG_MAX;
663
631 wbc.wb_start = jiffies; /* livelock avoidance */ 664 wbc.wb_start = jiffies; /* livelock avoidance */
632 for (;;) { 665 for (;;) {
633 /* 666 /*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
637 break; 670 break;
638 671
639 /* 672 /*
673 * Background writeout and kupdate-style writeback may
674 * run forever. Stop them if there is other work to do
675 * so that e.g. sync can proceed. They'll be restarted
676 * after the other works are all done.
677 */
678 if ((work->for_background || work->for_kupdate) &&
679 !list_empty(&wb->bdi->work_list))
680 break;
681
682 /*
640 * For background writeout, stop when we are below the 683 * For background writeout, stop when we are below the
641 * background dirty threshold 684 * background dirty threshold
642 */ 685 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
644 break; 687 break;
645 688
646 wbc.more_io = 0; 689 wbc.more_io = 0;
647 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 690 wbc.nr_to_write = write_chunk;
648 wbc.pages_skipped = 0; 691 wbc.pages_skipped = 0;
649 692
650 trace_wbc_writeback_start(&wbc, wb->bdi); 693 trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
654 writeback_inodes_wb(wb, &wbc); 697 writeback_inodes_wb(wb, &wbc);
655 trace_wbc_writeback_written(&wbc, wb->bdi); 698 trace_wbc_writeback_written(&wbc, wb->bdi);
656 699
657 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 700 work->nr_pages -= write_chunk - wbc.nr_to_write;
658 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 701 wrote += write_chunk - wbc.nr_to_write;
659 702
660 /* 703 /*
661 * If we consumed everything, see if we have more 704 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
670 /* 713 /*
671 * Did we write something? Try for more 714 * Did we write something? Try for more
672 */ 715 */
673 if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) 716 if (wbc.nr_to_write < write_chunk)
674 continue; 717 continue;
675 /* 718 /*
676 * Nothing written. Wait for some inode to 719 * Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
718 get_nr_dirty_inodes(); 761 get_nr_dirty_inodes();
719} 762}
720 763
764static long wb_check_background_flush(struct bdi_writeback *wb)
765{
766 if (over_bground_thresh()) {
767
768 struct wb_writeback_work work = {
769 .nr_pages = LONG_MAX,
770 .sync_mode = WB_SYNC_NONE,
771 .for_background = 1,
772 .range_cyclic = 1,
773 };
774
775 return wb_writeback(wb, &work);
776 }
777
778 return 0;
779}
780
721static long wb_check_old_data_flush(struct bdi_writeback *wb) 781static long wb_check_old_data_flush(struct bdi_writeback *wb)
722{ 782{
723 unsigned long expired; 783 unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
787 * Check for periodic writeback, kupdated() style 847 * Check for periodic writeback, kupdated() style
788 */ 848 */
789 wrote += wb_check_old_data_flush(wb); 849 wrote += wb_check_old_data_flush(wb);
850 wrote += wb_check_background_flush(wb);
790 clear_bit(BDI_writeback_running, &wb->bdi->state); 851 clear_bit(BDI_writeback_running, &wb->bdi->state);
791 852
792 return wrote; 853 return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
873 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 934 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
874 if (!bdi_has_dirty_io(bdi)) 935 if (!bdi_has_dirty_io(bdi))
875 continue; 936 continue;
876 __bdi_start_writeback(bdi, nr_pages, false, false); 937 __bdi_start_writeback(bdi, nr_pages, false);
877 } 938 }
878 rcu_read_unlock(); 939 rcu_read_unlock();
879} 940}
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1164 * @sb: the superblock 1225 * @sb: the superblock
1165 * 1226 *
1166 * This function writes and waits on any dirty inode belonging to this 1227 * This function writes and waits on any dirty inode belonging to this
1167 * super_block. The number of pages synced is returned. 1228 * super_block.
1168 */ 1229 */
1169void sync_inodes_sb(struct super_block *sb) 1230void sync_inodes_sb(struct super_block *sb)
1170{ 1231{
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1242EXPORT_SYMBOL(sync_inode); 1303EXPORT_SYMBOL(sync_inode);
1243 1304
1244/** 1305/**
1245 * sync_inode - write an inode to disk 1306 * sync_inode_metadata - write an inode to disk
1246 * @inode: the inode to sync 1307 * @inode: the inode to sync
1247 * @wait: wait for I/O to complete. 1308 * @wait: wait for I/O to complete.
1248 * 1309 *
1249 * Write an inode to disk and adjust it's dirty state after completion. 1310 * Write an inode to disk and adjust its dirty state after completion.
1250 * 1311 *
1251 * Note: only writes the actual inode, no associated data or other metadata. 1312 * Note: only writes the actual inode, no associated data or other metadata.
1252 */ 1313 */
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index ed45a9cf5f3..78b519c1353 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
4#include <linux/path.h> 4#include <linux/path.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6#include <linux/fs_struct.h> 6#include <linux/fs_struct.h>
7#include "internal.h"
8
9static inline void path_get_longterm(struct path *path)
10{
11 path_get(path);
12 mnt_make_longterm(path->mnt);
13}
14
15static inline void path_put_longterm(struct path *path)
16{
17 mnt_make_shortterm(path->mnt);
18 path_put(path);
19}
7 20
8/* 21/*
9 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. 22 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -14,12 +27,14 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
14 struct path old_root; 27 struct path old_root;
15 28
16 spin_lock(&fs->lock); 29 spin_lock(&fs->lock);
30 write_seqcount_begin(&fs->seq);
17 old_root = fs->root; 31 old_root = fs->root;
18 fs->root = *path; 32 fs->root = *path;
19 path_get(path); 33 path_get_longterm(path);
34 write_seqcount_end(&fs->seq);
20 spin_unlock(&fs->lock); 35 spin_unlock(&fs->lock);
21 if (old_root.dentry) 36 if (old_root.dentry)
22 path_put(&old_root); 37 path_put_longterm(&old_root);
23} 38}
24 39
25/* 40/*
@@ -31,13 +46,15 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
31 struct path old_pwd; 46 struct path old_pwd;
32 47
33 spin_lock(&fs->lock); 48 spin_lock(&fs->lock);
49 write_seqcount_begin(&fs->seq);
34 old_pwd = fs->pwd; 50 old_pwd = fs->pwd;
35 fs->pwd = *path; 51 fs->pwd = *path;
36 path_get(path); 52 path_get_longterm(path);
53 write_seqcount_end(&fs->seq);
37 spin_unlock(&fs->lock); 54 spin_unlock(&fs->lock);
38 55
39 if (old_pwd.dentry) 56 if (old_pwd.dentry)
40 path_put(&old_pwd); 57 path_put_longterm(&old_pwd);
41} 58}
42 59
43void chroot_fs_refs(struct path *old_root, struct path *new_root) 60void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -52,31 +69,33 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
52 fs = p->fs; 69 fs = p->fs;
53 if (fs) { 70 if (fs) {
54 spin_lock(&fs->lock); 71 spin_lock(&fs->lock);
72 write_seqcount_begin(&fs->seq);
55 if (fs->root.dentry == old_root->dentry 73 if (fs->root.dentry == old_root->dentry
56 && fs->root.mnt == old_root->mnt) { 74 && fs->root.mnt == old_root->mnt) {
57 path_get(new_root); 75 path_get_longterm(new_root);
58 fs->root = *new_root; 76 fs->root = *new_root;
59 count++; 77 count++;
60 } 78 }
61 if (fs->pwd.dentry == old_root->dentry 79 if (fs->pwd.dentry == old_root->dentry
62 && fs->pwd.mnt == old_root->mnt) { 80 && fs->pwd.mnt == old_root->mnt) {
63 path_get(new_root); 81 path_get_longterm(new_root);
64 fs->pwd = *new_root; 82 fs->pwd = *new_root;
65 count++; 83 count++;
66 } 84 }
85 write_seqcount_end(&fs->seq);
67 spin_unlock(&fs->lock); 86 spin_unlock(&fs->lock);
68 } 87 }
69 task_unlock(p); 88 task_unlock(p);
70 } while_each_thread(g, p); 89 } while_each_thread(g, p);
71 read_unlock(&tasklist_lock); 90 read_unlock(&tasklist_lock);
72 while (count--) 91 while (count--)
73 path_put(old_root); 92 path_put_longterm(old_root);
74} 93}
75 94
76void free_fs_struct(struct fs_struct *fs) 95void free_fs_struct(struct fs_struct *fs)
77{ 96{
78 path_put(&fs->root); 97 path_put_longterm(&fs->root);
79 path_put(&fs->pwd); 98 path_put_longterm(&fs->pwd);
80 kmem_cache_free(fs_cachep, fs); 99 kmem_cache_free(fs_cachep, fs);
81} 100}
82 101
@@ -88,8 +107,10 @@ void exit_fs(struct task_struct *tsk)
88 int kill; 107 int kill;
89 task_lock(tsk); 108 task_lock(tsk);
90 spin_lock(&fs->lock); 109 spin_lock(&fs->lock);
110 write_seqcount_begin(&fs->seq);
91 tsk->fs = NULL; 111 tsk->fs = NULL;
92 kill = !--fs->users; 112 kill = !--fs->users;
113 write_seqcount_end(&fs->seq);
93 spin_unlock(&fs->lock); 114 spin_unlock(&fs->lock);
94 task_unlock(tsk); 115 task_unlock(tsk);
95 if (kill) 116 if (kill)
@@ -105,8 +126,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
105 fs->users = 1; 126 fs->users = 1;
106 fs->in_exec = 0; 127 fs->in_exec = 0;
107 spin_lock_init(&fs->lock); 128 spin_lock_init(&fs->lock);
129 seqcount_init(&fs->seq);
108 fs->umask = old->umask; 130 fs->umask = old->umask;
109 get_fs_root_and_pwd(old, &fs->root, &fs->pwd); 131
132 spin_lock(&old->lock);
133 fs->root = old->root;
134 path_get_longterm(&fs->root);
135 fs->pwd = old->pwd;
136 path_get_longterm(&fs->pwd);
137 spin_unlock(&old->lock);
110 } 138 }
111 return fs; 139 return fs;
112} 140}
@@ -144,6 +172,7 @@ EXPORT_SYMBOL(current_umask);
144struct fs_struct init_fs = { 172struct fs_struct init_fs = {
145 .users = 1, 173 .users = 1,
146 .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), 174 .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
175 .seq = SEQCNT_ZERO,
147 .umask = 0022, 176 .umask = 0022,
148}; 177};
149 178
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede0..48a18f184d5 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
101 object->n_ops++; 101 object->n_ops++;
102 object->n_exclusive++; /* reads and writes must wait */ 102 object->n_exclusive++; /* reads and writes must wait */
103 103
104 if (object->n_ops > 0) { 104 if (object->n_ops > 1) {
105 atomic_inc(&op->usage); 105 atomic_inc(&op->usage);
106 list_add_tail(&op->pend_link, &object->pending_ops); 106 list_add_tail(&op->pend_link, &object->pending_ops);
107 fscache_stat(&fscache_n_op_pend); 107 fscache_stat(&fscache_n_op_pend);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6e07696308d..cf8d28d1fba 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
251 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 251 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
252} 252}
253 253
254void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
255 u64 nodeid, u64 nlookup)
256{
257 forget->forget_one.nodeid = nodeid;
258 forget->forget_one.nlookup = nlookup;
259
260 spin_lock(&fc->lock);
261 fc->forget_list_tail->next = forget;
262 fc->forget_list_tail = forget;
263 wake_up(&fc->waitq);
264 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
265 spin_unlock(&fc->lock);
266}
267
254static void flush_bg_queue(struct fuse_conn *fc) 268static void flush_bg_queue(struct fuse_conn *fc)
255{ 269{
256 while (fc->active_background < fc->max_background && 270 while (fc->active_background < fc->max_background &&
@@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
438 } 452 }
439} 453}
440 454
441void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
442{
443 req->isreply = 0;
444 fuse_request_send_nowait(fc, req);
445}
446
447void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) 455void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
448{ 456{
449 req->isreply = 1; 457 req->isreply = 1;
@@ -896,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
896 return err; 904 return err;
897} 905}
898 906
907static int forget_pending(struct fuse_conn *fc)
908{
909 return fc->forget_list_head.next != NULL;
910}
911
899static int request_pending(struct fuse_conn *fc) 912static int request_pending(struct fuse_conn *fc)
900{ 913{
901 return !list_empty(&fc->pending) || !list_empty(&fc->interrupts); 914 return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
915 forget_pending(fc);
902} 916}
903 917
904/* Wait until a request is available on the pending list */ 918/* Wait until a request is available on the pending list */
@@ -960,6 +974,120 @@ __releases(fc->lock)
960 return err ? err : reqsize; 974 return err ? err : reqsize;
961} 975}
962 976
977static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
978 unsigned max,
979 unsigned *countp)
980{
981 struct fuse_forget_link *head = fc->forget_list_head.next;
982 struct fuse_forget_link **newhead = &head;
983 unsigned count;
984
985 for (count = 0; *newhead != NULL && count < max; count++)
986 newhead = &(*newhead)->next;
987
988 fc->forget_list_head.next = *newhead;
989 *newhead = NULL;
990 if (fc->forget_list_head.next == NULL)
991 fc->forget_list_tail = &fc->forget_list_head;
992
993 if (countp != NULL)
994 *countp = count;
995
996 return head;
997}
998
999static int fuse_read_single_forget(struct fuse_conn *fc,
1000 struct fuse_copy_state *cs,
1001 size_t nbytes)
1002__releases(fc->lock)
1003{
1004 int err;
1005 struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
1006 struct fuse_forget_in arg = {
1007 .nlookup = forget->forget_one.nlookup,
1008 };
1009 struct fuse_in_header ih = {
1010 .opcode = FUSE_FORGET,
1011 .nodeid = forget->forget_one.nodeid,
1012 .unique = fuse_get_unique(fc),
1013 .len = sizeof(ih) + sizeof(arg),
1014 };
1015
1016 spin_unlock(&fc->lock);
1017 kfree(forget);
1018 if (nbytes < ih.len)
1019 return -EINVAL;
1020
1021 err = fuse_copy_one(cs, &ih, sizeof(ih));
1022 if (!err)
1023 err = fuse_copy_one(cs, &arg, sizeof(arg));
1024 fuse_copy_finish(cs);
1025
1026 if (err)
1027 return err;
1028
1029 return ih.len;
1030}
1031
1032static int fuse_read_batch_forget(struct fuse_conn *fc,
1033 struct fuse_copy_state *cs, size_t nbytes)
1034__releases(fc->lock)
1035{
1036 int err;
1037 unsigned max_forgets;
1038 unsigned count;
1039 struct fuse_forget_link *head;
1040 struct fuse_batch_forget_in arg = { .count = 0 };
1041 struct fuse_in_header ih = {
1042 .opcode = FUSE_BATCH_FORGET,
1043 .unique = fuse_get_unique(fc),
1044 .len = sizeof(ih) + sizeof(arg),
1045 };
1046
1047 if (nbytes < ih.len) {
1048 spin_unlock(&fc->lock);
1049 return -EINVAL;
1050 }
1051
1052 max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
1053 head = dequeue_forget(fc, max_forgets, &count);
1054 spin_unlock(&fc->lock);
1055
1056 arg.count = count;
1057 ih.len += count * sizeof(struct fuse_forget_one);
1058 err = fuse_copy_one(cs, &ih, sizeof(ih));
1059 if (!err)
1060 err = fuse_copy_one(cs, &arg, sizeof(arg));
1061
1062 while (head) {
1063 struct fuse_forget_link *forget = head;
1064
1065 if (!err) {
1066 err = fuse_copy_one(cs, &forget->forget_one,
1067 sizeof(forget->forget_one));
1068 }
1069 head = forget->next;
1070 kfree(forget);
1071 }
1072
1073 fuse_copy_finish(cs);
1074
1075 if (err)
1076 return err;
1077
1078 return ih.len;
1079}
1080
1081static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
1082 size_t nbytes)
1083__releases(fc->lock)
1084{
1085 if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
1086 return fuse_read_single_forget(fc, cs, nbytes);
1087 else
1088 return fuse_read_batch_forget(fc, cs, nbytes);
1089}
1090
963/* 1091/*
964 * Read a single request into the userspace filesystem's buffer. This 1092 * Read a single request into the userspace filesystem's buffer. This
965 * function waits until a request is available, then removes it from 1093 * function waits until a request is available, then removes it from
@@ -998,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
998 return fuse_read_interrupt(fc, cs, nbytes, req); 1126 return fuse_read_interrupt(fc, cs, nbytes, req);
999 } 1127 }
1000 1128
1129 if (forget_pending(fc)) {
1130 if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
1131 return fuse_read_forget(fc, cs, nbytes);
1132
1133 if (fc->forget_batch <= -8)
1134 fc->forget_batch = 16;
1135 }
1136
1001 req = list_entry(fc->pending.next, struct fuse_req, list); 1137 req = list_entry(fc->pending.next, struct fuse_req, list);
1002 req->state = FUSE_REQ_READING; 1138 req->state = FUSE_REQ_READING;
1003 list_move(&req->list, &fc->io); 1139 list_move(&req->list, &fc->io);
@@ -1090,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1090 if (!fc) 1226 if (!fc)
1091 return -EPERM; 1227 return -EPERM;
1092 1228
1093 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); 1229 bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
1094 if (!bufs) 1230 if (!bufs)
1095 return -ENOMEM; 1231 return -ENOMEM;
1096 1232
@@ -1626,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1626 if (!fc) 1762 if (!fc)
1627 return -EPERM; 1763 return -EPERM;
1628 1764
1629 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); 1765 bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
1630 if (!bufs) 1766 if (!bufs)
1631 return -ENOMEM; 1767 return -ENOMEM;
1632 1768
@@ -1770,6 +1906,8 @@ __acquires(fc->lock)
1770 flush_bg_queue(fc); 1906 flush_bg_queue(fc);
1771 end_requests(fc, &fc->pending); 1907 end_requests(fc, &fc->pending);
1772 end_requests(fc, &fc->processing); 1908 end_requests(fc, &fc->processing);
1909 while (forget_pending(fc))
1910 kfree(dequeue_forget(fc, 1, NULL));
1773} 1911}
1774 1912
1775/* 1913/*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c9627c95482..bfed8447ed8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,9 +10,9 @@
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/gfp.h>
14#include <linux/sched.h> 13#include <linux/sched.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/slab.h>
16 16
17#if BITS_PER_LONG >= 64 17#if BITS_PER_LONG >= 64
18static inline void fuse_dentry_settime(struct dentry *entry, u64 time) 18static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
@@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
156 */ 156 */
157static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) 157static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
158{ 158{
159 struct inode *inode = entry->d_inode; 159 struct inode *inode;
160 160
161 if (nd->flags & LOOKUP_RCU)
162 return -ECHILD;
163
164 inode = entry->d_inode;
161 if (inode && is_bad_inode(inode)) 165 if (inode && is_bad_inode(inode))
162 return 0; 166 return 0;
163 else if (fuse_dentry_time(entry) < get_jiffies_64()) { 167 else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -165,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
165 struct fuse_entry_out outarg; 169 struct fuse_entry_out outarg;
166 struct fuse_conn *fc; 170 struct fuse_conn *fc;
167 struct fuse_req *req; 171 struct fuse_req *req;
168 struct fuse_req *forget_req; 172 struct fuse_forget_link *forget;
169 struct dentry *parent; 173 struct dentry *parent;
170 u64 attr_version; 174 u64 attr_version;
171 175
@@ -178,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
178 if (IS_ERR(req)) 182 if (IS_ERR(req))
179 return 0; 183 return 0;
180 184
181 forget_req = fuse_get_req(fc); 185 forget = fuse_alloc_forget();
182 if (IS_ERR(forget_req)) { 186 if (!forget) {
183 fuse_put_request(fc, req); 187 fuse_put_request(fc, req);
184 return 0; 188 return 0;
185 } 189 }
@@ -199,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
199 if (!err) { 203 if (!err) {
200 struct fuse_inode *fi = get_fuse_inode(inode); 204 struct fuse_inode *fi = get_fuse_inode(inode);
201 if (outarg.nodeid != get_node_id(inode)) { 205 if (outarg.nodeid != get_node_id(inode)) {
202 fuse_send_forget(fc, forget_req, 206 fuse_queue_forget(fc, forget, outarg.nodeid, 1);
203 outarg.nodeid, 1);
204 return 0; 207 return 0;
205 } 208 }
206 spin_lock(&fc->lock); 209 spin_lock(&fc->lock);
207 fi->nlookup++; 210 fi->nlookup++;
208 spin_unlock(&fc->lock); 211 spin_unlock(&fc->lock);
209 } 212 }
210 fuse_put_request(fc, forget_req); 213 kfree(forget);
211 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) 214 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
212 return 0; 215 return 0;
213 216
@@ -259,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
259{ 262{
260 struct fuse_conn *fc = get_fuse_conn_super(sb); 263 struct fuse_conn *fc = get_fuse_conn_super(sb);
261 struct fuse_req *req; 264 struct fuse_req *req;
262 struct fuse_req *forget_req; 265 struct fuse_forget_link *forget;
263 u64 attr_version; 266 u64 attr_version;
264 int err; 267 int err;
265 268
@@ -273,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
273 if (IS_ERR(req)) 276 if (IS_ERR(req))
274 goto out; 277 goto out;
275 278
276 forget_req = fuse_get_req(fc); 279 forget = fuse_alloc_forget();
277 err = PTR_ERR(forget_req); 280 err = -ENOMEM;
278 if (IS_ERR(forget_req)) { 281 if (!forget) {
279 fuse_put_request(fc, req); 282 fuse_put_request(fc, req);
280 goto out; 283 goto out;
281 } 284 }
@@ -301,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
301 attr_version); 304 attr_version);
302 err = -ENOMEM; 305 err = -ENOMEM;
303 if (!*inode) { 306 if (!*inode) {
304 fuse_send_forget(fc, forget_req, outarg->nodeid, 1); 307 fuse_queue_forget(fc, forget, outarg->nodeid, 1);
305 goto out; 308 goto out;
306 } 309 }
307 err = 0; 310 err = 0;
308 311
309 out_put_forget: 312 out_put_forget:
310 fuse_put_request(fc, forget_req); 313 kfree(forget);
311 out: 314 out:
312 return err; 315 return err;
313} 316}
@@ -347,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
347 } 350 }
348 351
349 entry = newent ? newent : entry; 352 entry = newent ? newent : entry;
350 entry->d_op = &fuse_dentry_operations;
351 if (outarg_valid) 353 if (outarg_valid)
352 fuse_change_entry_timeout(entry, &outarg); 354 fuse_change_entry_timeout(entry, &outarg);
353 else 355 else
@@ -374,7 +376,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
374 struct inode *inode; 376 struct inode *inode;
375 struct fuse_conn *fc = get_fuse_conn(dir); 377 struct fuse_conn *fc = get_fuse_conn(dir);
376 struct fuse_req *req; 378 struct fuse_req *req;
377 struct fuse_req *forget_req; 379 struct fuse_forget_link *forget;
378 struct fuse_create_in inarg; 380 struct fuse_create_in inarg;
379 struct fuse_open_out outopen; 381 struct fuse_open_out outopen;
380 struct fuse_entry_out outentry; 382 struct fuse_entry_out outentry;
@@ -388,9 +390,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
388 if (flags & O_DIRECT) 390 if (flags & O_DIRECT)
389 return -EINVAL; 391 return -EINVAL;
390 392
391 forget_req = fuse_get_req(fc); 393 forget = fuse_alloc_forget();
392 if (IS_ERR(forget_req)) 394 if (!forget)
393 return PTR_ERR(forget_req); 395 return -ENOMEM;
394 396
395 req = fuse_get_req(fc); 397 req = fuse_get_req(fc);
396 err = PTR_ERR(req); 398 err = PTR_ERR(req);
@@ -448,10 +450,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
448 if (!inode) { 450 if (!inode) {
449 flags &= ~(O_CREAT | O_EXCL | O_TRUNC); 451 flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
450 fuse_sync_release(ff, flags); 452 fuse_sync_release(ff, flags);
451 fuse_send_forget(fc, forget_req, outentry.nodeid, 1); 453 fuse_queue_forget(fc, forget, outentry.nodeid, 1);
452 return -ENOMEM; 454 return -ENOMEM;
453 } 455 }
454 fuse_put_request(fc, forget_req); 456 kfree(forget);
455 d_instantiate(entry, inode); 457 d_instantiate(entry, inode);
456 fuse_change_entry_timeout(entry, &outentry); 458 fuse_change_entry_timeout(entry, &outentry);
457 fuse_invalidate_attr(dir); 459 fuse_invalidate_attr(dir);
@@ -469,7 +471,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
469 out_put_request: 471 out_put_request:
470 fuse_put_request(fc, req); 472 fuse_put_request(fc, req);
471 out_put_forget_req: 473 out_put_forget_req:
472 fuse_put_request(fc, forget_req); 474 kfree(forget);
473 return err; 475 return err;
474} 476}
475 477
@@ -483,12 +485,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
483 struct fuse_entry_out outarg; 485 struct fuse_entry_out outarg;
484 struct inode *inode; 486 struct inode *inode;
485 int err; 487 int err;
486 struct fuse_req *forget_req; 488 struct fuse_forget_link *forget;
487 489
488 forget_req = fuse_get_req(fc); 490 forget = fuse_alloc_forget();
489 if (IS_ERR(forget_req)) { 491 if (!forget) {
490 fuse_put_request(fc, req); 492 fuse_put_request(fc, req);
491 return PTR_ERR(forget_req); 493 return -ENOMEM;
492 } 494 }
493 495
494 memset(&outarg, 0, sizeof(outarg)); 496 memset(&outarg, 0, sizeof(outarg));
@@ -515,10 +517,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
515 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, 517 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
516 &outarg.attr, entry_attr_timeout(&outarg), 0); 518 &outarg.attr, entry_attr_timeout(&outarg), 0);
517 if (!inode) { 519 if (!inode) {
518 fuse_send_forget(fc, forget_req, outarg.nodeid, 1); 520 fuse_queue_forget(fc, forget, outarg.nodeid, 1);
519 return -ENOMEM; 521 return -ENOMEM;
520 } 522 }
521 fuse_put_request(fc, forget_req); 523 kfree(forget);
522 524
523 if (S_ISDIR(inode->i_mode)) { 525 if (S_ISDIR(inode->i_mode)) {
524 struct dentry *alias; 526 struct dentry *alias;
@@ -541,7 +543,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
541 return 0; 543 return 0;
542 544
543 out_put_forget_req: 545 out_put_forget_req:
544 fuse_put_request(fc, forget_req); 546 kfree(forget);
545 return err; 547 return err;
546} 548}
547 549
@@ -981,12 +983,15 @@ static int fuse_access(struct inode *inode, int mask)
981 * access request is sent. Execute permission is still checked 983 * access request is sent. Execute permission is still checked
982 * locally based on file mode. 984 * locally based on file mode.
983 */ 985 */
984static int fuse_permission(struct inode *inode, int mask) 986static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
985{ 987{
986 struct fuse_conn *fc = get_fuse_conn(inode); 988 struct fuse_conn *fc = get_fuse_conn(inode);
987 bool refreshed = false; 989 bool refreshed = false;
988 int err = 0; 990 int err = 0;
989 991
992 if (flags & IPERM_FLAG_RCU)
993 return -ECHILD;
994
990 if (!fuse_allow_task(fc, current)) 995 if (!fuse_allow_task(fc, current))
991 return -EACCES; 996 return -EACCES;
992 997
@@ -1001,7 +1006,7 @@ static int fuse_permission(struct inode *inode, int mask)
1001 } 1006 }
1002 1007
1003 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { 1008 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
1004 err = generic_permission(inode, mask, NULL); 1009 err = generic_permission(inode, mask, flags, NULL);
1005 1010
1006 /* If permission is denied, try to refresh file 1011 /* If permission is denied, try to refresh file
1007 attributes. This is also needed, because the root 1012 attributes. This is also needed, because the root
@@ -1009,7 +1014,8 @@ static int fuse_permission(struct inode *inode, int mask)
1009 if (err == -EACCES && !refreshed) { 1014 if (err == -EACCES && !refreshed) {
1010 err = fuse_do_getattr(inode, NULL, NULL); 1015 err = fuse_do_getattr(inode, NULL, NULL);
1011 if (!err) 1016 if (!err)
1012 err = generic_permission(inode, mask, NULL); 1017 err = generic_permission(inode, mask,
1018 flags, NULL);
1013 } 1019 }
1014 1020
1015 /* Note: the opposite of the above test does not 1021 /* Note: the opposite of the above test does not
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c8224587123..95da1bc1c82 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -13,6 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/compat.h>
16 17
17static const struct file_operations fuse_direct_io_file_operations; 18static const struct file_operations fuse_direct_io_file_operations;
18 19
@@ -134,6 +135,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
134void fuse_finish_open(struct inode *inode, struct file *file) 135void fuse_finish_open(struct inode *inode, struct file *file)
135{ 136{
136 struct fuse_file *ff = file->private_data; 137 struct fuse_file *ff = file->private_data;
138 struct fuse_conn *fc = get_fuse_conn(inode);
137 139
138 if (ff->open_flags & FOPEN_DIRECT_IO) 140 if (ff->open_flags & FOPEN_DIRECT_IO)
139 file->f_op = &fuse_direct_io_file_operations; 141 file->f_op = &fuse_direct_io_file_operations;
@@ -141,6 +143,15 @@ void fuse_finish_open(struct inode *inode, struct file *file)
141 invalidate_inode_pages2(inode->i_mapping); 143 invalidate_inode_pages2(inode->i_mapping);
142 if (ff->open_flags & FOPEN_NONSEEKABLE) 144 if (ff->open_flags & FOPEN_NONSEEKABLE)
143 nonseekable_open(inode, file); 145 nonseekable_open(inode, file);
146 if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
147 struct fuse_inode *fi = get_fuse_inode(inode);
148
149 spin_lock(&fc->lock);
150 fi->attr_version = ++fc->attr_version;
151 i_size_write(inode, 0);
152 spin_unlock(&fc->lock);
153 fuse_invalidate_attr(inode);
154 }
144} 155}
145 156
146int fuse_open_common(struct inode *inode, struct file *file, bool isdir) 157int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -1618,6 +1629,94 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1618} 1629}
1619 1630
1620/* 1631/*
1632 * CUSE servers compiled on 32bit broke on 64bit kernels because the
1633 * ABI was defined to be 'struct iovec' which is different on 32bit
1634 * and 64bit. Fortunately we can determine which structure the server
1635 * used from the size of the reply.
1636 */
1637static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
1638 size_t transferred, unsigned count,
1639 bool is_compat)
1640{
1641#ifdef CONFIG_COMPAT
1642 if (count * sizeof(struct compat_iovec) == transferred) {
1643 struct compat_iovec *ciov = src;
1644 unsigned i;
1645
1646 /*
1647 * With this interface a 32bit server cannot support
1648 * non-compat (i.e. ones coming from 64bit apps) ioctl
1649 * requests
1650 */
1651 if (!is_compat)
1652 return -EINVAL;
1653
1654 for (i = 0; i < count; i++) {
1655 dst[i].iov_base = compat_ptr(ciov[i].iov_base);
1656 dst[i].iov_len = ciov[i].iov_len;
1657 }
1658 return 0;
1659 }
1660#endif
1661
1662 if (count * sizeof(struct iovec) != transferred)
1663 return -EIO;
1664
1665 memcpy(dst, src, transferred);
1666 return 0;
1667}
1668
1669/* Make sure iov_length() won't overflow */
1670static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
1671{
1672 size_t n;
1673 u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
1674
1675 for (n = 0; n < count; n++) {
1676 if (iov->iov_len > (size_t) max)
1677 return -ENOMEM;
1678 max -= iov->iov_len;
1679 }
1680 return 0;
1681}
1682
1683static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
1684 void *src, size_t transferred, unsigned count,
1685 bool is_compat)
1686{
1687 unsigned i;
1688 struct fuse_ioctl_iovec *fiov = src;
1689
1690 if (fc->minor < 16) {
1691 return fuse_copy_ioctl_iovec_old(dst, src, transferred,
1692 count, is_compat);
1693 }
1694
1695 if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
1696 return -EIO;
1697
1698 for (i = 0; i < count; i++) {
1699 /* Did the server supply an inappropriate value? */
1700 if (fiov[i].base != (unsigned long) fiov[i].base ||
1701 fiov[i].len != (unsigned long) fiov[i].len)
1702 return -EIO;
1703
1704 dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
1705 dst[i].iov_len = (size_t) fiov[i].len;
1706
1707#ifdef CONFIG_COMPAT
1708 if (is_compat &&
1709 (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
1710 (compat_size_t) dst[i].iov_len != fiov[i].len))
1711 return -EIO;
1712#endif
1713 }
1714
1715 return 0;
1716}
1717
1718
1719/*
1621 * For ioctls, there is no generic way to determine how much memory 1720 * For ioctls, there is no generic way to determine how much memory
1622 * needs to be read and/or written. Furthermore, ioctls are allowed 1721 * needs to be read and/or written. Furthermore, ioctls are allowed
1623 * to dereference the passed pointer, so the parameter requires deep 1722 * to dereference the passed pointer, so the parameter requires deep
@@ -1677,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1677 struct fuse_ioctl_out outarg; 1776 struct fuse_ioctl_out outarg;
1678 struct fuse_req *req = NULL; 1777 struct fuse_req *req = NULL;
1679 struct page **pages = NULL; 1778 struct page **pages = NULL;
1680 struct page *iov_page = NULL; 1779 struct iovec *iov_page = NULL;
1681 struct iovec *in_iov = NULL, *out_iov = NULL; 1780 struct iovec *in_iov = NULL, *out_iov = NULL;
1682 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; 1781 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1683 size_t in_size, out_size, transferred; 1782 size_t in_size, out_size, transferred;
1684 int err; 1783 int err;
1685 1784
1785#if BITS_PER_LONG == 32
1786 inarg.flags |= FUSE_IOCTL_32BIT;
1787#else
1788 if (flags & FUSE_IOCTL_COMPAT)
1789 inarg.flags |= FUSE_IOCTL_32BIT;
1790#endif
1791
1686 /* assume all the iovs returned by client always fits in a page */ 1792 /* assume all the iovs returned by client always fits in a page */
1687 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 1793 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1688 1794
1689 err = -ENOMEM; 1795 err = -ENOMEM;
1690 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); 1796 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1691 iov_page = alloc_page(GFP_KERNEL); 1797 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
1692 if (!pages || !iov_page) 1798 if (!pages || !iov_page)
1693 goto out; 1799 goto out;
1694 1800
@@ -1697,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1697 * RETRY from server is not allowed. 1803 * RETRY from server is not allowed.
1698 */ 1804 */
1699 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { 1805 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1700 struct iovec *iov = page_address(iov_page); 1806 struct iovec *iov = iov_page;
1701 1807
1702 iov->iov_base = (void __user *)arg; 1808 iov->iov_base = (void __user *)arg;
1703 iov->iov_len = _IOC_SIZE(cmd); 1809 iov->iov_len = _IOC_SIZE(cmd);
@@ -1778,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1778 1884
1779 /* did it ask for retry? */ 1885 /* did it ask for retry? */
1780 if (outarg.flags & FUSE_IOCTL_RETRY) { 1886 if (outarg.flags & FUSE_IOCTL_RETRY) {
1781 char *vaddr; 1887 void *vaddr;
1782 1888
1783 /* no retry if in restricted mode */ 1889 /* no retry if in restricted mode */
1784 err = -EIO; 1890 err = -EIO;
@@ -1798,18 +1904,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1798 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) 1904 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1799 goto out; 1905 goto out;
1800 1906
1801 err = -EIO;
1802 if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
1803 goto out;
1804
1805 /* okay, copy in iovs and retry */
1806 vaddr = kmap_atomic(pages[0], KM_USER0); 1907 vaddr = kmap_atomic(pages[0], KM_USER0);
1807 memcpy(page_address(iov_page), vaddr, transferred); 1908 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
1909 transferred, in_iovs + out_iovs,
1910 (flags & FUSE_IOCTL_COMPAT) != 0);
1808 kunmap_atomic(vaddr, KM_USER0); 1911 kunmap_atomic(vaddr, KM_USER0);
1912 if (err)
1913 goto out;
1809 1914
1810 in_iov = page_address(iov_page); 1915 in_iov = iov_page;
1811 out_iov = in_iov + in_iovs; 1916 out_iov = in_iov + in_iovs;
1812 1917
1918 err = fuse_verify_ioctl_iov(in_iov, in_iovs);
1919 if (err)
1920 goto out;
1921
1922 err = fuse_verify_ioctl_iov(out_iov, out_iovs);
1923 if (err)
1924 goto out;
1925
1813 goto retry; 1926 goto retry;
1814 } 1927 }
1815 1928
@@ -1821,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1821 out: 1934 out:
1822 if (req) 1935 if (req)
1823 fuse_put_request(fc, req); 1936 fuse_put_request(fc, req);
1824 if (iov_page) 1937 free_page((unsigned long) iov_page);
1825 __free_page(iov_page);
1826 while (num_pages) 1938 while (num_pages)
1827 __free_page(pages[--num_pages]); 1939 __free_page(pages[--num_pages]);
1828 kfree(pages); 1940 kfree(pages);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 57d4a3a0f10..ae5744a2f9e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -53,6 +53,12 @@ extern struct mutex fuse_mutex;
53extern unsigned max_user_bgreq; 53extern unsigned max_user_bgreq;
54extern unsigned max_user_congthresh; 54extern unsigned max_user_congthresh;
55 55
56/* One forget request */
57struct fuse_forget_link {
58 struct fuse_forget_one forget_one;
59 struct fuse_forget_link *next;
60};
61
56/** FUSE inode */ 62/** FUSE inode */
57struct fuse_inode { 63struct fuse_inode {
58 /** Inode data */ 64 /** Inode data */
@@ -66,7 +72,7 @@ struct fuse_inode {
66 u64 nlookup; 72 u64 nlookup;
67 73
68 /** The request used for sending the FORGET message */ 74 /** The request used for sending the FORGET message */
69 struct fuse_req *forget_req; 75 struct fuse_forget_link *forget;
70 76
71 /** Time in jiffies until the file attributes are valid */ 77 /** Time in jiffies until the file attributes are valid */
72 u64 i_time; 78 u64 i_time;
@@ -255,7 +261,6 @@ struct fuse_req {
255 261
256 /** Data for asynchronous requests */ 262 /** Data for asynchronous requests */
257 union { 263 union {
258 struct fuse_forget_in forget_in;
259 struct { 264 struct {
260 struct fuse_release_in in; 265 struct fuse_release_in in;
261 struct path path; 266 struct path path;
@@ -369,6 +374,13 @@ struct fuse_conn {
369 /** Pending interrupts */ 374 /** Pending interrupts */
370 struct list_head interrupts; 375 struct list_head interrupts;
371 376
377 /** Queue of pending forgets */
378 struct fuse_forget_link forget_list_head;
379 struct fuse_forget_link *forget_list_tail;
380
381 /** Batching of FORGET requests (positive indicates FORGET batch) */
382 int forget_batch;
383
372 /** Flag indicating if connection is blocked. This will be 384 /** Flag indicating if connection is blocked. This will be
373 the case before the INIT reply is received, and if there 385 the case before the INIT reply is received, and if there
374 are too many outstading backgrounds requests */ 386 are too many outstading backgrounds requests */
@@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
543/** 555/**
544 * Send FORGET command 556 * Send FORGET command
545 */ 557 */
546void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, 558void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
547 u64 nodeid, u64 nlookup); 559 u64 nodeid, u64 nlookup);
560
561struct fuse_forget_link *fuse_alloc_forget(void);
548 562
549/** 563/**
550 * Initialize READ or READDIR request 564 * Initialize READ or READDIR request
@@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
656void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); 670void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
657 671
658/** 672/**
659 * Send a request with no reply
660 */
661void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
662
663/**
664 * Send a request in the background 673 * Send a request in the background
665 */ 674 */
666void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); 675void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cfce3ad86a9..9e3f68cc1bd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,6 +71,11 @@ struct fuse_mount_data {
71 unsigned blksize; 71 unsigned blksize;
72}; 72};
73 73
74struct fuse_forget_link *fuse_alloc_forget()
75{
76 return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
77}
78
74static struct inode *fuse_alloc_inode(struct super_block *sb) 79static struct inode *fuse_alloc_inode(struct super_block *sb)
75{ 80{
76 struct inode *inode; 81 struct inode *inode;
@@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
90 INIT_LIST_HEAD(&fi->queued_writes); 95 INIT_LIST_HEAD(&fi->queued_writes);
91 INIT_LIST_HEAD(&fi->writepages); 96 INIT_LIST_HEAD(&fi->writepages);
92 init_waitqueue_head(&fi->page_waitq); 97 init_waitqueue_head(&fi->page_waitq);
93 fi->forget_req = fuse_request_alloc(); 98 fi->forget = fuse_alloc_forget();
94 if (!fi->forget_req) { 99 if (!fi->forget) {
95 kmem_cache_free(fuse_inode_cachep, inode); 100 kmem_cache_free(fuse_inode_cachep, inode);
96 return NULL; 101 return NULL;
97 } 102 }
@@ -99,27 +104,20 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
99 return inode; 104 return inode;
100} 105}
101 106
102static void fuse_destroy_inode(struct inode *inode) 107static void fuse_i_callback(struct rcu_head *head)
103{ 108{
104 struct fuse_inode *fi = get_fuse_inode(inode); 109 struct inode *inode = container_of(head, struct inode, i_rcu);
105 BUG_ON(!list_empty(&fi->write_files)); 110 INIT_LIST_HEAD(&inode->i_dentry);
106 BUG_ON(!list_empty(&fi->queued_writes));
107 if (fi->forget_req)
108 fuse_request_free(fi->forget_req);
109 kmem_cache_free(fuse_inode_cachep, inode); 111 kmem_cache_free(fuse_inode_cachep, inode);
110} 112}
111 113
112void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, 114static void fuse_destroy_inode(struct inode *inode)
113 u64 nodeid, u64 nlookup)
114{ 115{
115 struct fuse_forget_in *inarg = &req->misc.forget_in; 116 struct fuse_inode *fi = get_fuse_inode(inode);
116 inarg->nlookup = nlookup; 117 BUG_ON(!list_empty(&fi->write_files));
117 req->in.h.opcode = FUSE_FORGET; 118 BUG_ON(!list_empty(&fi->queued_writes));
118 req->in.h.nodeid = nodeid; 119 kfree(fi->forget);
119 req->in.numargs = 1; 120 call_rcu(&inode->i_rcu, fuse_i_callback);
120 req->in.args[0].size = sizeof(struct fuse_forget_in);
121 req->in.args[0].value = inarg;
122 fuse_request_send_noreply(fc, req);
123} 121}
124 122
125static void fuse_evict_inode(struct inode *inode) 123static void fuse_evict_inode(struct inode *inode)
@@ -129,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode)
129 if (inode->i_sb->s_flags & MS_ACTIVE) { 127 if (inode->i_sb->s_flags & MS_ACTIVE) {
130 struct fuse_conn *fc = get_fuse_conn(inode); 128 struct fuse_conn *fc = get_fuse_conn(inode);
131 struct fuse_inode *fi = get_fuse_inode(inode); 129 struct fuse_inode *fi = get_fuse_inode(inode);
132 fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup); 130 fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
133 fi->forget_req = NULL; 131 fi->forget = NULL;
134 } 132 }
135} 133}
136 134
@@ -534,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc)
534 INIT_LIST_HEAD(&fc->interrupts); 532 INIT_LIST_HEAD(&fc->interrupts);
535 INIT_LIST_HEAD(&fc->bg_queue); 533 INIT_LIST_HEAD(&fc->bg_queue);
536 INIT_LIST_HEAD(&fc->entry); 534 INIT_LIST_HEAD(&fc->entry);
535 fc->forget_list_tail = &fc->forget_list_head;
537 atomic_set(&fc->num_waiting, 0); 536 atomic_set(&fc->num_waiting, 0);
538 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; 537 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
539 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; 538 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
@@ -618,10 +617,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
618 goto out_iput; 617 goto out_iput;
619 618
620 entry = d_obtain_alias(inode); 619 entry = d_obtain_alias(inode);
621 if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) { 620 if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
622 entry->d_op = &fuse_dentry_operations;
623 fuse_invalidate_entry_cache(entry); 621 fuse_invalidate_entry_cache(entry);
624 }
625 622
626 return entry; 623 return entry;
627 624
@@ -720,10 +717,8 @@ static struct dentry *fuse_get_parent(struct dentry *child)
720 } 717 }
721 718
722 parent = d_obtain_alias(inode); 719 parent = d_obtain_alias(inode);
723 if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) { 720 if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
724 parent->d_op = &fuse_dentry_operations;
725 fuse_invalidate_entry_cache(parent); 721 fuse_invalidate_entry_cache(parent);
726 }
727 722
728 return parent; 723 return parent;
729} 724}
@@ -990,6 +985,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
990 iput(root); 985 iput(root);
991 goto err_put_conn; 986 goto err_put_conn;
992 } 987 }
988 /* only now - we want root dentry with NULL ->d_op */
989 sb->s_d_op = &fuse_dentry_operations;
993 990
994 init_req = fuse_request_alloc(); 991 init_req = fuse_request_alloc();
995 if (!init_req) 992 if (!init_req)
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 6bc9e3a5a69..06c48a89183 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -190,14 +190,20 @@ generic_acl_chmod(struct inode *inode)
190} 190}
191 191
192int 192int
193generic_check_acl(struct inode *inode, int mask) 193generic_check_acl(struct inode *inode, int mask, unsigned int flags)
194{ 194{
195 struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS); 195 if (flags & IPERM_FLAG_RCU) {
196 196 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
197 if (acl) { 197 return -ECHILD;
198 int error = posix_acl_permission(inode, acl, mask); 198 } else {
199 posix_acl_release(acl); 199 struct posix_acl *acl;
200 return error; 200
201 acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
202 if (acl) {
203 int error = posix_acl_permission(inode, acl, mask);
204 posix_acl_release(acl);
205 return error;
206 }
201 } 207 }
202 return -EAGAIN; 208 return -EAGAIN;
203} 209}
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 48171f4c943..7118f1a780a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
75 * Returns: errno 75 * Returns: errno
76 */ 76 */
77 77
78int gfs2_check_acl(struct inode *inode, int mask) 78int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
79{ 79{
80 struct posix_acl *acl; 80 struct posix_acl *acl;
81 int error; 81 int error;
82 82
83 if (flags & IPERM_FLAG_RCU)
84 return -ECHILD;
85
83 acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); 86 acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
84 if (IS_ERR(acl)) 87 if (IS_ERR(acl))
85 return PTR_ERR(acl); 88 return PTR_ERR(acl);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index b522b0cb39e..a93907c8159 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@
16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" 16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
17#define GFS2_ACL_MAX_ENTRIES 25 17#define GFS2_ACL_MAX_ENTRIES 25
18 18
19extern int gfs2_check_acl(struct inode *inode, int mask); 19extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); 20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); 21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
22extern const struct xattr_handler gfs2_xattr_system_handler; 22extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c066d4e..3c4039d5eef 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
763 int metadata; 763 int metadata;
764 unsigned int revokes = 0; 764 unsigned int revokes = 0;
765 int x; 765 int x;
766 int error; 766 int error = 0;
767 767
768 if (!*top) 768 if (!*top)
769 sm->sm_first = 0; 769 sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
780 if (metadata) 780 if (metadata)
781 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; 781 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
782 782
783 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); 783 if (ip != GFS2_I(sdp->sd_rindex))
784 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
785 else if (!sdp->sd_rgrps)
786 error = gfs2_ri_update(ip);
787
784 if (error) 788 if (error)
785 return error; 789 return error;
786 790
@@ -879,7 +883,8 @@ out_rg_gunlock:
879out_rlist: 883out_rlist:
880 gfs2_rlist_free(&rlist); 884 gfs2_rlist_free(&rlist);
881out: 885out:
882 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); 886 if (ip != GFS2_I(sdp->sd_rindex))
887 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
883 return error; 888 return error;
884} 889}
885 890
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 6798755b385..4a456338b87 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -11,6 +11,7 @@
11#include <linux/completion.h> 11#include <linux/completion.h>
12#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
13#include <linux/gfs2_ondisk.h> 13#include <linux/gfs2_ondisk.h>
14#include <linux/namei.h>
14#include <linux/crc32.h> 15#include <linux/crc32.h>
15 16
16#include "gfs2.h" 17#include "gfs2.h"
@@ -34,15 +35,23 @@
34 35
35static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) 36static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
36{ 37{
37 struct dentry *parent = dget_parent(dentry); 38 struct dentry *parent;
38 struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode); 39 struct gfs2_sbd *sdp;
39 struct gfs2_inode *dip = GFS2_I(parent->d_inode); 40 struct gfs2_inode *dip;
40 struct inode *inode = dentry->d_inode; 41 struct inode *inode;
41 struct gfs2_holder d_gh; 42 struct gfs2_holder d_gh;
42 struct gfs2_inode *ip = NULL; 43 struct gfs2_inode *ip = NULL;
43 int error; 44 int error;
44 int had_lock = 0; 45 int had_lock = 0;
45 46
47 if (nd->flags & LOOKUP_RCU)
48 return -ECHILD;
49
50 parent = dget_parent(dentry);
51 sdp = GFS2_SB(parent->d_inode);
52 dip = GFS2_I(parent->d_inode);
53 inode = dentry->d_inode;
54
46 if (inode) { 55 if (inode) {
47 if (is_bad_inode(inode)) 56 if (is_bad_inode(inode))
48 goto invalid; 57 goto invalid;
@@ -100,13 +109,14 @@ fail:
100 return 0; 109 return 0;
101} 110}
102 111
103static int gfs2_dhash(struct dentry *dentry, struct qstr *str) 112static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
113 struct qstr *str)
104{ 114{
105 str->hash = gfs2_disk_hash(str->name, str->len); 115 str->hash = gfs2_disk_hash(str->name, str->len);
106 return 0; 116 return 0;
107} 117}
108 118
109static int gfs2_dentry_delete(struct dentry *dentry) 119static int gfs2_dentry_delete(const struct dentry *dentry)
110{ 120{
111 struct gfs2_inode *ginode; 121 struct gfs2_inode *ginode;
112 122
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 06d582732d3..9023db8184f 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,22 +126,14 @@ static int gfs2_get_name(struct dentry *parent, char *name,
126 126
127static struct dentry *gfs2_get_parent(struct dentry *child) 127static struct dentry *gfs2_get_parent(struct dentry *child)
128{ 128{
129 struct dentry *dentry; 129 return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
130
131 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
132 if (!IS_ERR(dentry))
133 dentry->d_op = &gfs2_dops;
134 return dentry;
135} 130}
136 131
137static struct dentry *gfs2_get_dentry(struct super_block *sb, 132static struct dentry *gfs2_get_dentry(struct super_block *sb,
138 struct gfs2_inum_host *inum) 133 struct gfs2_inum_host *inum)
139{ 134{
140 struct gfs2_sbd *sdp = sb->s_fs_info; 135 struct gfs2_sbd *sdp = sb->s_fs_info;
141 struct gfs2_holder i_gh;
142 struct inode *inode; 136 struct inode *inode;
143 struct dentry *dentry;
144 int error;
145 137
146 inode = gfs2_ilookup(sb, inum->no_addr); 138 inode = gfs2_ilookup(sb, inum->no_addr);
147 if (inode) { 139 if (inode) {
@@ -152,52 +144,13 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
152 goto out_inode; 144 goto out_inode;
153 } 145 }
154 146
155 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, 147 inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
156 LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 148 GFS2_BLKST_DINODE);
157 if (error) 149 if (IS_ERR(inode))
158 return ERR_PTR(error); 150 return ERR_CAST(inode);
159
160 error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
161 if (error)
162 goto fail;
163
164 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
165 if (IS_ERR(inode)) {
166 error = PTR_ERR(inode);
167 goto fail;
168 }
169
170 error = gfs2_inode_refresh(GFS2_I(inode));
171 if (error) {
172 iput(inode);
173 goto fail;
174 }
175
176 /* Pick up the works we bypass in gfs2_inode_lookup */
177 if (inode->i_state & I_NEW)
178 gfs2_set_iop(inode);
179
180 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
181 iput(inode);
182 goto fail;
183 }
184
185 error = -EIO;
186 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
187 iput(inode);
188 goto fail;
189 }
190
191 gfs2_glock_dq_uninit(&i_gh);
192 151
193out_inode: 152out_inode:
194 dentry = d_obtain_alias(inode); 153 return d_obtain_alias(inode);
195 if (!IS_ERR(dentry))
196 dentry->d_op = &gfs2_dops;
197 return dentry;
198fail:
199 gfs2_glock_dq_uninit(&i_gh);
200 return ERR_PTR(error);
201} 154}
202 155
203static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, 156static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index aa996471ec5..7cfdcb91336 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/ext2_fs.h> 21#include <linux/ext2_fs.h>
22#include <linux/falloc.h>
23#include <linux/swap.h>
22#include <linux/crc32.h> 24#include <linux/crc32.h>
23#include <linux/writeback.h> 25#include <linux/writeback.h>
24#include <asm/uaccess.h> 26#include <asm/uaccess.h>
@@ -241,7 +243,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
241 !capable(CAP_LINUX_IMMUTABLE)) 243 !capable(CAP_LINUX_IMMUTABLE))
242 goto out; 244 goto out;
243 if (!IS_IMMUTABLE(inode)) { 245 if (!IS_IMMUTABLE(inode)) {
244 error = gfs2_permission(inode, MAY_WRITE); 246 error = gfs2_permission(inode, MAY_WRITE, 0);
245 if (error) 247 if (error)
246 goto out; 248 goto out;
247 } 249 }
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
610 return generic_file_aio_write(iocb, iov, nr_segs, pos); 612 return generic_file_aio_write(iocb, iov, nr_segs, pos);
611} 613}
612 614
615static void empty_write_end(struct page *page, unsigned from,
616 unsigned to)
617{
618 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
619
620 page_zero_new_buffers(page, from, to);
621 flush_dcache_page(page);
622 mark_page_accessed(page);
623
624 if (!gfs2_is_writeback(ip))
625 gfs2_page_add_databufs(ip, page, from, to);
626
627 block_commit_write(page, from, to);
628}
629
630static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
631{
632 unsigned start, end, next;
633 struct buffer_head *bh, *head;
634 int error;
635
636 if (!page_has_buffers(page)) {
637 error = __block_write_begin(page, from, to - from, gfs2_block_map);
638 if (unlikely(error))
639 return error;
640
641 empty_write_end(page, from, to);
642 return 0;
643 }
644
645 bh = head = page_buffers(page);
646 next = end = 0;
647 while (next < from) {
648 next += bh->b_size;
649 bh = bh->b_this_page;
650 }
651 start = next;
652 do {
653 next += bh->b_size;
654 if (buffer_mapped(bh)) {
655 if (end) {
656 error = __block_write_begin(page, start, end - start,
657 gfs2_block_map);
658 if (unlikely(error))
659 return error;
660 empty_write_end(page, start, end);
661 end = 0;
662 }
663 start = next;
664 }
665 else
666 end = next;
667 bh = bh->b_this_page;
668 } while (next < to);
669
670 if (end) {
671 error = __block_write_begin(page, start, end - start, gfs2_block_map);
672 if (unlikely(error))
673 return error;
674 empty_write_end(page, start, end);
675 }
676
677 return 0;
678}
679
680static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
681 int mode)
682{
683 struct gfs2_inode *ip = GFS2_I(inode);
684 struct buffer_head *dibh;
685 int error;
686 u64 start = offset >> PAGE_CACHE_SHIFT;
687 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
688 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
689 pgoff_t curr;
690 struct page *page;
691 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
692 unsigned int from, to;
693
694 if (!end_offset)
695 end_offset = PAGE_CACHE_SIZE;
696
697 error = gfs2_meta_inode_buffer(ip, &dibh);
698 if (unlikely(error))
699 goto out;
700
701 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
702
703 if (gfs2_is_stuffed(ip)) {
704 error = gfs2_unstuff_dinode(ip, NULL);
705 if (unlikely(error))
706 goto out;
707 }
708
709 curr = start;
710 offset = start << PAGE_CACHE_SHIFT;
711 from = start_offset;
712 to = PAGE_CACHE_SIZE;
713 while (curr <= end) {
714 page = grab_cache_page_write_begin(inode->i_mapping, curr,
715 AOP_FLAG_NOFS);
716 if (unlikely(!page)) {
717 error = -ENOMEM;
718 goto out;
719 }
720
721 if (curr == end)
722 to = end_offset;
723 error = write_empty_blocks(page, from, to);
724 if (!error && offset + to > inode->i_size &&
725 !(mode & FALLOC_FL_KEEP_SIZE)) {
726 i_size_write(inode, offset + to);
727 }
728 unlock_page(page);
729 page_cache_release(page);
730 if (error)
731 goto out;
732 curr++;
733 offset += PAGE_CACHE_SIZE;
734 from = 0;
735 }
736
737 gfs2_dinode_out(ip, dibh->b_data);
738 mark_inode_dirty(inode);
739
740 brelse(dibh);
741
742out:
743 return error;
744}
745
746static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
747 unsigned int *data_blocks, unsigned int *ind_blocks)
748{
749 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
750 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
751 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
752
753 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
754 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
755 max_data -= tmp;
756 }
757 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
758 so it might end up with fewer data blocks */
759 if (max_data <= *data_blocks)
760 return;
761 *data_blocks = max_data;
762 *ind_blocks = max_blocks - max_data;
763 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
764 if (*len > max) {
765 *len = max;
766 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
767 }
768}
769
770static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
771 loff_t len)
772{
773 struct inode *inode = file->f_path.dentry->d_inode;
774 struct gfs2_sbd *sdp = GFS2_SB(inode);
775 struct gfs2_inode *ip = GFS2_I(inode);
776 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
777 loff_t bytes, max_bytes;
778 struct gfs2_alloc *al;
779 int error;
780 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
781 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
782
783 /* We only support the FALLOC_FL_KEEP_SIZE mode */
784 if (mode & ~FALLOC_FL_KEEP_SIZE)
785 return -EOPNOTSUPP;
786
787 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
788 sdp->sd_sb.sb_bsize_shift;
789
790 len = next - offset;
791 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
792 if (!bytes)
793 bytes = UINT_MAX;
794
795 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
796 error = gfs2_glock_nq(&ip->i_gh);
797 if (unlikely(error))
798 goto out_uninit;
799
800 if (!gfs2_write_alloc_required(ip, offset, len))
801 goto out_unlock;
802
803 while (len > 0) {
804 if (len < bytes)
805 bytes = len;
806 al = gfs2_alloc_get(ip);
807 if (!al) {
808 error = -ENOMEM;
809 goto out_unlock;
810 }
811
812 error = gfs2_quota_lock_check(ip);
813 if (error)
814 goto out_alloc_put;
815
816retry:
817 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
818
819 al->al_requested = data_blocks + ind_blocks;
820 error = gfs2_inplace_reserve(ip);
821 if (error) {
822 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
823 bytes >>= 1;
824 goto retry;
825 }
826 goto out_qunlock;
827 }
828 max_bytes = bytes;
829 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
830 al->al_requested = data_blocks + ind_blocks;
831
832 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
833 RES_RG_HDR + gfs2_rg_blocks(al);
834 if (gfs2_is_jdata(ip))
835 rblocks += data_blocks ? data_blocks : 1;
836
837 error = gfs2_trans_begin(sdp, rblocks,
838 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
839 if (error)
840 goto out_trans_fail;
841
842 error = fallocate_chunk(inode, offset, max_bytes, mode);
843 gfs2_trans_end(sdp);
844
845 if (error)
846 goto out_trans_fail;
847
848 len -= max_bytes;
849 offset += max_bytes;
850 gfs2_inplace_release(ip);
851 gfs2_quota_unlock(ip);
852 gfs2_alloc_put(ip);
853 }
854 goto out_unlock;
855
856out_trans_fail:
857 gfs2_inplace_release(ip);
858out_qunlock:
859 gfs2_quota_unlock(ip);
860out_alloc_put:
861 gfs2_alloc_put(ip);
862out_unlock:
863 gfs2_glock_dq(&ip->i_gh);
864out_uninit:
865 gfs2_holder_uninit(&ip->i_gh);
866 return error;
867}
868
613#ifdef CONFIG_GFS2_FS_LOCKING_DLM 869#ifdef CONFIG_GFS2_FS_LOCKING_DLM
614 870
615/** 871/**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
765 .splice_read = generic_file_splice_read, 1021 .splice_read = generic_file_splice_read,
766 .splice_write = generic_file_splice_write, 1022 .splice_write = generic_file_splice_write,
767 .setlease = gfs2_setlease, 1023 .setlease = gfs2_setlease,
1024 .fallocate = gfs2_fallocate,
768}; 1025};
769 1026
770const struct file_operations gfs2_dir_fops = { 1027const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
794 .splice_read = generic_file_splice_read, 1051 .splice_read = generic_file_splice_read,
795 .splice_write = generic_file_splice_write, 1052 .splice_write = generic_file_splice_write,
796 .setlease = generic_setlease, 1053 .setlease = generic_setlease,
1054 .fallocate = gfs2_fallocate,
797}; 1055};
798 1056
799const struct file_operations gfs2_dir_fops_nolock = { 1057const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 87778857f09..08a8beb152e 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
541 spin_unlock(&gl->gl_spin); 541 spin_unlock(&gl->gl_spin);
542} 542}
543 543
544static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
545 unsigned int req_state,
546 unsigned int flags)
547{
548 int ret = LM_OUT_ERROR;
549
550 if (!sdp->sd_lockstruct.ls_ops->lm_lock)
551 return req_state == LM_ST_UNLOCKED ? 0 : req_state;
552
553 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
554 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
555 req_state, flags);
556 return ret;
557}
558
559/** 544/**
560 * do_xmote - Calls the DLM to change the state of a lock 545 * do_xmote - Calls the DLM to change the state of a lock
561 * @gl: The lock state 546 * @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
575 560
576 lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | 561 lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
577 LM_FLAG_PRIORITY); 562 LM_FLAG_PRIORITY);
578 BUG_ON(gl->gl_state == target); 563 GLOCK_BUG_ON(gl, gl->gl_state == target);
579 BUG_ON(gl->gl_state == gl->gl_target); 564 GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
580 if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && 565 if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
581 glops->go_inval) { 566 glops->go_inval) {
582 set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); 567 set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
583 do_error(gl, 0); /* Fail queued try locks */ 568 do_error(gl, 0); /* Fail queued try locks */
584 } 569 }
570 gl->gl_req = target;
585 spin_unlock(&gl->gl_spin); 571 spin_unlock(&gl->gl_spin);
586 if (glops->go_xmote_th) 572 if (glops->go_xmote_th)
587 glops->go_xmote_th(gl); 573 glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
594 gl->gl_state == LM_ST_DEFERRED) && 580 gl->gl_state == LM_ST_DEFERRED) &&
595 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 581 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
596 lck_flags |= LM_FLAG_TRY_1CB; 582 lck_flags |= LM_FLAG_TRY_1CB;
597 ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
598 583
599 if (!(ret & LM_OUT_ASYNC)) { 584 if (sdp->sd_lockstruct.ls_ops->lm_lock) {
600 finish_xmote(gl, ret); 585 /* lock_dlm */
586 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
587 GLOCK_BUG_ON(gl, ret);
588 } else { /* lock_nolock */
589 finish_xmote(gl, target);
601 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 590 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
602 gfs2_glock_put(gl); 591 gfs2_glock_put(gl);
603 } else {
604 GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
605 } 592 }
593
606 spin_lock(&gl->gl_spin); 594 spin_lock(&gl->gl_spin);
607} 595}
608 596
@@ -686,21 +674,20 @@ static void delete_work_func(struct work_struct *work)
686{ 674{
687 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); 675 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
688 struct gfs2_sbd *sdp = gl->gl_sbd; 676 struct gfs2_sbd *sdp = gl->gl_sbd;
689 struct gfs2_inode *ip = NULL; 677 struct gfs2_inode *ip;
690 struct inode *inode; 678 struct inode *inode;
691 u64 no_addr = 0; 679 u64 no_addr = gl->gl_name.ln_number;
680
681 ip = gl->gl_object;
682 /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
692 683
693 spin_lock(&gl->gl_spin);
694 ip = (struct gfs2_inode *)gl->gl_object;
695 if (ip) 684 if (ip)
696 no_addr = ip->i_no_addr;
697 spin_unlock(&gl->gl_spin);
698 if (ip) {
699 inode = gfs2_ilookup(sdp->sd_vfs, no_addr); 685 inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
700 if (inode) { 686 else
701 d_prune_aliases(inode); 687 inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
702 iput(inode); 688 if (inode && !IS_ERR(inode)) {
703 } 689 d_prune_aliases(inode);
690 iput(inode);
704 } 691 }
705 gfs2_glock_put(gl); 692 gfs2_glock_put(gl);
706} 693}
@@ -952,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
952 939
953void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) 940void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
954{ 941{
942 struct va_format vaf;
955 va_list args; 943 va_list args;
956 944
957 va_start(args, fmt); 945 va_start(args, fmt);
946
958 if (seq) { 947 if (seq) {
959 struct gfs2_glock_iter *gi = seq->private; 948 struct gfs2_glock_iter *gi = seq->private;
960 vsprintf(gi->string, fmt, args); 949 vsprintf(gi->string, fmt, args);
961 seq_printf(seq, gi->string); 950 seq_printf(seq, gi->string);
962 } else { 951 } else {
963 printk(KERN_ERR " "); 952 vaf.fmt = fmt;
964 vprintk(fmt, args); 953 vaf.va = &args;
954
955 printk(KERN_ERR " %pV", &vaf);
965 } 956 }
957
966 va_end(args); 958 va_end(args);
967} 959}
968 960
@@ -1362,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
1362 * @gl: Pointer to the glock 1354 * @gl: Pointer to the glock
1363 * @ret: The return value from the dlm 1355 * @ret: The return value from the dlm
1364 * 1356 *
1357 * The gl_reply field is under the gl_spin lock so that it is ok
1358 * to use a bitfield shared with other glock state fields.
1365 */ 1359 */
1366 1360
1367void gfs2_glock_complete(struct gfs2_glock *gl, int ret) 1361void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1368{ 1362{
1369 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 1363 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
1370 1364
1365 spin_lock(&gl->gl_spin);
1371 gl->gl_reply = ret; 1366 gl->gl_reply = ret;
1372 1367
1373 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { 1368 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
1374 spin_lock(&gl->gl_spin);
1375 if (gfs2_should_freeze(gl)) { 1369 if (gfs2_should_freeze(gl)) {
1376 set_bit(GLF_FROZEN, &gl->gl_flags); 1370 set_bit(GLF_FROZEN, &gl->gl_flags);
1377 spin_unlock(&gl->gl_spin); 1371 spin_unlock(&gl->gl_spin);
1378 return; 1372 return;
1379 } 1373 }
1380 spin_unlock(&gl->gl_spin);
1381 } 1374 }
1375
1376 spin_unlock(&gl->gl_spin);
1382 set_bit(GLF_REPLY_PENDING, &gl->gl_flags); 1377 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1378 smp_wmb();
1383 gfs2_glock_hold(gl); 1379 gfs2_glock_hold(gl);
1384 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1380 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1385 gfs2_glock_put(gl); 1381 gfs2_glock_put(gl);
@@ -1627,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1627static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) 1623static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1628{ 1624{
1629 struct task_struct *gh_owner = NULL; 1625 struct task_struct *gh_owner = NULL;
1630 char buffer[KSYM_SYMBOL_LEN];
1631 char flags_buf[32]; 1626 char flags_buf[32];
1632 1627
1633 sprint_symbol(buffer, gh->gh_ip);
1634 if (gh->gh_owner_pid) 1628 if (gh->gh_owner_pid)
1635 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); 1629 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
1636 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n", 1630 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
1637 state2str(gh->gh_state), 1631 state2str(gh->gh_state),
1638 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), 1632 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
1639 gh->gh_error, 1633 gh->gh_error,
1640 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, 1634 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
1641 gh_owner ? gh_owner->comm : "(ended)", buffer); 1635 gh_owner ? gh_owner->comm : "(ended)",
1636 (void *)gh->gh_ip);
1642 return 0; 1637 return 0;
1643} 1638}
1644 1639
@@ -1783,12 +1778,13 @@ int __init gfs2_glock_init(void)
1783 } 1778 }
1784#endif 1779#endif
1785 1780
1786 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER | 1781 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
1787 WQ_HIGHPRI | WQ_FREEZEABLE, 0); 1782 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1788 if (IS_ERR(glock_workqueue)) 1783 if (IS_ERR(glock_workqueue))
1789 return PTR_ERR(glock_workqueue); 1784 return PTR_ERR(glock_workqueue);
1790 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER | 1785 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
1791 WQ_FREEZEABLE, 0); 1786 WQ_MEM_RECLAIM | WQ_FREEZEABLE,
1787 0);
1792 if (IS_ERR(gfs2_delete_workqueue)) { 1788 if (IS_ERR(gfs2_delete_workqueue)) {
1793 destroy_workqueue(glock_workqueue); 1789 destroy_workqueue(glock_workqueue);
1794 return PTR_ERR(gfs2_delete_workqueue); 1790 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d6d22..691851ceb61 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
87#define GL_ASYNC 0x00000040 87#define GL_ASYNC 0x00000040
88#define GL_EXACT 0x00000080 88#define GL_EXACT 0x00000080
89#define GL_SKIP 0x00000100 89#define GL_SKIP 0x00000100
90#define GL_ATIME 0x00000200
91#define GL_NOCACHE 0x00000400 90#define GL_NOCACHE 0x00000400
92 91
93/* 92/*
94 * lm_lock() and lm_async_cb return flags 93 * lm_async_cb return flags
95 * 94 *
96 * LM_OUT_ST_MASK 95 * LM_OUT_ST_MASK
97 * Masks the lower two bits of lock state in the returned value. 96 * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
99 * LM_OUT_CANCELED 98 * LM_OUT_CANCELED
100 * The lock request was canceled. 99 * The lock request was canceled.
101 * 100 *
102 * LM_OUT_ASYNC
103 * The result of the request will be returned in an LM_CB_ASYNC callback.
104 *
105 */ 101 */
106 102
107#define LM_OUT_ST_MASK 0x00000003 103#define LM_OUT_ST_MASK 0x00000003
108#define LM_OUT_CANCELED 0x00000008 104#define LM_OUT_CANCELED 0x00000008
109#define LM_OUT_ASYNC 0x00000080 105#define LM_OUT_ERROR 0x00000004
110#define LM_OUT_ERROR 0x00000100
111 106
112/* 107/*
113 * lm_recovery_done() messages 108 * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
124 void (*lm_unmount) (struct gfs2_sbd *sdp); 119 void (*lm_unmount) (struct gfs2_sbd *sdp);
125 void (*lm_withdraw) (struct gfs2_sbd *sdp); 120 void (*lm_withdraw) (struct gfs2_sbd *sdp);
126 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); 121 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
127 unsigned int (*lm_lock) (struct gfs2_glock *gl, 122 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
128 unsigned int req_state, unsigned int flags); 123 unsigned int flags);
129 void (*lm_cancel) (struct gfs2_glock *gl); 124 void (*lm_cancel) (struct gfs2_glock *gl);
130 const match_table_t *lm_tokens; 125 const match_table_t *lm_tokens;
131}; 126};
132 127
133#define LM_FLAG_TRY 0x00000001
134#define LM_FLAG_TRY_1CB 0x00000002
135#define LM_FLAG_NOEXP 0x00000004
136#define LM_FLAG_ANY 0x00000008
137#define LM_FLAG_PRIORITY 0x00000010
138
139#define GL_ASYNC 0x00000040
140#define GL_EXACT 0x00000080
141#define GL_SKIP 0x00000100
142#define GL_NOCACHE 0x00000400
143
144#define GLR_TRYFAILED 13
145
146extern struct workqueue_struct *gfs2_delete_workqueue; 128extern struct workqueue_struct *gfs2_delete_workqueue;
147static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) 129static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
148{ 130{
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
212int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 194int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
213void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 195void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
214void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); 196void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
197
198__attribute__ ((format(printf, 2, 3)))
215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 199void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
216 200
217/** 201/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dcc04e..263561bf1a5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
325 325
326 if (gl->gl_state != LM_ST_UNLOCKED && 326 if (gl->gl_state != LM_ST_UNLOCKED &&
327 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { 327 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
328 flush_workqueue(gfs2_delete_workqueue);
329 gfs2_meta_syncfs(sdp); 328 gfs2_meta_syncfs(sdp);
330 gfs2_log_shutdown(sdp); 329 gfs2_log_shutdown(sdp);
331 } 330 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb49efc..a79790c0627 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
11#define __INCORE_DOT_H__ 11#define __INCORE_DOT_H__
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/kobject.h>
14#include <linux/workqueue.h> 15#include <linux/workqueue.h>
15#include <linux/dlm.h> 16#include <linux/dlm.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
@@ -207,12 +208,14 @@ struct gfs2_glock {
207 208
208 spinlock_t gl_spin; 209 spinlock_t gl_spin;
209 210
210 unsigned int gl_state; 211 /* State fields protected by gl_spin */
211 unsigned int gl_target; 212 unsigned int gl_state:2, /* Current state */
212 unsigned int gl_reply; 213 gl_target:2, /* Target state */
214 gl_demote_state:2, /* State requested by remote node */
215 gl_req:2, /* State in last dlm request */
216 gl_reply:8; /* Last reply from the dlm */
217
213 unsigned int gl_hash; 218 unsigned int gl_hash;
214 unsigned int gl_req;
215 unsigned int gl_demote_state; /* state requested by remote node */
216 unsigned long gl_demote_time; /* time of first demote request */ 219 unsigned long gl_demote_time; /* time of first demote request */
217 struct list_head gl_holders; 220 struct list_head gl_holders;
218 221
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06370f8bd8c..7aa7d4f8984 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,60 +73,15 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
73 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); 73 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
74} 74}
75 75
76struct gfs2_skip_data {
77 u64 no_addr;
78 int skipped;
79};
80
81static int iget_skip_test(struct inode *inode, void *opaque)
82{
83 struct gfs2_inode *ip = GFS2_I(inode);
84 struct gfs2_skip_data *data = opaque;
85
86 if (ip->i_no_addr == data->no_addr) {
87 if (inode->i_state & (I_FREEING|I_WILL_FREE)){
88 data->skipped = 1;
89 return 0;
90 }
91 return 1;
92 }
93 return 0;
94}
95
96static int iget_skip_set(struct inode *inode, void *opaque)
97{
98 struct gfs2_inode *ip = GFS2_I(inode);
99 struct gfs2_skip_data *data = opaque;
100
101 if (data->skipped)
102 return 1;
103 inode->i_ino = (unsigned long)(data->no_addr);
104 ip->i_no_addr = data->no_addr;
105 return 0;
106}
107
108static struct inode *gfs2_iget_skip(struct super_block *sb,
109 u64 no_addr)
110{
111 struct gfs2_skip_data data;
112 unsigned long hash = (unsigned long)no_addr;
113
114 data.no_addr = no_addr;
115 data.skipped = 0;
116 return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
117}
118
119/** 76/**
120 * GFS2 lookup code fills in vfs inode contents based on info obtained 77 * gfs2_set_iop - Sets inode operations
121 * from directory entry inside gfs2_inode_lookup(). This has caused issues 78 * @inode: The inode with correct i_mode filled in
122 * with NFS code path since its get_dentry routine doesn't have the relevant
123 * directory entry when gfs2_inode_lookup() is invoked. Part of the code
124 * segment inside gfs2_inode_lookup code needs to get moved around.
125 * 79 *
126 * Clears I_NEW as well. 80 * GFS2 lookup code fills in vfs inode contents based on info obtained
127 **/ 81 * from directory entry inside gfs2_inode_lookup().
82 */
128 83
129void gfs2_set_iop(struct inode *inode) 84static void gfs2_set_iop(struct inode *inode)
130{ 85{
131 struct gfs2_sbd *sdp = GFS2_SB(inode); 86 struct gfs2_sbd *sdp = GFS2_SB(inode);
132 umode_t mode = inode->i_mode; 87 umode_t mode = inode->i_mode;
@@ -149,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
149 inode->i_op = &gfs2_file_iops; 104 inode->i_op = &gfs2_file_iops;
150 init_special_inode(inode, inode->i_mode, inode->i_rdev); 105 init_special_inode(inode, inode->i_mode, inode->i_rdev);
151 } 106 }
152
153 unlock_new_inode(inode);
154} 107}
155 108
156/** 109/**
@@ -162,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
162 * Returns: A VFS inode, or an error 115 * Returns: A VFS inode, or an error
163 */ 116 */
164 117
165struct inode *gfs2_inode_lookup(struct super_block *sb, 118struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
166 unsigned int type, 119 u64 no_addr, u64 no_formal_ino)
167 u64 no_addr,
168 u64 no_formal_ino)
169{ 120{
170 struct inode *inode; 121 struct inode *inode;
171 struct gfs2_inode *ip; 122 struct gfs2_inode *ip;
@@ -195,141 +146,80 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
195 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); 146 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
196 if (unlikely(error)) 147 if (unlikely(error))
197 goto fail_iopen; 148 goto fail_iopen;
198 ip->i_iopen_gh.gh_gl->gl_object = ip;
199 149
150 ip->i_iopen_gh.gh_gl->gl_object = ip;
200 gfs2_glock_put(io_gl); 151 gfs2_glock_put(io_gl);
201 io_gl = NULL; 152 io_gl = NULL;
202 153
203 if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
204 goto gfs2_nfsbypass;
205
206 inode->i_mode = DT2IF(type);
207
208 /*
209 * We must read the inode in order to work out its type in
210 * this case. Note that this doesn't happen often as we normally
211 * know the type beforehand. This code path only occurs during
212 * unlinked inode recovery (where it is safe to do this glock,
213 * which is not true in the general case).
214 */
215 if (type == DT_UNKNOWN) { 154 if (type == DT_UNKNOWN) {
216 struct gfs2_holder gh; 155 /* Inode glock must be locked already */
217 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 156 error = gfs2_inode_refresh(GFS2_I(inode));
218 if (unlikely(error)) 157 if (error)
219 goto fail_glock; 158 goto fail_refresh;
220 /* Inode is now uptodate */ 159 } else {
221 gfs2_glock_dq_uninit(&gh); 160 inode->i_mode = DT2IF(type);
222 } 161 }
223 162
224 gfs2_set_iop(inode); 163 gfs2_set_iop(inode);
164 unlock_new_inode(inode);
225 } 165 }
226 166
227gfs2_nfsbypass:
228 return inode; 167 return inode;
229fail_glock: 168
230 gfs2_glock_dq(&ip->i_iopen_gh); 169fail_refresh:
170 ip->i_iopen_gh.gh_gl->gl_object = NULL;
171 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
231fail_iopen: 172fail_iopen:
232 if (io_gl) 173 if (io_gl)
233 gfs2_glock_put(io_gl); 174 gfs2_glock_put(io_gl);
234fail_put: 175fail_put:
235 if (inode->i_state & I_NEW) 176 ip->i_gl->gl_object = NULL;
236 ip->i_gl->gl_object = NULL;
237 gfs2_glock_put(ip->i_gl); 177 gfs2_glock_put(ip->i_gl);
238fail: 178fail:
239 if (inode->i_state & I_NEW) 179 iget_failed(inode);
240 iget_failed(inode);
241 else
242 iput(inode);
243 return ERR_PTR(error); 180 return ERR_PTR(error);
244} 181}
245 182
246/** 183struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
247 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation 184 u64 *no_formal_ino, unsigned int blktype)
248 * and try to reclaim it by doing iput.
249 *
250 * This function assumes no rgrp locks are currently held.
251 *
252 * @sb: The super block
253 * no_addr: The inode number
254 *
255 */
256
257void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
258{ 185{
259 struct gfs2_sbd *sdp; 186 struct super_block *sb = sdp->sd_vfs;
260 struct gfs2_inode *ip; 187 struct gfs2_holder i_gh;
261 struct gfs2_glock *io_gl = NULL;
262 int error;
263 struct gfs2_holder gh;
264 struct inode *inode; 188 struct inode *inode;
189 int error;
265 190
266 inode = gfs2_iget_skip(sb, no_addr); 191 error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
267 192 LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
268 if (!inode) 193 if (error)
269 return; 194 return ERR_PTR(error);
270
271 /* If it's not a new inode, someone's using it, so leave it alone. */
272 if (!(inode->i_state & I_NEW)) {
273 iput(inode);
274 return;
275 }
276
277 ip = GFS2_I(inode);
278 sdp = GFS2_SB(inode);
279 ip->i_no_formal_ino = -1;
280 195
281 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 196 error = gfs2_check_blk_type(sdp, no_addr, blktype);
282 if (unlikely(error)) 197 if (error)
283 goto fail; 198 goto fail;
284 ip->i_gl->gl_object = ip;
285
286 error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
287 if (unlikely(error))
288 goto fail_put;
289
290 set_bit(GIF_INVALID, &ip->i_flags);
291 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
292 &ip->i_iopen_gh);
293 if (unlikely(error))
294 goto fail_iopen;
295
296 ip->i_iopen_gh.gh_gl->gl_object = ip;
297 gfs2_glock_put(io_gl);
298 io_gl = NULL;
299 199
300 inode->i_mode = DT2IF(DT_UNKNOWN); 200 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
201 if (IS_ERR(inode))
202 goto fail;
301 203
302 /* 204 /* Two extra checks for NFS only */
303 * We must read the inode in order to work out its type in 205 if (no_formal_ino) {
304 * this case. Note that this doesn't happen often as we normally 206 error = -ESTALE;
305 * know the type beforehand. This code path only occurs during 207 if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
306 * unlinked inode recovery (where it is safe to do this glock, 208 goto fail_iput;
307 * which is not true in the general case).
308 */
309 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
310 &gh);
311 if (unlikely(error))
312 goto fail_glock;
313 209
314 /* Inode is now uptodate */ 210 error = -EIO;
315 gfs2_glock_dq_uninit(&gh); 211 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
316 gfs2_set_iop(inode); 212 goto fail_iput;
317 213
318 /* The iput will cause it to be deleted. */ 214 error = 0;
319 iput(inode); 215 }
320 return;
321 216
322fail_glock:
323 gfs2_glock_dq(&ip->i_iopen_gh);
324fail_iopen:
325 if (io_gl)
326 gfs2_glock_put(io_gl);
327fail_put:
328 ip->i_gl->gl_object = NULL;
329 gfs2_glock_put(ip->i_gl);
330fail: 217fail:
331 iget_failed(inode); 218 gfs2_glock_dq_uninit(&i_gh);
332 return; 219 return error ? ERR_PTR(error) : inode;
220fail_iput:
221 iput(inode);
222 goto fail;
333} 223}
334 224
335static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 225static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -591,7 +481,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
591 } 481 }
592 482
593 if (!is_root) { 483 if (!is_root) {
594 error = gfs2_permission(dir, MAY_EXEC); 484 error = gfs2_permission(dir, MAY_EXEC, 0);
595 if (error) 485 if (error)
596 goto out; 486 goto out;
597 } 487 }
@@ -621,7 +511,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
621{ 511{
622 int error; 512 int error;
623 513
624 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); 514 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
625 if (error) 515 if (error)
626 return error; 516 return error;
627 517
@@ -998,17 +888,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
998 if (error) 888 if (error)
999 return error; 889 return error;
1000 890
1001 if ((attr->ia_valid & ATTR_SIZE) &&
1002 attr->ia_size != i_size_read(inode)) {
1003 error = vmtruncate(inode, attr->ia_size);
1004 if (error)
1005 return error;
1006 }
1007
1008 setattr_copy(inode, attr); 891 setattr_copy(inode, attr);
1009 mark_inode_dirty(inode); 892 mark_inode_dirty(inode);
1010
1011 gfs2_assert_warn(GFS2_SB(inode), !error);
1012 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 893 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1013 gfs2_dinode_out(ip, dibh->b_data); 894 gfs2_dinode_out(ip, dibh->b_data);
1014 brelse(dibh); 895 brelse(dibh);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6720d7d5fbc..3e00a66e7cb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,10 +96,11 @@ err:
96 return -EIO; 96 return -EIO;
97} 97}
98 98
99extern void gfs2_set_iop(struct inode *inode);
100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 99extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
101 u64 no_addr, u64 no_formal_ino); 100 u64 no_addr, u64 no_formal_ino);
102extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); 101extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
102 u64 *no_formal_ino,
103 unsigned int blktype);
103extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 104extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
104 105
105extern int gfs2_inode_refresh(struct gfs2_inode *ip); 106extern int gfs2_inode_refresh(struct gfs2_inode *ip);
@@ -111,7 +112,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
111extern struct inode *gfs2_createi(struct gfs2_holder *ghs, 112extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
112 const struct qstr *name, 113 const struct qstr *name,
113 unsigned int mode, dev_t dev); 114 unsigned int mode, dev_t dev);
114extern int gfs2_permission(struct inode *inode, int mask); 115extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
115extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); 116extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
116extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 117extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
117extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 118extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425b45f..6e493aee28f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
146 return lkf; 146 return lkf;
147} 147}
148 148
149static unsigned int gdlm_lock(struct gfs2_glock *gl, 149static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
150 unsigned int req_state, unsigned int flags) 150 unsigned int flags)
151{ 151{
152 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 152 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
153 int error;
154 int req; 153 int req;
155 u32 lkf; 154 u32 lkf;
156 155
157 gl->gl_req = req_state;
158 req = make_mode(req_state); 156 req = make_mode(req_state);
159 lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); 157 lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
160 158
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
162 * Submit the actual lock request. 160 * Submit the actual lock request.
163 */ 161 */
164 162
165 error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, 163 return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
166 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); 164 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
167 if (error == -EAGAIN)
168 return 0;
169 if (error)
170 return LM_OUT_ERROR;
171 return LM_OUT_ASYNC;
172} 165}
173 166
174static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) 167static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3eb1393f7b8..777927ce6f7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -440,7 +440,6 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
440 iput(inode); 440 iput(inode);
441 return -ENOMEM; 441 return -ENOMEM;
442 } 442 }
443 dentry->d_op = &gfs2_dops;
444 *dptr = dentry; 443 *dptr = dentry;
445 return 0; 444 return 0;
446} 445}
@@ -1106,6 +1105,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1106 1105
1107 sb->s_magic = GFS2_MAGIC; 1106 sb->s_magic = GFS2_MAGIC;
1108 sb->s_op = &gfs2_super_ops; 1107 sb->s_op = &gfs2_super_ops;
1108 sb->s_d_op = &gfs2_dops;
1109 sb->s_export_op = &gfs2_export_ops; 1109 sb->s_export_op = &gfs2_export_ops;
1110 sb->s_xattr = gfs2_xattr_handlers; 1110 sb->s_xattr = gfs2_xattr_handlers;
1111 sb->s_qcop = &gfs2_quotactl_ops; 1111 sb->s_qcop = &gfs2_quotactl_ops;
@@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1268{ 1268{
1269 struct block_device *bdev; 1269 struct block_device *bdev;
1270 struct super_block *s; 1270 struct super_block *s;
1271 fmode_t mode = FMODE_READ; 1271 fmode_t mode = FMODE_READ | FMODE_EXCL;
1272 int error; 1272 int error;
1273 struct gfs2_args args; 1273 struct gfs2_args args;
1274 struct gfs2_sbd *sdp; 1274 struct gfs2_sbd *sdp;
@@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1276 if (!(flags & MS_RDONLY)) 1276 if (!(flags & MS_RDONLY))
1277 mode |= FMODE_WRITE; 1277 mode |= FMODE_WRITE;
1278 1278
1279 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1279 bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1280 if (IS_ERR(bdev)) 1280 if (IS_ERR(bdev))
1281 return ERR_CAST(bdev); 1281 return ERR_CAST(bdev);
1282 1282
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1298 goto error_bdev; 1298 goto error_bdev;
1299 1299
1300 if (s->s_root) 1300 if (s->s_root)
1301 close_bdev_exclusive(bdev, mode); 1301 blkdev_put(bdev, mode);
1302 1302
1303 memset(&args, 0, sizeof(args)); 1303 memset(&args, 0, sizeof(args));
1304 args.ar_quota = GFS2_QUOTA_DEFAULT; 1304 args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
1342 deactivate_locked_super(s); 1342 deactivate_locked_super(s);
1343 return ERR_PTR(error); 1343 return ERR_PTR(error);
1344error_bdev: 1344error_bdev:
1345 close_bdev_exclusive(bdev, mode); 1345 blkdev_put(bdev, mode);
1346 return ERR_PTR(error); 1346 return ERR_PTR(error);
1347} 1347}
1348 1348
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c..d8b26ac2e20 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/crc32.h> 19#include <linux/crc32.h>
20#include <linux/fiemap.h> 20#include <linux/fiemap.h>
21#include <linux/swap.h>
22#include <linux/falloc.h>
23#include <asm/uaccess.h> 21#include <asm/uaccess.h>
24 22
25#include "gfs2.h" 23#include "gfs2.h"
@@ -106,8 +104,6 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
106{ 104{
107 struct inode *inode = NULL; 105 struct inode *inode = NULL;
108 106
109 dentry->d_op = &gfs2_dops;
110
111 inode = gfs2_lookupi(dir, &dentry->d_name, 0); 107 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
112 if (inode && IS_ERR(inode)) 108 if (inode && IS_ERR(inode))
113 return ERR_CAST(inode); 109 return ERR_CAST(inode);
@@ -166,7 +162,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
166 if (error) 162 if (error)
167 goto out_child; 163 goto out_child;
168 164
169 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); 165 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
170 if (error) 166 if (error)
171 goto out_gunlock; 167 goto out_gunlock;
172 168
@@ -289,7 +285,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
289 if (IS_APPEND(&dip->i_inode)) 285 if (IS_APPEND(&dip->i_inode))
290 return -EPERM; 286 return -EPERM;
291 287
292 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); 288 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
293 if (error) 289 if (error)
294 return error; 290 return error;
295 291
@@ -822,7 +818,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
822 } 818 }
823 } 819 }
824 } else { 820 } else {
825 error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); 821 error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
826 if (error) 822 if (error)
827 goto out_gunlock; 823 goto out_gunlock;
828 824
@@ -857,7 +853,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
857 /* Check out the dir to be renamed */ 853 /* Check out the dir to be renamed */
858 854
859 if (dir_rename) { 855 if (dir_rename) {
860 error = gfs2_permission(odentry->d_inode, MAY_WRITE); 856 error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
861 if (error) 857 if (error)
862 goto out_gunlock; 858 goto out_gunlock;
863 } 859 }
@@ -1041,13 +1037,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1041 * Returns: errno 1037 * Returns: errno
1042 */ 1038 */
1043 1039
1044int gfs2_permission(struct inode *inode, int mask) 1040int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
1045{ 1041{
1046 struct gfs2_inode *ip = GFS2_I(inode); 1042 struct gfs2_inode *ip;
1047 struct gfs2_holder i_gh; 1043 struct gfs2_holder i_gh;
1048 int error; 1044 int error;
1049 int unlock = 0; 1045 int unlock = 0;
1050 1046
1047 if (flags & IPERM_FLAG_RCU)
1048 return -ECHILD;
1049
1050 ip = GFS2_I(inode);
1051 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { 1051 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
1052 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 1052 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
1053 if (error) 1053 if (error)
@@ -1058,7 +1058,7 @@ int gfs2_permission(struct inode *inode, int mask)
1058 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) 1058 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
1059 error = -EACCES; 1059 error = -EACCES;
1060 else 1060 else
1061 error = generic_permission(inode, mask, gfs2_check_acl); 1061 error = generic_permission(inode, mask, flags, gfs2_check_acl);
1062 if (unlock) 1062 if (unlock)
1063 gfs2_glock_dq_uninit(&i_gh); 1063 gfs2_glock_dq_uninit(&i_gh);
1064 1064
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1069{ 1069{
1070 struct gfs2_inode *ip = GFS2_I(inode); 1070 struct gfs2_inode *ip = GFS2_I(inode);
1071 struct gfs2_sbd *sdp = GFS2_SB(inode); 1071 struct gfs2_sbd *sdp = GFS2_SB(inode);
1072 struct buffer_head *dibh;
1073 u32 ouid, ogid, nuid, ngid; 1072 u32 ouid, ogid, nuid, ngid;
1074 int error; 1073 int error;
1075 1074
@@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1100 if (error) 1099 if (error)
1101 goto out_gunlock_q; 1100 goto out_gunlock_q;
1102 1101
1103 error = gfs2_meta_inode_buffer(ip, &dibh); 1102 error = gfs2_setattr_simple(ip, attr);
1104 if (error) 1103 if (error)
1105 goto out_end_trans; 1104 goto out_end_trans;
1106 1105
1107 if ((attr->ia_valid & ATTR_SIZE) &&
1108 attr->ia_size != i_size_read(inode)) {
1109 int error;
1110
1111 error = vmtruncate(inode, attr->ia_size);
1112 gfs2_assert_warn(sdp, !error);
1113 }
1114
1115 setattr_copy(inode, attr);
1116 mark_inode_dirty(inode);
1117
1118 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1119 gfs2_dinode_out(ip, dibh->b_data);
1120 brelse(dibh);
1121
1122 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { 1106 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
1123 u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); 1107 u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
1124 gfs2_quota_change(ip, -blocks, ouid, ogid); 1108 gfs2_quota_change(ip, -blocks, ouid, ogid);
@@ -1271,257 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1271 return ret; 1255 return ret;
1272} 1256}
1273 1257
1274static void empty_write_end(struct page *page, unsigned from,
1275 unsigned to)
1276{
1277 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
1278
1279 page_zero_new_buffers(page, from, to);
1280 flush_dcache_page(page);
1281 mark_page_accessed(page);
1282
1283 if (!gfs2_is_writeback(ip))
1284 gfs2_page_add_databufs(ip, page, from, to);
1285
1286 block_commit_write(page, from, to);
1287}
1288
1289
1290static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1291{
1292 unsigned start, end, next;
1293 struct buffer_head *bh, *head;
1294 int error;
1295
1296 if (!page_has_buffers(page)) {
1297 error = __block_write_begin(page, from, to - from, gfs2_block_map);
1298 if (unlikely(error))
1299 return error;
1300
1301 empty_write_end(page, from, to);
1302 return 0;
1303 }
1304
1305 bh = head = page_buffers(page);
1306 next = end = 0;
1307 while (next < from) {
1308 next += bh->b_size;
1309 bh = bh->b_this_page;
1310 }
1311 start = next;
1312 do {
1313 next += bh->b_size;
1314 if (buffer_mapped(bh)) {
1315 if (end) {
1316 error = __block_write_begin(page, start, end - start,
1317 gfs2_block_map);
1318 if (unlikely(error))
1319 return error;
1320 empty_write_end(page, start, end);
1321 end = 0;
1322 }
1323 start = next;
1324 }
1325 else
1326 end = next;
1327 bh = bh->b_this_page;
1328 } while (next < to);
1329
1330 if (end) {
1331 error = __block_write_begin(page, start, end - start, gfs2_block_map);
1332 if (unlikely(error))
1333 return error;
1334 empty_write_end(page, start, end);
1335 }
1336
1337 return 0;
1338}
1339
1340static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
1341 int mode)
1342{
1343 struct gfs2_inode *ip = GFS2_I(inode);
1344 struct buffer_head *dibh;
1345 int error;
1346 u64 start = offset >> PAGE_CACHE_SHIFT;
1347 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
1348 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
1349 pgoff_t curr;
1350 struct page *page;
1351 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
1352 unsigned int from, to;
1353
1354 if (!end_offset)
1355 end_offset = PAGE_CACHE_SIZE;
1356
1357 error = gfs2_meta_inode_buffer(ip, &dibh);
1358 if (unlikely(error))
1359 goto out;
1360
1361 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1362
1363 if (gfs2_is_stuffed(ip)) {
1364 error = gfs2_unstuff_dinode(ip, NULL);
1365 if (unlikely(error))
1366 goto out;
1367 }
1368
1369 curr = start;
1370 offset = start << PAGE_CACHE_SHIFT;
1371 from = start_offset;
1372 to = PAGE_CACHE_SIZE;
1373 while (curr <= end) {
1374 page = grab_cache_page_write_begin(inode->i_mapping, curr,
1375 AOP_FLAG_NOFS);
1376 if (unlikely(!page)) {
1377 error = -ENOMEM;
1378 goto out;
1379 }
1380
1381 if (curr == end)
1382 to = end_offset;
1383 error = write_empty_blocks(page, from, to);
1384 if (!error && offset + to > inode->i_size &&
1385 !(mode & FALLOC_FL_KEEP_SIZE)) {
1386 i_size_write(inode, offset + to);
1387 }
1388 unlock_page(page);
1389 page_cache_release(page);
1390 if (error)
1391 goto out;
1392 curr++;
1393 offset += PAGE_CACHE_SIZE;
1394 from = 0;
1395 }
1396
1397 gfs2_dinode_out(ip, dibh->b_data);
1398 mark_inode_dirty(inode);
1399
1400 brelse(dibh);
1401
1402out:
1403 return error;
1404}
1405
1406static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
1407 unsigned int *data_blocks, unsigned int *ind_blocks)
1408{
1409 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1410 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
1411 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
1412
1413 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
1414 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1415 max_data -= tmp;
1416 }
1417 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
1418 so it might end up with fewer data blocks */
1419 if (max_data <= *data_blocks)
1420 return;
1421 *data_blocks = max_data;
1422 *ind_blocks = max_blocks - max_data;
1423 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
1424 if (*len > max) {
1425 *len = max;
1426 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
1427 }
1428}
1429
1430static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1431 loff_t len)
1432{
1433 struct gfs2_sbd *sdp = GFS2_SB(inode);
1434 struct gfs2_inode *ip = GFS2_I(inode);
1435 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1436 loff_t bytes, max_bytes;
1437 struct gfs2_alloc *al;
1438 int error;
1439 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
1440 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
1441
1442 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
1443 sdp->sd_sb.sb_bsize_shift;
1444
1445 len = next - offset;
1446 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
1447 if (!bytes)
1448 bytes = UINT_MAX;
1449
1450 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
1451 error = gfs2_glock_nq(&ip->i_gh);
1452 if (unlikely(error))
1453 goto out_uninit;
1454
1455 if (!gfs2_write_alloc_required(ip, offset, len))
1456 goto out_unlock;
1457
1458 while (len > 0) {
1459 if (len < bytes)
1460 bytes = len;
1461 al = gfs2_alloc_get(ip);
1462 if (!al) {
1463 error = -ENOMEM;
1464 goto out_unlock;
1465 }
1466
1467 error = gfs2_quota_lock_check(ip);
1468 if (error)
1469 goto out_alloc_put;
1470
1471retry:
1472 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
1473
1474 al->al_requested = data_blocks + ind_blocks;
1475 error = gfs2_inplace_reserve(ip);
1476 if (error) {
1477 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
1478 bytes >>= 1;
1479 goto retry;
1480 }
1481 goto out_qunlock;
1482 }
1483 max_bytes = bytes;
1484 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
1485 al->al_requested = data_blocks + ind_blocks;
1486
1487 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
1488 RES_RG_HDR + gfs2_rg_blocks(al);
1489 if (gfs2_is_jdata(ip))
1490 rblocks += data_blocks ? data_blocks : 1;
1491
1492 error = gfs2_trans_begin(sdp, rblocks,
1493 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
1494 if (error)
1495 goto out_trans_fail;
1496
1497 error = fallocate_chunk(inode, offset, max_bytes, mode);
1498 gfs2_trans_end(sdp);
1499
1500 if (error)
1501 goto out_trans_fail;
1502
1503 len -= max_bytes;
1504 offset += max_bytes;
1505 gfs2_inplace_release(ip);
1506 gfs2_quota_unlock(ip);
1507 gfs2_alloc_put(ip);
1508 }
1509 goto out_unlock;
1510
1511out_trans_fail:
1512 gfs2_inplace_release(ip);
1513out_qunlock:
1514 gfs2_quota_unlock(ip);
1515out_alloc_put:
1516 gfs2_alloc_put(ip);
1517out_unlock:
1518 gfs2_glock_dq(&ip->i_gh);
1519out_uninit:
1520 gfs2_holder_uninit(&ip->i_gh);
1521 return error;
1522}
1523
1524
1525static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1258static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1526 u64 start, u64 len) 1259 u64 start, u64 len)
1527{ 1260{
@@ -1572,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
1572 .getxattr = gfs2_getxattr, 1305 .getxattr = gfs2_getxattr,
1573 .listxattr = gfs2_listxattr, 1306 .listxattr = gfs2_listxattr,
1574 .removexattr = gfs2_removexattr, 1307 .removexattr = gfs2_removexattr,
1575 .fallocate = gfs2_fallocate,
1576 .fiemap = gfs2_fiemap, 1308 .fiemap = gfs2_fiemap,
1577}; 1309};
1578 1310
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 58a9b9998b4..a689901963d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
631 struct fs_disk_quota *fdq) 631 struct fs_disk_quota *fdq)
632{ 632{
633 struct inode *inode = &ip->i_inode; 633 struct inode *inode = &ip->i_inode;
634 struct gfs2_sbd *sdp = GFS2_SB(inode);
634 struct address_space *mapping = inode->i_mapping; 635 struct address_space *mapping = inode->i_mapping;
635 unsigned long index = loc >> PAGE_CACHE_SHIFT; 636 unsigned long index = loc >> PAGE_CACHE_SHIFT;
636 unsigned offset = loc & (PAGE_CACHE_SIZE - 1); 637 unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
@@ -658,13 +659,17 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
658 qd->qd_qb.qb_value = qp->qu_value; 659 qd->qd_qb.qb_value = qp->qu_value;
659 if (fdq) { 660 if (fdq) {
660 if (fdq->d_fieldmask & FS_DQ_BSOFT) { 661 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
661 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); 662 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
662 qd->qd_qb.qb_warn = qp->qu_warn; 663 qd->qd_qb.qb_warn = qp->qu_warn;
663 } 664 }
664 if (fdq->d_fieldmask & FS_DQ_BHARD) { 665 if (fdq->d_fieldmask & FS_DQ_BHARD) {
665 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); 666 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
666 qd->qd_qb.qb_limit = qp->qu_limit; 667 qd->qd_qb.qb_limit = qp->qu_limit;
667 } 668 }
669 if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
670 qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
671 qd->qd_qb.qb_value = qp->qu_value;
672 }
668 } 673 }
669 674
670 /* Write the quota into the quota file on disk */ 675 /* Write the quota into the quota file on disk */
@@ -1497,9 +1502,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1497 fdq->d_version = FS_DQUOT_VERSION; 1502 fdq->d_version = FS_DQUOT_VERSION;
1498 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; 1503 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
1499 fdq->d_id = id; 1504 fdq->d_id = id;
1500 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); 1505 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
1501 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); 1506 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
1502 fdq->d_bcount = be64_to_cpu(qlvb->qb_value); 1507 fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
1503 1508
1504 gfs2_glock_dq_uninit(&q_gh); 1509 gfs2_glock_dq_uninit(&q_gh);
1505out: 1510out:
@@ -1508,7 +1513,7 @@ out:
1508} 1513}
1509 1514
1510/* GFS2 only supports a subset of the XFS fields */ 1515/* GFS2 only supports a subset of the XFS fields */
1511#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) 1516#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
1512 1517
1513static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, 1518static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1514 struct fs_disk_quota *fdq) 1519 struct fs_disk_quota *fdq)
@@ -1566,11 +1571,17 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1566 1571
1567 /* If nothing has changed, this is a no-op */ 1572 /* If nothing has changed, this is a no-op */
1568 if ((fdq->d_fieldmask & FS_DQ_BSOFT) && 1573 if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
1569 (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn))) 1574 ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
1570 fdq->d_fieldmask ^= FS_DQ_BSOFT; 1575 fdq->d_fieldmask ^= FS_DQ_BSOFT;
1576
1571 if ((fdq->d_fieldmask & FS_DQ_BHARD) && 1577 if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
1572 (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit))) 1578 ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
1573 fdq->d_fieldmask ^= FS_DQ_BHARD; 1579 fdq->d_fieldmask ^= FS_DQ_BHARD;
1580
1581 if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
1582 ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
1583 fdq->d_fieldmask ^= FS_DQ_BCOUNT;
1584
1574 if (fdq->d_fieldmask == 0) 1585 if (fdq->d_fieldmask == 0)
1575 goto out_i; 1586 goto out_i;
1576 1587
@@ -1619,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
1619 .get_dqblk = gfs2_get_dqblk, 1630 .get_dqblk = gfs2_get_dqblk,
1620 .set_dqblk = gfs2_set_dqblk, 1631 .set_dqblk = gfs2_set_dqblk,
1621}; 1632};
1622
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bef3ab6cf5c..7293ea27020 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
500 for (rgrps = 0;; rgrps++) { 500 for (rgrps = 0;; rgrps++) {
501 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 501 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
502 502
503 if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode)) 503 if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
504 break; 504 break;
505 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 505 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
506 sizeof(struct gfs2_rindex)); 506 sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
583 * Returns: 0 on successful update, error code otherwise 583 * Returns: 0 on successful update, error code otherwise
584 */ 584 */
585 585
586static int gfs2_ri_update(struct gfs2_inode *ip) 586int gfs2_ri_update(struct gfs2_inode *ip)
587{ 587{
588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
589 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
614} 614}
615 615
616/** 616/**
617 * gfs2_ri_update_special - Pull in a new resource index from the disk
618 *
619 * This is a special version that's safe to call from gfs2_inplace_reserve_i.
620 * In this case we know that we don't have any resource groups in memory yet.
621 *
622 * @ip: pointer to the rindex inode
623 *
624 * Returns: 0 on successful update, error code otherwise
625 */
626static int gfs2_ri_update_special(struct gfs2_inode *ip)
627{
628 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
629 struct inode *inode = &ip->i_inode;
630 struct file_ra_state ra_state;
631 struct gfs2_rgrpd *rgd;
632 unsigned int max_data = 0;
633 int error;
634
635 file_ra_state_init(&ra_state, inode->i_mapping);
636 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
637 /* Ignore partials */
638 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
639 i_size_read(inode))
640 break;
641 error = read_rindex_entry(ip, &ra_state);
642 if (error) {
643 clear_rgrpdi(sdp);
644 return error;
645 }
646 }
647 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
648 if (rgd->rd_data > max_data)
649 max_data = rgd->rd_data;
650 sdp->sd_max_rg_data = max_data;
651
652 sdp->sd_rindex_uptodate = 1;
653 return 0;
654}
655
656/**
657 * gfs2_rindex_hold - Grab a lock on the rindex 617 * gfs2_rindex_hold - Grab a lock on the rindex
658 * @sdp: The GFS2 superblock 618 * @sdp: The GFS2 superblock
659 * @ri_gh: the glock holder 619 * @ri_gh: the glock holder
@@ -963,17 +923,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
963 * The inode, if one has been found, in inode. 923 * The inode, if one has been found, in inode.
964 */ 924 */
965 925
966static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, 926static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
967 u64 skip)
968{ 927{
969 u32 goal = 0, block; 928 u32 goal = 0, block;
970 u64 no_addr; 929 u64 no_addr;
971 struct gfs2_sbd *sdp = rgd->rd_sbd; 930 struct gfs2_sbd *sdp = rgd->rd_sbd;
972 unsigned int n; 931 unsigned int n;
932 struct gfs2_glock *gl;
933 struct gfs2_inode *ip;
934 int error;
935 int found = 0;
973 936
974 for(;;) { 937 while (goal < rgd->rd_data) {
975 if (goal >= rgd->rd_data)
976 break;
977 down_write(&sdp->sd_log_flush_lock); 938 down_write(&sdp->sd_log_flush_lock);
978 n = 1; 939 n = 1;
979 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, 940 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -990,11 +951,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
990 if (no_addr == skip) 951 if (no_addr == skip)
991 continue; 952 continue;
992 *last_unlinked = no_addr; 953 *last_unlinked = no_addr;
993 return no_addr; 954
955 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
956 if (error)
957 continue;
958
959 /* If the inode is already in cache, we can ignore it here
960 * because the existing inode disposal code will deal with
961 * it when all refs have gone away. Accessing gl_object like
962 * this is not safe in general. Here it is ok because we do
963 * not dereference the pointer, and we only need an approx
964 * answer to whether it is NULL or not.
965 */
966 ip = gl->gl_object;
967
968 if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
969 gfs2_glock_put(gl);
970 else
971 found++;
972
973 /* Limit reclaim to sensible number of tasks */
974 if (found > 2*NR_CPUS)
975 return;
994 } 976 }
995 977
996 rgd->rd_flags &= ~GFS2_RDF_CHECK; 978 rgd->rd_flags &= ~GFS2_RDF_CHECK;
997 return 0; 979 return;
998} 980}
999 981
1000/** 982/**
@@ -1075,11 +1057,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1075 * Try to acquire rgrp in way which avoids contending with others. 1057 * Try to acquire rgrp in way which avoids contending with others.
1076 * 1058 *
1077 * Returns: errno 1059 * Returns: errno
1078 * unlinked: the block address of an unlinked block to be reclaimed
1079 */ 1060 */
1080 1061
1081static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, 1062static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1082 u64 *last_unlinked)
1083{ 1063{
1084 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1064 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1085 struct gfs2_rgrpd *rgd, *begin = NULL; 1065 struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1089,7 +1069,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1089 int loops = 0; 1069 int loops = 0;
1090 int error, rg_locked; 1070 int error, rg_locked;
1091 1071
1092 *unlinked = 0;
1093 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 1072 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1094 1073
1095 while (rgd) { 1074 while (rgd) {
@@ -1106,17 +1085,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1106 case 0: 1085 case 0:
1107 if (try_rgrp_fit(rgd, al)) 1086 if (try_rgrp_fit(rgd, al))
1108 goto out; 1087 goto out;
1109 /* If the rg came in already locked, there's no 1088 if (rgd->rd_flags & GFS2_RDF_CHECK)
1110 way we can recover from a failed try_rgrp_unlink 1089 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1111 because that would require an iput which can only
1112 happen after the rgrp is unlocked. */
1113 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1114 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1115 ip->i_no_addr);
1116 if (!rg_locked) 1090 if (!rg_locked)
1117 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1091 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1118 if (*unlinked)
1119 return -EAGAIN;
1120 /* fall through */ 1092 /* fall through */
1121 case GLR_TRYFAILED: 1093 case GLR_TRYFAILED:
1122 rgd = recent_rgrp_next(rgd); 1094 rgd = recent_rgrp_next(rgd);
@@ -1145,13 +1117,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1145 case 0: 1117 case 0:
1146 if (try_rgrp_fit(rgd, al)) 1118 if (try_rgrp_fit(rgd, al))
1147 goto out; 1119 goto out;
1148 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) 1120 if (rgd->rd_flags & GFS2_RDF_CHECK)
1149 *unlinked = try_rgrp_unlink(rgd, last_unlinked, 1121 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1150 ip->i_no_addr);
1151 if (!rg_locked) 1122 if (!rg_locked)
1152 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1123 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1153 if (*unlinked)
1154 return -EAGAIN;
1155 break; 1124 break;
1156 1125
1157 case GLR_TRYFAILED: 1126 case GLR_TRYFAILED:
@@ -1204,12 +1173,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1204 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1173 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1205 struct gfs2_alloc *al = ip->i_alloc; 1174 struct gfs2_alloc *al = ip->i_alloc;
1206 int error = 0; 1175 int error = 0;
1207 u64 last_unlinked = NO_BLOCK, unlinked; 1176 u64 last_unlinked = NO_BLOCK;
1177 int tries = 0;
1208 1178
1209 if (gfs2_assert_warn(sdp, al->al_requested)) 1179 if (gfs2_assert_warn(sdp, al->al_requested))
1210 return -EINVAL; 1180 return -EINVAL;
1211 1181
1212try_again:
1213 if (hold_rindex) { 1182 if (hold_rindex) {
1214 /* We need to hold the rindex unless the inode we're using is 1183 /* We need to hold the rindex unless the inode we're using is
1215 the rindex itself, in which case it's already held. */ 1184 the rindex itself, in which case it's already held. */
@@ -1217,32 +1186,33 @@ try_again:
1217 error = gfs2_rindex_hold(sdp, &al->al_ri_gh); 1186 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1218 else if (!sdp->sd_rgrps) /* We may not have the rindex read 1187 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1219 in, so: */ 1188 in, so: */
1220 error = gfs2_ri_update_special(ip); 1189 error = gfs2_ri_update(ip);
1190 if (error)
1191 return error;
1221 } 1192 }
1222 1193
1223 if (error) 1194try_again:
1224 return error; 1195 do {
1196 error = get_local_rgrp(ip, &last_unlinked);
1197 /* If there is no space, flushing the log may release some */
1198 if (error) {
1199 if (ip == GFS2_I(sdp->sd_rindex) &&
1200 !sdp->sd_rindex_uptodate) {
1201 error = gfs2_ri_update(ip);
1202 if (error)
1203 return error;
1204 goto try_again;
1205 }
1206 gfs2_log_flush(sdp, NULL);
1207 }
1208 } while (error && tries++ < 3);
1225 1209
1226 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1227 dinodes along the way, error will equal -EAGAIN and unlinked will
1228 contains it block address. We then need to look up that inode and
1229 try to free it, and try the allocation again. */
1230 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1231 if (error) { 1210 if (error) {
1232 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) 1211 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1233 gfs2_glock_dq_uninit(&al->al_ri_gh); 1212 gfs2_glock_dq_uninit(&al->al_ri_gh);
1234 if (error != -EAGAIN) 1213 return error;
1235 return error;
1236
1237 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1238 /* regardless of whether or not gfs2_process_unlinked_inode
1239 was successful, we don't want to repeat it again. */
1240 last_unlinked = unlinked;
1241 gfs2_log_flush(sdp, NULL);
1242 error = 0;
1243
1244 goto try_again;
1245 } 1214 }
1215
1246 /* no error, so we have the rgrp set in the inode's allocation. */ 1216 /* no error, so we have the rgrp set in the inode's allocation. */
1247 al->al_file = file; 1217 al->al_file = file;
1248 al->al_line = line; 1218 al->al_line = line;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c0466f9..50c2bb04369 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
48 48
49extern void gfs2_inplace_release(struct gfs2_inode *ip); 49extern void gfs2_inplace_release(struct gfs2_inode *ip);
50 50
51extern int gfs2_ri_update(struct gfs2_inode *ip);
51extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 52extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
52extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); 53extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
53 54
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2b2c4997430..ec73ed70bae 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1336,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
1336 if (error) 1336 if (error)
1337 goto out_truncate; 1337 goto out_truncate;
1338 1338
1339 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1339 gfs2_glock_dq_wait(&ip->i_iopen_gh); 1340 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1340 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 1341 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
1341 error = gfs2_glock_nq(&ip->i_iopen_gh); 1342 error = gfs2_glock_nq(&ip->i_iopen_gh);
@@ -1405,11 +1406,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
1405 return &ip->i_inode; 1406 return &ip->i_inode;
1406} 1407}
1407 1408
1408static void gfs2_destroy_inode(struct inode *inode) 1409static void gfs2_i_callback(struct rcu_head *head)
1409{ 1410{
1411 struct inode *inode = container_of(head, struct inode, i_rcu);
1412 INIT_LIST_HEAD(&inode->i_dentry);
1410 kmem_cache_free(gfs2_inode_cachep, inode); 1413 kmem_cache_free(gfs2_inode_cachep, inode);
1411} 1414}
1412 1415
1416static void gfs2_destroy_inode(struct inode *inode)
1417{
1418 call_rcu(&inode->i_rcu, gfs2_i_callback);
1419}
1420
1413const struct super_operations gfs2_super_ops = { 1421const struct super_operations gfs2_super_ops = {
1414 .alloc_inode = gfs2_alloc_inode, 1422 .alloc_inode = gfs2_alloc_inode,
1415 .destroy_inode = gfs2_destroy_inode, 1423 .destroy_inode = gfs2_destroy_inode,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07c8a..439b61c0326 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
1296 1296
1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) 1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1298{ 1298{
1299 struct inode *inode = &ip->i_inode;
1300 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1299 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1301 struct gfs2_ea_location el; 1300 struct gfs2_ea_location el;
1302 struct buffer_head *dibh;
1303 int error; 1301 int error;
1304 1302
1305 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); 1303 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1321 if (error) 1319 if (error)
1322 return error; 1320 return error;
1323 1321
1324 error = gfs2_meta_inode_buffer(ip, &dibh); 1322 error = gfs2_setattr_simple(ip, attr);
1325 if (error)
1326 goto out_trans_end;
1327
1328 if ((attr->ia_valid & ATTR_SIZE) &&
1329 attr->ia_size != i_size_read(inode)) {
1330 int error;
1331
1332 error = vmtruncate(inode, attr->ia_size);
1333 gfs2_assert_warn(GFS2_SB(inode), !error);
1334 }
1335
1336 setattr_copy(inode, attr);
1337 mark_inode_dirty(inode);
1338
1339 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1340 gfs2_dinode_out(ip, dibh->b_data);
1341 brelse(dibh);
1342
1343out_trans_end:
1344 gfs2_trans_end(sdp); 1323 gfs2_trans_end(sdp);
1345 return error; 1324 return error;
1346} 1325}
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 2b3b8611b41..afa66aaa223 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,8 +25,6 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
25 struct inode *inode = NULL; 25 struct inode *inode = NULL;
26 int res; 26 int res;
27 27
28 dentry->d_op = &hfs_dentry_operations;
29
30 hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); 28 hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
31 hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); 29 hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
32 res = hfs_brec_read(&fd, &rec, sizeof(rec)); 30 res = hfs_brec_read(&fd, &rec, sizeof(rec));
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index c8cffb81e84..ad97c2d5828 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,10 +213,14 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
213/* string.c */ 213/* string.c */
214extern const struct dentry_operations hfs_dentry_operations; 214extern const struct dentry_operations hfs_dentry_operations;
215 215
216extern int hfs_hash_dentry(struct dentry *, struct qstr *); 216extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
217 struct qstr *);
217extern int hfs_strcmp(const unsigned char *, unsigned int, 218extern int hfs_strcmp(const unsigned char *, unsigned int,
218 const unsigned char *, unsigned int); 219 const unsigned char *, unsigned int);
219extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *); 220extern int hfs_compare_dentry(const struct dentry *parent,
221 const struct inode *pinode,
222 const struct dentry *dentry, const struct inode *inode,
223 unsigned int len, const char *str, const struct qstr *name);
220 224
221/* trans.c */ 225/* trans.c */
222extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *); 226extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 927a5af7942..495a976a3cc 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,7 +51,8 @@ static unsigned char caseorder[256] = {
51/* 51/*
52 * Hash a string to an integer in a case-independent way 52 * Hash a string to an integer in a case-independent way
53 */ 53 */
54int hfs_hash_dentry(struct dentry *dentry, struct qstr *this) 54int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
55 struct qstr *this)
55{ 56{
56 const unsigned char *name = this->name; 57 const unsigned char *name = this->name;
57 unsigned int hash, len = this->len; 58 unsigned int hash, len = this->len;
@@ -92,21 +93,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
92 * Test for equality of two strings in the HFS filename character ordering. 93 * Test for equality of two strings in the HFS filename character ordering.
93 * return 1 on failure and 0 on success 94 * return 1 on failure and 0 on success
94 */ 95 */
95int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2) 96int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
97 const struct dentry *dentry, const struct inode *inode,
98 unsigned int len, const char *str, const struct qstr *name)
96{ 99{
97 const unsigned char *n1, *n2; 100 const unsigned char *n1, *n2;
98 int len;
99 101
100 len = s1->len;
101 if (len >= HFS_NAMELEN) { 102 if (len >= HFS_NAMELEN) {
102 if (s2->len < HFS_NAMELEN) 103 if (name->len < HFS_NAMELEN)
103 return 1; 104 return 1;
104 len = HFS_NAMELEN; 105 len = HFS_NAMELEN;
105 } else if (len != s2->len) 106 } else if (len != name->len)
106 return 1; 107 return 1;
107 108
108 n1 = s1->name; 109 n1 = str;
109 n2 = s2->name; 110 n2 = name->name;
110 while (len--) { 111 while (len--) {
111 if (caseorder[*n1++] != caseorder[*n2++]) 112 if (caseorder[*n1++] != caseorder[*n2++])
112 return 1; 113 return 1;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4824c27cebb..1b55f704fb2 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -167,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
167 return i ? &i->vfs_inode : NULL; 167 return i ? &i->vfs_inode : NULL;
168} 168}
169 169
170static void hfs_destroy_inode(struct inode *inode) 170static void hfs_i_callback(struct rcu_head *head)
171{ 171{
172 struct inode *inode = container_of(head, struct inode, i_rcu);
173 INIT_LIST_HEAD(&inode->i_dentry);
172 kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); 174 kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
173} 175}
174 176
177static void hfs_destroy_inode(struct inode *inode)
178{
179 call_rcu(&inode->i_rcu, hfs_i_callback);
180}
181
175static const struct super_operations hfs_super_operations = { 182static const struct super_operations hfs_super_operations = {
176 .alloc_inode = hfs_alloc_inode, 183 .alloc_inode = hfs_alloc_inode,
177 .destroy_inode = hfs_destroy_inode, 184 .destroy_inode = hfs_destroy_inode,
@@ -422,13 +429,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
422 if (!root_inode) 429 if (!root_inode)
423 goto bail_no_root; 430 goto bail_no_root;
424 431
432 sb->s_d_op = &hfs_dentry_operations;
425 res = -ENOMEM; 433 res = -ENOMEM;
426 sb->s_root = d_alloc_root(root_inode); 434 sb->s_root = d_alloc_root(root_inode);
427 if (!sb->s_root) 435 if (!sb->s_root)
428 goto bail_iput; 436 goto bail_iput;
429 437
430 sb->s_root->d_op = &hfs_dentry_operations;
431
432 /* everything's okay */ 438 /* everything's okay */
433 return 0; 439 return 0;
434 440
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 7478f5c219a..19cf291eb91 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -8,15 +8,20 @@
8 * This file contains the code to do various system dependent things. 8 * This file contains the code to do various system dependent things.
9 */ 9 */
10 10
11#include <linux/namei.h>
11#include "hfs_fs.h" 12#include "hfs_fs.h"
12 13
13/* dentry case-handling: just lowercase everything */ 14/* dentry case-handling: just lowercase everything */
14 15
15static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd) 16static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
16{ 17{
17 struct inode *inode = dentry->d_inode; 18 struct inode *inode;
18 int diff; 19 int diff;
19 20
21 if (nd->flags & LOOKUP_RCU)
22 return -ECHILD;
23
24 inode = dentry->d_inode;
20 if(!inode) 25 if(!inode)
21 return 1; 26 return 1;
22 27
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index d182438c7ae..5d799c13205 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
22 return -ENOMEM; 22 return -ENOMEM;
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
26 tree->cnid, __builtin_return_address(0));
26 mutex_lock(&tree->tree_lock); 27 mutex_lock(&tree->tree_lock);
27 return 0; 28 return 0;
28} 29}
@@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd)
31{ 32{
32 hfs_bnode_put(fd->bnode); 33 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 34 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 35 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n",
36 fd->tree->cnid, __builtin_return_address(0));
35 mutex_unlock(&fd->tree->tree_lock); 37 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 38 fd->tree = NULL;
37} 39}
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ad57f5991eb..1cad80c789c 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -15,7 +15,8 @@
15 15
16#define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8) 16#define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8)
17 17
18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) 18int hfsplus_block_allocate(struct super_block *sb, u32 size,
19 u32 offset, u32 *max)
19{ 20{
20 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); 21 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
21 struct page *page; 22 struct page *page;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 29da6574ba7..1c42cc5b899 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -42,7 +42,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
42u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) 42u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
43{ 43{
44 __be16 data; 44 __be16 data;
45 // optimize later... 45 /* TODO: optimize later... */
46 hfs_bnode_read(node, &data, off, 2); 46 hfs_bnode_read(node, &data, off, 2);
47 return be16_to_cpu(data); 47 return be16_to_cpu(data);
48} 48}
@@ -50,7 +50,7 @@ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
50u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) 50u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
51{ 51{
52 u8 data; 52 u8 data;
53 // optimize later... 53 /* TODO: optimize later... */
54 hfs_bnode_read(node, &data, off, 1); 54 hfs_bnode_read(node, &data, off, 1);
55 return data; 55 return data;
56} 56}
@@ -96,7 +96,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
96void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) 96void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
97{ 97{
98 __be16 v = cpu_to_be16(data); 98 __be16 v = cpu_to_be16(data);
99 // optimize later... 99 /* TODO: optimize later... */
100 hfs_bnode_write(node, &v, off, 2); 100 hfs_bnode_write(node, &v, off, 2);
101} 101}
102 102
@@ -212,7 +212,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
212 dst_page--; 212 dst_page--;
213 } 213 }
214 src -= len; 214 src -= len;
215 memmove(kmap(*dst_page) + src, kmap(*src_page) + src, len); 215 memmove(kmap(*dst_page) + src,
216 kmap(*src_page) + src, len);
216 kunmap(*src_page); 217 kunmap(*src_page);
217 set_page_dirty(*dst_page); 218 set_page_dirty(*dst_page);
218 kunmap(*dst_page); 219 kunmap(*dst_page);
@@ -250,14 +251,16 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
250 251
251 if (src == dst) { 252 if (src == dst) {
252 l = min(len, (int)PAGE_CACHE_SIZE - src); 253 l = min(len, (int)PAGE_CACHE_SIZE - src);
253 memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l); 254 memmove(kmap(*dst_page) + src,
255 kmap(*src_page) + src, l);
254 kunmap(*src_page); 256 kunmap(*src_page);
255 set_page_dirty(*dst_page); 257 set_page_dirty(*dst_page);
256 kunmap(*dst_page); 258 kunmap(*dst_page);
257 259
258 while ((len -= l) != 0) { 260 while ((len -= l) != 0) {
259 l = min(len, (int)PAGE_CACHE_SIZE); 261 l = min(len, (int)PAGE_CACHE_SIZE);
260 memmove(kmap(*++dst_page), kmap(*++src_page), l); 262 memmove(kmap(*++dst_page),
263 kmap(*++src_page), l);
261 kunmap(*src_page); 264 kunmap(*src_page);
262 set_page_dirty(*dst_page); 265 set_page_dirty(*dst_page);
263 kunmap(*dst_page); 266 kunmap(*dst_page);
@@ -268,7 +271,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
268 do { 271 do {
269 src_ptr = kmap(*src_page) + src; 272 src_ptr = kmap(*src_page) + src;
270 dst_ptr = kmap(*dst_page) + dst; 273 dst_ptr = kmap(*dst_page) + dst;
271 if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) { 274 if (PAGE_CACHE_SIZE - src <
275 PAGE_CACHE_SIZE - dst) {
272 l = PAGE_CACHE_SIZE - src; 276 l = PAGE_CACHE_SIZE - src;
273 src = 0; 277 src = 0;
274 dst += l; 278 dst += l;
@@ -340,7 +344,8 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
340 return; 344 return;
341 tmp->next = node->next; 345 tmp->next = node->next;
342 cnid = cpu_to_be32(tmp->next); 346 cnid = cpu_to_be32(tmp->next);
343 hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4); 347 hfs_bnode_write(tmp, &cnid,
348 offsetof(struct hfs_bnode_desc, next), 4);
344 hfs_bnode_put(tmp); 349 hfs_bnode_put(tmp);
345 } else if (node->type == HFS_NODE_LEAF) 350 } else if (node->type == HFS_NODE_LEAF)
346 tree->leaf_head = node->next; 351 tree->leaf_head = node->next;
@@ -351,15 +356,15 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
351 return; 356 return;
352 tmp->prev = node->prev; 357 tmp->prev = node->prev;
353 cnid = cpu_to_be32(tmp->prev); 358 cnid = cpu_to_be32(tmp->prev);
354 hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4); 359 hfs_bnode_write(tmp, &cnid,
360 offsetof(struct hfs_bnode_desc, prev), 4);
355 hfs_bnode_put(tmp); 361 hfs_bnode_put(tmp);
356 } else if (node->type == HFS_NODE_LEAF) 362 } else if (node->type == HFS_NODE_LEAF)
357 tree->leaf_tail = node->prev; 363 tree->leaf_tail = node->prev;
358 364
359 // move down? 365 /* move down? */
360 if (!node->prev && !node->next) { 366 if (!node->prev && !node->next)
361 printk(KERN_DEBUG "hfs_btree_del_level\n"); 367 dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
362 }
363 if (!node->parent) { 368 if (!node->parent) {
364 tree->root = 0; 369 tree->root = 0;
365 tree->depth = 0; 370 tree->depth = 0;
@@ -379,16 +384,16 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
379 struct hfs_bnode *node; 384 struct hfs_bnode *node;
380 385
381 if (cnid >= tree->node_count) { 386 if (cnid >= tree->node_count) {
382 printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); 387 printk(KERN_ERR "hfs: request for non-existent node "
388 "%d in B*Tree\n",
389 cnid);
383 return NULL; 390 return NULL;
384 } 391 }
385 392
386 for (node = tree->node_hash[hfs_bnode_hash(cnid)]; 393 for (node = tree->node_hash[hfs_bnode_hash(cnid)];
387 node; node = node->next_hash) { 394 node; node = node->next_hash)
388 if (node->this == cnid) { 395 if (node->this == cnid)
389 return node; 396 return node;
390 }
391 }
392 return NULL; 397 return NULL;
393} 398}
394 399
@@ -402,7 +407,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
402 loff_t off; 407 loff_t off;
403 408
404 if (cnid >= tree->node_count) { 409 if (cnid >= tree->node_count) {
405 printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); 410 printk(KERN_ERR "hfs: request for non-existent node "
411 "%d in B*Tree\n",
412 cnid);
406 return NULL; 413 return NULL;
407 } 414 }
408 415
@@ -429,7 +436,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
429 } else { 436 } else {
430 spin_unlock(&tree->hash_lock); 437 spin_unlock(&tree->hash_lock);
431 kfree(node); 438 kfree(node);
432 wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags)); 439 wait_event(node2->lock_wq,
440 !test_bit(HFS_BNODE_NEW, &node2->flags));
433 return node2; 441 return node2;
434 } 442 }
435 spin_unlock(&tree->hash_lock); 443 spin_unlock(&tree->hash_lock);
@@ -483,7 +491,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
483 if (node) { 491 if (node) {
484 hfs_bnode_get(node); 492 hfs_bnode_get(node);
485 spin_unlock(&tree->hash_lock); 493 spin_unlock(&tree->hash_lock);
486 wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags)); 494 wait_event(node->lock_wq,
495 !test_bit(HFS_BNODE_NEW, &node->flags));
487 if (test_bit(HFS_BNODE_ERROR, &node->flags)) 496 if (test_bit(HFS_BNODE_ERROR, &node->flags))
488 goto node_error; 497 goto node_error;
489 return node; 498 return node;
@@ -497,7 +506,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
497 if (!test_bit(HFS_BNODE_NEW, &node->flags)) 506 if (!test_bit(HFS_BNODE_NEW, &node->flags))
498 return node; 507 return node;
499 508
500 desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); 509 desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
510 node->page_offset);
501 node->prev = be32_to_cpu(desc->prev); 511 node->prev = be32_to_cpu(desc->prev);
502 node->next = be32_to_cpu(desc->next); 512 node->next = be32_to_cpu(desc->next);
503 node->num_recs = be16_to_cpu(desc->num_recs); 513 node->num_recs = be16_to_cpu(desc->num_recs);
@@ -556,11 +566,13 @@ node_error:
556 566
557void hfs_bnode_free(struct hfs_bnode *node) 567void hfs_bnode_free(struct hfs_bnode *node)
558{ 568{
559 //int i; 569#if 0
570 int i;
560 571
561 //for (i = 0; i < node->tree->pages_per_bnode; i++) 572 for (i = 0; i < node->tree->pages_per_bnode; i++)
562 // if (node->page[i]) 573 if (node->page[i])
563 // page_cache_release(node->page[i]); 574 page_cache_release(node->page[i]);
575#endif
564 kfree(node); 576 kfree(node);
565} 577}
566 578
@@ -607,7 +619,8 @@ void hfs_bnode_get(struct hfs_bnode *node)
607 if (node) { 619 if (node) {
608 atomic_inc(&node->refcnt); 620 atomic_inc(&node->refcnt);
609 dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", 621 dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
610 node->tree->cnid, node->this, atomic_read(&node->refcnt)); 622 node->tree->cnid, node->this,
623 atomic_read(&node->refcnt));
611 } 624 }
612} 625}
613 626
@@ -619,7 +632,8 @@ void hfs_bnode_put(struct hfs_bnode *node)
619 int i; 632 int i;
620 633
621 dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", 634 dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
622 node->tree->cnid, node->this, atomic_read(&node->refcnt)); 635 node->tree->cnid, node->this,
636 atomic_read(&node->refcnt));
623 BUG_ON(!atomic_read(&node->refcnt)); 637 BUG_ON(!atomic_read(&node->refcnt));
624 if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) 638 if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
625 return; 639 return;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 2f39d05443e..2312de34bd4 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -39,7 +39,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
39 !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) { 39 !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
40 retval = node->tree->max_key_len + 2; 40 retval = node->tree->max_key_len + 2;
41 } else { 41 } else {
42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); 42 recoff = hfs_bnode_read_u16(node,
43 node->tree->node_size - (rec + 1) * 2);
43 if (!recoff) 44 if (!recoff)
44 return 0; 45 return 0;
45 46
@@ -84,7 +85,8 @@ again:
84 end_rec_off = tree->node_size - (node->num_recs + 1) * 2; 85 end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
85 end_off = hfs_bnode_read_u16(node, end_rec_off); 86 end_off = hfs_bnode_read_u16(node, end_rec_off);
86 end_rec_off -= 2; 87 end_rec_off -= 2;
87 dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); 88 dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
89 rec, size, end_off, end_rec_off);
88 if (size > end_rec_off - end_off) { 90 if (size > end_rec_off - end_off) {
89 if (new_node) 91 if (new_node)
90 panic("not enough room!\n"); 92 panic("not enough room!\n");
@@ -99,7 +101,9 @@ again:
99 } 101 }
100 node->num_recs++; 102 node->num_recs++;
101 /* write new last offset */ 103 /* write new last offset */
102 hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); 104 hfs_bnode_write_u16(node,
105 offsetof(struct hfs_bnode_desc, num_recs),
106 node->num_recs);
103 hfs_bnode_write_u16(node, end_rec_off, end_off + size); 107 hfs_bnode_write_u16(node, end_rec_off, end_off + size);
104 data_off = end_off; 108 data_off = end_off;
105 data_rec_off = end_rec_off + 2; 109 data_rec_off = end_rec_off + 2;
@@ -151,7 +155,8 @@ skip:
151 if (tree->attributes & HFS_TREE_VARIDXKEYS) 155 if (tree->attributes & HFS_TREE_VARIDXKEYS)
152 key_len = be16_to_cpu(fd->search_key->key_len) + 2; 156 key_len = be16_to_cpu(fd->search_key->key_len) + 2;
153 else { 157 else {
154 fd->search_key->key_len = cpu_to_be16(tree->max_key_len); 158 fd->search_key->key_len =
159 cpu_to_be16(tree->max_key_len);
155 key_len = tree->max_key_len + 2; 160 key_len = tree->max_key_len + 2;
156 } 161 }
157 goto again; 162 goto again;
@@ -180,7 +185,8 @@ again:
180 mark_inode_dirty(tree->inode); 185 mark_inode_dirty(tree->inode);
181 } 186 }
182 hfs_bnode_dump(node); 187 hfs_bnode_dump(node);
183 dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); 188 dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n",
189 fd->record, fd->keylength + fd->entrylength);
184 if (!--node->num_recs) { 190 if (!--node->num_recs) {
185 hfs_bnode_unlink(node); 191 hfs_bnode_unlink(node);
186 if (!node->parent) 192 if (!node->parent)
@@ -194,7 +200,9 @@ again:
194 __hfs_brec_find(node, fd); 200 __hfs_brec_find(node, fd);
195 goto again; 201 goto again;
196 } 202 }
197 hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); 203 hfs_bnode_write_u16(node,
204 offsetof(struct hfs_bnode_desc, num_recs),
205 node->num_recs);
198 206
199 if (rec_off == end_off) 207 if (rec_off == end_off)
200 goto skip; 208 goto skip;
@@ -364,7 +372,8 @@ again:
364 newkeylen = hfs_bnode_read_u16(node, 14) + 2; 372 newkeylen = hfs_bnode_read_u16(node, 14) + 2;
365 else 373 else
366 fd->keylength = newkeylen = tree->max_key_len + 2; 374 fd->keylength = newkeylen = tree->max_key_len + 2;
367 dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); 375 dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n",
376 rec, fd->keylength, newkeylen);
368 377
369 rec_off = tree->node_size - (rec + 2) * 2; 378 rec_off = tree->node_size - (rec + 2) * 2;
370 end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; 379 end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
@@ -375,7 +384,7 @@ again:
375 end_off = hfs_bnode_read_u16(parent, end_rec_off); 384 end_off = hfs_bnode_read_u16(parent, end_rec_off);
376 if (end_rec_off - end_off < diff) { 385 if (end_rec_off - end_off < diff) {
377 386
378 printk(KERN_DEBUG "hfs: splitting index node...\n"); 387 dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n");
379 fd->bnode = parent; 388 fd->bnode = parent;
380 new_node = hfs_bnode_split(fd); 389 new_node = hfs_bnode_split(fd);
381 if (IS_ERR(new_node)) 390 if (IS_ERR(new_node))
@@ -383,7 +392,8 @@ again:
383 parent = fd->bnode; 392 parent = fd->bnode;
384 rec = fd->record; 393 rec = fd->record;
385 rec_off = tree->node_size - (rec + 2) * 2; 394 rec_off = tree->node_size - (rec + 2) * 2;
386 end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; 395 end_rec_off = tree->node_size -
396 (parent->num_recs + 1) * 2;
387 } 397 }
388 } 398 }
389 399
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 22e4d4e3299..21023d9f8ff 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -51,7 +51,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
51 goto free_inode; 51 goto free_inode;
52 52
53 /* Load the header */ 53 /* Load the header */
54 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 54 head = (struct hfs_btree_header_rec *)(kmap(page) +
55 sizeof(struct hfs_bnode_desc));
55 tree->root = be32_to_cpu(head->root); 56 tree->root = be32_to_cpu(head->root);
56 tree->leaf_count = be32_to_cpu(head->leaf_count); 57 tree->leaf_count = be32_to_cpu(head->leaf_count);
57 tree->leaf_head = be32_to_cpu(head->leaf_head); 58 tree->leaf_head = be32_to_cpu(head->leaf_head);
@@ -115,7 +116,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
115 116
116 tree->node_size_shift = ffs(size) - 1; 117 tree->node_size_shift = ffs(size) - 1;
117 118
118 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 119 tree->pages_per_bnode =
120 (tree->node_size + PAGE_CACHE_SIZE - 1) >>
121 PAGE_CACHE_SHIFT;
119 122
120 kunmap(page); 123 kunmap(page);
121 page_cache_release(page); 124 page_cache_release(page);
@@ -144,8 +147,10 @@ void hfs_btree_close(struct hfs_btree *tree)
144 while ((node = tree->node_hash[i])) { 147 while ((node = tree->node_hash[i])) {
145 tree->node_hash[i] = node->next_hash; 148 tree->node_hash[i] = node->next_hash;
146 if (atomic_read(&node->refcnt)) 149 if (atomic_read(&node->refcnt))
147 printk(KERN_CRIT "hfs: node %d:%d still has %d user(s)!\n", 150 printk(KERN_CRIT "hfs: node %d:%d "
148 node->tree->cnid, node->this, atomic_read(&node->refcnt)); 151 "still has %d user(s)!\n",
152 node->tree->cnid, node->this,
153 atomic_read(&node->refcnt));
149 hfs_bnode_free(node); 154 hfs_bnode_free(node);
150 tree->node_hash_cnt--; 155 tree->node_hash_cnt--;
151 } 156 }
@@ -166,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree)
166 return; 171 return;
167 /* Load the header */ 172 /* Load the header */
168 page = node->page[0]; 173 page = node->page[0];
169 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 174 head = (struct hfs_btree_header_rec *)(kmap(page) +
175 sizeof(struct hfs_bnode_desc));
170 176
171 head->root = cpu_to_be32(tree->root); 177 head->root = cpu_to_be32(tree->root);
172 head->leaf_count = cpu_to_be32(tree->leaf_count); 178 head->leaf_count = cpu_to_be32(tree->leaf_count);
@@ -272,7 +278,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
272 tree->free_nodes--; 278 tree->free_nodes--;
273 mark_inode_dirty(tree->inode); 279 mark_inode_dirty(tree->inode);
274 hfs_bnode_put(node); 280 hfs_bnode_put(node);
275 return hfs_bnode_create(tree, idx); 281 return hfs_bnode_create(tree,
282 idx);
276 } 283 }
277 } 284 }
278 } 285 }
@@ -287,7 +294,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
287 kunmap(*pagep); 294 kunmap(*pagep);
288 nidx = node->next; 295 nidx = node->next;
289 if (!nidx) { 296 if (!nidx) {
290 printk(KERN_DEBUG "hfs: create new bmap node...\n"); 297 dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n");
291 next_node = hfs_bmap_new_bmap(node, idx); 298 next_node = hfs_bmap_new_bmap(node, idx);
292 } else 299 } else
293 next_node = hfs_bnode_find(tree, nidx); 300 next_node = hfs_bnode_find(tree, nidx);
@@ -329,7 +336,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
329 hfs_bnode_put(node); 336 hfs_bnode_put(node);
330 if (!i) { 337 if (!i) {
331 /* panic */; 338 /* panic */;
332 printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this); 339 printk(KERN_CRIT "hfs: unable to free bnode %u. "
340 "bmap not found!\n",
341 node->this);
333 return; 342 return;
334 } 343 }
335 node = hfs_bnode_find(tree, i); 344 node = hfs_bnode_find(tree, i);
@@ -337,7 +346,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
337 return; 346 return;
338 if (node->type != HFS_NODE_MAP) { 347 if (node->type != HFS_NODE_MAP) {
339 /* panic */; 348 /* panic */;
340 printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type); 349 printk(KERN_CRIT "hfs: invalid bmap found! "
350 "(%u,%d)\n",
351 node->this, node->type);
341 hfs_bnode_put(node); 352 hfs_bnode_put(node);
342 return; 353 return;
343 } 354 }
@@ -350,7 +361,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
350 m = 1 << (~nidx & 7); 361 m = 1 << (~nidx & 7);
351 byte = data[off]; 362 byte = data[off];
352 if (!(byte & m)) { 363 if (!(byte & m)) {
353 printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type); 364 printk(KERN_CRIT "hfs: trying to free free bnode "
365 "%u(%d)\n",
366 node->this, node->type);
354 kunmap(page); 367 kunmap(page);
355 hfs_bnode_put(node); 368 hfs_bnode_put(node);
356 return; 369 return;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 8af45fc5b05..b4ba1b31933 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -91,7 +91,8 @@ void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
91 perms->dev = 0; 91 perms->dev = 0;
92} 92}
93 93
94static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) 94static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
95 u32 cnid, struct inode *inode)
95{ 96{
96 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); 97 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
97 98
@@ -128,20 +129,32 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
128 if (cnid == inode->i_ino) { 129 if (cnid == inode->i_ino) {
129 hfsplus_cat_set_perms(inode, &file->permissions); 130 hfsplus_cat_set_perms(inode, &file->permissions);
130 if (S_ISLNK(inode->i_mode)) { 131 if (S_ISLNK(inode->i_mode)) {
131 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); 132 file->user_info.fdType =
132 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); 133 cpu_to_be32(HFSP_SYMLINK_TYPE);
134 file->user_info.fdCreator =
135 cpu_to_be32(HFSP_SYMLINK_CREATOR);
133 } else { 136 } else {
134 file->user_info.fdType = cpu_to_be32(sbi->type); 137 file->user_info.fdType =
135 file->user_info.fdCreator = cpu_to_be32(sbi->creator); 138 cpu_to_be32(sbi->type);
139 file->user_info.fdCreator =
140 cpu_to_be32(sbi->creator);
136 } 141 }
137 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 142 if (HFSPLUS_FLG_IMMUTABLE &
138 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 143 (file->permissions.rootflags |
144 file->permissions.userflags))
145 file->flags |=
146 cpu_to_be16(HFSPLUS_FILE_LOCKED);
139 } else { 147 } else {
140 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); 148 file->user_info.fdType =
141 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); 149 cpu_to_be32(HFSP_HARDLINK_TYPE);
142 file->user_info.fdFlags = cpu_to_be16(0x100); 150 file->user_info.fdCreator =
143 file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date; 151 cpu_to_be32(HFSP_HFSPLUS_CREATOR);
144 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid); 152 file->user_info.fdFlags =
153 cpu_to_be16(0x100);
154 file->create_date =
155 HFSPLUS_I(sbi->hidden_dir)->create_date;
156 file->permissions.dev =
157 cpu_to_be32(HFSPLUS_I(inode)->linkid);
145 } 158 }
146 return sizeof(*file); 159 return sizeof(*file);
147 } 160 }
@@ -182,12 +195,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
182 return -EIO; 195 return -EIO;
183 } 196 }
184 197
185 hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID), 198 hfsplus_cat_build_key_uni(fd->search_key,
186 &tmp.thread.nodeName); 199 be32_to_cpu(tmp.thread.parentID),
200 &tmp.thread.nodeName);
187 return hfs_brec_find(fd); 201 return hfs_brec_find(fd);
188} 202}
189 203
190int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) 204int hfsplus_create_cat(u32 cnid, struct inode *dir,
205 struct qstr *str, struct inode *inode)
191{ 206{
192 struct super_block *sb = dir->i_sb; 207 struct super_block *sb = dir->i_sb;
193 struct hfs_find_data fd; 208 struct hfs_find_data fd;
@@ -195,13 +210,15 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
195 int entry_size; 210 int entry_size;
196 int err; 211 int err;
197 212
198 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); 213 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
214 str->name, cnid, inode->i_nlink);
199 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); 215 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
200 216
201 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); 217 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
202 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? 218 entry_size = hfsplus_fill_cat_thread(sb, &entry,
219 S_ISDIR(inode->i_mode) ?
203 HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, 220 HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
204 dir->i_ino, str); 221 dir->i_ino, str);
205 err = hfs_brec_find(&fd); 222 err = hfs_brec_find(&fd);
206 if (err != -ENOENT) { 223 if (err != -ENOENT) {
207 if (!err) 224 if (!err)
@@ -227,7 +244,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
227 244
228 dir->i_size++; 245 dir->i_size++;
229 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; 246 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
230 mark_inode_dirty(dir); 247 hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
248
231 hfs_find_exit(&fd); 249 hfs_find_exit(&fd);
232 return 0; 250 return 0;
233 251
@@ -249,7 +267,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
249 int err, off; 267 int err, off;
250 u16 type; 268 u16 type;
251 269
252 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); 270 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
271 str ? str->name : NULL, cnid);
253 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); 272 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
254 273
255 if (!str) { 274 if (!str) {
@@ -260,11 +279,15 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
260 if (err) 279 if (err)
261 goto out; 280 goto out;
262 281
263 off = fd.entryoffset + offsetof(struct hfsplus_cat_thread, nodeName); 282 off = fd.entryoffset +
283 offsetof(struct hfsplus_cat_thread, nodeName);
264 fd.search_key->cat.parent = cpu_to_be32(dir->i_ino); 284 fd.search_key->cat.parent = cpu_to_be32(dir->i_ino);
265 hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.length, off, 2); 285 hfs_bnode_read(fd.bnode,
286 &fd.search_key->cat.name.length, off, 2);
266 len = be16_to_cpu(fd.search_key->cat.name.length) * 2; 287 len = be16_to_cpu(fd.search_key->cat.name.length) * 2;
267 hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.unicode, off + 2, len); 288 hfs_bnode_read(fd.bnode,
289 &fd.search_key->cat.name.unicode,
290 off + 2, len);
268 fd.search_key->key_len = cpu_to_be16(6 + len); 291 fd.search_key->key_len = cpu_to_be16(6 + len);
269 } else 292 } else
270 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); 293 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
@@ -281,7 +304,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
281 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA); 304 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA);
282#endif 305#endif
283 306
284 off = fd.entryoffset + offsetof(struct hfsplus_cat_file, rsrc_fork); 307 off = fd.entryoffset +
308 offsetof(struct hfsplus_cat_file, rsrc_fork);
285 hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork)); 309 hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
286 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); 310 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
287 } 311 }
@@ -308,7 +332,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
308 332
309 dir->i_size--; 333 dir->i_size--;
310 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; 334 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
311 mark_inode_dirty(dir); 335 hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
312out: 336out:
313 hfs_find_exit(&fd); 337 hfs_find_exit(&fd);
314 338
@@ -325,7 +349,8 @@ int hfsplus_rename_cat(u32 cnid,
325 int entry_size, type; 349 int entry_size, type;
326 int err = 0; 350 int err = 0;
327 351
328 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, 352 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
353 cnid, src_dir->i_ino, src_name->name,
329 dst_dir->i_ino, dst_name->name); 354 dst_dir->i_ino, dst_name->name);
330 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); 355 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
331 dst_fd = src_fd; 356 dst_fd = src_fd;
@@ -353,7 +378,6 @@ int hfsplus_rename_cat(u32 cnid,
353 goto out; 378 goto out;
354 dst_dir->i_size++; 379 dst_dir->i_size++;
355 dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; 380 dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
356 mark_inode_dirty(dst_dir);
357 381
358 /* finally remove the old entry */ 382 /* finally remove the old entry */
359 hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); 383 hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
@@ -365,7 +389,6 @@ int hfsplus_rename_cat(u32 cnid,
365 goto out; 389 goto out;
366 src_dir->i_size--; 390 src_dir->i_size--;
367 src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; 391 src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
368 mark_inode_dirty(src_dir);
369 392
370 /* remove old thread entry */ 393 /* remove old thread entry */
371 hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); 394 hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
@@ -379,7 +402,8 @@ int hfsplus_rename_cat(u32 cnid,
379 402
380 /* create new thread entry */ 403 /* create new thread entry */
381 hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL); 404 hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
382 entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name); 405 entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
406 dst_dir->i_ino, dst_name);
383 err = hfs_brec_find(&dst_fd); 407 err = hfs_brec_find(&dst_fd);
384 if (err != -ENOENT) { 408 if (err != -ENOENT) {
385 if (!err) 409 if (!err)
@@ -387,6 +411,9 @@ int hfsplus_rename_cat(u32 cnid,
387 goto out; 411 goto out;
388 } 412 }
389 err = hfs_brec_insert(&dst_fd, &entry, entry_size); 413 err = hfs_brec_insert(&dst_fd, &entry, entry_size);
414
415 hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY);
416 hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY);
390out: 417out:
391 hfs_bnode_put(dst_fd.bnode); 418 hfs_bnode_put(dst_fd.bnode);
392 hfs_find_exit(&src_fd); 419 hfs_find_exit(&src_fd);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 9d59c0571f5..4df5059c25d 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,7 +37,6 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
37 37
38 sb = dir->i_sb; 38 sb = dir->i_sb;
39 39
40 dentry->d_op = &hfsplus_dentry_operations;
41 dentry->d_fsdata = NULL; 40 dentry->d_fsdata = NULL;
42 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); 41 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 42 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
@@ -66,11 +65,17 @@ again:
66 goto fail; 65 goto fail;
67 } 66 }
68 cnid = be32_to_cpu(entry.file.id); 67 cnid = be32_to_cpu(entry.file.id);
69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && 68 if (entry.file.user_info.fdType ==
70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && 69 cpu_to_be32(HFSP_HARDLINK_TYPE) &&
71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date || 70 entry.file.user_info.fdCreator ==
72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) && 71 cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
73 HFSPLUS_SB(sb)->hidden_dir) { 72 (entry.file.create_date ==
73 HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
74 create_date ||
75 entry.file.create_date ==
76 HFSPLUS_I(sb->s_root->d_inode)->
77 create_date) &&
78 HFSPLUS_SB(sb)->hidden_dir) {
74 struct qstr str; 79 struct qstr str;
75 char name[32]; 80 char name[32];
76 81
@@ -83,11 +88,13 @@ again:
83 linkid = 0; 88 linkid = 0;
84 } else { 89 } else {
85 dentry->d_fsdata = (void *)(unsigned long)cnid; 90 dentry->d_fsdata = (void *)(unsigned long)cnid;
86 linkid = be32_to_cpu(entry.file.permissions.dev); 91 linkid =
92 be32_to_cpu(entry.file.permissions.dev);
87 str.len = sprintf(name, "iNode%d", linkid); 93 str.len = sprintf(name, "iNode%d", linkid);
88 str.name = name; 94 str.name = name;
89 hfsplus_cat_build_key(sb, fd.search_key, 95 hfsplus_cat_build_key(sb, fd.search_key,
90 HFSPLUS_SB(sb)->hidden_dir->i_ino, &str); 96 HFSPLUS_SB(sb)->hidden_dir->i_ino,
97 &str);
91 goto again; 98 goto again;
92 } 99 }
93 } else if (!dentry->d_fsdata) 100 } else if (!dentry->d_fsdata)
@@ -139,7 +146,8 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
139 filp->f_pos++; 146 filp->f_pos++;
140 /* fall through */ 147 /* fall through */
141 case 1: 148 case 1:
142 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); 149 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
150 fd.entrylength);
143 if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { 151 if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
144 printk(KERN_ERR "hfs: bad catalog folder thread\n"); 152 printk(KERN_ERR "hfs: bad catalog folder thread\n");
145 err = -EIO; 153 err = -EIO;
@@ -169,14 +177,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
169 err = -EIO; 177 err = -EIO;
170 goto out; 178 goto out;
171 } 179 }
172 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); 180 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
181 fd.entrylength);
173 type = be16_to_cpu(entry.type); 182 type = be16_to_cpu(entry.type);
174 len = HFSPLUS_MAX_STRLEN; 183 len = HFSPLUS_MAX_STRLEN;
175 err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); 184 err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
176 if (err) 185 if (err)
177 goto out; 186 goto out;
178 if (type == HFSPLUS_FOLDER) { 187 if (type == HFSPLUS_FOLDER) {
179 if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { 188 if (fd.entrylength <
189 sizeof(struct hfsplus_cat_folder)) {
180 printk(KERN_ERR "hfs: small dir entry\n"); 190 printk(KERN_ERR "hfs: small dir entry\n");
181 err = -EIO; 191 err = -EIO;
182 goto out; 192 goto out;
@@ -202,7 +212,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
202 err = -EIO; 212 err = -EIO;
203 goto out; 213 goto out;
204 } 214 }
205 next: 215next:
206 filp->f_pos++; 216 filp->f_pos++;
207 if (filp->f_pos >= inode->i_size) 217 if (filp->f_pos >= inode->i_size)
208 goto out; 218 goto out;
@@ -273,7 +283,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
273 HFSPLUS_I(inode)->linkid = id; 283 HFSPLUS_I(inode)->linkid = id;
274 cnid = sbi->next_cnid++; 284 cnid = sbi->next_cnid++;
275 src_dentry->d_fsdata = (void *)(unsigned long)cnid; 285 src_dentry->d_fsdata = (void *)(unsigned long)cnid;
276 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); 286 res = hfsplus_create_cat(cnid, src_dir,
287 &src_dentry->d_name, inode);
277 if (res) 288 if (res)
278 /* panic? */ 289 /* panic? */
279 goto out; 290 goto out;
@@ -485,6 +496,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
485}; 496};
486 497
487const struct file_operations hfsplus_dir_operations = { 498const struct file_operations hfsplus_dir_operations = {
499 .fsync = hfsplus_file_fsync,
488 .read = generic_read_dir, 500 .read = generic_read_dir,
489 .readdir = hfsplus_readdir, 501 .readdir = hfsplus_readdir,
490 .unlocked_ioctl = hfsplus_ioctl, 502 .unlocked_ioctl = hfsplus_ioctl,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0c9cb1820a5..52a0bcaa7b6 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -83,7 +83,8 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
83 return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count); 83 return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
84} 84}
85 85
86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) 86static void __hfsplus_ext_write_extent(struct inode *inode,
87 struct hfs_find_data *fd)
87{ 88{
88 struct hfsplus_inode_info *hip = HFSPLUS_I(inode); 89 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
89 int res; 90 int res;
@@ -95,24 +96,32 @@ static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data
95 HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 96 HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
96 97
97 res = hfs_brec_find(fd); 98 res = hfs_brec_find(fd);
98 if (hip->flags & HFSPLUS_FLG_EXT_NEW) { 99 if (hip->extent_state & HFSPLUS_EXT_NEW) {
99 if (res != -ENOENT) 100 if (res != -ENOENT)
100 return; 101 return;
101 hfs_brec_insert(fd, hip->cached_extents, 102 hfs_brec_insert(fd, hip->cached_extents,
102 sizeof(hfsplus_extent_rec)); 103 sizeof(hfsplus_extent_rec));
103 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 104 hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
104 } else { 105 } else {
105 if (res) 106 if (res)
106 return; 107 return;
107 hfs_bnode_write(fd->bnode, hip->cached_extents, 108 hfs_bnode_write(fd->bnode, hip->cached_extents,
108 fd->entryoffset, fd->entrylength); 109 fd->entryoffset, fd->entrylength);
109 hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY; 110 hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
110 } 111 }
112
113 /*
114 * We can't just use hfsplus_mark_inode_dirty here, because we
115 * also get called from hfsplus_write_inode, which should not
116 * redirty the inode. Instead the callers have to be careful
117 * to explicily mark the inode dirty, too.
118 */
119 set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
111} 120}
112 121
113static void hfsplus_ext_write_extent_locked(struct inode *inode) 122static void hfsplus_ext_write_extent_locked(struct inode *inode)
114{ 123{
115 if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) { 124 if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
116 struct hfs_find_data fd; 125 struct hfs_find_data fd;
117 126
118 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); 127 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
@@ -144,18 +153,20 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
144 return -ENOENT; 153 return -ENOENT;
145 if (fd->entrylength != sizeof(hfsplus_extent_rec)) 154 if (fd->entrylength != sizeof(hfsplus_extent_rec))
146 return -EIO; 155 return -EIO;
147 hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfsplus_extent_rec)); 156 hfs_bnode_read(fd->bnode, extent, fd->entryoffset,
157 sizeof(hfsplus_extent_rec));
148 return 0; 158 return 0;
149} 159}
150 160
151static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) 161static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
162 struct inode *inode, u32 block)
152{ 163{
153 struct hfsplus_inode_info *hip = HFSPLUS_I(inode); 164 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
154 int res; 165 int res;
155 166
156 WARN_ON(!mutex_is_locked(&hip->extents_lock)); 167 WARN_ON(!mutex_is_locked(&hip->extents_lock));
157 168
158 if (hip->flags & HFSPLUS_FLG_EXT_DIRTY) 169 if (hip->extent_state & HFSPLUS_EXT_DIRTY)
159 __hfsplus_ext_write_extent(inode, fd); 170 __hfsplus_ext_write_extent(inode, fd);
160 171
161 res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, 172 res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
@@ -164,10 +175,11 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct in
164 HFSPLUS_TYPE_DATA); 175 HFSPLUS_TYPE_DATA);
165 if (!res) { 176 if (!res) {
166 hip->cached_start = be32_to_cpu(fd->key->ext.start_block); 177 hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
167 hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents); 178 hip->cached_blocks =
179 hfsplus_ext_block_count(hip->cached_extents);
168 } else { 180 } else {
169 hip->cached_start = hip->cached_blocks = 0; 181 hip->cached_start = hip->cached_blocks = 0;
170 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 182 hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
171 } 183 }
172 return res; 184 return res;
173} 185}
@@ -197,6 +209,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
197 struct hfsplus_inode_info *hip = HFSPLUS_I(inode); 209 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
198 int res = -EIO; 210 int res = -EIO;
199 u32 ablock, dblock, mask; 211 u32 ablock, dblock, mask;
212 int was_dirty = 0;
200 int shift; 213 int shift;
201 214
202 /* Convert inode block to disk allocation block */ 215 /* Convert inode block to disk allocation block */
@@ -223,27 +236,37 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
223 return -EIO; 236 return -EIO;
224 237
225 mutex_lock(&hip->extents_lock); 238 mutex_lock(&hip->extents_lock);
239
240 /*
241 * hfsplus_ext_read_extent will write out a cached extent into
242 * the extents btree. In that case we may have to mark the inode
243 * dirty even for a pure read of an extent here.
244 */
245 was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY);
226 res = hfsplus_ext_read_extent(inode, ablock); 246 res = hfsplus_ext_read_extent(inode, ablock);
227 if (!res) { 247 if (res) {
228 dblock = hfsplus_ext_find_block(hip->cached_extents,
229 ablock - hip->cached_start);
230 } else {
231 mutex_unlock(&hip->extents_lock); 248 mutex_unlock(&hip->extents_lock);
232 return -EIO; 249 return -EIO;
233 } 250 }
251 dblock = hfsplus_ext_find_block(hip->cached_extents,
252 ablock - hip->cached_start);
234 mutex_unlock(&hip->extents_lock); 253 mutex_unlock(&hip->extents_lock);
235 254
236done: 255done:
237 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); 256 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
257 inode->i_ino, (long long)iblock, dblock);
238 mask = (1 << sbi->fs_shift) - 1; 258 mask = (1 << sbi->fs_shift) - 1;
239 map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask)); 259 map_bh(bh_result, sb,
260 (dblock << sbi->fs_shift) + sbi->blockoffset +
261 (iblock & mask));
240 if (create) { 262 if (create) {
241 set_buffer_new(bh_result); 263 set_buffer_new(bh_result);
242 hip->phys_size += sb->s_blocksize; 264 hip->phys_size += sb->s_blocksize;
243 hip->fs_blocks++; 265 hip->fs_blocks++;
244 inode_add_bytes(inode, sb->s_blocksize); 266 inode_add_bytes(inode, sb->s_blocksize);
245 mark_inode_dirty(inode);
246 } 267 }
268 if (create || was_dirty)
269 mark_inode_dirty(inode);
247 return 0; 270 return 0;
248} 271}
249 272
@@ -326,7 +349,8 @@ found:
326 } 349 }
327} 350}
328 351
329int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw *fork, int type) 352int hfsplus_free_fork(struct super_block *sb, u32 cnid,
353 struct hfsplus_fork_raw *fork, int type)
330{ 354{
331 struct hfs_find_data fd; 355 struct hfs_find_data fd;
332 hfsplus_extent_rec ext_entry; 356 hfsplus_extent_rec ext_entry;
@@ -373,12 +397,13 @@ int hfsplus_file_extend(struct inode *inode)
373 u32 start, len, goal; 397 u32 start, len, goal;
374 int res; 398 int res;
375 399
376 if (sbi->alloc_file->i_size * 8 < 400 if (sbi->total_blocks - sbi->free_blocks + 8 >
377 sbi->total_blocks - sbi->free_blocks + 8) { 401 sbi->alloc_file->i_size * 8) {
378 // extend alloc file 402 /* extend alloc file */
379 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", 403 printk(KERN_ERR "hfs: extend alloc file! "
380 sbi->alloc_file->i_size * 8, 404 "(%llu,%u,%u)\n",
381 sbi->total_blocks, sbi->free_blocks); 405 sbi->alloc_file->i_size * 8,
406 sbi->total_blocks, sbi->free_blocks);
382 return -ENOSPC; 407 return -ENOSPC;
383 } 408 }
384 409
@@ -429,7 +454,7 @@ int hfsplus_file_extend(struct inode *inode)
429 start, len); 454 start, len);
430 if (!res) { 455 if (!res) {
431 hfsplus_dump_extent(hip->cached_extents); 456 hfsplus_dump_extent(hip->cached_extents);
432 hip->flags |= HFSPLUS_FLG_EXT_DIRTY; 457 hip->extent_state |= HFSPLUS_EXT_DIRTY;
433 hip->cached_blocks += len; 458 hip->cached_blocks += len;
434 } else if (res == -ENOSPC) 459 } else if (res == -ENOSPC)
435 goto insert_extent; 460 goto insert_extent;
@@ -438,7 +463,7 @@ out:
438 mutex_unlock(&hip->extents_lock); 463 mutex_unlock(&hip->extents_lock);
439 if (!res) { 464 if (!res) {
440 hip->alloc_blocks += len; 465 hip->alloc_blocks += len;
441 mark_inode_dirty(inode); 466 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
442 } 467 }
443 return res; 468 return res;
444 469
@@ -450,7 +475,7 @@ insert_extent:
450 hip->cached_extents[0].start_block = cpu_to_be32(start); 475 hip->cached_extents[0].start_block = cpu_to_be32(start);
451 hip->cached_extents[0].block_count = cpu_to_be32(len); 476 hip->cached_extents[0].block_count = cpu_to_be32(len);
452 hfsplus_dump_extent(hip->cached_extents); 477 hfsplus_dump_extent(hip->cached_extents);
453 hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; 478 hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW;
454 hip->cached_start = hip->alloc_blocks; 479 hip->cached_start = hip->alloc_blocks;
455 hip->cached_blocks = len; 480 hip->cached_blocks = len;
456 481
@@ -466,8 +491,9 @@ void hfsplus_file_truncate(struct inode *inode)
466 u32 alloc_cnt, blk_cnt, start; 491 u32 alloc_cnt, blk_cnt, start;
467 int res; 492 int res;
468 493
469 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", 494 dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n",
470 inode->i_ino, (long long)hip->phys_size, inode->i_size); 495 inode->i_ino, (long long)hip->phys_size,
496 inode->i_size);
471 497
472 if (inode->i_size > hip->phys_size) { 498 if (inode->i_size > hip->phys_size) {
473 struct address_space *mapping = inode->i_mapping; 499 struct address_space *mapping = inode->i_mapping;
@@ -481,7 +507,8 @@ void hfsplus_file_truncate(struct inode *inode)
481 &page, &fsdata); 507 &page, &fsdata);
482 if (res) 508 if (res)
483 return; 509 return;
484 res = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); 510 res = pagecache_write_end(NULL, mapping, size,
511 0, 0, page, fsdata);
485 if (res < 0) 512 if (res < 0)
486 return; 513 return;
487 mark_inode_dirty(inode); 514 mark_inode_dirty(inode);
@@ -513,12 +540,12 @@ void hfsplus_file_truncate(struct inode *inode)
513 alloc_cnt - start, alloc_cnt - blk_cnt); 540 alloc_cnt - start, alloc_cnt - blk_cnt);
514 hfsplus_dump_extent(hip->cached_extents); 541 hfsplus_dump_extent(hip->cached_extents);
515 if (blk_cnt > start) { 542 if (blk_cnt > start) {
516 hip->flags |= HFSPLUS_FLG_EXT_DIRTY; 543 hip->extent_state |= HFSPLUS_EXT_DIRTY;
517 break; 544 break;
518 } 545 }
519 alloc_cnt = start; 546 alloc_cnt = start;
520 hip->cached_start = hip->cached_blocks = 0; 547 hip->cached_start = hip->cached_blocks = 0;
521 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 548 hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
522 hfs_brec_remove(&fd); 549 hfs_brec_remove(&fd);
523 } 550 }
524 hfs_find_exit(&fd); 551 hfs_find_exit(&fd);
@@ -527,7 +554,8 @@ void hfsplus_file_truncate(struct inode *inode)
527 hip->alloc_blocks = blk_cnt; 554 hip->alloc_blocks = blk_cnt;
528out: 555out:
529 hip->phys_size = inode->i_size; 556 hip->phys_size = inode->i_size;
530 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 557 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
558 sb->s_blocksize_bits;
531 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); 559 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
532 mark_inode_dirty(inode); 560 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
533} 561}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index cb3653efb57..d6857523336 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -23,13 +23,16 @@
23#define DBG_EXTENT 0x00000020 23#define DBG_EXTENT 0x00000020
24#define DBG_BITMAP 0x00000040 24#define DBG_BITMAP 0x00000040
25 25
26//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) 26#if 0
27//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) 27#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
28//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) 28#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
29#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
30#endif
29#define DBG_MASK (0) 31#define DBG_MASK (0)
30 32
31#define dprint(flg, fmt, args...) \ 33#define dprint(flg, fmt, args...) \
32 if (flg & DBG_MASK) printk(fmt , ## args) 34 if (flg & DBG_MASK) \
35 printk(fmt , ## args)
33 36
34/* Runtime config options */ 37/* Runtime config options */
35#define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ 38#define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */
@@ -37,7 +40,8 @@
37#define HFSPLUS_TYPE_DATA 0x00 40#define HFSPLUS_TYPE_DATA 0x00
38#define HFSPLUS_TYPE_RSRC 0xFF 41#define HFSPLUS_TYPE_RSRC 0xFF
39 42
40typedef int (*btree_keycmp)(const hfsplus_btree_key *, const hfsplus_btree_key *); 43typedef int (*btree_keycmp)(const hfsplus_btree_key *,
44 const hfsplus_btree_key *);
41 45
42#define NODE_HASH_SIZE 256 46#define NODE_HASH_SIZE 256
43 47
@@ -61,7 +65,6 @@ struct hfs_btree {
61 unsigned int max_key_len; 65 unsigned int max_key_len;
62 unsigned int depth; 66 unsigned int depth;
63 67
64 //unsigned int map1_size, map_size;
65 struct mutex tree_lock; 68 struct mutex tree_lock;
66 69
67 unsigned int pages_per_bnode; 70 unsigned int pages_per_bnode;
@@ -107,8 +110,8 @@ struct hfsplus_vh;
107struct hfs_btree; 110struct hfs_btree;
108 111
109struct hfsplus_sb_info { 112struct hfsplus_sb_info {
110 struct buffer_head *s_vhbh;
111 struct hfsplus_vh *s_vhdr; 113 struct hfsplus_vh *s_vhdr;
114 struct hfsplus_vh *s_backup_vhdr;
112 struct hfs_btree *ext_tree; 115 struct hfs_btree *ext_tree;
113 struct hfs_btree *cat_tree; 116 struct hfs_btree *cat_tree;
114 struct hfs_btree *attr_tree; 117 struct hfs_btree *attr_tree;
@@ -118,7 +121,8 @@ struct hfsplus_sb_info {
118 121
119 /* Runtime variables */ 122 /* Runtime variables */
120 u32 blockoffset; 123 u32 blockoffset;
121 u32 sect_count; 124 sector_t part_start;
125 sector_t sect_count;
122 int fs_shift; 126 int fs_shift;
123 127
124 /* immutable data from the volume header */ 128 /* immutable data from the volume header */
@@ -155,6 +159,12 @@ struct hfsplus_sb_info {
155#define HFSPLUS_SB_FORCE 2 159#define HFSPLUS_SB_FORCE 2
156#define HFSPLUS_SB_HFSX 3 160#define HFSPLUS_SB_HFSX 3
157#define HFSPLUS_SB_CASEFOLD 4 161#define HFSPLUS_SB_CASEFOLD 4
162#define HFSPLUS_SB_NOBARRIER 5
163
164static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
165{
166 return sb->s_fs_info;
167}
158 168
159 169
160struct hfsplus_inode_info { 170struct hfsplus_inode_info {
@@ -170,7 +180,7 @@ struct hfsplus_inode_info {
170 u32 cached_blocks; 180 u32 cached_blocks;
171 hfsplus_extent_rec first_extents; 181 hfsplus_extent_rec first_extents;
172 hfsplus_extent_rec cached_extents; 182 hfsplus_extent_rec cached_extents;
173 unsigned long flags; 183 unsigned int extent_state;
174 struct mutex extents_lock; 184 struct mutex extents_lock;
175 185
176 /* 186 /*
@@ -185,6 +195,11 @@ struct hfsplus_inode_info {
185 u32 linkid; 195 u32 linkid;
186 196
187 /* 197 /*
198 * Accessed using atomic bitops.
199 */
200 unsigned long flags;
201
202 /*
188 * Protected by i_mutex. 203 * Protected by i_mutex.
189 */ 204 */
190 sector_t fs_blocks; 205 sector_t fs_blocks;
@@ -195,12 +210,34 @@ struct hfsplus_inode_info {
195 struct inode vfs_inode; 210 struct inode vfs_inode;
196}; 211};
197 212
198#define HFSPLUS_FLG_RSRC 0x0001 213#define HFSPLUS_EXT_DIRTY 0x0001
199#define HFSPLUS_FLG_EXT_DIRTY 0x0002 214#define HFSPLUS_EXT_NEW 0x0002
200#define HFSPLUS_FLG_EXT_NEW 0x0004 215
216#define HFSPLUS_I_RSRC 0 /* represents a resource fork */
217#define HFSPLUS_I_CAT_DIRTY 1 /* has changes in the catalog tree */
218#define HFSPLUS_I_EXT_DIRTY 2 /* has changes in the extent tree */
219#define HFSPLUS_I_ALLOC_DIRTY 3 /* has changes in the allocation file */
220
221#define HFSPLUS_IS_RSRC(inode) \
222 test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
223
224static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
225{
226 return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
227}
201 228
202#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)) 229/*
203#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC) 230 * Mark an inode dirty, and also mark the btree in which the
231 * specific type of metadata is stored.
232 * For data or metadata that gets written back by into the catalog btree
233 * by hfsplus_write_inode a plain mark_inode_dirty call is enough.
234 */
235static inline void hfsplus_mark_inode_dirty(struct inode *inode,
236 unsigned int flag)
237{
238 set_bit(flag, &HFSPLUS_I(inode)->flags);
239 mark_inode_dirty(inode);
240}
204 241
205struct hfs_find_data { 242struct hfs_find_data {
206 /* filled by caller */ 243 /* filled by caller */
@@ -318,9 +355,12 @@ int hfs_brec_read(struct hfs_find_data *, void *, int);
318int hfs_brec_goto(struct hfs_find_data *, int); 355int hfs_brec_goto(struct hfs_find_data *, int);
319 356
320/* catalog.c */ 357/* catalog.c */
321int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); 358int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *,
322int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); 359 const hfsplus_btree_key *);
323void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *, u32, struct qstr *); 360int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *,
361 const hfsplus_btree_key *);
362void hfsplus_cat_build_key(struct super_block *sb,
363 hfsplus_btree_key *, u32, struct qstr *);
324int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *); 364int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
325int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); 365int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
326int hfsplus_delete_cat(u32, struct inode *, struct qstr *); 366int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
@@ -336,7 +376,8 @@ extern const struct file_operations hfsplus_dir_operations;
336int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); 376int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
337void hfsplus_ext_write_extent(struct inode *); 377void hfsplus_ext_write_extent(struct inode *);
338int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int); 378int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
339int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int); 379int hfsplus_free_fork(struct super_block *, u32,
380 struct hfsplus_fork_raw *, int);
340int hfsplus_file_extend(struct inode *); 381int hfsplus_file_extend(struct inode *);
341void hfsplus_file_truncate(struct inode *); 382void hfsplus_file_truncate(struct inode *);
342 383
@@ -351,6 +392,7 @@ int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
351int hfsplus_cat_write_inode(struct inode *); 392int hfsplus_cat_write_inode(struct inode *);
352struct inode *hfsplus_new_inode(struct super_block *, int); 393struct inode *hfsplus_new_inode(struct super_block *, int);
353void hfsplus_delete_inode(struct inode *); 394void hfsplus_delete_inode(struct inode *);
395int hfsplus_file_fsync(struct file *file, int datasync);
354 396
355/* ioctl.c */ 397/* ioctl.c */
356long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 398long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
@@ -362,6 +404,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
362 404
363/* options.c */ 405/* options.c */
364int hfsplus_parse_options(char *, struct hfsplus_sb_info *); 406int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
407int hfsplus_parse_options_remount(char *input, int *force);
365void hfsplus_fill_defaults(struct hfsplus_sb_info *); 408void hfsplus_fill_defaults(struct hfsplus_sb_info *);
366int hfsplus_show_options(struct seq_file *, struct vfsmount *); 409int hfsplus_show_options(struct seq_file *, struct vfsmount *);
367 410
@@ -375,45 +418,26 @@ extern u16 hfsplus_decompose_table[];
375extern u16 hfsplus_compose_table[]; 418extern u16 hfsplus_compose_table[];
376 419
377/* unicode.c */ 420/* unicode.c */
378int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); 421int hfsplus_strcasecmp(const struct hfsplus_unistr *,
379int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); 422 const struct hfsplus_unistr *);
380int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); 423int hfsplus_strcmp(const struct hfsplus_unistr *,
381int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int); 424 const struct hfsplus_unistr *);
382int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str); 425int hfsplus_uni2asc(struct super_block *,
383int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2); 426 const struct hfsplus_unistr *, char *, int *);
427int hfsplus_asc2uni(struct super_block *,
428 struct hfsplus_unistr *, const char *, int);
429int hfsplus_hash_dentry(const struct dentry *dentry,
430 const struct inode *inode, struct qstr *str);
431int hfsplus_compare_dentry(const struct dentry *parent,
432 const struct inode *pinode,
433 const struct dentry *dentry, const struct inode *inode,
434 unsigned int len, const char *str, const struct qstr *name);
384 435
385/* wrapper.c */ 436/* wrapper.c */
386int hfsplus_read_wrapper(struct super_block *); 437int hfsplus_read_wrapper(struct super_block *);
387
388int hfs_part_find(struct super_block *, sector_t *, sector_t *); 438int hfs_part_find(struct super_block *, sector_t *, sector_t *);
389 439int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
390/* access macros */ 440 void *data, int rw);
391static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
392{
393 return sb->s_fs_info;
394}
395
396static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
397{
398 return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
399}
400
401#define sb_bread512(sb, sec, data) ({ \
402 struct buffer_head *__bh; \
403 sector_t __block; \
404 loff_t __start; \
405 int __offset; \
406 \
407 __start = (loff_t)(sec) << HFSPLUS_SECTOR_SHIFT;\
408 __block = __start >> (sb)->s_blocksize_bits; \
409 __offset = __start & ((sb)->s_blocksize - 1); \
410 __bh = sb_bread((sb), __block); \
411 if (likely(__bh != NULL)) \
412 data = (void *)(__bh->b_data + __offset);\
413 else \
414 data = NULL; \
415 __bh; \
416})
417 441
418/* time macros */ 442/* time macros */
419#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) 443#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U)
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 6892899fd6f..927cdd6d5bf 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -36,7 +36,8 @@
36#define HFSP_WRAPOFF_EMBEDSIG 0x7C 36#define HFSP_WRAPOFF_EMBEDSIG 0x7C
37#define HFSP_WRAPOFF_EMBEDEXT 0x7E 37#define HFSP_WRAPOFF_EMBEDEXT 0x7E
38 38
39#define HFSP_HIDDENDIR_NAME "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data" 39#define HFSP_HIDDENDIR_NAME \
40 "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
40 41
41#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */ 42#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */
42#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */ 43#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8afd7e84f98..a8df651747f 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -8,6 +8,7 @@
8 * Inode handling routines 8 * Inode handling routines
9 */ 9 */
10 10
11#include <linux/blkdev.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
@@ -77,7 +78,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
77 if (!tree) 78 if (!tree)
78 return 0; 79 return 0;
79 if (tree->node_size >= PAGE_CACHE_SIZE) { 80 if (tree->node_size >= PAGE_CACHE_SIZE) {
80 nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); 81 nidx = page->index >>
82 (tree->node_size_shift - PAGE_CACHE_SHIFT);
81 spin_lock(&tree->hash_lock); 83 spin_lock(&tree->hash_lock);
82 node = hfs_bnode_findhash(tree, nidx); 84 node = hfs_bnode_findhash(tree, nidx);
83 if (!node) 85 if (!node)
@@ -90,7 +92,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
90 } 92 }
91 spin_unlock(&tree->hash_lock); 93 spin_unlock(&tree->hash_lock);
92 } else { 94 } else {
93 nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift); 95 nidx = page->index <<
96 (PAGE_CACHE_SHIFT - tree->node_size_shift);
94 i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); 97 i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
95 spin_lock(&tree->hash_lock); 98 spin_lock(&tree->hash_lock);
96 do { 99 do {
@@ -166,8 +169,8 @@ const struct dentry_operations hfsplus_dentry_operations = {
166 .d_compare = hfsplus_compare_dentry, 169 .d_compare = hfsplus_compare_dentry,
167}; 170};
168 171
169static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry, 172static struct dentry *hfsplus_file_lookup(struct inode *dir,
170 struct nameidata *nd) 173 struct dentry *dentry, struct nameidata *nd)
171{ 174{
172 struct hfs_find_data fd; 175 struct hfs_find_data fd;
173 struct super_block *sb = dir->i_sb; 176 struct super_block *sb = dir->i_sb;
@@ -190,7 +193,9 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
190 inode->i_ino = dir->i_ino; 193 inode->i_ino = dir->i_ino;
191 INIT_LIST_HEAD(&hip->open_dir_list); 194 INIT_LIST_HEAD(&hip->open_dir_list);
192 mutex_init(&hip->extents_lock); 195 mutex_init(&hip->extents_lock);
193 hip->flags = HFSPLUS_FLG_RSRC; 196 hip->extent_state = 0;
197 hip->flags = 0;
198 set_bit(HFSPLUS_I_RSRC, &hip->flags);
194 199
195 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); 200 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
196 err = hfsplus_find_cat(sb, dir->i_ino, &fd); 201 err = hfsplus_find_cat(sb, dir->i_ino, &fd);
@@ -219,7 +224,8 @@ out:
219 return NULL; 224 return NULL;
220} 225}
221 226
222static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) 227static void hfsplus_get_perms(struct inode *inode,
228 struct hfsplus_perm *perms, int dir)
223{ 229{
224 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); 230 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
225 u16 mode; 231 u16 mode;
@@ -302,29 +308,41 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
302 return 0; 308 return 0;
303} 309}
304 310
305static int hfsplus_file_fsync(struct file *filp, int datasync) 311int hfsplus_file_fsync(struct file *file, int datasync)
306{ 312{
307 struct inode *inode = filp->f_mapping->host; 313 struct inode *inode = file->f_mapping->host;
308 struct super_block * sb; 314 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
309 int ret, err; 315 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
310 316 int error = 0, error2;
311 /* sync the inode to buffers */ 317
312 ret = write_inode_now(inode, 0); 318 /*
313 319 * Sync inode metadata into the catalog and extent trees.
314 /* sync the superblock to buffers */ 320 */
315 sb = inode->i_sb; 321 sync_inode_metadata(inode, 1);
316 if (sb->s_dirt) { 322
317 if (!(sb->s_flags & MS_RDONLY)) 323 /*
318 hfsplus_sync_fs(sb, 1); 324 * And explicitly write out the btrees.
319 else 325 */
320 sb->s_dirt = 0; 326 if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
327 error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
328
329 if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
330 error2 =
331 filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
332 if (!error)
333 error = error2;
321 } 334 }
322 335
323 /* .. finally sync the buffers to disk */ 336 if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
324 err = sync_blockdev(sb->s_bdev); 337 error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
325 if (!ret) 338 if (!error)
326 ret = err; 339 error = error2;
327 return ret; 340 }
341
342 if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
343 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
344
345 return error;
328} 346}
329 347
330static const struct inode_operations hfsplus_file_inode_operations = { 348static const struct inode_operations hfsplus_file_inode_operations = {
@@ -337,7 +355,7 @@ static const struct inode_operations hfsplus_file_inode_operations = {
337}; 355};
338 356
339static const struct file_operations hfsplus_file_operations = { 357static const struct file_operations hfsplus_file_operations = {
340 .llseek = generic_file_llseek, 358 .llseek = generic_file_llseek,
341 .read = do_sync_read, 359 .read = do_sync_read,
342 .aio_read = generic_file_aio_read, 360 .aio_read = generic_file_aio_read,
343 .write = do_sync_write, 361 .write = do_sync_write,
@@ -370,6 +388,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
370 INIT_LIST_HEAD(&hip->open_dir_list); 388 INIT_LIST_HEAD(&hip->open_dir_list);
371 mutex_init(&hip->extents_lock); 389 mutex_init(&hip->extents_lock);
372 atomic_set(&hip->opencnt, 0); 390 atomic_set(&hip->opencnt, 0);
391 hip->extent_state = 0;
373 hip->flags = 0; 392 hip->flags = 0;
374 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); 393 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
375 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); 394 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
@@ -457,7 +476,8 @@ void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
457 } 476 }
458} 477}
459 478
460void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 479void hfsplus_inode_write_fork(struct inode *inode,
480 struct hfsplus_fork_raw *fork)
461{ 481{
462 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents, 482 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
463 sizeof(hfsplus_extent_rec)); 483 sizeof(hfsplus_extent_rec));
@@ -499,13 +519,14 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
499 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, 519 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
500 sizeof(struct hfsplus_cat_file)); 520 sizeof(struct hfsplus_cat_file));
501 521
502 hfsplus_inode_read_fork(inode, HFSPLUS_IS_DATA(inode) ? 522 hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
503 &file->data_fork : &file->rsrc_fork); 523 &file->rsrc_fork : &file->data_fork);
504 hfsplus_get_perms(inode, &file->permissions, 0); 524 hfsplus_get_perms(inode, &file->permissions, 0);
505 inode->i_nlink = 1; 525 inode->i_nlink = 1;
506 if (S_ISREG(inode->i_mode)) { 526 if (S_ISREG(inode->i_mode)) {
507 if (file->permissions.dev) 527 if (file->permissions.dev)
508 inode->i_nlink = be32_to_cpu(file->permissions.dev); 528 inode->i_nlink =
529 be32_to_cpu(file->permissions.dev);
509 inode->i_op = &hfsplus_file_inode_operations; 530 inode->i_op = &hfsplus_file_inode_operations;
510 inode->i_fop = &hfsplus_file_operations; 531 inode->i_fop = &hfsplus_file_operations;
511 inode->i_mapping->a_ops = &hfsplus_aops; 532 inode->i_mapping->a_ops = &hfsplus_aops;
@@ -578,7 +599,9 @@ int hfsplus_cat_write_inode(struct inode *inode)
578 sizeof(struct hfsplus_cat_file)); 599 sizeof(struct hfsplus_cat_file));
579 hfsplus_inode_write_fork(inode, &file->data_fork); 600 hfsplus_inode_write_fork(inode, &file->data_fork);
580 hfsplus_cat_set_perms(inode, &file->permissions); 601 hfsplus_cat_set_perms(inode, &file->permissions);
581 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 602 if (HFSPLUS_FLG_IMMUTABLE &
603 (file->permissions.rootflags |
604 file->permissions.userflags))
582 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 605 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
583 else 606 else
584 file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); 607 file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
@@ -588,6 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
588 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, 611 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
589 sizeof(struct hfsplus_cat_file)); 612 sizeof(struct hfsplus_cat_file));
590 } 613 }
614
615 set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
591out: 616out:
592 hfs_find_exit(&fd); 617 hfs_find_exit(&fd);
593 return 0; 618 return 0;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 40a85a3ded6..508ce662ce1 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -28,7 +28,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
28 28
29 if (inode->i_flags & S_IMMUTABLE) 29 if (inode->i_flags & S_IMMUTABLE)
30 flags |= FS_IMMUTABLE_FL; 30 flags |= FS_IMMUTABLE_FL;
31 if (inode->i_flags |= S_APPEND) 31 if (inode->i_flags & S_APPEND)
32 flags |= FS_APPEND_FL; 32 flags |= FS_APPEND_FL;
33 if (hip->userflags & HFSPLUS_FLG_NODUMP) 33 if (hip->userflags & HFSPLUS_FLG_NODUMP)
34 flags |= FS_NODUMP_FL; 34 flags |= FS_NODUMP_FL;
@@ -147,9 +147,11 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
147 res = -ERANGE; 147 res = -ERANGE;
148 } else 148 } else
149 res = -EOPNOTSUPP; 149 res = -EOPNOTSUPP;
150 if (!res) 150 if (!res) {
151 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, 151 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
152 sizeof(struct hfsplus_cat_file)); 152 sizeof(struct hfsplus_cat_file));
153 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
154 }
153out: 155out:
154 hfs_find_exit(&fd); 156 hfs_find_exit(&fd);
155 return res; 157 return res;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index f9ab276a4d8..bb62a588214 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -23,6 +23,7 @@ enum {
23 opt_umask, opt_uid, opt_gid, 23 opt_umask, opt_uid, opt_gid,
24 opt_part, opt_session, opt_nls, 24 opt_part, opt_session, opt_nls,
25 opt_nodecompose, opt_decompose, 25 opt_nodecompose, opt_decompose,
26 opt_barrier, opt_nobarrier,
26 opt_force, opt_err 27 opt_force, opt_err
27}; 28};
28 29
@@ -37,6 +38,8 @@ static const match_table_t tokens = {
37 { opt_nls, "nls=%s" }, 38 { opt_nls, "nls=%s" },
38 { opt_decompose, "decompose" }, 39 { opt_decompose, "decompose" },
39 { opt_nodecompose, "nodecompose" }, 40 { opt_nodecompose, "nodecompose" },
41 { opt_barrier, "barrier" },
42 { opt_nobarrier, "nobarrier" },
40 { opt_force, "force" }, 43 { opt_force, "force" },
41 { opt_err, NULL } 44 { opt_err, NULL }
42}; 45};
@@ -65,6 +68,32 @@ static inline int match_fourchar(substring_t *arg, u32 *result)
65 return 0; 68 return 0;
66} 69}
67 70
71int hfsplus_parse_options_remount(char *input, int *force)
72{
73 char *p;
74 substring_t args[MAX_OPT_ARGS];
75 int token;
76
77 if (!input)
78 return 0;
79
80 while ((p = strsep(&input, ",")) != NULL) {
81 if (!*p)
82 continue;
83
84 token = match_token(p, tokens, args);
85 switch (token) {
86 case opt_force:
87 *force = 1;
88 break;
89 default:
90 break;
91 }
92 }
93
94 return 1;
95}
96
68/* Parse options from mount. Returns 0 on failure */ 97/* Parse options from mount. Returns 0 on failure */
69/* input is the options passed to mount() as a string */ 98/* input is the options passed to mount() as a string */
70int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) 99int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
@@ -136,7 +165,9 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
136 if (p) 165 if (p)
137 sbi->nls = load_nls(p); 166 sbi->nls = load_nls(p);
138 if (!sbi->nls) { 167 if (!sbi->nls) {
139 printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p); 168 printk(KERN_ERR "hfs: unable to load "
169 "nls mapping \"%s\"\n",
170 p);
140 kfree(p); 171 kfree(p);
141 return 0; 172 return 0;
142 } 173 }
@@ -148,6 +179,12 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
148 case opt_nodecompose: 179 case opt_nodecompose:
149 set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); 180 set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
150 break; 181 break;
182 case opt_barrier:
183 clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
184 break;
185 case opt_nobarrier:
186 set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
187 break;
151 case opt_force: 188 case opt_force:
152 set_bit(HFSPLUS_SB_FORCE, &sbi->flags); 189 set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
153 break; 190 break;
@@ -177,7 +214,8 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); 214 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
178 if (sbi->type != HFSPLUS_DEF_CR_TYPE) 215 if (sbi->type != HFSPLUS_DEF_CR_TYPE)
179 seq_printf(seq, ",type=%.4s", (char *)&sbi->type); 216 seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
180 seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid); 217 seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
218 sbi->uid, sbi->gid);
181 if (sbi->part >= 0) 219 if (sbi->part >= 0)
182 seq_printf(seq, ",part=%u", sbi->part); 220 seq_printf(seq, ",part=%u", sbi->part);
183 if (sbi->session >= 0) 221 if (sbi->session >= 0)
@@ -186,5 +224,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
186 seq_printf(seq, ",nls=%s", sbi->nls->charset); 224 seq_printf(seq, ",nls=%s", sbi->nls->charset);
187 if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) 225 if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
188 seq_printf(seq, ",nodecompose"); 226 seq_printf(seq, ",nodecompose");
227 if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
228 seq_printf(seq, ",nobarrier");
189 return 0; 229 return 0;
190} 230}
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 208b16c645c..d66ad113b1c 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -2,7 +2,8 @@
2 * linux/fs/hfsplus/part_tbl.c 2 * linux/fs/hfsplus/part_tbl.c
3 * 3 *
4 * Copyright (C) 1996-1997 Paul H. Hargrove 4 * Copyright (C) 1996-1997 Paul H. Hargrove
5 * This file may be distributed under the terms of the GNU General Public License. 5 * This file may be distributed under the terms of
6 * the GNU General Public License.
6 * 7 *
7 * Original code to handle the new style Mac partition table based on 8 * Original code to handle the new style Mac partition table based on
8 * a patch contributed by Holger Schemel (aeglos@valinor.owl.de). 9 * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
@@ -13,6 +14,7 @@
13 * 14 *
14 */ 15 */
15 16
17#include <linux/slab.h>
16#include "hfsplus_fs.h" 18#include "hfsplus_fs.h"
17 19
18/* offsets to various blocks */ 20/* offsets to various blocks */
@@ -58,77 +60,94 @@ struct new_pmap {
58 */ 60 */
59struct old_pmap { 61struct old_pmap {
60 __be16 pdSig; /* Signature bytes */ 62 __be16 pdSig; /* Signature bytes */
61 struct old_pmap_entry { 63 struct old_pmap_entry {
62 __be32 pdStart; 64 __be32 pdStart;
63 __be32 pdSize; 65 __be32 pdSize;
64 __be32 pdFSID; 66 __be32 pdFSID;
65 } pdEntry[42]; 67 } pdEntry[42];
66} __packed; 68} __packed;
67 69
70static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm,
71 sector_t *part_start, sector_t *part_size)
72{
73 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
74 int i;
75
76 for (i = 0; i < 42; i++) {
77 struct old_pmap_entry *p = &pm->pdEntry[i];
78
79 if (p->pdStart && p->pdSize &&
80 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
81 (sbi->part < 0 || sbi->part == i)) {
82 *part_start += be32_to_cpu(p->pdStart);
83 *part_size = be32_to_cpu(p->pdSize);
84 return 0;
85 }
86 }
87
88 return -ENOENT;
89}
90
91static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
92 sector_t *part_start, sector_t *part_size)
93{
94 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
95 int size = be32_to_cpu(pm->pmMapBlkCnt);
96 int res;
97 int i = 0;
98
99 do {
100 if (!memcmp(pm->pmPartType, "Apple_HFS", 9) &&
101 (sbi->part < 0 || sbi->part == i)) {
102 *part_start += be32_to_cpu(pm->pmPyPartStart);
103 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
104 return 0;
105 }
106
107 if (++i >= size)
108 return -ENOENT;
109
110 res = hfsplus_submit_bio(sb->s_bdev,
111 *part_start + HFS_PMAP_BLK + i,
112 pm, READ);
113 if (res)
114 return res;
115 } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC));
116
117 return -ENOENT;
118}
119
68/* 120/*
69 * hfs_part_find() 121 * Parse the partition map looking for the start and length of a
70 * 122 * HFS/HFS+ partition.
71 * Parse the partition map looking for the
72 * start and length of the 'part'th HFS partition.
73 */ 123 */
74int hfs_part_find(struct super_block *sb, 124int hfs_part_find(struct super_block *sb,
75 sector_t *part_start, sector_t *part_size) 125 sector_t *part_start, sector_t *part_size)
76{ 126{
77 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); 127 void *data;
78 struct buffer_head *bh; 128 int res;
79 __be16 *data; 129
80 int i, size, res; 130 data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
131 if (!data)
132 return -ENOMEM;
81 133
82 res = -ENOENT; 134 res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
83 bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data); 135 data, READ);
84 if (!bh) 136 if (res)
85 return -EIO; 137 return res;
86 138
87 switch (be16_to_cpu(*data)) { 139 switch (be16_to_cpu(*((__be16 *)data))) {
88 case HFS_OLD_PMAP_MAGIC: 140 case HFS_OLD_PMAP_MAGIC:
89 { 141 res = hfs_parse_old_pmap(sb, data, part_start, part_size);
90 struct old_pmap *pm;
91 struct old_pmap_entry *p;
92
93 pm = (struct old_pmap *)bh->b_data;
94 p = pm->pdEntry;
95 size = 42;
96 for (i = 0; i < size; p++, i++) {
97 if (p->pdStart && p->pdSize &&
98 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
99 (sbi->part < 0 || sbi->part == i)) {
100 *part_start += be32_to_cpu(p->pdStart);
101 *part_size = be32_to_cpu(p->pdSize);
102 res = 0;
103 }
104 }
105 break; 142 break;
106 }
107 case HFS_NEW_PMAP_MAGIC: 143 case HFS_NEW_PMAP_MAGIC:
108 { 144 res = hfs_parse_new_pmap(sb, data, part_start, part_size);
109 struct new_pmap *pm; 145 break;
110 146 default:
111 pm = (struct new_pmap *)bh->b_data; 147 res = -ENOENT;
112 size = be32_to_cpu(pm->pmMapBlkCnt);
113 for (i = 0; i < size;) {
114 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
115 (sbi->part < 0 || sbi->part == i)) {
116 *part_start += be32_to_cpu(pm->pmPyPartStart);
117 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
118 res = 0;
119 break;
120 }
121 brelse(bh);
122 bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm);
123 if (!bh)
124 return -EIO;
125 if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC))
126 break;
127 }
128 break; 148 break;
129 }
130 } 149 }
131 brelse(bh);
132 150
151 kfree(data);
133 return res; 152 return res;
134} 153}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 52cc746d3ba..9a3b4795f43 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/blkdev.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/vfs.h> 16#include <linux/vfs.h>
@@ -66,6 +67,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
66 INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); 67 INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
67 mutex_init(&HFSPLUS_I(inode)->extents_lock); 68 mutex_init(&HFSPLUS_I(inode)->extents_lock);
68 HFSPLUS_I(inode)->flags = 0; 69 HFSPLUS_I(inode)->flags = 0;
70 HFSPLUS_I(inode)->extent_state = 0;
69 HFSPLUS_I(inode)->rsrc_inode = NULL; 71 HFSPLUS_I(inode)->rsrc_inode = NULL;
70 atomic_set(&HFSPLUS_I(inode)->opencnt, 0); 72 atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
71 73
@@ -157,45 +159,65 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
157{ 159{
158 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); 160 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
159 struct hfsplus_vh *vhdr = sbi->s_vhdr; 161 struct hfsplus_vh *vhdr = sbi->s_vhdr;
162 int write_backup = 0;
163 int error, error2;
164
165 if (!wait)
166 return 0;
160 167
161 dprint(DBG_SUPER, "hfsplus_write_super\n"); 168 dprint(DBG_SUPER, "hfsplus_write_super\n");
162 169
163 mutex_lock(&sbi->vh_mutex);
164 mutex_lock(&sbi->alloc_mutex);
165 sb->s_dirt = 0; 170 sb->s_dirt = 0;
166 171
172 /*
173 * Explicitly write out the special metadata inodes.
174 *
175 * While these special inodes are marked as hashed and written
176 * out peridocically by the flusher threads we redirty them
177 * during writeout of normal inodes, and thus the life lock
178 * prevents us from getting the latest state to disk.
179 */
180 error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
181 error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
182 if (!error)
183 error = error2;
184 error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
185 if (!error)
186 error = error2;
187
188 mutex_lock(&sbi->vh_mutex);
189 mutex_lock(&sbi->alloc_mutex);
167 vhdr->free_blocks = cpu_to_be32(sbi->free_blocks); 190 vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
168 vhdr->next_cnid = cpu_to_be32(sbi->next_cnid); 191 vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
169 vhdr->folder_count = cpu_to_be32(sbi->folder_count); 192 vhdr->folder_count = cpu_to_be32(sbi->folder_count);
170 vhdr->file_count = cpu_to_be32(sbi->file_count); 193 vhdr->file_count = cpu_to_be32(sbi->file_count);
171 194
172 mark_buffer_dirty(sbi->s_vhbh);
173 if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { 195 if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
174 if (sbi->sect_count) { 196 memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
175 struct buffer_head *bh; 197 write_backup = 1;
176 u32 block, offset;
177
178 block = sbi->blockoffset;
179 block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
180 offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
181 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
182 sbi->blockoffset, sbi->sect_count,
183 block, offset);
184 bh = sb_bread(sb, block);
185 if (bh) {
186 vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
187 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
188 memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
189 mark_buffer_dirty(bh);
190 brelse(bh);
191 } else
192 printk(KERN_WARNING "hfs: backup not found!\n");
193 }
194 }
195 } 198 }
199
200 error2 = hfsplus_submit_bio(sb->s_bdev,
201 sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
202 sbi->s_vhdr, WRITE_SYNC);
203 if (!error)
204 error = error2;
205 if (!write_backup)
206 goto out;
207
208 error2 = hfsplus_submit_bio(sb->s_bdev,
209 sbi->part_start + sbi->sect_count - 2,
210 sbi->s_backup_vhdr, WRITE_SYNC);
211 if (!error)
212 error2 = error;
213out:
196 mutex_unlock(&sbi->alloc_mutex); 214 mutex_unlock(&sbi->alloc_mutex);
197 mutex_unlock(&sbi->vh_mutex); 215 mutex_unlock(&sbi->vh_mutex);
198 return 0; 216
217 if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
218 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
219
220 return error;
199} 221}
200 222
201static void hfsplus_write_super(struct super_block *sb) 223static void hfsplus_write_super(struct super_block *sb)
@@ -215,23 +237,22 @@ static void hfsplus_put_super(struct super_block *sb)
215 if (!sb->s_fs_info) 237 if (!sb->s_fs_info)
216 return; 238 return;
217 239
218 if (sb->s_dirt)
219 hfsplus_write_super(sb);
220 if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) { 240 if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
221 struct hfsplus_vh *vhdr = sbi->s_vhdr; 241 struct hfsplus_vh *vhdr = sbi->s_vhdr;
222 242
223 vhdr->modify_date = hfsp_now2mt(); 243 vhdr->modify_date = hfsp_now2mt();
224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); 244 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); 245 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
226 mark_buffer_dirty(sbi->s_vhbh); 246
227 sync_dirty_buffer(sbi->s_vhbh); 247 hfsplus_sync_fs(sb, 1);
228 } 248 }
229 249
230 hfs_btree_close(sbi->cat_tree); 250 hfs_btree_close(sbi->cat_tree);
231 hfs_btree_close(sbi->ext_tree); 251 hfs_btree_close(sbi->ext_tree);
232 iput(sbi->alloc_file); 252 iput(sbi->alloc_file);
233 iput(sbi->hidden_dir); 253 iput(sbi->hidden_dir);
234 brelse(sbi->s_vhbh); 254 kfree(sbi->s_vhdr);
255 kfree(sbi->s_backup_vhdr);
235 unload_nls(sbi->nls); 256 unload_nls(sbi->nls);
236 kfree(sb->s_fs_info); 257 kfree(sb->s_fs_info);
237 sb->s_fs_info = NULL; 258 sb->s_fs_info = NULL;
@@ -263,26 +284,31 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
263 return 0; 284 return 0;
264 if (!(*flags & MS_RDONLY)) { 285 if (!(*flags & MS_RDONLY)) {
265 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; 286 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
266 struct hfsplus_sb_info sbi; 287 int force = 0;
267 288
268 memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); 289 if (!hfsplus_parse_options_remount(data, &force))
269 sbi.nls = HFSPLUS_SB(sb)->nls;
270 if (!hfsplus_parse_options(data, &sbi))
271 return -EINVAL; 290 return -EINVAL;
272 291
273 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { 292 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
274 printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " 293 printk(KERN_WARNING "hfs: filesystem was "
275 "running fsck.hfsplus is recommended. leaving read-only.\n"); 294 "not cleanly unmounted, "
295 "running fsck.hfsplus is recommended. "
296 "leaving read-only.\n");
276 sb->s_flags |= MS_RDONLY; 297 sb->s_flags |= MS_RDONLY;
277 *flags |= MS_RDONLY; 298 *flags |= MS_RDONLY;
278 } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) { 299 } else if (force) {
279 /* nothing */ 300 /* nothing */
280 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 301 } else if (vhdr->attributes &
281 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); 302 cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
303 printk(KERN_WARNING "hfs: filesystem is marked locked, "
304 "leaving read-only.\n");
282 sb->s_flags |= MS_RDONLY; 305 sb->s_flags |= MS_RDONLY;
283 *flags |= MS_RDONLY; 306 *flags |= MS_RDONLY;
284 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { 307 } else if (vhdr->attributes &
285 printk(KERN_WARNING "hfs: filesystem is marked journaled, leaving read-only.\n"); 308 cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
309 printk(KERN_WARNING "hfs: filesystem is "
310 "marked journaled, "
311 "leaving read-only.\n");
286 sb->s_flags |= MS_RDONLY; 312 sb->s_flags |= MS_RDONLY;
287 *flags |= MS_RDONLY; 313 *flags |= MS_RDONLY;
288 } 314 }
@@ -372,17 +398,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
372 sb->s_maxbytes = MAX_LFS_FILESIZE; 398 sb->s_maxbytes = MAX_LFS_FILESIZE;
373 399
374 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { 400 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
375 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " 401 printk(KERN_WARNING "hfs: Filesystem was "
376 "running fsck.hfsplus is recommended. mounting read-only.\n"); 402 "not cleanly unmounted, "
403 "running fsck.hfsplus is recommended. "
404 "mounting read-only.\n");
377 sb->s_flags |= MS_RDONLY; 405 sb->s_flags |= MS_RDONLY;
378 } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { 406 } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
379 /* nothing */ 407 /* nothing */
380 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 408 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
381 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); 409 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
382 sb->s_flags |= MS_RDONLY; 410 sb->s_flags |= MS_RDONLY;
383 } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) { 411 } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
384 printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, " 412 !(sb->s_flags & MS_RDONLY)) {
385 "use the force option at your own risk, mounting read-only.\n"); 413 printk(KERN_WARNING "hfs: write access to "
414 "a journaled filesystem is not supported, "
415 "use the force option at your own risk, "
416 "mounting read-only.\n");
386 sb->s_flags |= MS_RDONLY; 417 sb->s_flags |= MS_RDONLY;
387 } 418 }
388 419
@@ -413,13 +444,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
413 err = PTR_ERR(root); 444 err = PTR_ERR(root);
414 goto cleanup; 445 goto cleanup;
415 } 446 }
447 sb->s_d_op = &hfsplus_dentry_operations;
416 sb->s_root = d_alloc_root(root); 448 sb->s_root = d_alloc_root(root);
417 if (!sb->s_root) { 449 if (!sb->s_root) {
418 iput(root); 450 iput(root);
419 err = -ENOMEM; 451 err = -ENOMEM;
420 goto cleanup; 452 goto cleanup;
421 } 453 }
422 sb->s_root->d_op = &hfsplus_dentry_operations;
423 454
424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 455 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
425 str.name = HFSP_HIDDENDIR_NAME; 456 str.name = HFSP_HIDDENDIR_NAME;
@@ -449,19 +480,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
449 be32_add_cpu(&vhdr->write_count, 1); 480 be32_add_cpu(&vhdr->write_count, 1);
450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 481 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 482 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
452 mark_buffer_dirty(sbi->s_vhbh); 483 hfsplus_sync_fs(sb, 1);
453 sync_dirty_buffer(sbi->s_vhbh);
454 484
455 if (!sbi->hidden_dir) { 485 if (!sbi->hidden_dir) {
456 printk(KERN_DEBUG "hfs: create hidden dir...\n");
457
458 mutex_lock(&sbi->vh_mutex); 486 mutex_lock(&sbi->vh_mutex);
459 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); 487 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
460 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode, 488 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
461 &str, sbi->hidden_dir); 489 &str, sbi->hidden_dir);
462 mutex_unlock(&sbi->vh_mutex); 490 mutex_unlock(&sbi->vh_mutex);
463 491
464 mark_inode_dirty(sbi->hidden_dir); 492 hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
465 } 493 }
466out: 494out:
467 unload_nls(sbi->nls); 495 unload_nls(sbi->nls);
@@ -488,11 +516,19 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
488 return i ? &i->vfs_inode : NULL; 516 return i ? &i->vfs_inode : NULL;
489} 517}
490 518
491static void hfsplus_destroy_inode(struct inode *inode) 519static void hfsplus_i_callback(struct rcu_head *head)
492{ 520{
521 struct inode *inode = container_of(head, struct inode, i_rcu);
522
523 INIT_LIST_HEAD(&inode->i_dentry);
493 kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); 524 kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
494} 525}
495 526
527static void hfsplus_destroy_inode(struct inode *inode)
528{
529 call_rcu(&inode->i_rcu, hfsplus_i_callback);
530}
531
496#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) 532#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
497 533
498static struct dentry *hfsplus_mount(struct file_system_type *fs_type, 534static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index b66d67de882..a3f0bfcc881 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -17,14 +17,14 @@
17/* Returns folded char, or 0 if ignorable */ 17/* Returns folded char, or 0 if ignorable */
18static inline u16 case_fold(u16 c) 18static inline u16 case_fold(u16 c)
19{ 19{
20 u16 tmp; 20 u16 tmp;
21 21
22 tmp = hfsplus_case_fold_table[c >> 8]; 22 tmp = hfsplus_case_fold_table[c >> 8];
23 if (tmp) 23 if (tmp)
24 tmp = hfsplus_case_fold_table[tmp + (c & 0xff)]; 24 tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
25 else 25 else
26 tmp = c; 26 tmp = c;
27 return tmp; 27 return tmp;
28} 28}
29 29
30/* Compare unicode strings, return values like normal strcmp */ 30/* Compare unicode strings, return values like normal strcmp */
@@ -118,7 +118,9 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
118 return NULL; 118 return NULL;
119} 119}
120 120
121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) 121int hfsplus_uni2asc(struct super_block *sb,
122 const struct hfsplus_unistr *ustr,
123 char *astr, int *len_p)
122{ 124{
123 const hfsplus_unichr *ip; 125 const hfsplus_unichr *ip;
124 struct nls_table *nls = HFSPLUS_SB(sb)->nls; 126 struct nls_table *nls = HFSPLUS_SB(sb)->nls;
@@ -171,7 +173,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
171 goto same; 173 goto same;
172 c1 = be16_to_cpu(*ip); 174 c1 = be16_to_cpu(*ip);
173 if (likely(compose)) 175 if (likely(compose))
174 ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c1); 176 ce1 = hfsplus_compose_lookup(
177 hfsplus_compose_table, c1);
175 if (ce1) 178 if (ce1)
176 break; 179 break;
177 switch (c0) { 180 switch (c0) {
@@ -199,7 +202,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
199 if (ce2) { 202 if (ce2) {
200 i = 1; 203 i = 1;
201 while (i < ustrlen) { 204 while (i < ustrlen) {
202 ce1 = hfsplus_compose_lookup(ce2, be16_to_cpu(ip[i])); 205 ce1 = hfsplus_compose_lookup(ce2,
206 be16_to_cpu(ip[i]));
203 if (!ce1) 207 if (!ce1)
204 break; 208 break;
205 i++; 209 i++;
@@ -211,7 +215,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
211 goto done; 215 goto done;
212 } 216 }
213 } 217 }
214 same: 218same:
215 switch (c0) { 219 switch (c0) {
216 case 0: 220 case 0:
217 cc = 0x2400; 221 cc = 0x2400;
@@ -222,7 +226,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
222 default: 226 default:
223 cc = c0; 227 cc = c0;
224 } 228 }
225 done: 229done:
226 res = nls->uni2char(cc, op, len); 230 res = nls->uni2char(cc, op, len);
227 if (res < 0) { 231 if (res < 0) {
228 if (res == -ENAMETOOLONG) 232 if (res == -ENAMETOOLONG)
@@ -320,7 +324,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
320 * Composed unicode characters are decomposed and case-folding is performed 324 * Composed unicode characters are decomposed and case-folding is performed
321 * if the appropriate bits are (un)set on the superblock. 325 * if the appropriate bits are (un)set on the superblock.
322 */ 326 */
323int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) 327int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
328 struct qstr *str)
324{ 329{
325 struct super_block *sb = dentry->d_sb; 330 struct super_block *sb = dentry->d_sb;
326 const char *astr; 331 const char *astr;
@@ -363,9 +368,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
363 * Composed unicode characters are decomposed and case-folding is performed 368 * Composed unicode characters are decomposed and case-folding is performed
364 * if the appropriate bits are (un)set on the superblock. 369 * if the appropriate bits are (un)set on the superblock.
365 */ 370 */
366int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2) 371int hfsplus_compare_dentry(const struct dentry *parent,
372 const struct inode *pinode,
373 const struct dentry *dentry, const struct inode *inode,
374 unsigned int len, const char *str, const struct qstr *name)
367{ 375{
368 struct super_block *sb = dentry->d_sb; 376 struct super_block *sb = parent->d_sb;
369 int casefold, decompose, size; 377 int casefold, decompose, size;
370 int dsize1, dsize2, len1, len2; 378 int dsize1, dsize2, len1, len2;
371 const u16 *dstr1, *dstr2; 379 const u16 *dstr1, *dstr2;
@@ -375,10 +383,10 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
375 383
376 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); 384 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
377 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); 385 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
378 astr1 = s1->name; 386 astr1 = str;
379 len1 = s1->len; 387 len1 = len;
380 astr2 = s2->name; 388 astr2 = name->name;
381 len2 = s2->len; 389 len2 = name->len;
382 dsize1 = dsize2 = 0; 390 dsize1 = dsize2 = 0;
383 dstr1 = dstr2 = NULL; 391 dstr1 = dstr2 = NULL;
384 392
@@ -388,7 +396,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
388 astr1 += size; 396 astr1 += size;
389 len1 -= size; 397 len1 -= size;
390 398
391 if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) { 399 if (decompose)
400 dstr1 = decompose_unichar(c, &dsize1);
401 if (!decompose || !dstr1) {
392 c1 = c; 402 c1 = c;
393 dstr1 = &c1; 403 dstr1 = &c1;
394 dsize1 = 1; 404 dsize1 = 1;
@@ -400,7 +410,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
400 astr2 += size; 410 astr2 += size;
401 len2 -= size; 411 len2 -= size;
402 412
403 if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) { 413 if (decompose)
414 dstr2 = decompose_unichar(c, &dsize2);
415 if (!decompose || !dstr2) {
404 c2 = c; 416 c2 = c;
405 dstr2 = &c2; 417 dstr2 = &c2;
406 dsize2 = 1; 418 dsize2 = 1;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 8972c20b321..196231794f6 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,6 +24,40 @@ struct hfsplus_wd {
24 u16 embed_count; 24 u16 embed_count;
25}; 25};
26 26
27static void hfsplus_end_io_sync(struct bio *bio, int err)
28{
29 if (err)
30 clear_bit(BIO_UPTODATE, &bio->bi_flags);
31 complete(bio->bi_private);
32}
33
34int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
35 void *data, int rw)
36{
37 DECLARE_COMPLETION_ONSTACK(wait);
38 struct bio *bio;
39
40 bio = bio_alloc(GFP_NOIO, 1);
41 bio->bi_sector = sector;
42 bio->bi_bdev = bdev;
43 bio->bi_end_io = hfsplus_end_io_sync;
44 bio->bi_private = &wait;
45
46 /*
47 * We always submit one sector at a time, so bio_add_page must not fail.
48 */
49 if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE,
50 offset_in_page(data)) != HFSPLUS_SECTOR_SIZE)
51 BUG();
52
53 submit_bio(rw, bio);
54 wait_for_completion(&wait);
55
56 if (!bio_flagged(bio, BIO_UPTODATE))
57 return -EIO;
58 return 0;
59}
60
27static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) 61static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
28{ 62{
29 u32 extent; 63 u32 extent;
@@ -40,12 +74,14 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
40 !(attrib & HFSP_WRAP_ATTRIB_SPARED)) 74 !(attrib & HFSP_WRAP_ATTRIB_SPARED))
41 return 0; 75 return 0;
42 76
43 wd->ablk_size = be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE)); 77 wd->ablk_size =
78 be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
44 if (wd->ablk_size < HFSPLUS_SECTOR_SIZE) 79 if (wd->ablk_size < HFSPLUS_SECTOR_SIZE)
45 return 0; 80 return 0;
46 if (wd->ablk_size % HFSPLUS_SECTOR_SIZE) 81 if (wd->ablk_size % HFSPLUS_SECTOR_SIZE)
47 return 0; 82 return 0;
48 wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART)); 83 wd->ablk_start =
84 be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
49 85
50 extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT); 86 extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
51 wd->embed_start = (extent >> 16) & 0xFFFF; 87 wd->embed_start = (extent >> 16) & 0xFFFF;
@@ -68,7 +104,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
68 if (HFSPLUS_SB(sb)->session >= 0) { 104 if (HFSPLUS_SB(sb)->session >= 0) {
69 te.cdte_track = HFSPLUS_SB(sb)->session; 105 te.cdte_track = HFSPLUS_SB(sb)->session;
70 te.cdte_format = CDROM_LBA; 106 te.cdte_format = CDROM_LBA;
71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); 107 res = ioctl_by_bdev(sb->s_bdev,
108 CDROMREADTOCENTRY, (unsigned long)&te);
72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { 109 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
73 *start = (sector_t)te.cdte_addr.lba << 2; 110 *start = (sector_t)te.cdte_addr.lba << 2;
74 return 0; 111 return 0;
@@ -77,7 +114,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
77 return -EINVAL; 114 return -EINVAL;
78 } 115 }
79 ms_info.addr_format = CDROM_LBA; 116 ms_info.addr_format = CDROM_LBA;
80 res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info); 117 res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION,
118 (unsigned long)&ms_info);
81 if (!res && ms_info.xa_flag) 119 if (!res && ms_info.xa_flag)
82 *start = (sector_t)ms_info.addr.lba << 2; 120 *start = (sector_t)ms_info.addr.lba << 2;
83 return 0; 121 return 0;
@@ -88,100 +126,112 @@ static int hfsplus_get_last_session(struct super_block *sb,
88int hfsplus_read_wrapper(struct super_block *sb) 126int hfsplus_read_wrapper(struct super_block *sb)
89{ 127{
90 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); 128 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
91 struct buffer_head *bh;
92 struct hfsplus_vh *vhdr;
93 struct hfsplus_wd wd; 129 struct hfsplus_wd wd;
94 sector_t part_start, part_size; 130 sector_t part_start, part_size;
95 u32 blocksize; 131 u32 blocksize;
132 int error = 0;
96 133
134 error = -EINVAL;
97 blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE); 135 blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE);
98 if (!blocksize) 136 if (!blocksize)
99 return -EINVAL; 137 goto out;
100 138
101 if (hfsplus_get_last_session(sb, &part_start, &part_size)) 139 if (hfsplus_get_last_session(sb, &part_start, &part_size))
102 return -EINVAL; 140 goto out;
103 if ((u64)part_start + part_size > 0x100000000ULL) { 141 if ((u64)part_start + part_size > 0x100000000ULL) {
104 pr_err("hfs: volumes larger than 2TB are not supported yet\n"); 142 pr_err("hfs: volumes larger than 2TB are not supported yet\n");
105 return -EINVAL; 143 goto out;
106 } 144 }
107 while (1) {
108 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
109 if (!bh)
110 return -EIO;
111
112 if (vhdr->signature == cpu_to_be16(HFSP_WRAP_MAGIC)) {
113 if (!hfsplus_read_mdb(vhdr, &wd))
114 goto error;
115 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
116 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
117 part_size = wd.embed_count * wd.ablk_size;
118 brelse(bh);
119 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
120 if (!bh)
121 return -EIO;
122 }
123 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
124 break;
125 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
126 set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
127 break;
128 }
129 brelse(bh);
130 145
131 /* check for a partition block 146 error = -ENOMEM;
147 sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
148 if (!sbi->s_vhdr)
149 goto out;
150 sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
151 if (!sbi->s_backup_vhdr)
152 goto out_free_vhdr;
153
154reread:
155 error = hfsplus_submit_bio(sb->s_bdev,
156 part_start + HFSPLUS_VOLHEAD_SECTOR,
157 sbi->s_vhdr, READ);
158 if (error)
159 goto out_free_backup_vhdr;
160
161 error = -EINVAL;
162 switch (sbi->s_vhdr->signature) {
163 case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
164 set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
165 /*FALLTHRU*/
166 case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
167 break;
168 case cpu_to_be16(HFSP_WRAP_MAGIC):
169 if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
170 goto out;
171 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
172 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
173 part_size = wd.embed_count * wd.ablk_size;
174 goto reread;
175 default:
176 /*
177 * Check for a partition block.
178 *
132 * (should do this only for cdrom/loop though) 179 * (should do this only for cdrom/loop though)
133 */ 180 */
134 if (hfs_part_find(sb, &part_start, &part_size)) 181 if (hfs_part_find(sb, &part_start, &part_size))
135 return -EINVAL; 182 goto out;
183 goto reread;
184 }
185
186 error = hfsplus_submit_bio(sb->s_bdev,
187 part_start + part_size - 2,
188 sbi->s_backup_vhdr, READ);
189 if (error)
190 goto out_free_backup_vhdr;
191
192 error = -EINVAL;
193 if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
194 printk(KERN_WARNING
195 "hfs: invalid secondary volume header\n");
196 goto out_free_backup_vhdr;
136 } 197 }
137 198
138 blocksize = be32_to_cpu(vhdr->blocksize); 199 blocksize = be32_to_cpu(sbi->s_vhdr->blocksize);
139 brelse(bh);
140 200
141 /* block size must be at least as large as a sector 201 /*
142 * and a multiple of 2 202 * Block size must be at least as large as a sector and a multiple of 2.
143 */ 203 */
144 if (blocksize < HFSPLUS_SECTOR_SIZE || 204 if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
145 ((blocksize - 1) & blocksize)) 205 goto out_free_backup_vhdr;
146 return -EINVAL;
147 sbi->alloc_blksz = blocksize; 206 sbi->alloc_blksz = blocksize;
148 sbi->alloc_blksz_shift = 0; 207 sbi->alloc_blksz_shift = 0;
149 while ((blocksize >>= 1) != 0) 208 while ((blocksize >>= 1) != 0)
150 sbi->alloc_blksz_shift++; 209 sbi->alloc_blksz_shift++;
151 blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE); 210 blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
152 211
153 /* align block size to block offset */ 212 /*
213 * Align block size to block offset.
214 */
154 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) 215 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
155 blocksize >>= 1; 216 blocksize >>= 1;
156 217
157 if (sb_set_blocksize(sb, blocksize) != blocksize) { 218 if (sb_set_blocksize(sb, blocksize) != blocksize) {
158 printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", blocksize); 219 printk(KERN_ERR "hfs: unable to set blocksize to %u!\n",
159 return -EINVAL; 220 blocksize);
221 goto out_free_backup_vhdr;
160 } 222 }
161 223
162 sbi->blockoffset = 224 sbi->blockoffset =
163 part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); 225 part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
226 sbi->part_start = part_start;
164 sbi->sect_count = part_size; 227 sbi->sect_count = part_size;
165 sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; 228 sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
166
167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
168 if (!bh)
169 return -EIO;
170
171 /* should still be the same... */
172 if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
173 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
174 goto error;
175 } else {
176 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
177 goto error;
178 }
179
180 sbi->s_vhbh = bh;
181 sbi->s_vhdr = vhdr;
182
183 return 0; 229 return 0;
184 error: 230
185 brelse(bh); 231out_free_backup_vhdr:
186 return -EINVAL; 232 kfree(sbi->s_backup_vhdr);
233out_free_vhdr:
234 kfree(sbi->s_vhdr);
235out:
236 return error;
187} 237}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2c0f148a49e..2638c834ed2 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -32,7 +32,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
32 32
33#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode) 33#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
34 34
35static int hostfs_d_delete(struct dentry *dentry) 35static int hostfs_d_delete(const struct dentry *dentry)
36{ 36{
37 return 1; 37 return 1;
38} 38}
@@ -92,12 +92,10 @@ __uml_setup("hostfs=", hostfs_args,
92 92
93static char *__dentry_name(struct dentry *dentry, char *name) 93static char *__dentry_name(struct dentry *dentry, char *name)
94{ 94{
95 char *p = __dentry_path(dentry, name, PATH_MAX); 95 char *p = dentry_path_raw(dentry, name, PATH_MAX);
96 char *root; 96 char *root;
97 size_t len; 97 size_t len;
98 98
99 spin_unlock(&dcache_lock);
100
101 root = dentry->d_sb->s_fs_info; 99 root = dentry->d_sb->s_fs_info;
102 len = strlen(root); 100 len = strlen(root);
103 if (IS_ERR(p)) { 101 if (IS_ERR(p)) {
@@ -123,25 +121,23 @@ static char *dentry_name(struct dentry *dentry)
123 if (!name) 121 if (!name)
124 return NULL; 122 return NULL;
125 123
126 spin_lock(&dcache_lock);
127 return __dentry_name(dentry, name); /* will unlock */ 124 return __dentry_name(dentry, name); /* will unlock */
128} 125}
129 126
130static char *inode_name(struct inode *ino) 127static char *inode_name(struct inode *ino)
131{ 128{
132 struct dentry *dentry; 129 struct dentry *dentry;
133 char *name = __getname(); 130 char *name;
134 if (!name)
135 return NULL;
136 131
137 spin_lock(&dcache_lock); 132 dentry = d_find_alias(ino);
138 if (list_empty(&ino->i_dentry)) { 133 if (!dentry)
139 spin_unlock(&dcache_lock);
140 __putname(name);
141 return NULL; 134 return NULL;
142 } 135
143 dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias); 136 name = dentry_name(dentry);
144 return __dentry_name(dentry, name); /* will unlock */ 137
138 dput(dentry);
139
140 return name;
145} 141}
146 142
147static char *follow_link(char *link) 143static char *follow_link(char *link)
@@ -251,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode)
251 } 247 }
252} 248}
253 249
254static void hostfs_destroy_inode(struct inode *inode) 250static void hostfs_i_callback(struct rcu_head *head)
255{ 251{
252 struct inode *inode = container_of(head, struct inode, i_rcu);
253 INIT_LIST_HEAD(&inode->i_dentry);
256 kfree(HOSTFS_I(inode)); 254 kfree(HOSTFS_I(inode));
257} 255}
258 256
257static void hostfs_destroy_inode(struct inode *inode)
258{
259 call_rcu(&inode->i_rcu, hostfs_i_callback);
260}
261
259static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) 262static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
260{ 263{
261 const char *root_path = vfs->mnt_sb->s_fs_info; 264 const char *root_path = vfs->mnt_sb->s_fs_info;
@@ -609,7 +612,6 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
609 goto out_put; 612 goto out_put;
610 613
611 d_add(dentry, inode); 614 d_add(dentry, inode);
612 dentry->d_op = &hostfs_dentry_ops;
613 return NULL; 615 return NULL;
614 616
615 out_put: 617 out_put:
@@ -746,11 +748,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
746 return err; 748 return err;
747} 749}
748 750
749int hostfs_permission(struct inode *ino, int desired) 751int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
750{ 752{
751 char *name; 753 char *name;
752 int r = 0, w = 0, x = 0, err; 754 int r = 0, w = 0, x = 0, err;
753 755
756 if (flags & IPERM_FLAG_RCU)
757 return -ECHILD;
758
754 if (desired & MAY_READ) r = 1; 759 if (desired & MAY_READ) r = 1;
755 if (desired & MAY_WRITE) w = 1; 760 if (desired & MAY_WRITE) w = 1;
756 if (desired & MAY_EXEC) x = 1; 761 if (desired & MAY_EXEC) x = 1;
@@ -765,7 +770,7 @@ int hostfs_permission(struct inode *ino, int desired)
765 err = access_file(name, r, w, x); 770 err = access_file(name, r, w, x);
766 __putname(name); 771 __putname(name);
767 if (!err) 772 if (!err)
768 err = generic_permission(ino, desired, NULL); 773 err = generic_permission(ino, desired, flags, NULL);
769 return err; 774 return err;
770} 775}
771 776
@@ -916,6 +921,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
916 sb->s_blocksize_bits = 10; 921 sb->s_blocksize_bits = 10;
917 sb->s_magic = HOSTFS_SUPER_MAGIC; 922 sb->s_magic = HOSTFS_SUPER_MAGIC;
918 sb->s_op = &hostfs_sbops; 923 sb->s_op = &hostfs_sbops;
924 sb->s_d_op = &hostfs_dentry_ops;
919 sb->s_maxbytes = MAX_LFS_FILESIZE; 925 sb->s_maxbytes = MAX_LFS_FILESIZE;
920 926
921 /* NULL is printed as <NULL> by sprintf: avoid that. */ 927 /* NULL is printed as <NULL> by sprintf: avoid that. */
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 67d9d36b3d5..05d4816e4e7 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,7 +12,8 @@
12 * Note: the dentry argument is the parent dentry. 12 * Note: the dentry argument is the parent dentry.
13 */ 13 */
14 14
15static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr) 15static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
16 struct qstr *qstr)
16{ 17{
17 unsigned long hash; 18 unsigned long hash;
18 int i; 19 int i;
@@ -34,29 +35,30 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
34 return 0; 35 return 0;
35} 36}
36 37
37static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) 38static int hpfs_compare_dentry(const struct dentry *parent,
39 const struct inode *pinode,
40 const struct dentry *dentry, const struct inode *inode,
41 unsigned int len, const char *str, const struct qstr *name)
38{ 42{
39 unsigned al=a->len; 43 unsigned al = len;
40 unsigned bl=b->len; 44 unsigned bl = name->len;
41 hpfs_adjust_length(a->name, &al); 45
46 hpfs_adjust_length(str, &al);
42 /*hpfs_adjust_length(b->name, &bl);*/ 47 /*hpfs_adjust_length(b->name, &bl);*/
43 /* 'a' is the qstr of an already existing dentry, so the name 48
44 * must be valid. 'b' must be validated first. 49 /*
50 * 'str' is the nane of an already existing dentry, so the name
51 * must be valid. 'name' must be validated first.
45 */ 52 */
46 53
47 if (hpfs_chk_name(b->name, &bl)) 54 if (hpfs_chk_name(name->name, &bl))
48 return 1; 55 return 1;
49 if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0)) 56 if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
50 return 1; 57 return 1;
51 return 0; 58 return 0;
52} 59}
53 60
54static const struct dentry_operations hpfs_dentry_operations = { 61const struct dentry_operations hpfs_dentry_operations = {
55 .d_hash = hpfs_hash_dentry, 62 .d_hash = hpfs_hash_dentry,
56 .d_compare = hpfs_compare_dentry, 63 .d_compare = hpfs_compare_dentry,
57}; 64};
58
59void hpfs_set_dentry_operations(struct dentry *dentry)
60{
61 dentry->d_op = &hpfs_dentry_operations;
62}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2338130cceb..d32f63a569f 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -298,7 +298,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
298 298
299 end: 299 end:
300 end_add: 300 end_add:
301 hpfs_set_dentry_operations(dentry);
302 unlock_kernel(); 301 unlock_kernel();
303 d_add(dentry, result); 302 d_add(dentry, result);
304 return NULL; 303 return NULL;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2fee17d0d9a..1c43dbea55e 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -233,7 +233,7 @@ void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
233 233
234/* dentry.c */ 234/* dentry.c */
235 235
236void hpfs_set_dentry_operations(struct dentry *); 236extern const struct dentry_operations hpfs_dentry_operations;
237 237
238/* dir.c */ 238/* dir.c */
239 239
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd1..1ae35baa539 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
281 attr->ia_size != i_size_read(inode)) { 281 attr->ia_size != i_size_read(inode)) {
282 error = vmtruncate(inode, attr->ia_size); 282 error = vmtruncate(inode, attr->ia_size);
283 if (error) 283 if (error)
284 return error; 284 goto out_unlock;
285 } 285 }
286 286
287 setattr_copy(inode, attr); 287 setattr_copy(inode, attr);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 11c2b4080f6..f4ad9e31ddc 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -419,7 +419,7 @@ again:
419 unlock_kernel(); 419 unlock_kernel();
420 return -ENOSPC; 420 return -ENOSPC;
421 } 421 }
422 if (generic_permission(inode, MAY_WRITE, NULL) || 422 if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
423 !S_ISREG(inode->i_mode) || 423 !S_ISREG(inode->i_mode) ||
424 get_write_access(inode)) { 424 get_write_access(inode)) {
425 d_rehash(dentry); 425 d_rehash(dentry);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 6c5f01597c3..b30426b1fc9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
177 return &ei->vfs_inode; 177 return &ei->vfs_inode;
178} 178}
179 179
180static void hpfs_destroy_inode(struct inode *inode) 180static void hpfs_i_callback(struct rcu_head *head)
181{ 181{
182 struct inode *inode = container_of(head, struct inode, i_rcu);
183 INIT_LIST_HEAD(&inode->i_dentry);
182 kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); 184 kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
183} 185}
184 186
187static void hpfs_destroy_inode(struct inode *inode)
188{
189 call_rcu(&inode->i_rcu, hpfs_i_callback);
190}
191
185static void init_once(void *foo) 192static void init_once(void *foo)
186{ 193{
187 struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; 194 struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
@@ -543,6 +550,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
543 /* Fill superblock stuff */ 550 /* Fill superblock stuff */
544 s->s_magic = HPFS_SUPER_MAGIC; 551 s->s_magic = HPFS_SUPER_MAGIC;
545 s->s_op = &hpfs_sops; 552 s->s_op = &hpfs_sops;
553 s->s_d_op = &hpfs_dentry_operations;
546 554
547 sbi->sb_root = superblock->root; 555 sbi->sb_root = superblock->root;
548 sbi->sb_fs_size = superblock->n_sectors; 556 sbi->sb_fs_size = superblock->n_sectors;
@@ -644,7 +652,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
644 iput(root); 652 iput(root);
645 goto bail0; 653 goto bail0;
646 } 654 }
647 hpfs_set_dentry_operations(s->s_root);
648 655
649 /* 656 /*
650 * find the root directory's . pointer & finish filling in the inode 657 * find the root directory's . pointer & finish filling in the inode
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index f702b5f713f..87ed48e0343 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -632,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino)
632 mntput(ino->i_sb->s_fs_info); 632 mntput(ino->i_sb->s_fs_info);
633} 633}
634 634
635static void hppfs_destroy_inode(struct inode *inode) 635static void hppfs_i_callback(struct rcu_head *head)
636{ 636{
637 struct inode *inode = container_of(head, struct inode, i_rcu);
638 INIT_LIST_HEAD(&inode->i_dentry);
637 kfree(HPPFS_I(inode)); 639 kfree(HPPFS_I(inode));
638} 640}
639 641
642static void hppfs_destroy_inode(struct inode *inode)
643{
644 call_rcu(&inode->i_rcu, hppfs_i_callback);
645}
646
640static const struct super_operations hppfs_sbops = { 647static const struct super_operations hppfs_sbops = {
641 .alloc_inode = hppfs_alloc_inode, 648 .alloc_inode = hppfs_alloc_inode,
642 .destroy_inode = hppfs_destroy_inode, 649 .destroy_inode = hppfs_destroy_inode,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d6cfac1f0a4..9885082b470 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -663,11 +663,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
663 return &p->vfs_inode; 663 return &p->vfs_inode;
664} 664}
665 665
666static void hugetlbfs_i_callback(struct rcu_head *head)
667{
668 struct inode *inode = container_of(head, struct inode, i_rcu);
669 INIT_LIST_HEAD(&inode->i_dentry);
670 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
671}
672
666static void hugetlbfs_destroy_inode(struct inode *inode) 673static void hugetlbfs_destroy_inode(struct inode *inode)
667{ 674{
668 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); 675 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
669 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 676 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
670 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 677 call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
671} 678}
672 679
673static const struct address_space_operations hugetlbfs_aops = { 680static const struct address_space_operations hugetlbfs_aops = {
@@ -932,8 +939,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
932 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 939 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
933 *user = current_user(); 940 *user = current_user();
934 if (user_shm_lock(size, *user)) { 941 if (user_shm_lock(size, *user)) {
935 WARN_ONCE(1, 942 printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
936 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
937 } else { 943 } else {
938 *user = NULL; 944 *user = NULL;
939 return ERR_PTR(-EPERM); 945 return ERR_PTR(-EPERM);
diff --git a/fs/inode.c b/fs/inode.c
index ae2727ab0c3..da85e56378f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -102,26 +102,29 @@ static DECLARE_RWSEM(iprune_sem);
102 */ 102 */
103struct inodes_stat_t inodes_stat; 103struct inodes_stat_t inodes_stat;
104 104
105static struct percpu_counter nr_inodes __cacheline_aligned_in_smp; 105static DEFINE_PER_CPU(unsigned int, nr_inodes);
106static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
107 106
108static struct kmem_cache *inode_cachep __read_mostly; 107static struct kmem_cache *inode_cachep __read_mostly;
109 108
110static inline int get_nr_inodes(void) 109static int get_nr_inodes(void)
111{ 110{
112 return percpu_counter_sum_positive(&nr_inodes); 111 int i;
112 int sum = 0;
113 for_each_possible_cpu(i)
114 sum += per_cpu(nr_inodes, i);
115 return sum < 0 ? 0 : sum;
113} 116}
114 117
115static inline int get_nr_inodes_unused(void) 118static inline int get_nr_inodes_unused(void)
116{ 119{
117 return percpu_counter_sum_positive(&nr_inodes_unused); 120 return inodes_stat.nr_unused;
118} 121}
119 122
120int get_nr_dirty_inodes(void) 123int get_nr_dirty_inodes(void)
121{ 124{
125 /* not actually dirty inodes, but a wild approximation */
122 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); 126 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
123 return nr_dirty > 0 ? nr_dirty : 0; 127 return nr_dirty > 0 ? nr_dirty : 0;
124
125} 128}
126 129
127/* 130/*
@@ -132,7 +135,6 @@ int proc_nr_inodes(ctl_table *table, int write,
132 void __user *buffer, size_t *lenp, loff_t *ppos) 135 void __user *buffer, size_t *lenp, loff_t *ppos)
133{ 136{
134 inodes_stat.nr_inodes = get_nr_inodes(); 137 inodes_stat.nr_inodes = get_nr_inodes();
135 inodes_stat.nr_unused = get_nr_inodes_unused();
136 return proc_dointvec(table, write, buffer, lenp, ppos); 138 return proc_dointvec(table, write, buffer, lenp, ppos);
137} 139}
138#endif 140#endif
@@ -224,7 +226,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
224 inode->i_fsnotify_mask = 0; 226 inode->i_fsnotify_mask = 0;
225#endif 227#endif
226 228
227 percpu_counter_inc(&nr_inodes); 229 this_cpu_inc(nr_inodes);
228 230
229 return 0; 231 return 0;
230out: 232out:
@@ -255,6 +257,12 @@ static struct inode *alloc_inode(struct super_block *sb)
255 return inode; 257 return inode;
256} 258}
257 259
260void free_inode_nonrcu(struct inode *inode)
261{
262 kmem_cache_free(inode_cachep, inode);
263}
264EXPORT_SYMBOL(free_inode_nonrcu);
265
258void __destroy_inode(struct inode *inode) 266void __destroy_inode(struct inode *inode)
259{ 267{
260 BUG_ON(inode_has_buffers(inode)); 268 BUG_ON(inode_has_buffers(inode));
@@ -266,10 +274,17 @@ void __destroy_inode(struct inode *inode)
266 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 274 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
267 posix_acl_release(inode->i_default_acl); 275 posix_acl_release(inode->i_default_acl);
268#endif 276#endif
269 percpu_counter_dec(&nr_inodes); 277 this_cpu_dec(nr_inodes);
270} 278}
271EXPORT_SYMBOL(__destroy_inode); 279EXPORT_SYMBOL(__destroy_inode);
272 280
281static void i_callback(struct rcu_head *head)
282{
283 struct inode *inode = container_of(head, struct inode, i_rcu);
284 INIT_LIST_HEAD(&inode->i_dentry);
285 kmem_cache_free(inode_cachep, inode);
286}
287
273static void destroy_inode(struct inode *inode) 288static void destroy_inode(struct inode *inode)
274{ 289{
275 BUG_ON(!list_empty(&inode->i_lru)); 290 BUG_ON(!list_empty(&inode->i_lru));
@@ -277,7 +292,7 @@ static void destroy_inode(struct inode *inode)
277 if (inode->i_sb->s_op->destroy_inode) 292 if (inode->i_sb->s_op->destroy_inode)
278 inode->i_sb->s_op->destroy_inode(inode); 293 inode->i_sb->s_op->destroy_inode(inode);
279 else 294 else
280 kmem_cache_free(inode_cachep, (inode)); 295 call_rcu(&inode->i_rcu, i_callback);
281} 296}
282 297
283/* 298/*
@@ -335,7 +350,7 @@ static void inode_lru_list_add(struct inode *inode)
335{ 350{
336 if (list_empty(&inode->i_lru)) { 351 if (list_empty(&inode->i_lru)) {
337 list_add(&inode->i_lru, &inode_lru); 352 list_add(&inode->i_lru, &inode_lru);
338 percpu_counter_inc(&nr_inodes_unused); 353 inodes_stat.nr_unused++;
339 } 354 }
340} 355}
341 356
@@ -343,7 +358,7 @@ static void inode_lru_list_del(struct inode *inode)
343{ 358{
344 if (!list_empty(&inode->i_lru)) { 359 if (!list_empty(&inode->i_lru)) {
345 list_del_init(&inode->i_lru); 360 list_del_init(&inode->i_lru);
346 percpu_counter_dec(&nr_inodes_unused); 361 inodes_stat.nr_unused--;
347 } 362 }
348} 363}
349 364
@@ -430,6 +445,7 @@ void end_writeback(struct inode *inode)
430 BUG_ON(!(inode->i_state & I_FREEING)); 445 BUG_ON(!(inode->i_state & I_FREEING));
431 BUG_ON(inode->i_state & I_CLEAR); 446 BUG_ON(inode->i_state & I_CLEAR);
432 inode_sync_wait(inode); 447 inode_sync_wait(inode);
448 /* don't need i_lock here, no concurrent mods to i_state */
433 inode->i_state = I_FREEING | I_CLEAR; 449 inode->i_state = I_FREEING | I_CLEAR;
434} 450}
435EXPORT_SYMBOL(end_writeback); 451EXPORT_SYMBOL(end_writeback);
@@ -513,7 +529,7 @@ void evict_inodes(struct super_block *sb)
513 list_move(&inode->i_lru, &dispose); 529 list_move(&inode->i_lru, &dispose);
514 list_del_init(&inode->i_wb_list); 530 list_del_init(&inode->i_wb_list);
515 if (!(inode->i_state & (I_DIRTY | I_SYNC))) 531 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
516 percpu_counter_dec(&nr_inodes_unused); 532 inodes_stat.nr_unused--;
517 } 533 }
518 spin_unlock(&inode_lock); 534 spin_unlock(&inode_lock);
519 535
@@ -554,7 +570,7 @@ int invalidate_inodes(struct super_block *sb)
554 list_move(&inode->i_lru, &dispose); 570 list_move(&inode->i_lru, &dispose);
555 list_del_init(&inode->i_wb_list); 571 list_del_init(&inode->i_wb_list);
556 if (!(inode->i_state & (I_DIRTY | I_SYNC))) 572 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
557 percpu_counter_dec(&nr_inodes_unused); 573 inodes_stat.nr_unused--;
558 } 574 }
559 spin_unlock(&inode_lock); 575 spin_unlock(&inode_lock);
560 576
@@ -616,7 +632,7 @@ static void prune_icache(int nr_to_scan)
616 if (atomic_read(&inode->i_count) || 632 if (atomic_read(&inode->i_count) ||
617 (inode->i_state & ~I_REFERENCED)) { 633 (inode->i_state & ~I_REFERENCED)) {
618 list_del_init(&inode->i_lru); 634 list_del_init(&inode->i_lru);
619 percpu_counter_dec(&nr_inodes_unused); 635 inodes_stat.nr_unused--;
620 continue; 636 continue;
621 } 637 }
622 638
@@ -650,7 +666,7 @@ static void prune_icache(int nr_to_scan)
650 */ 666 */
651 list_move(&inode->i_lru, &freeable); 667 list_move(&inode->i_lru, &freeable);
652 list_del_init(&inode->i_wb_list); 668 list_del_init(&inode->i_wb_list);
653 percpu_counter_dec(&nr_inodes_unused); 669 inodes_stat.nr_unused--;
654 } 670 }
655 if (current_is_kswapd()) 671 if (current_is_kswapd())
656 __count_vm_events(KSWAPD_INODESTEAL, reap); 672 __count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -1648,8 +1664,6 @@ void __init inode_init(void)
1648 SLAB_MEM_SPREAD), 1664 SLAB_MEM_SPREAD),
1649 init_once); 1665 init_once);
1650 register_shrinker(&icache_shrinker); 1666 register_shrinker(&icache_shrinker);
1651 percpu_counter_init(&nr_inodes, 0);
1652 percpu_counter_init(&nr_inodes_unused, 0);
1653 1667
1654 /* Hash may have been set up in inode_init_early */ 1668 /* Hash may have been set up in inode_init_early */
1655 if (!hashdist) 1669 if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index e43b9a4dbf4..0663568b124 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -63,12 +63,17 @@ extern int copy_mount_string(const void __user *, char **);
63 63
64extern void free_vfsmnt(struct vfsmount *); 64extern void free_vfsmnt(struct vfsmount *);
65extern struct vfsmount *alloc_vfsmnt(const char *); 65extern struct vfsmount *alloc_vfsmnt(const char *);
66extern unsigned int mnt_get_count(struct vfsmount *mnt);
66extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); 67extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
67extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, 68extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
68 struct vfsmount *); 69 struct vfsmount *);
69extern void release_mounts(struct list_head *); 70extern void release_mounts(struct list_head *);
70extern void umount_tree(struct vfsmount *, int, struct list_head *); 71extern void umount_tree(struct vfsmount *, int, struct list_head *);
71extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); 72extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
73extern int finish_automount(struct vfsmount *, struct path *);
74
75extern void mnt_make_longterm(struct vfsmount *);
76extern void mnt_make_shortterm(struct vfsmount *);
72 77
73extern void __init mnt_init(void); 78extern void __init mnt_init(void);
74 79
diff --git a/fs/ioctl.c b/fs/ioctl.c
index e92fdbb3bc3..a59635e295f 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -6,7 +6,6 @@
6 6
7#include <linux/syscalls.h> 7#include <linux/syscalls.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/smp_lock.h>
10#include <linux/capability.h> 9#include <linux/capability.h>
11#include <linux/file.h> 10#include <linux/file.h>
12#include <linux/fs.h> 11#include <linux/fs.h>
@@ -87,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
87 u64 phys, u64 len, u32 flags) 86 u64 phys, u64 len, u32 flags)
88{ 87{
89 struct fiemap_extent extent; 88 struct fiemap_extent extent;
90 struct fiemap_extent *dest = fieinfo->fi_extents_start; 89 struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
91 90
92 /* only count the extents */ 91 /* only count the extents */
93 if (fieinfo->fi_extents_max == 0) { 92 if (fieinfo->fi_extents_max == 0) {
@@ -174,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
174static int ioctl_fiemap(struct file *filp, unsigned long arg) 173static int ioctl_fiemap(struct file *filp, unsigned long arg)
175{ 174{
176 struct fiemap fiemap; 175 struct fiemap fiemap;
176 struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
177 struct fiemap_extent_info fieinfo = { 0, }; 177 struct fiemap_extent_info fieinfo = { 0, };
178 struct inode *inode = filp->f_path.dentry->d_inode; 178 struct inode *inode = filp->f_path.dentry->d_inode;
179 struct super_block *sb = inode->i_sb; 179 struct super_block *sb = inode->i_sb;
@@ -183,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
183 if (!inode->i_op->fiemap) 183 if (!inode->i_op->fiemap)
184 return -EOPNOTSUPP; 184 return -EOPNOTSUPP;
185 185
186 if (copy_from_user(&fiemap, (struct fiemap __user *)arg, 186 if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
187 sizeof(struct fiemap)))
188 return -EFAULT; 187 return -EFAULT;
189 188
190 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) 189 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -197,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
197 196
198 fieinfo.fi_flags = fiemap.fm_flags; 197 fieinfo.fi_flags = fiemap.fm_flags;
199 fieinfo.fi_extents_max = fiemap.fm_extent_count; 198 fieinfo.fi_extents_max = fiemap.fm_extent_count;
200 fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); 199 fieinfo.fi_extents_start = ufiemap->fm_extents;
201 200
202 if (fiemap.fm_extent_count != 0 && 201 if (fiemap.fm_extent_count != 0 &&
203 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, 202 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -210,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
210 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); 209 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
211 fiemap.fm_flags = fieinfo.fi_flags; 210 fiemap.fm_flags = fieinfo.fi_flags;
212 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; 211 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
213 if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) 212 if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
214 error = -EFAULT; 213 error = -EFAULT;
215 214
216 return error; 215 return error;
@@ -530,41 +529,6 @@ static int ioctl_fsthaw(struct file *filp)
530 return thaw_super(sb); 529 return thaw_super(sb);
531} 530}
532 531
533static int ioctl_fstrim(struct file *filp, void __user *argp)
534{
535 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
536 struct fstrim_range range;
537 int ret = 0;
538
539 if (!capable(CAP_SYS_ADMIN))
540 return -EPERM;
541
542 /* If filesystem doesn't support trim feature, return. */
543 if (sb->s_op->trim_fs == NULL)
544 return -EOPNOTSUPP;
545
546 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
547 if (sb->s_bdev == NULL)
548 return -EINVAL;
549
550 if (argp == NULL) {
551 range.start = 0;
552 range.len = ULLONG_MAX;
553 range.minlen = 0;
554 } else if (copy_from_user(&range, argp, sizeof(range)))
555 return -EFAULT;
556
557 ret = sb->s_op->trim_fs(sb, &range);
558 if (ret < 0)
559 return ret;
560
561 if ((argp != NULL) &&
562 (copy_to_user(argp, &range, sizeof(range))))
563 return -EFAULT;
564
565 return 0;
566}
567
568/* 532/*
569 * When you add any new common ioctls to the switches above and below 533 * When you add any new common ioctls to the switches above and below
570 * please update compat_sys_ioctl() too. 534 * please update compat_sys_ioctl() too.
@@ -615,10 +579,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
615 error = ioctl_fsthaw(filp); 579 error = ioctl_fsthaw(filp);
616 break; 580 break;
617 581
618 case FITRIM:
619 error = ioctl_fstrim(filp, argp);
620 break;
621
622 case FS_IOC_FIEMAP: 582 case FS_IOC_FIEMAP:
623 return ioctl_fiemap(filp, arg); 583 return ioctl_fiemap(filp, arg);
624 584
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc..7da2a06508e 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
103 } 103 }
104 104
105 ret = -ESRCH; 105 ret = -ESRCH;
106 /* 106 rcu_read_lock();
107 * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic",
108 * so we can't use rcu_read_lock(). See re-copy of ->ioprio
109 * in copy_process().
110 */
111 read_lock(&tasklist_lock);
112 switch (which) { 107 switch (which) {
113 case IOPRIO_WHO_PROCESS: 108 case IOPRIO_WHO_PROCESS:
114 if (!who) 109 if (!who)
@@ -153,7 +148,7 @@ free_uid:
153 ret = -EINVAL; 148 ret = -EINVAL;
154 } 149 }
155 150
156 read_unlock(&tasklist_lock); 151 rcu_read_unlock();
157 return ret; 152 return ret;
158} 153}
159 154
@@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
197 int ret = -ESRCH; 192 int ret = -ESRCH;
198 int tmpio; 193 int tmpio;
199 194
200 read_lock(&tasklist_lock); 195 rcu_read_lock();
201 switch (which) { 196 switch (which) {
202 case IOPRIO_WHO_PROCESS: 197 case IOPRIO_WHO_PROCESS:
203 if (!who) 198 if (!who)
@@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
250 ret = -EINVAL; 245 ret = -EINVAL;
251 } 246 }
252 247
253 read_unlock(&tasklist_lock); 248 rcu_read_unlock();
254 return ret; 249 return ret;
255} 250}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bfdeb82a53b..a0f3833c0db 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -26,16 +26,32 @@
26 26
27#define BEQUIET 27#define BEQUIET
28 28
29static int isofs_hashi(struct dentry *parent, struct qstr *qstr); 29static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
30static int isofs_hash(struct dentry *parent, struct qstr *qstr); 30 struct qstr *qstr);
31static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b); 31static int isofs_hash(const struct dentry *parent, const struct inode *inode,
32static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b); 32 struct qstr *qstr);
33static int isofs_dentry_cmpi(const struct dentry *parent,
34 const struct inode *pinode,
35 const struct dentry *dentry, const struct inode *inode,
36 unsigned int len, const char *str, const struct qstr *name);
37static int isofs_dentry_cmp(const struct dentry *parent,
38 const struct inode *pinode,
39 const struct dentry *dentry, const struct inode *inode,
40 unsigned int len, const char *str, const struct qstr *name);
33 41
34#ifdef CONFIG_JOLIET 42#ifdef CONFIG_JOLIET
35static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr); 43static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
36static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr); 44 struct qstr *qstr);
37static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b); 45static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
38static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b); 46 struct qstr *qstr);
47static int isofs_dentry_cmpi_ms(const struct dentry *parent,
48 const struct inode *pinode,
49 const struct dentry *dentry, const struct inode *inode,
50 unsigned int len, const char *str, const struct qstr *name);
51static int isofs_dentry_cmp_ms(const struct dentry *parent,
52 const struct inode *pinode,
53 const struct dentry *dentry, const struct inode *inode,
54 unsigned int len, const char *str, const struct qstr *name);
39#endif 55#endif
40 56
41static void isofs_put_super(struct super_block *sb) 57static void isofs_put_super(struct super_block *sb)
@@ -65,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb)
65 return &ei->vfs_inode; 81 return &ei->vfs_inode;
66} 82}
67 83
68static void isofs_destroy_inode(struct inode *inode) 84static void isofs_i_callback(struct rcu_head *head)
69{ 85{
86 struct inode *inode = container_of(head, struct inode, i_rcu);
87 INIT_LIST_HEAD(&inode->i_dentry);
70 kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); 88 kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
71} 89}
72 90
91static void isofs_destroy_inode(struct inode *inode)
92{
93 call_rcu(&inode->i_rcu, isofs_i_callback);
94}
95
73static void init_once(void *foo) 96static void init_once(void *foo)
74{ 97{
75 struct iso_inode_info *ei = foo; 98 struct iso_inode_info *ei = foo;
@@ -160,7 +183,7 @@ struct iso9660_options{
160 * Compute the hash for the isofs name corresponding to the dentry. 183 * Compute the hash for the isofs name corresponding to the dentry.
161 */ 184 */
162static int 185static int
163isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms) 186isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
164{ 187{
165 const char *name; 188 const char *name;
166 int len; 189 int len;
@@ -181,7 +204,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
181 * Compute the hash for the isofs name corresponding to the dentry. 204 * Compute the hash for the isofs name corresponding to the dentry.
182 */ 205 */
183static int 206static int
184isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms) 207isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
185{ 208{
186 const char *name; 209 const char *name;
187 int len; 210 int len;
@@ -206,100 +229,94 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
206} 229}
207 230
208/* 231/*
209 * Case insensitive compare of two isofs names. 232 * Compare of two isofs names.
210 */ 233 */
211static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a, 234static int isofs_dentry_cmp_common(
212 struct qstr *b, int ms) 235 unsigned int len, const char *str,
236 const struct qstr *name, int ms, int ci)
213{ 237{
214 int alen, blen; 238 int alen, blen;
215 239
216 /* A filename cannot end in '.' or we treat it like it has none */ 240 /* A filename cannot end in '.' or we treat it like it has none */
217 alen = a->len; 241 alen = name->len;
218 blen = b->len; 242 blen = len;
219 if (ms) { 243 if (ms) {
220 while (alen && a->name[alen-1] == '.') 244 while (alen && name->name[alen-1] == '.')
221 alen--; 245 alen--;
222 while (blen && b->name[blen-1] == '.') 246 while (blen && str[blen-1] == '.')
223 blen--; 247 blen--;
224 } 248 }
225 if (alen == blen) { 249 if (alen == blen) {
226 if (strnicmp(a->name, b->name, alen) == 0) 250 if (ci) {
227 return 0; 251 if (strnicmp(name->name, str, alen) == 0)
228 } 252 return 0;
229 return 1; 253 } else {
230} 254 if (strncmp(name->name, str, alen) == 0)
231 255 return 0;
232/* 256 }
233 * Case sensitive compare of two isofs names.
234 */
235static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a,
236 struct qstr *b, int ms)
237{
238 int alen, blen;
239
240 /* A filename cannot end in '.' or we treat it like it has none */
241 alen = a->len;
242 blen = b->len;
243 if (ms) {
244 while (alen && a->name[alen-1] == '.')
245 alen--;
246 while (blen && b->name[blen-1] == '.')
247 blen--;
248 }
249 if (alen == blen) {
250 if (strncmp(a->name, b->name, alen) == 0)
251 return 0;
252 } 257 }
253 return 1; 258 return 1;
254} 259}
255 260
256static int 261static int
257isofs_hash(struct dentry *dentry, struct qstr *qstr) 262isofs_hash(const struct dentry *dentry, const struct inode *inode,
263 struct qstr *qstr)
258{ 264{
259 return isofs_hash_common(dentry, qstr, 0); 265 return isofs_hash_common(dentry, qstr, 0);
260} 266}
261 267
262static int 268static int
263isofs_hashi(struct dentry *dentry, struct qstr *qstr) 269isofs_hashi(const struct dentry *dentry, const struct inode *inode,
270 struct qstr *qstr)
264{ 271{
265 return isofs_hashi_common(dentry, qstr, 0); 272 return isofs_hashi_common(dentry, qstr, 0);
266} 273}
267 274
268static int 275static int
269isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b) 276isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
277 const struct dentry *dentry, const struct inode *inode,
278 unsigned int len, const char *str, const struct qstr *name)
270{ 279{
271 return isofs_dentry_cmp_common(dentry, a, b, 0); 280 return isofs_dentry_cmp_common(len, str, name, 0, 0);
272} 281}
273 282
274static int 283static int
275isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b) 284isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
285 const struct dentry *dentry, const struct inode *inode,
286 unsigned int len, const char *str, const struct qstr *name)
276{ 287{
277 return isofs_dentry_cmpi_common(dentry, a, b, 0); 288 return isofs_dentry_cmp_common(len, str, name, 0, 1);
278} 289}
279 290
280#ifdef CONFIG_JOLIET 291#ifdef CONFIG_JOLIET
281static int 292static int
282isofs_hash_ms(struct dentry *dentry, struct qstr *qstr) 293isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
294 struct qstr *qstr)
283{ 295{
284 return isofs_hash_common(dentry, qstr, 1); 296 return isofs_hash_common(dentry, qstr, 1);
285} 297}
286 298
287static int 299static int
288isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr) 300isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
301 struct qstr *qstr)
289{ 302{
290 return isofs_hashi_common(dentry, qstr, 1); 303 return isofs_hashi_common(dentry, qstr, 1);
291} 304}
292 305
293static int 306static int
294isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b) 307isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
308 const struct dentry *dentry, const struct inode *inode,
309 unsigned int len, const char *str, const struct qstr *name)
295{ 310{
296 return isofs_dentry_cmp_common(dentry, a, b, 1); 311 return isofs_dentry_cmp_common(len, str, name, 1, 0);
297} 312}
298 313
299static int 314static int
300isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b) 315isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
316 const struct dentry *dentry, const struct inode *inode,
317 unsigned int len, const char *str, const struct qstr *name)
301{ 318{
302 return isofs_dentry_cmpi_common(dentry, a, b, 1); 319 return isofs_dentry_cmp_common(len, str, name, 1, 1);
303} 320}
304#endif 321#endif
305 322
@@ -922,17 +939,18 @@ root_found:
922 goto out_iput; 939 goto out_iput;
923 } 940 }
924 941
925 /* get the root dentry */
926 s->s_root = d_alloc_root(inode);
927 if (!(s->s_root))
928 goto out_no_root;
929
930 table = 0; 942 table = 0;
931 if (joliet_level) 943 if (joliet_level)
932 table += 2; 944 table += 2;
933 if (opt.check == 'r') 945 if (opt.check == 'r')
934 table++; 946 table++;
935 s->s_root->d_op = &isofs_dentry_ops[table]; 947
948 s->s_d_op = &isofs_dentry_ops[table];
949
950 /* get the root dentry */
951 s->s_root = d_alloc_root(inode);
952 if (!(s->s_root))
953 goto out_no_root;
936 954
937 kfree(opt.iocharset); 955 kfree(opt.iocharset);
938 956
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 0d23abfd428..4fb3e8074fd 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
37 37
38 qstr.name = compare; 38 qstr.name = compare;
39 qstr.len = dlen; 39 qstr.len = dlen;
40 return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr); 40 return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
41 dentry->d_name.len, dentry->d_name.name, &qstr);
41} 42}
42 43
43/* 44/*
@@ -171,8 +172,6 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
171 struct inode *inode; 172 struct inode *inode;
172 struct page *page; 173 struct page *page;
173 174
174 dentry->d_op = dir->i_sb->s_root->d_op;
175
176 page = alloc_page(GFP_USER); 175 page = alloc_page(GFP_USER);
177 if (!page) 176 if (!page)
178 return ERR_PTR(-ENOMEM); 177 return ERR_PTR(-ENOMEM);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 846a3f31411..5b2e4c30a2a 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -207,7 +207,7 @@ repeat_locked:
207 * the committing transaction. Really, we only need to give it 207 * the committing transaction. Really, we only need to give it
208 * committing_transaction->t_outstanding_credits plus "enough" for 208 * committing_transaction->t_outstanding_credits plus "enough" for
209 * the log control blocks. 209 * the log control blocks.
210 * Also, this test is inconsitent with the matching one in 210 * Also, this test is inconsistent with the matching one in
211 * journal_extend(). 211 * journal_extend().
212 */ 212 */
213 if (__log_space_left(journal) < jbd_space_needed(journal)) { 213 if (__log_space_left(journal) < jbd_space_needed(journal)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c590d155c09..9e4686900f1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/backing-dev.h> 44#include <linux/backing-dev.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/ratelimit.h>
46 47
47#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
48#include <trace/events/jbd2.h> 49#include <trace/events/jbd2.h>
@@ -93,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
93EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); 94EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
94EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); 95EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
95EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 96EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
97EXPORT_SYMBOL(jbd2_inode_cache);
96 98
97static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 99static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
98static void __journal_abort_soft (journal_t *journal, int errno); 100static void __journal_abort_soft (journal_t *journal, int errno);
@@ -827,7 +829,7 @@ static journal_t * journal_init_common (void)
827 829
828 journal = kzalloc(sizeof(*journal), GFP_KERNEL); 830 journal = kzalloc(sizeof(*journal), GFP_KERNEL);
829 if (!journal) 831 if (!journal)
830 goto fail; 832 return NULL;
831 833
832 init_waitqueue_head(&journal->j_wait_transaction_locked); 834 init_waitqueue_head(&journal->j_wait_transaction_locked);
833 init_waitqueue_head(&journal->j_wait_logspace); 835 init_waitqueue_head(&journal->j_wait_logspace);
@@ -852,14 +854,12 @@ static journal_t * journal_init_common (void)
852 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); 854 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
853 if (err) { 855 if (err) {
854 kfree(journal); 856 kfree(journal);
855 goto fail; 857 return NULL;
856 } 858 }
857 859
858 spin_lock_init(&journal->j_history_lock); 860 spin_lock_init(&journal->j_history_lock);
859 861
860 return journal; 862 return journal;
861fail:
862 return NULL;
863} 863}
864 864
865/* jbd2_journal_init_dev and jbd2_journal_init_inode: 865/* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
899 899
900 /* journal descriptor can store up to n blocks -bzzz */ 900 /* journal descriptor can store up to n blocks -bzzz */
901 journal->j_blocksize = blocksize; 901 journal->j_blocksize = blocksize;
902 journal->j_dev = bdev;
903 journal->j_fs_dev = fs_dev;
904 journal->j_blk_offset = start;
905 journal->j_maxlen = len;
906 bdevname(journal->j_dev, journal->j_devname);
907 p = journal->j_devname;
908 while ((p = strchr(p, '/')))
909 *p = '!';
902 jbd2_stats_proc_init(journal); 910 jbd2_stats_proc_init(journal);
903 n = journal->j_blocksize / sizeof(journal_block_tag_t); 911 n = journal->j_blocksize / sizeof(journal_block_tag_t);
904 journal->j_wbufsize = n; 912 journal->j_wbufsize = n;
@@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
908 __func__); 916 __func__);
909 goto out_err; 917 goto out_err;
910 } 918 }
911 journal->j_dev = bdev;
912 journal->j_fs_dev = fs_dev;
913 journal->j_blk_offset = start;
914 journal->j_maxlen = len;
915 bdevname(journal->j_dev, journal->j_devname);
916 p = journal->j_devname;
917 while ((p = strchr(p, '/')))
918 *p = '!';
919 919
920 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 920 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
921 if (!bh) { 921 if (!bh) {
@@ -1982,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
1982static struct journal_head *journal_alloc_journal_head(void) 1982static struct journal_head *journal_alloc_journal_head(void)
1983{ 1983{
1984 struct journal_head *ret; 1984 struct journal_head *ret;
1985 static unsigned long last_warning;
1986 1985
1987#ifdef CONFIG_JBD2_DEBUG 1986#ifdef CONFIG_JBD2_DEBUG
1988 atomic_inc(&nr_journal_heads); 1987 atomic_inc(&nr_journal_heads);
@@ -1990,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void)
1990 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 1989 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
1991 if (!ret) { 1990 if (!ret) {
1992 jbd_debug(1, "out of memory for journal_head\n"); 1991 jbd_debug(1, "out of memory for journal_head\n");
1993 if (time_after(jiffies, last_warning + 5*HZ)) { 1992 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
1994 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1995 __func__);
1996 last_warning = jiffies;
1997 }
1998 while (!ret) { 1993 while (!ret) {
1999 yield(); 1994 yield();
2000 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 1995 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -2292,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2292 2287
2293#endif 2288#endif
2294 2289
2295struct kmem_cache *jbd2_handle_cache; 2290struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
2296 2291
2297static int __init journal_init_handle_cache(void) 2292static int __init journal_init_handle_cache(void)
2298{ 2293{
2299 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", 2294 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
2300 sizeof(handle_t),
2301 0, /* offset */
2302 SLAB_TEMPORARY, /* flags */
2303 NULL); /* ctor */
2304 if (jbd2_handle_cache == NULL) { 2295 if (jbd2_handle_cache == NULL) {
2305 printk(KERN_EMERG "JBD: failed to create handle cache\n"); 2296 printk(KERN_EMERG "JBD2: failed to create handle cache\n");
2297 return -ENOMEM;
2298 }
2299 jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
2300 if (jbd2_inode_cache == NULL) {
2301 printk(KERN_EMERG "JBD2: failed to create inode cache\n");
2302 kmem_cache_destroy(jbd2_handle_cache);
2306 return -ENOMEM; 2303 return -ENOMEM;
2307 } 2304 }
2308 return 0; 2305 return 0;
@@ -2312,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
2312{ 2309{
2313 if (jbd2_handle_cache) 2310 if (jbd2_handle_cache)
2314 kmem_cache_destroy(jbd2_handle_cache); 2311 kmem_cache_destroy(jbd2_handle_cache);
2312 if (jbd2_inode_cache)
2313 kmem_cache_destroy(jbd2_inode_cache);
2314
2315} 2315}
2316 2316
2317/* 2317/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2bc4d5f116f..1cad869494f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal)
299#ifdef CONFIG_JBD2_DEBUG 299#ifdef CONFIG_JBD2_DEBUG
300 int dropped = info.end_transaction - 300 int dropped = info.end_transaction -
301 be32_to_cpu(journal->j_superblock->s_sequence); 301 be32_to_cpu(journal->j_superblock->s_sequence);
302#endif
303 jbd_debug(1, 302 jbd_debug(1,
304 "JBD: ignoring %d transaction%s from the journal.\n", 303 "JBD: ignoring %d transaction%s from the journal.\n",
305 dropped, (dropped == 1) ? "" : "s"); 304 dropped, (dropped == 1) ? "" : "s");
305#endif
306 journal->j_transaction_sequence = ++info.end_transaction; 306 journal->j_transaction_sequence = ++info.end_transaction;
307 } 307 }
308 308
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6bf0a242613..faad2bd787c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -251,7 +251,7 @@ repeat:
251 * the committing transaction. Really, we only need to give it 251 * the committing transaction. Really, we only need to give it
252 * committing_transaction->t_outstanding_credits plus "enough" for 252 * committing_transaction->t_outstanding_credits plus "enough" for
253 * the log control blocks. 253 * the log control blocks.
254 * Also, this test is inconsitent with the matching one in 254 * Also, this test is inconsistent with the matching one in
255 * jbd2_journal_extend(). 255 * jbd2_journal_extend().
256 */ 256 */
257 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { 257 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
@@ -340,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
340 jbd2_free_handle(handle); 340 jbd2_free_handle(handle);
341 current->journal_info = NULL; 341 current->journal_info = NULL;
342 handle = ERR_PTR(err); 342 handle = ERR_PTR(err);
343 goto out;
344 } 343 }
345out:
346 return handle; 344 return handle;
347} 345}
348EXPORT_SYMBOL(jbd2__journal_start); 346EXPORT_SYMBOL(jbd2__journal_start);
@@ -589,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
589 transaction = handle->h_transaction; 587 transaction = handle->h_transaction;
590 journal = transaction->t_journal; 588 journal = transaction->t_journal;
591 589
592 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); 590 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
593 591
594 JBUFFER_TRACE(jh, "entry"); 592 JBUFFER_TRACE(jh, "entry");
595repeat: 593repeat:
@@ -774,7 +772,7 @@ done:
774 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), 772 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
775 "Possible IO failure.\n"); 773 "Possible IO failure.\n");
776 page = jh2bh(jh)->b_page; 774 page = jh2bh(jh)->b_page;
777 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; 775 offset = offset_in_page(jh2bh(jh)->b_data);
778 source = kmap_atomic(page, KM_USER0); 776 source = kmap_atomic(page, KM_USER0);
779 /* Fire data frozen trigger just before we copy the data */ 777 /* Fire data frozen trigger just before we copy the data */
780 jbd2_buffer_frozen_trigger(jh, source + offset, 778 jbd2_buffer_frozen_trigger(jh, source + offset,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 54a92fd02bb..95b79672150 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
259 return rc; 259 return rc;
260} 260}
261 261
262int jffs2_check_acl(struct inode *inode, int mask) 262int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
263{ 263{
264 struct posix_acl *acl; 264 struct posix_acl *acl;
265 int rc; 265 int rc;
266 266
267 if (flags & IPERM_FLAG_RCU)
268 return -ECHILD;
269
267 acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); 270 acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
268 if (IS_ERR(acl)) 271 if (IS_ERR(acl))
269 return PTR_ERR(acl); 272 return PTR_ERR(acl);
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 5e42de8d954..3119f59253d 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
26 26
27#ifdef CONFIG_JFFS2_FS_POSIX_ACL 27#ifdef CONFIG_JFFS2_FS_POSIX_ACL
28 28
29extern int jffs2_check_acl(struct inode *, int); 29extern int jffs2_check_acl(struct inode *, int, unsigned int);
30extern int jffs2_acl_chmod(struct inode *); 30extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 85c6be2db02..3005ec4520a 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
336 size = sizeof(struct jffs2_eraseblock) * c->nr_blocks; 336 size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
337#ifndef __ECOS 337#ifndef __ECOS
338 if (jffs2_blocks_use_vmalloc(c)) 338 if (jffs2_blocks_use_vmalloc(c))
339 c->blocks = vmalloc(size); 339 c->blocks = vzalloc(size);
340 else 340 else
341#endif 341#endif
342 c->blocks = kmalloc(size, GFP_KERNEL); 342 c->blocks = kzalloc(size, GFP_KERNEL);
343 if (!c->blocks) 343 if (!c->blocks)
344 return -ENOMEM; 344 return -ENOMEM;
345 345
346 memset(c->blocks, 0, size);
347 for (i=0; i<c->nr_blocks; i++) { 346 for (i=0; i<c->nr_blocks; i++) {
348 INIT_LIST_HEAD(&c->blocks[i].list); 347 INIT_LIST_HEAD(&c->blocks[i].list);
349 c->blocks[i].offset = i * c->sector_size; 348 c->blocks[i].offset = i * c->sector_size;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index f864005de64..0bc6a6c80a5 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -144,4 +144,4 @@ struct jffs2_sb_info {
144 void *os_priv; 144 void *os_priv;
145}; 145};
146 146
147#endif /* _JFFS2_FB_SB */ 147#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index c86041b866a..853b8e30008 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -40,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
40 return &f->vfs_inode; 40 return &f->vfs_inode;
41} 41}
42 42
43static void jffs2_destroy_inode(struct inode *inode) 43static void jffs2_i_callback(struct rcu_head *head)
44{ 44{
45 struct inode *inode = container_of(head, struct inode, i_rcu);
46 INIT_LIST_HEAD(&inode->i_dentry);
45 kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); 47 kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
46} 48}
47 49
50static void jffs2_destroy_inode(struct inode *inode)
51{
52 call_rcu(&inode->i_rcu, jffs2_i_callback);
53}
54
48static void jffs2_i_init_once(void *foo) 55static void jffs2_i_init_once(void *foo)
49{ 56{
50 struct jffs2_inode_info *f = foo; 57 struct jffs2_inode_info *f = foo;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a4..4f9cc048294 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
151 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", 151 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
152 offset, je32_to_cpu(rx.hdr_crc), crc); 152 offset, je32_to_cpu(rx.hdr_crc), crc);
153 xd->flags |= JFFS2_XFLAGS_INVALID; 153 xd->flags |= JFFS2_XFLAGS_INVALID;
154 return EIO; 154 return -EIO;
155 } 155 }
156 totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len)); 156 totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
157 if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK 157 if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
167 je32_to_cpu(rx.xid), xd->xid, 167 je32_to_cpu(rx.xid), xd->xid,
168 je32_to_cpu(rx.version), xd->version); 168 je32_to_cpu(rx.version), xd->version);
169 xd->flags |= JFFS2_XFLAGS_INVALID; 169 xd->flags |= JFFS2_XFLAGS_INVALID;
170 return EIO; 170 return -EIO;
171 } 171 }
172 xd->xprefix = rx.xprefix; 172 xd->xprefix = rx.xprefix;
173 xd->name_len = rx.name_len; 173 xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
230 ref_offset(xd->node), xd->data_crc, crc); 230 ref_offset(xd->node), xd->data_crc, crc);
231 kfree(data); 231 kfree(data);
232 xd->flags |= JFFS2_XFLAGS_INVALID; 232 xd->flags |= JFFS2_XFLAGS_INVALID;
233 return EIO; 233 return -EIO;
234 } 234 }
235 235
236 xd->flags |= JFFS2_XFLAGS_HOT; 236 xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
268 if (xd->xname) 268 if (xd->xname)
269 return 0; 269 return 0;
270 if (xd->flags & JFFS2_XFLAGS_INVALID) 270 if (xd->flags & JFFS2_XFLAGS_INVALID)
271 return EIO; 271 return -EIO;
272 if (unlikely(is_xattr_datum_unchecked(c, xd))) 272 if (unlikely(is_xattr_datum_unchecked(c, xd)))
273 rc = do_verify_xattr_datum(c, xd); 273 rc = do_verify_xattr_datum(c, xd);
274 if (!rc) 274 if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
460 if (crc != je32_to_cpu(rr.node_crc)) { 460 if (crc != je32_to_cpu(rr.node_crc)) {
461 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", 461 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
462 offset, je32_to_cpu(rr.node_crc), crc); 462 offset, je32_to_cpu(rr.node_crc), crc);
463 return EIO; 463 return -EIO;
464 } 464 }
465 if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK 465 if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
466 || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF 466 || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
470 offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, 470 offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
471 je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF, 471 je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
472 je32_to_cpu(rr.totlen), PAD(sizeof(rr))); 472 je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
473 return EIO; 473 return -EIO;
474 } 474 }
475 ref->ino = je32_to_cpu(rr.ino); 475 ref->ino = je32_to_cpu(rr.ino);
476 ref->xid = je32_to_cpu(rr.xid); 476 ref->xid = je32_to_cpu(rr.xid);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 1057a4998e4..e5de9422fa3 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,10 +114,14 @@ out:
114 return rc; 114 return rc;
115} 115}
116 116
117int jfs_check_acl(struct inode *inode, int mask) 117int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
118{ 118{
119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); 119 struct posix_acl *acl;
120
121 if (flags & IPERM_FLAG_RCU)
122 return -ECHILD;
120 123
124 acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
121 if (IS_ERR(acl)) 125 if (IS_ERR(acl))
122 return PTR_ERR(acl); 126 return PTR_ERR(acl);
123 if (acl) { 127 if (acl) {
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 54e07559878..f9285c4900f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
20 20
21#ifdef CONFIG_JFS_POSIX_ACL 21#ifdef CONFIG_JFS_POSIX_ACL
22 22
23int jfs_check_acl(struct inode *, int); 23int jfs_check_acl(struct inode *, int, unsigned int flags);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 24int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_acl_chmod(struct inode *inode); 25int jfs_acl_chmod(struct inode *inode);
26 26
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e1b8493b9aa..278e3fb40b7 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
1120 * file systems to log may have n-to-1 relationship; 1120 * file systems to log may have n-to-1 relationship;
1121 */ 1121 */
1122 1122
1123 bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE); 1123 bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1124 log);
1124 if (IS_ERR(bdev)) { 1125 if (IS_ERR(bdev)) {
1125 rc = -PTR_ERR(bdev); 1126 rc = -PTR_ERR(bdev);
1126 goto free; 1127 goto free;
1127 } 1128 }
1128 1129
1129 if ((rc = bd_claim(bdev, log))) {
1130 goto close;
1131 }
1132
1133 log->bdev = bdev; 1130 log->bdev = bdev;
1134 memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); 1131 memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
1135 1132
@@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
1137 * initialize log: 1134 * initialize log:
1138 */ 1135 */
1139 if ((rc = lmLogInit(log))) 1136 if ((rc = lmLogInit(log)))
1140 goto unclaim; 1137 goto close;
1141 1138
1142 list_add(&log->journal_list, &jfs_external_logs); 1139 list_add(&log->journal_list, &jfs_external_logs);
1143 1140
@@ -1163,11 +1160,8 @@ journal_found:
1163 list_del(&log->journal_list); 1160 list_del(&log->journal_list);
1164 lbmLogShutdown(log); 1161 lbmLogShutdown(log);
1165 1162
1166 unclaim:
1167 bd_release(bdev);
1168
1169 close: /* close external log device */ 1163 close: /* close external log device */
1170 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1164 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1171 1165
1172 free: /* free log descriptor */ 1166 free: /* free log descriptor */
1173 mutex_unlock(&jfs_log_mutex); 1167 mutex_unlock(&jfs_log_mutex);
@@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
1512 bdev = log->bdev; 1506 bdev = log->bdev;
1513 rc = lmLogShutdown(log); 1507 rc = lmLogShutdown(log);
1514 1508
1515 bd_release(bdev); 1509 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1516 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1517 1510
1518 kfree(log); 1511 kfree(log);
1519 1512
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 231ca4af9bc..81ead850ddb 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/namei.h>
21#include <linux/ctype.h> 22#include <linux/ctype.h>
22#include <linux/quotaops.h> 23#include <linux/quotaops.h>
23#include <linux/exportfs.h> 24#include <linux/exportfs.h>
@@ -1464,9 +1465,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1464 1465
1465 jfs_info("jfs_lookup: name = %s", name); 1466 jfs_info("jfs_lookup: name = %s", name);
1466 1467
1467 if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
1468 dentry->d_op = &jfs_ci_dentry_operations;
1469
1470 if ((name[0] == '.') && (len == 1)) 1468 if ((name[0] == '.') && (len == 1))
1471 inum = dip->i_ino; 1469 inum = dip->i_ino;
1472 else if (strcmp(name, "..") == 0) 1470 else if (strcmp(name, "..") == 0)
@@ -1491,12 +1489,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1491 return ERR_CAST(ip); 1489 return ERR_CAST(ip);
1492 } 1490 }
1493 1491
1494 dentry = d_splice_alias(ip, dentry); 1492 return d_splice_alias(ip, dentry);
1495
1496 if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
1497 dentry->d_op = &jfs_ci_dentry_operations;
1498
1499 return dentry;
1500} 1493}
1501 1494
1502static struct inode *jfs_nfs_get_inode(struct super_block *sb, 1495static struct inode *jfs_nfs_get_inode(struct super_block *sb,
@@ -1573,7 +1566,8 @@ const struct file_operations jfs_dir_operations = {
1573 .llseek = generic_file_llseek, 1566 .llseek = generic_file_llseek,
1574}; 1567};
1575 1568
1576static int jfs_ci_hash(struct dentry *dir, struct qstr *this) 1569static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
1570 struct qstr *this)
1577{ 1571{
1578 unsigned long hash; 1572 unsigned long hash;
1579 int i; 1573 int i;
@@ -1586,32 +1580,63 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
1586 return 0; 1580 return 0;
1587} 1581}
1588 1582
1589static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b) 1583static int jfs_ci_compare(const struct dentry *parent,
1584 const struct inode *pinode,
1585 const struct dentry *dentry, const struct inode *inode,
1586 unsigned int len, const char *str, const struct qstr *name)
1590{ 1587{
1591 int i, result = 1; 1588 int i, result = 1;
1592 1589
1593 if (a->len != b->len) 1590 if (len != name->len)
1594 goto out; 1591 goto out;
1595 for (i=0; i < a->len; i++) { 1592 for (i=0; i < len; i++) {
1596 if (tolower(a->name[i]) != tolower(b->name[i])) 1593 if (tolower(str[i]) != tolower(name->name[i]))
1597 goto out; 1594 goto out;
1598 } 1595 }
1599 result = 0; 1596 result = 0;
1597out:
1598 return result;
1599}
1600 1600
1601static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
1602{
1603 if (nd->flags & LOOKUP_RCU)
1604 return -ECHILD;
1601 /* 1605 /*
1602 * We want creates to preserve case. A negative dentry, a, that 1606 * This is not negative dentry. Always valid.
1603 * has a different case than b may cause a new entry to be created 1607 *
1604 * with the wrong case. Since we can't tell if a comes from a negative 1608 * Note, rename() to existing directory entry will have ->d_inode,
1605 * dentry, we blindly replace it with b. This should be harmless if 1609 * and will use existing name which isn't specified name by user.
1606 * a is not a negative dentry. 1610 *
1611 * We may be able to drop this positive dentry here. But dropping
1612 * positive dentry isn't good idea. So it's unsupported like
1613 * rename("filename", "FILENAME") for now.
1607 */ 1614 */
1608 memcpy((unsigned char *)a->name, b->name, a->len); 1615 if (dentry->d_inode)
1609out: 1616 return 1;
1610 return result; 1617
1618 /*
1619 * This may be nfsd (or something), anyway, we can't see the
1620 * intent of this. So, since this can be for creation, drop it.
1621 */
1622 if (!nd)
1623 return 0;
1624
1625 /*
1626 * Drop the negative dentry, in order to make sure to use the
1627 * case sensitive name which is specified by user if this is
1628 * for creation.
1629 */
1630 if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
1631 if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
1632 return 0;
1633 }
1634 return 1;
1611} 1635}
1612 1636
1613const struct dentry_operations jfs_ci_dentry_operations = 1637const struct dentry_operations jfs_ci_dentry_operations =
1614{ 1638{
1615 .d_hash = jfs_ci_hash, 1639 .d_hash = jfs_ci_hash,
1616 .d_compare = jfs_ci_compare, 1640 .d_compare = jfs_ci_compare,
1641 .d_revalidate = jfs_ci_revalidate,
1617}; 1642};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0669fc1cc3b..eeca48a031a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -115,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
115 return &jfs_inode->vfs_inode; 115 return &jfs_inode->vfs_inode;
116} 116}
117 117
118static void jfs_i_callback(struct rcu_head *head)
119{
120 struct inode *inode = container_of(head, struct inode, i_rcu);
121 struct jfs_inode_info *ji = JFS_IP(inode);
122 INIT_LIST_HEAD(&inode->i_dentry);
123 kmem_cache_free(jfs_inode_cachep, ji);
124}
125
118static void jfs_destroy_inode(struct inode *inode) 126static void jfs_destroy_inode(struct inode *inode)
119{ 127{
120 struct jfs_inode_info *ji = JFS_IP(inode); 128 struct jfs_inode_info *ji = JFS_IP(inode);
@@ -128,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode)
128 ji->active_ag = -1; 136 ji->active_ag = -1;
129 } 137 }
130 spin_unlock_irq(&ji->ag_lock); 138 spin_unlock_irq(&ji->ag_lock);
131 kmem_cache_free(jfs_inode_cachep, ji); 139 call_rcu(&inode->i_rcu, jfs_i_callback);
132} 140}
133 141
134static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) 142static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -507,6 +515,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
507 515
508 sb->s_magic = JFS_SUPER_MAGIC; 516 sb->s_magic = JFS_SUPER_MAGIC;
509 517
518 if (sbi->mntflag & JFS_OS2)
519 sb->s_d_op = &jfs_ci_dentry_operations;
520
510 inode = jfs_iget(sb, ROOT_I); 521 inode = jfs_iget(sb, ROOT_I);
511 if (IS_ERR(inode)) { 522 if (IS_ERR(inode)) {
512 ret = PTR_ERR(inode); 523 ret = PTR_ERR(inode);
@@ -516,9 +527,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
516 if (!sb->s_root) 527 if (!sb->s_root)
517 goto out_no_root; 528 goto out_no_root;
518 529
519 if (sbi->mntflag & JFS_OS2)
520 sb->s_root->d_op = &jfs_ci_dentry_operations;
521
522 /* logical blocks are represented by 40 bits in pxd_t, etc. */ 530 /* logical blocks are represented by 40 bits in pxd_t, etc. */
523 sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; 531 sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
524#if BITS_PER_LONG == 32 532#if BITS_PER_LONG == 32
diff --git a/fs/libfs.c b/fs/libfs.c
index a3accdf528a..c88eab55aec 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,11 @@
16 16
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18 18
19static inline int simple_positive(struct dentry *dentry)
20{
21 return dentry->d_inode && !d_unhashed(dentry);
22}
23
19int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, 24int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
20 struct kstat *stat) 25 struct kstat *stat)
21{ 26{
@@ -37,7 +42,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
37 * Retaining negative dentries for an in-memory filesystem just wastes 42 * Retaining negative dentries for an in-memory filesystem just wastes
38 * memory and lookup time: arrange for them to be deleted immediately. 43 * memory and lookup time: arrange for them to be deleted immediately.
39 */ 44 */
40static int simple_delete_dentry(struct dentry *dentry) 45static int simple_delete_dentry(const struct dentry *dentry)
41{ 46{
42 return 1; 47 return 1;
43} 48}
@@ -54,7 +59,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
54 59
55 if (dentry->d_name.len > NAME_MAX) 60 if (dentry->d_name.len > NAME_MAX)
56 return ERR_PTR(-ENAMETOOLONG); 61 return ERR_PTR(-ENAMETOOLONG);
57 dentry->d_op = &simple_dentry_operations; 62 d_set_d_op(dentry, &simple_dentry_operations);
58 d_add(dentry, NULL); 63 d_add(dentry, NULL);
59 return NULL; 64 return NULL;
60} 65}
@@ -76,7 +81,8 @@ int dcache_dir_close(struct inode *inode, struct file *file)
76 81
77loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) 82loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
78{ 83{
79 mutex_lock(&file->f_path.dentry->d_inode->i_mutex); 84 struct dentry *dentry = file->f_path.dentry;
85 mutex_lock(&dentry->d_inode->i_mutex);
80 switch (origin) { 86 switch (origin) {
81 case 1: 87 case 1:
82 offset += file->f_pos; 88 offset += file->f_pos;
@@ -84,7 +90,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
84 if (offset >= 0) 90 if (offset >= 0)
85 break; 91 break;
86 default: 92 default:
87 mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); 93 mutex_unlock(&dentry->d_inode->i_mutex);
88 return -EINVAL; 94 return -EINVAL;
89 } 95 }
90 if (offset != file->f_pos) { 96 if (offset != file->f_pos) {
@@ -94,21 +100,24 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
94 struct dentry *cursor = file->private_data; 100 struct dentry *cursor = file->private_data;
95 loff_t n = file->f_pos - 2; 101 loff_t n = file->f_pos - 2;
96 102
97 spin_lock(&dcache_lock); 103 spin_lock(&dentry->d_lock);
104 /* d_lock not required for cursor */
98 list_del(&cursor->d_u.d_child); 105 list_del(&cursor->d_u.d_child);
99 p = file->f_path.dentry->d_subdirs.next; 106 p = dentry->d_subdirs.next;
100 while (n && p != &file->f_path.dentry->d_subdirs) { 107 while (n && p != &dentry->d_subdirs) {
101 struct dentry *next; 108 struct dentry *next;
102 next = list_entry(p, struct dentry, d_u.d_child); 109 next = list_entry(p, struct dentry, d_u.d_child);
103 if (!d_unhashed(next) && next->d_inode) 110 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
111 if (simple_positive(next))
104 n--; 112 n--;
113 spin_unlock(&next->d_lock);
105 p = p->next; 114 p = p->next;
106 } 115 }
107 list_add_tail(&cursor->d_u.d_child, p); 116 list_add_tail(&cursor->d_u.d_child, p);
108 spin_unlock(&dcache_lock); 117 spin_unlock(&dentry->d_lock);
109 } 118 }
110 } 119 }
111 mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); 120 mutex_unlock(&dentry->d_inode->i_mutex);
112 return offset; 121 return offset;
113} 122}
114 123
@@ -148,29 +157,35 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
148 i++; 157 i++;
149 /* fallthrough */ 158 /* fallthrough */
150 default: 159 default:
151 spin_lock(&dcache_lock); 160 spin_lock(&dentry->d_lock);
152 if (filp->f_pos == 2) 161 if (filp->f_pos == 2)
153 list_move(q, &dentry->d_subdirs); 162 list_move(q, &dentry->d_subdirs);
154 163
155 for (p=q->next; p != &dentry->d_subdirs; p=p->next) { 164 for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
156 struct dentry *next; 165 struct dentry *next;
157 next = list_entry(p, struct dentry, d_u.d_child); 166 next = list_entry(p, struct dentry, d_u.d_child);
158 if (d_unhashed(next) || !next->d_inode) 167 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
168 if (!simple_positive(next)) {
169 spin_unlock(&next->d_lock);
159 continue; 170 continue;
171 }
160 172
161 spin_unlock(&dcache_lock); 173 spin_unlock(&next->d_lock);
174 spin_unlock(&dentry->d_lock);
162 if (filldir(dirent, next->d_name.name, 175 if (filldir(dirent, next->d_name.name,
163 next->d_name.len, filp->f_pos, 176 next->d_name.len, filp->f_pos,
164 next->d_inode->i_ino, 177 next->d_inode->i_ino,
165 dt_type(next->d_inode)) < 0) 178 dt_type(next->d_inode)) < 0)
166 return 0; 179 return 0;
167 spin_lock(&dcache_lock); 180 spin_lock(&dentry->d_lock);
181 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
168 /* next is still alive */ 182 /* next is still alive */
169 list_move(q, p); 183 list_move(q, p);
184 spin_unlock(&next->d_lock);
170 p = q; 185 p = q;
171 filp->f_pos++; 186 filp->f_pos++;
172 } 187 }
173 spin_unlock(&dcache_lock); 188 spin_unlock(&dentry->d_lock);
174 } 189 }
175 return 0; 190 return 0;
176} 191}
@@ -202,7 +217,8 @@ static const struct super_operations simple_super_operations = {
202 * will never be mountable) 217 * will never be mountable)
203 */ 218 */
204struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, 219struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
205 const struct super_operations *ops, unsigned long magic) 220 const struct super_operations *ops,
221 const struct dentry_operations *dops, unsigned long magic)
206{ 222{
207 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); 223 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
208 struct dentry *dentry; 224 struct dentry *dentry;
@@ -239,6 +255,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
239 dentry->d_parent = dentry; 255 dentry->d_parent = dentry;
240 d_instantiate(dentry, root); 256 d_instantiate(dentry, root);
241 s->s_root = dentry; 257 s->s_root = dentry;
258 s->s_d_op = dops;
242 s->s_flags |= MS_ACTIVE; 259 s->s_flags |= MS_ACTIVE;
243 return dget(s->s_root); 260 return dget(s->s_root);
244 261
@@ -259,23 +276,23 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
259 return 0; 276 return 0;
260} 277}
261 278
262static inline int simple_positive(struct dentry *dentry)
263{
264 return dentry->d_inode && !d_unhashed(dentry);
265}
266
267int simple_empty(struct dentry *dentry) 279int simple_empty(struct dentry *dentry)
268{ 280{
269 struct dentry *child; 281 struct dentry *child;
270 int ret = 0; 282 int ret = 0;
271 283
272 spin_lock(&dcache_lock); 284 spin_lock(&dentry->d_lock);
273 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) 285 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
274 if (simple_positive(child)) 286 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
287 if (simple_positive(child)) {
288 spin_unlock(&child->d_lock);
275 goto out; 289 goto out;
290 }
291 spin_unlock(&child->d_lock);
292 }
276 ret = 1; 293 ret = 1;
277out: 294out:
278 spin_unlock(&dcache_lock); 295 spin_unlock(&dentry->d_lock);
279 return ret; 296 return ret;
280} 297}
281 298
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 97f6073ab33..ca58d64374c 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-$(CONFIG_LOCKD) += lockd.o 5obj-$(CONFIG_LOCKD) += lockd.o
6 6
7lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ 7lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
8 svcproc.o svcsubs.o mon.o xdr.o grace.o 8 svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
9lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o 9lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
10lockd-objs := $(lockd-objs-y) 10lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 00000000000..f848b52c67b
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
1/*
2 * linux/fs/lockd/clnt4xdr.c
3 *
4 * XDR functions to encode/decode NLM version 4 RPC arguments and results.
5 *
6 * NLM client-side only.
7 *
8 * Copyright (C) 2010, Oracle. All rights reserved.
9 */
10
11#include <linux/types.h>
12#include <linux/sunrpc/xdr.h>
13#include <linux/sunrpc/clnt.h>
14#include <linux/sunrpc/stats.h>
15#include <linux/lockd/lockd.h>
16
17#define NLMDBG_FACILITY NLMDBG_XDR
18
19#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
20# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
21#endif
22
23#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
24# error "NLM host name cannot be larger than NLM's maximum string length!"
25#endif
26
27/*
28 * Declare the space requirements for NLM arguments and replies as
29 * number of 32bit-words
30 */
31#define NLM4_void_sz (0)
32#define NLM4_cookie_sz (1+(NLM_MAXCOOKIELEN>>2))
33#define NLM4_caller_sz (1+(NLMCLNT_OHSIZE>>2))
34#define NLM4_owner_sz (1+(NLMCLNT_OHSIZE>>2))
35#define NLM4_fhandle_sz (1+(NFS3_FHSIZE>>2))
36#define NLM4_lock_sz (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
37#define NLM4_holder_sz (6+NLM4_owner_sz)
38
39#define NLM4_testargs_sz (NLM4_cookie_sz+1+NLM4_lock_sz)
40#define NLM4_lockargs_sz (NLM4_cookie_sz+4+NLM4_lock_sz)
41#define NLM4_cancargs_sz (NLM4_cookie_sz+2+NLM4_lock_sz)
42#define NLM4_unlockargs_sz (NLM4_cookie_sz+NLM4_lock_sz)
43
44#define NLM4_testres_sz (NLM4_cookie_sz+1+NLM4_holder_sz)
45#define NLM4_res_sz (NLM4_cookie_sz+1)
46#define NLM4_norep_sz (0)
47
48
49static s64 loff_t_to_s64(loff_t offset)
50{
51 s64 res;
52
53 if (offset >= NLM4_OFFSET_MAX)
54 res = NLM4_OFFSET_MAX;
55 else if (offset <= -NLM4_OFFSET_MAX)
56 res = -NLM4_OFFSET_MAX;
57 else
58 res = offset;
59 return res;
60}
61
62static void nlm4_compute_offsets(const struct nlm_lock *lock,
63 u64 *l_offset, u64 *l_len)
64{
65 const struct file_lock *fl = &lock->fl;
66
67 BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
68 BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
69 fl->fl_end != OFFSET_MAX);
70
71 *l_offset = loff_t_to_s64(fl->fl_start);
72 if (fl->fl_end == OFFSET_MAX)
73 *l_len = 0;
74 else
75 *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
76}
77
78/*
79 * Handle decode buffer overflows out-of-line.
80 */
81static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
82{
83 dprintk("lockd: %s prematurely hit the end of our receive buffer. "
84 "Remaining buffer length is %tu words.\n",
85 func, xdr->end - xdr->p);
86}
87
88
89/*
90 * Encode/decode NLMv4 basic data types
91 *
92 * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
93 * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
94 * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
95 *
96 * Not all basic data types have their own encoding and decoding
97 * functions. For run-time efficiency, some data types are encoded
98 * or decoded inline.
99 */
100
101static void encode_bool(struct xdr_stream *xdr, const int value)
102{
103 __be32 *p;
104
105 p = xdr_reserve_space(xdr, 4);
106 *p = value ? xdr_one : xdr_zero;
107}
108
109static void encode_int32(struct xdr_stream *xdr, const s32 value)
110{
111 __be32 *p;
112
113 p = xdr_reserve_space(xdr, 4);
114 *p = cpu_to_be32(value);
115}
116
117/*
118 * typedef opaque netobj<MAXNETOBJ_SZ>
119 */
120static void encode_netobj(struct xdr_stream *xdr,
121 const u8 *data, const unsigned int length)
122{
123 __be32 *p;
124
125 BUG_ON(length > XDR_MAX_NETOBJ);
126 p = xdr_reserve_space(xdr, 4 + length);
127 xdr_encode_opaque(p, data, length);
128}
129
130static int decode_netobj(struct xdr_stream *xdr,
131 struct xdr_netobj *obj)
132{
133 u32 length;
134 __be32 *p;
135
136 p = xdr_inline_decode(xdr, 4);
137 if (unlikely(p == NULL))
138 goto out_overflow;
139 length = be32_to_cpup(p++);
140 if (unlikely(length > XDR_MAX_NETOBJ))
141 goto out_size;
142 obj->len = length;
143 obj->data = (u8 *)p;
144 return 0;
145out_size:
146 dprintk("NFS: returned netobj was too long: %u\n", length);
147 return -EIO;
148out_overflow:
149 print_overflow_msg(__func__, xdr);
150 return -EIO;
151}
152
153/*
154 * netobj cookie;
155 */
156static void encode_cookie(struct xdr_stream *xdr,
157 const struct nlm_cookie *cookie)
158{
159 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
160 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
161}
162
163static int decode_cookie(struct xdr_stream *xdr,
164 struct nlm_cookie *cookie)
165{
166 u32 length;
167 __be32 *p;
168
169 p = xdr_inline_decode(xdr, 4);
170 if (unlikely(p == NULL))
171 goto out_overflow;
172 length = be32_to_cpup(p++);
173 /* apparently HPUX can return empty cookies */
174 if (length == 0)
175 goto out_hpux;
176 if (length > NLM_MAXCOOKIELEN)
177 goto out_size;
178 p = xdr_inline_decode(xdr, length);
179 if (unlikely(p == NULL))
180 goto out_overflow;
181 cookie->len = length;
182 memcpy(cookie->data, p, length);
183 return 0;
184out_hpux:
185 cookie->len = 4;
186 memset(cookie->data, 0, 4);
187 return 0;
188out_size:
189 dprintk("NFS: returned cookie was too long: %u\n", length);
190 return -EIO;
191out_overflow:
192 print_overflow_msg(__func__, xdr);
193 return -EIO;
194}
195
196/*
197 * netobj fh;
198 */
199static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
200{
201 BUG_ON(fh->size > NFS3_FHSIZE);
202 encode_netobj(xdr, (u8 *)&fh->data, fh->size);
203}
204
205/*
206 * enum nlm4_stats {
207 * NLM4_GRANTED = 0,
208 * NLM4_DENIED = 1,
209 * NLM4_DENIED_NOLOCKS = 2,
210 * NLM4_BLOCKED = 3,
211 * NLM4_DENIED_GRACE_PERIOD = 4,
212 * NLM4_DEADLCK = 5,
213 * NLM4_ROFS = 6,
214 * NLM4_STALE_FH = 7,
215 * NLM4_FBIG = 8,
216 * NLM4_FAILED = 9
217 * };
218 *
219 * struct nlm4_stat {
220 * nlm4_stats stat;
221 * };
222 *
223 * NB: we don't swap bytes for the NLM status values. The upper
224 * layers deal directly with the status value in network byte
225 * order.
226 */
227static void encode_nlm4_stat(struct xdr_stream *xdr,
228 const __be32 stat)
229{
230 __be32 *p;
231
232 BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
233 p = xdr_reserve_space(xdr, 4);
234 *p = stat;
235}
236
237static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
238{
239 __be32 *p;
240
241 p = xdr_inline_decode(xdr, 4);
242 if (unlikely(p == NULL))
243 goto out_overflow;
244 if (unlikely(*p > nlm4_failed))
245 goto out_bad_xdr;
246 *stat = *p;
247 return 0;
248out_bad_xdr:
249 dprintk("%s: server returned invalid nlm4_stats value: %u\n",
250 __func__, be32_to_cpup(p));
251 return -EIO;
252out_overflow:
253 print_overflow_msg(__func__, xdr);
254 return -EIO;
255}
256
257/*
258 * struct nlm4_holder {
259 * bool exclusive;
260 * int32 svid;
261 * netobj oh;
262 * uint64 l_offset;
263 * uint64 l_len;
264 * };
265 */
266static void encode_nlm4_holder(struct xdr_stream *xdr,
267 const struct nlm_res *result)
268{
269 const struct nlm_lock *lock = &result->lock;
270 u64 l_offset, l_len;
271 __be32 *p;
272
273 encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
274 encode_int32(xdr, lock->svid);
275 encode_netobj(xdr, lock->oh.data, lock->oh.len);
276
277 p = xdr_reserve_space(xdr, 4 + 4);
278 nlm4_compute_offsets(lock, &l_offset, &l_len);
279 p = xdr_encode_hyper(p, l_offset);
280 xdr_encode_hyper(p, l_len);
281}
282
283static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
284{
285 struct nlm_lock *lock = &result->lock;
286 struct file_lock *fl = &lock->fl;
287 u64 l_offset, l_len;
288 u32 exclusive;
289 int error;
290 __be32 *p;
291 s32 end;
292
293 memset(lock, 0, sizeof(*lock));
294 locks_init_lock(fl);
295
296 p = xdr_inline_decode(xdr, 4 + 4);
297 if (unlikely(p == NULL))
298 goto out_overflow;
299 exclusive = be32_to_cpup(p++);
300 lock->svid = be32_to_cpup(p);
301 fl->fl_pid = (pid_t)lock->svid;
302
303 error = decode_netobj(xdr, &lock->oh);
304 if (unlikely(error))
305 goto out;
306
307 p = xdr_inline_decode(xdr, 8 + 8);
308 if (unlikely(p == NULL))
309 goto out_overflow;
310
311 fl->fl_flags = FL_POSIX;
312 fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
313 p = xdr_decode_hyper(p, &l_offset);
314 xdr_decode_hyper(p, &l_len);
315 end = l_offset + l_len - 1;
316
317 fl->fl_start = (loff_t)l_offset;
318 if (l_len == 0 || end < 0)
319 fl->fl_end = OFFSET_MAX;
320 else
321 fl->fl_end = (loff_t)end;
322 error = 0;
323out:
324 return error;
325out_overflow:
326 print_overflow_msg(__func__, xdr);
327 return -EIO;
328}
329
330/*
331 * string caller_name<LM_MAXSTRLEN>;
332 */
333static void encode_caller_name(struct xdr_stream *xdr, const char *name)
334{
335 /* NB: client-side does not set lock->len */
336 u32 length = strlen(name);
337 __be32 *p;
338
339 BUG_ON(length > NLM_MAXSTRLEN);
340 p = xdr_reserve_space(xdr, 4 + length);
341 xdr_encode_opaque(p, name, length);
342}
343
344/*
345 * struct nlm4_lock {
346 * string caller_name<LM_MAXSTRLEN>;
347 * netobj fh;
348 * netobj oh;
349 * int32 svid;
350 * uint64 l_offset;
351 * uint64 l_len;
352 * };
353 */
354static void encode_nlm4_lock(struct xdr_stream *xdr,
355 const struct nlm_lock *lock)
356{
357 u64 l_offset, l_len;
358 __be32 *p;
359
360 encode_caller_name(xdr, lock->caller);
361 encode_fh(xdr, &lock->fh);
362 encode_netobj(xdr, lock->oh.data, lock->oh.len);
363
364 p = xdr_reserve_space(xdr, 4 + 8 + 8);
365 *p++ = cpu_to_be32(lock->svid);
366
367 nlm4_compute_offsets(lock, &l_offset, &l_len);
368 p = xdr_encode_hyper(p, l_offset);
369 xdr_encode_hyper(p, l_len);
370}
371
372
373/*
374 * NLMv4 XDR encode functions
375 *
376 * NLMv4 argument types are defined in Appendix II of RFC 1813:
377 * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
378 * "Protocols for Interworking: XNFS, Version 3W".
379 */
380
381/*
382 * struct nlm4_testargs {
383 * netobj cookie;
384 * bool exclusive;
385 * struct nlm4_lock alock;
386 * };
387 */
388static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
389 struct xdr_stream *xdr,
390 const struct nlm_args *args)
391{
392 const struct nlm_lock *lock = &args->lock;
393
394 encode_cookie(xdr, &args->cookie);
395 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
396 encode_nlm4_lock(xdr, lock);
397}
398
399/*
400 * struct nlm4_lockargs {
401 * netobj cookie;
402 * bool block;
403 * bool exclusive;
404 * struct nlm4_lock alock;
405 * bool reclaim;
406 * int state;
407 * };
408 */
409static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
410 struct xdr_stream *xdr,
411 const struct nlm_args *args)
412{
413 const struct nlm_lock *lock = &args->lock;
414
415 encode_cookie(xdr, &args->cookie);
416 encode_bool(xdr, args->block);
417 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
418 encode_nlm4_lock(xdr, lock);
419 encode_bool(xdr, args->reclaim);
420 encode_int32(xdr, args->state);
421}
422
423/*
424 * struct nlm4_cancargs {
425 * netobj cookie;
426 * bool block;
427 * bool exclusive;
428 * struct nlm4_lock alock;
429 * };
430 */
431static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
432 struct xdr_stream *xdr,
433 const struct nlm_args *args)
434{
435 const struct nlm_lock *lock = &args->lock;
436
437 encode_cookie(xdr, &args->cookie);
438 encode_bool(xdr, args->block);
439 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
440 encode_nlm4_lock(xdr, lock);
441}
442
443/*
444 * struct nlm4_unlockargs {
445 * netobj cookie;
446 * struct nlm4_lock alock;
447 * };
448 */
449static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
450 struct xdr_stream *xdr,
451 const struct nlm_args *args)
452{
453 const struct nlm_lock *lock = &args->lock;
454
455 encode_cookie(xdr, &args->cookie);
456 encode_nlm4_lock(xdr, lock);
457}
458
459/*
460 * struct nlm4_res {
461 * netobj cookie;
462 * nlm4_stat stat;
463 * };
464 */
465static void nlm4_xdr_enc_res(struct rpc_rqst *req,
466 struct xdr_stream *xdr,
467 const struct nlm_res *result)
468{
469 encode_cookie(xdr, &result->cookie);
470 encode_nlm4_stat(xdr, result->status);
471}
472
473/*
474 * union nlm4_testrply switch (nlm4_stats stat) {
475 * case NLM4_DENIED:
476 * struct nlm4_holder holder;
477 * default:
478 * void;
479 * };
480 *
481 * struct nlm4_testres {
482 * netobj cookie;
483 * nlm4_testrply test_stat;
484 * };
485 */
486static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
487 struct xdr_stream *xdr,
488 const struct nlm_res *result)
489{
490 encode_cookie(xdr, &result->cookie);
491 encode_nlm4_stat(xdr, result->status);
492 if (result->status == nlm_lck_denied)
493 encode_nlm4_holder(xdr, result);
494}
495
496
497/*
498 * NLMv4 XDR decode functions
499 *
500 * NLMv4 argument types are defined in Appendix II of RFC 1813:
501 * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
502 * "Protocols for Interworking: XNFS, Version 3W".
503 */
504
505/*
506 * union nlm4_testrply switch (nlm4_stats stat) {
507 * case NLM4_DENIED:
508 * struct nlm4_holder holder;
509 * default:
510 * void;
511 * };
512 *
513 * struct nlm4_testres {
514 * netobj cookie;
515 * nlm4_testrply test_stat;
516 * };
517 */
518static int decode_nlm4_testrply(struct xdr_stream *xdr,
519 struct nlm_res *result)
520{
521 int error;
522
523 error = decode_nlm4_stat(xdr, &result->status);
524 if (unlikely(error))
525 goto out;
526 if (result->status == nlm_lck_denied)
527 error = decode_nlm4_holder(xdr, result);
528out:
529 return error;
530}
531
532static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
533 struct xdr_stream *xdr,
534 struct nlm_res *result)
535{
536 int error;
537
538 error = decode_cookie(xdr, &result->cookie);
539 if (unlikely(error))
540 goto out;
541 error = decode_nlm4_testrply(xdr, result);
542out:
543 return error;
544}
545
546/*
547 * struct nlm4_res {
548 * netobj cookie;
549 * nlm4_stat stat;
550 * };
551 */
552static int nlm4_xdr_dec_res(struct rpc_rqst *req,
553 struct xdr_stream *xdr,
554 struct nlm_res *result)
555{
556 int error;
557
558 error = decode_cookie(xdr, &result->cookie);
559 if (unlikely(error))
560 goto out;
561 error = decode_nlm4_stat(xdr, &result->status);
562out:
563 return error;
564}
565
566
567/*
568 * For NLM, a void procedure really returns nothing
569 */
570#define nlm4_xdr_dec_norep NULL
571
572#define PROC(proc, argtype, restype) \
573[NLMPROC_##proc] = { \
574 .p_proc = NLMPROC_##proc, \
575 .p_encode = (kxdreproc_t)nlm4_xdr_enc_##argtype, \
576 .p_decode = (kxdrdproc_t)nlm4_xdr_dec_##restype, \
577 .p_arglen = NLM4_##argtype##_sz, \
578 .p_replen = NLM4_##restype##_sz, \
579 .p_statidx = NLMPROC_##proc, \
580 .p_name = #proc, \
581 }
582
583static struct rpc_procinfo nlm4_procedures[] = {
584 PROC(TEST, testargs, testres),
585 PROC(LOCK, lockargs, res),
586 PROC(CANCEL, cancargs, res),
587 PROC(UNLOCK, unlockargs, res),
588 PROC(GRANTED, testargs, res),
589 PROC(TEST_MSG, testargs, norep),
590 PROC(LOCK_MSG, lockargs, norep),
591 PROC(CANCEL_MSG, cancargs, norep),
592 PROC(UNLOCK_MSG, unlockargs, norep),
593 PROC(GRANTED_MSG, testargs, norep),
594 PROC(TEST_RES, testres, norep),
595 PROC(LOCK_RES, res, norep),
596 PROC(CANCEL_RES, res, norep),
597 PROC(UNLOCK_RES, res, norep),
598 PROC(GRANTED_RES, res, norep),
599};
600
601struct rpc_version nlm_version4 = {
602 .number = 4,
603 .nrprocs = ARRAY_SIZE(nlm4_procedures),
604 .procs = nlm4_procedures,
605};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index d5bb86866e6..8d4ea8351e3 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,7 +14,6 @@
14#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
16#include <linux/lockd/lockd.h> 16#include <linux/lockd/lockd.h>
17#include <linux/smp_lock.h>
18#include <linux/kthread.h> 17#include <linux/kthread.h>
19 18
20#define NLMDBG_FACILITY NLMDBG_CLIENT 19#define NLMDBG_FACILITY NLMDBG_CLIENT
@@ -80,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
80 */ 79 */
81void nlmclnt_done(struct nlm_host *host) 80void nlmclnt_done(struct nlm_host *host)
82{ 81{
83 nlm_release_host(host); 82 nlmclnt_release_host(host);
84 lockd_down(); 83 lockd_down();
85} 84}
86EXPORT_SYMBOL_GPL(nlmclnt_done); 85EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -274,7 +273,7 @@ restart:
274 spin_unlock(&nlm_blocked_lock); 273 spin_unlock(&nlm_blocked_lock);
275 274
276 /* Release host handle after use */ 275 /* Release host handle after use */
277 nlm_release_host(host); 276 nlmclnt_release_host(host);
278 lockd_down(); 277 lockd_down();
279 return 0; 278 return 0;
280} 279}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 47ea1e1925b..adb45ec9038 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h>
11#include <linux/slab.h> 10#include <linux/slab.h>
12#include <linux/types.h> 11#include <linux/types.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
@@ -59,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
59 return; 58 return;
60 list_del(&lockowner->list); 59 list_del(&lockowner->list);
61 spin_unlock(&lockowner->host->h_lock); 60 spin_unlock(&lockowner->host->h_lock);
62 nlm_release_host(lockowner->host); 61 nlmclnt_release_host(lockowner->host);
63 kfree(lockowner); 62 kfree(lockowner);
64} 63}
65 64
@@ -208,22 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
208 printk("nlm_alloc_call: failed, waiting for memory\n"); 207 printk("nlm_alloc_call: failed, waiting for memory\n");
209 schedule_timeout_interruptible(5*HZ); 208 schedule_timeout_interruptible(5*HZ);
210 } 209 }
211 nlm_release_host(host); 210 nlmclnt_release_host(host);
212 return NULL; 211 return NULL;
213} 212}
214 213
215void nlm_release_call(struct nlm_rqst *call) 214void nlmclnt_release_call(struct nlm_rqst *call)
216{ 215{
217 if (!atomic_dec_and_test(&call->a_count)) 216 if (!atomic_dec_and_test(&call->a_count))
218 return; 217 return;
219 nlm_release_host(call->a_host); 218 nlmclnt_release_host(call->a_host);
220 nlmclnt_release_lockargs(call); 219 nlmclnt_release_lockargs(call);
221 kfree(call); 220 kfree(call);
222} 221}
223 222
224static void nlmclnt_rpc_release(void *data) 223static void nlmclnt_rpc_release(void *data)
225{ 224{
226 nlm_release_call(data); 225 nlmclnt_release_call(data);
227} 226}
228 227
229static int nlm_wait_on_grace(wait_queue_head_t *queue) 228static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -437,7 +436,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
437 status = nlm_stat_to_errno(req->a_res.status); 436 status = nlm_stat_to_errno(req->a_res.status);
438 } 437 }
439out: 438out:
440 nlm_release_call(req); 439 nlmclnt_release_call(req);
441 return status; 440 return status;
442} 441}
443 442
@@ -594,7 +593,7 @@ again:
594out_unblock: 593out_unblock:
595 nlmclnt_finish_block(block); 594 nlmclnt_finish_block(block);
596out: 595out:
597 nlm_release_call(req); 596 nlmclnt_release_call(req);
598 return status; 597 return status;
599out_unlock: 598out_unlock:
600 /* Fatal error: ensure that we remove the lock altogether */ 599 /* Fatal error: ensure that we remove the lock altogether */
@@ -695,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
695 /* What to do now? I'm out of my depth... */ 694 /* What to do now? I'm out of my depth... */
696 status = -ENOLCK; 695 status = -ENOLCK;
697out: 696out:
698 nlm_release_call(req); 697 nlmclnt_release_call(req);
699 return status; 698 return status;
700} 699}
701 700
@@ -756,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
756 NLMPROC_CANCEL, &nlmclnt_cancel_ops); 755 NLMPROC_CANCEL, &nlmclnt_cancel_ops);
757 if (status == 0 && req->a_res.status == nlm_lck_denied) 756 if (status == 0 && req->a_res.status == nlm_lck_denied)
758 status = -ENOLCK; 757 status = -ENOLCK;
759 nlm_release_call(req); 758 nlmclnt_release_call(req);
760 return status; 759 return status;
761} 760}
762 761
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 00000000000..180ac34feb9
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
1/*
2 * linux/fs/lockd/clntxdr.c
3 *
4 * XDR functions to encode/decode NLM version 3 RPC arguments and results.
5 * NLM version 3 is backwards compatible with NLM versions 1 and 2.
6 *
7 * NLM client-side only.
8 *
9 * Copyright (C) 2010, Oracle. All rights reserved.
10 */
11
12#include <linux/types.h>
13#include <linux/sunrpc/xdr.h>
14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/stats.h>
16#include <linux/lockd/lockd.h>
17
18#define NLMDBG_FACILITY NLMDBG_XDR
19
20#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
21# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
22#endif
23
24/*
25 * Declare the space requirements for NLM arguments and replies as
26 * number of 32bit-words
27 */
28#define NLM_cookie_sz (1+(NLM_MAXCOOKIELEN>>2))
29#define NLM_caller_sz (1+(NLMCLNT_OHSIZE>>2))
30#define NLM_owner_sz (1+(NLMCLNT_OHSIZE>>2))
31#define NLM_fhandle_sz (1+(NFS2_FHSIZE>>2))
32#define NLM_lock_sz (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
33#define NLM_holder_sz (4+NLM_owner_sz)
34
35#define NLM_testargs_sz (NLM_cookie_sz+1+NLM_lock_sz)
36#define NLM_lockargs_sz (NLM_cookie_sz+4+NLM_lock_sz)
37#define NLM_cancargs_sz (NLM_cookie_sz+2+NLM_lock_sz)
38#define NLM_unlockargs_sz (NLM_cookie_sz+NLM_lock_sz)
39
40#define NLM_testres_sz (NLM_cookie_sz+1+NLM_holder_sz)
41#define NLM_res_sz (NLM_cookie_sz+1)
42#define NLM_norep_sz (0)
43
44
45static s32 loff_t_to_s32(loff_t offset)
46{
47 s32 res;
48
49 if (offset >= NLM_OFFSET_MAX)
50 res = NLM_OFFSET_MAX;
51 else if (offset <= -NLM_OFFSET_MAX)
52 res = -NLM_OFFSET_MAX;
53 else
54 res = offset;
55 return res;
56}
57
58static void nlm_compute_offsets(const struct nlm_lock *lock,
59 u32 *l_offset, u32 *l_len)
60{
61 const struct file_lock *fl = &lock->fl;
62
63 BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
64 BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
65 fl->fl_end != OFFSET_MAX);
66
67 *l_offset = loff_t_to_s32(fl->fl_start);
68 if (fl->fl_end == OFFSET_MAX)
69 *l_len = 0;
70 else
71 *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
72}
73
74/*
75 * Handle decode buffer overflows out-of-line.
76 */
77static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
78{
79 dprintk("lockd: %s prematurely hit the end of our receive buffer. "
80 "Remaining buffer length is %tu words.\n",
81 func, xdr->end - xdr->p);
82}
83
84
85/*
86 * Encode/decode NLMv3 basic data types
87 *
88 * Basic NLMv3 data types are not defined in an IETF standards
89 * document. X/Open has a description of these data types that
90 * is useful. See Chapter 10 of "Protocols for Interworking:
91 * XNFS, Version 3W".
92 *
93 * Not all basic data types have their own encoding and decoding
94 * functions. For run-time efficiency, some data types are encoded
95 * or decoded inline.
96 */
97
98static void encode_bool(struct xdr_stream *xdr, const int value)
99{
100 __be32 *p;
101
102 p = xdr_reserve_space(xdr, 4);
103 *p = value ? xdr_one : xdr_zero;
104}
105
106static void encode_int32(struct xdr_stream *xdr, const s32 value)
107{
108 __be32 *p;
109
110 p = xdr_reserve_space(xdr, 4);
111 *p = cpu_to_be32(value);
112}
113
114/*
115 * typedef opaque netobj<MAXNETOBJ_SZ>
116 */
117static void encode_netobj(struct xdr_stream *xdr,
118 const u8 *data, const unsigned int length)
119{
120 __be32 *p;
121
122 BUG_ON(length > XDR_MAX_NETOBJ);
123 p = xdr_reserve_space(xdr, 4 + length);
124 xdr_encode_opaque(p, data, length);
125}
126
127static int decode_netobj(struct xdr_stream *xdr,
128 struct xdr_netobj *obj)
129{
130 u32 length;
131 __be32 *p;
132
133 p = xdr_inline_decode(xdr, 4);
134 if (unlikely(p == NULL))
135 goto out_overflow;
136 length = be32_to_cpup(p++);
137 if (unlikely(length > XDR_MAX_NETOBJ))
138 goto out_size;
139 obj->len = length;
140 obj->data = (u8 *)p;
141 return 0;
142out_size:
143 dprintk("NFS: returned netobj was too long: %u\n", length);
144 return -EIO;
145out_overflow:
146 print_overflow_msg(__func__, xdr);
147 return -EIO;
148}
149
150/*
151 * netobj cookie;
152 */
153static void encode_cookie(struct xdr_stream *xdr,
154 const struct nlm_cookie *cookie)
155{
156 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
157 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
158}
159
160static int decode_cookie(struct xdr_stream *xdr,
161 struct nlm_cookie *cookie)
162{
163 u32 length;
164 __be32 *p;
165
166 p = xdr_inline_decode(xdr, 4);
167 if (unlikely(p == NULL))
168 goto out_overflow;
169 length = be32_to_cpup(p++);
170 /* apparently HPUX can return empty cookies */
171 if (length == 0)
172 goto out_hpux;
173 if (length > NLM_MAXCOOKIELEN)
174 goto out_size;
175 p = xdr_inline_decode(xdr, length);
176 if (unlikely(p == NULL))
177 goto out_overflow;
178 cookie->len = length;
179 memcpy(cookie->data, p, length);
180 return 0;
181out_hpux:
182 cookie->len = 4;
183 memset(cookie->data, 0, 4);
184 return 0;
185out_size:
186 dprintk("NFS: returned cookie was too long: %u\n", length);
187 return -EIO;
188out_overflow:
189 print_overflow_msg(__func__, xdr);
190 return -EIO;
191}
192
193/*
194 * netobj fh;
195 */
196static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
197{
198 BUG_ON(fh->size != NFS2_FHSIZE);
199 encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
200}
201
202/*
203 * enum nlm_stats {
204 * LCK_GRANTED = 0,
205 * LCK_DENIED = 1,
206 * LCK_DENIED_NOLOCKS = 2,
207 * LCK_BLOCKED = 3,
208 * LCK_DENIED_GRACE_PERIOD = 4
209 * };
210 *
211 *
212 * struct nlm_stat {
213 * nlm_stats stat;
214 * };
215 *
216 * NB: we don't swap bytes for the NLM status values. The upper
217 * layers deal directly with the status value in network byte
218 * order.
219 */
220
221static void encode_nlm_stat(struct xdr_stream *xdr,
222 const __be32 stat)
223{
224 __be32 *p;
225
226 BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
227 p = xdr_reserve_space(xdr, 4);
228 *p = stat;
229}
230
231static int decode_nlm_stat(struct xdr_stream *xdr,
232 __be32 *stat)
233{
234 __be32 *p;
235
236 p = xdr_inline_decode(xdr, 4);
237 if (unlikely(p == NULL))
238 goto out_overflow;
239 if (unlikely(*p > nlm_lck_denied_grace_period))
240 goto out_enum;
241 *stat = *p;
242 return 0;
243out_enum:
244 dprintk("%s: server returned invalid nlm_stats value: %u\n",
245 __func__, be32_to_cpup(p));
246 return -EIO;
247out_overflow:
248 print_overflow_msg(__func__, xdr);
249 return -EIO;
250}
251
252/*
253 * struct nlm_holder {
254 * bool exclusive;
255 * int uppid;
256 * netobj oh;
257 * unsigned l_offset;
258 * unsigned l_len;
259 * };
260 */
261static void encode_nlm_holder(struct xdr_stream *xdr,
262 const struct nlm_res *result)
263{
264 const struct nlm_lock *lock = &result->lock;
265 u32 l_offset, l_len;
266 __be32 *p;
267
268 encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
269 encode_int32(xdr, lock->svid);
270 encode_netobj(xdr, lock->oh.data, lock->oh.len);
271
272 p = xdr_reserve_space(xdr, 4 + 4);
273 nlm_compute_offsets(lock, &l_offset, &l_len);
274 *p++ = cpu_to_be32(l_offset);
275 *p = cpu_to_be32(l_len);
276}
277
278static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
279{
280 struct nlm_lock *lock = &result->lock;
281 struct file_lock *fl = &lock->fl;
282 u32 exclusive, l_offset, l_len;
283 int error;
284 __be32 *p;
285 s32 end;
286
287 memset(lock, 0, sizeof(*lock));
288 locks_init_lock(fl);
289
290 p = xdr_inline_decode(xdr, 4 + 4);
291 if (unlikely(p == NULL))
292 goto out_overflow;
293 exclusive = be32_to_cpup(p++);
294 lock->svid = be32_to_cpup(p);
295 fl->fl_pid = (pid_t)lock->svid;
296
297 error = decode_netobj(xdr, &lock->oh);
298 if (unlikely(error))
299 goto out;
300
301 p = xdr_inline_decode(xdr, 4 + 4);
302 if (unlikely(p == NULL))
303 goto out_overflow;
304
305 fl->fl_flags = FL_POSIX;
306 fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
307 l_offset = be32_to_cpup(p++);
308 l_len = be32_to_cpup(p);
309 end = l_offset + l_len - 1;
310
311 fl->fl_start = (loff_t)l_offset;
312 if (l_len == 0 || end < 0)
313 fl->fl_end = OFFSET_MAX;
314 else
315 fl->fl_end = (loff_t)end;
316 error = 0;
317out:
318 return error;
319out_overflow:
320 print_overflow_msg(__func__, xdr);
321 return -EIO;
322}
323
324/*
325 * string caller_name<LM_MAXSTRLEN>;
326 */
327static void encode_caller_name(struct xdr_stream *xdr, const char *name)
328{
329 /* NB: client-side does not set lock->len */
330 u32 length = strlen(name);
331 __be32 *p;
332
333 BUG_ON(length > NLM_MAXSTRLEN);
334 p = xdr_reserve_space(xdr, 4 + length);
335 xdr_encode_opaque(p, name, length);
336}
337
338/*
339 * struct nlm_lock {
340 * string caller_name<LM_MAXSTRLEN>;
341 * netobj fh;
342 * netobj oh;
343 * int uppid;
344 * unsigned l_offset;
345 * unsigned l_len;
346 * };
347 */
348static void encode_nlm_lock(struct xdr_stream *xdr,
349 const struct nlm_lock *lock)
350{
351 u32 l_offset, l_len;
352 __be32 *p;
353
354 encode_caller_name(xdr, lock->caller);
355 encode_fh(xdr, &lock->fh);
356 encode_netobj(xdr, lock->oh.data, lock->oh.len);
357
358 p = xdr_reserve_space(xdr, 4 + 4 + 4);
359 *p++ = cpu_to_be32(lock->svid);
360
361 nlm_compute_offsets(lock, &l_offset, &l_len);
362 *p++ = cpu_to_be32(l_offset);
363 *p = cpu_to_be32(l_len);
364}
365
366
367/*
368 * NLMv3 XDR encode functions
369 *
370 * NLMv3 argument types are defined in Chapter 10 of The Open Group's
371 * "Protocols for Interworking: XNFS, Version 3W".
372 */
373
374/*
375 * struct nlm_testargs {
376 * netobj cookie;
377 * bool exclusive;
378 * struct nlm_lock alock;
379 * };
380 */
381static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
382 struct xdr_stream *xdr,
383 const struct nlm_args *args)
384{
385 const struct nlm_lock *lock = &args->lock;
386
387 encode_cookie(xdr, &args->cookie);
388 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
389 encode_nlm_lock(xdr, lock);
390}
391
392/*
393 * struct nlm_lockargs {
394 * netobj cookie;
395 * bool block;
396 * bool exclusive;
397 * struct nlm_lock alock;
398 * bool reclaim;
399 * int state;
400 * };
401 */
402static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
403 struct xdr_stream *xdr,
404 const struct nlm_args *args)
405{
406 const struct nlm_lock *lock = &args->lock;
407
408 encode_cookie(xdr, &args->cookie);
409 encode_bool(xdr, args->block);
410 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
411 encode_nlm_lock(xdr, lock);
412 encode_bool(xdr, args->reclaim);
413 encode_int32(xdr, args->state);
414}
415
416/*
417 * struct nlm_cancargs {
418 * netobj cookie;
419 * bool block;
420 * bool exclusive;
421 * struct nlm_lock alock;
422 * };
423 */
424static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
425 struct xdr_stream *xdr,
426 const struct nlm_args *args)
427{
428 const struct nlm_lock *lock = &args->lock;
429
430 encode_cookie(xdr, &args->cookie);
431 encode_bool(xdr, args->block);
432 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
433 encode_nlm_lock(xdr, lock);
434}
435
436/*
437 * struct nlm_unlockargs {
438 * netobj cookie;
439 * struct nlm_lock alock;
440 * };
441 */
442static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
443 struct xdr_stream *xdr,
444 const struct nlm_args *args)
445{
446 const struct nlm_lock *lock = &args->lock;
447
448 encode_cookie(xdr, &args->cookie);
449 encode_nlm_lock(xdr, lock);
450}
451
452/*
453 * struct nlm_res {
454 * netobj cookie;
455 * nlm_stat stat;
456 * };
457 */
458static void nlm_xdr_enc_res(struct rpc_rqst *req,
459 struct xdr_stream *xdr,
460 const struct nlm_res *result)
461{
462 encode_cookie(xdr, &result->cookie);
463 encode_nlm_stat(xdr, result->status);
464}
465
466/*
467 * union nlm_testrply switch (nlm_stats stat) {
468 * case LCK_DENIED:
469 * struct nlm_holder holder;
470 * default:
471 * void;
472 * };
473 *
474 * struct nlm_testres {
475 * netobj cookie;
476 * nlm_testrply test_stat;
477 * };
478 */
479static void encode_nlm_testrply(struct xdr_stream *xdr,
480 const struct nlm_res *result)
481{
482 if (result->status == nlm_lck_denied)
483 encode_nlm_holder(xdr, result);
484}
485
486static void nlm_xdr_enc_testres(struct rpc_rqst *req,
487 struct xdr_stream *xdr,
488 const struct nlm_res *result)
489{
490 encode_cookie(xdr, &result->cookie);
491 encode_nlm_stat(xdr, result->status);
492 encode_nlm_testrply(xdr, result);
493}
494
495
496/*
497 * NLMv3 XDR decode functions
498 *
499 * NLMv3 result types are defined in Chapter 10 of The Open Group's
500 * "Protocols for Interworking: XNFS, Version 3W".
501 */
502
503/*
504 * union nlm_testrply switch (nlm_stats stat) {
505 * case LCK_DENIED:
506 * struct nlm_holder holder;
507 * default:
508 * void;
509 * };
510 *
511 * struct nlm_testres {
512 * netobj cookie;
513 * nlm_testrply test_stat;
514 * };
515 */
516static int decode_nlm_testrply(struct xdr_stream *xdr,
517 struct nlm_res *result)
518{
519 int error;
520
521 error = decode_nlm_stat(xdr, &result->status);
522 if (unlikely(error))
523 goto out;
524 if (result->status == nlm_lck_denied)
525 error = decode_nlm_holder(xdr, result);
526out:
527 return error;
528}
529
530static int nlm_xdr_dec_testres(struct rpc_rqst *req,
531 struct xdr_stream *xdr,
532 struct nlm_res *result)
533{
534 int error;
535
536 error = decode_cookie(xdr, &result->cookie);
537 if (unlikely(error))
538 goto out;
539 error = decode_nlm_testrply(xdr, result);
540out:
541 return error;
542}
543
544/*
545 * struct nlm_res {
546 * netobj cookie;
547 * nlm_stat stat;
548 * };
549 */
550static int nlm_xdr_dec_res(struct rpc_rqst *req,
551 struct xdr_stream *xdr,
552 struct nlm_res *result)
553{
554 int error;
555
556 error = decode_cookie(xdr, &result->cookie);
557 if (unlikely(error))
558 goto out;
559 error = decode_nlm_stat(xdr, &result->status);
560out:
561 return error;
562}
563
564
565/*
566 * For NLM, a void procedure really returns nothing
567 */
568#define nlm_xdr_dec_norep NULL
569
570#define PROC(proc, argtype, restype) \
571[NLMPROC_##proc] = { \
572 .p_proc = NLMPROC_##proc, \
573 .p_encode = (kxdreproc_t)nlm_xdr_enc_##argtype, \
574 .p_decode = (kxdrdproc_t)nlm_xdr_dec_##restype, \
575 .p_arglen = NLM_##argtype##_sz, \
576 .p_replen = NLM_##restype##_sz, \
577 .p_statidx = NLMPROC_##proc, \
578 .p_name = #proc, \
579 }
580
581static struct rpc_procinfo nlm_procedures[] = {
582 PROC(TEST, testargs, testres),
583 PROC(LOCK, lockargs, res),
584 PROC(CANCEL, cancargs, res),
585 PROC(UNLOCK, unlockargs, res),
586 PROC(GRANTED, testargs, res),
587 PROC(TEST_MSG, testargs, norep),
588 PROC(LOCK_MSG, lockargs, norep),
589 PROC(CANCEL_MSG, cancargs, norep),
590 PROC(UNLOCK_MSG, unlockargs, norep),
591 PROC(GRANTED_MSG, testargs, norep),
592 PROC(TEST_RES, testres, norep),
593 PROC(LOCK_RES, res, norep),
594 PROC(CANCEL_RES, res, norep),
595 PROC(UNLOCK_RES, res, norep),
596 PROC(GRANTED_RES, res, norep),
597};
598
599static struct rpc_version nlm_version1 = {
600 .number = 1,
601 .nrprocs = ARRAY_SIZE(nlm_procedures),
602 .procs = nlm_procedures,
603};
604
605static struct rpc_version nlm_version3 = {
606 .number = 3,
607 .nrprocs = ARRAY_SIZE(nlm_procedures),
608 .procs = nlm_procedures,
609};
610
611static struct rpc_version *nlm_versions[] = {
612 [1] = &nlm_version1,
613 [3] = &nlm_version3,
614#ifdef CONFIG_LOCKD_V4
615 [4] = &nlm_version4,
616#endif
617};
618
619static struct rpc_stat nlm_rpc_stats;
620
621struct rpc_program nlm_program = {
622 .name = "lockd",
623 .number = NLM_PROGRAM,
624 .nrvers = ARRAY_SIZE(nlm_versions),
625 .version = nlm_versions,
626 .stats = &nlm_rpc_stats,
627};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 25e21e4023b..b7c99bfb3da 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -25,9 +25,22 @@
25#define NLM_HOST_EXPIRE (300 * HZ) 25#define NLM_HOST_EXPIRE (300 * HZ)
26#define NLM_HOST_COLLECT (120 * HZ) 26#define NLM_HOST_COLLECT (120 * HZ)
27 27
28static struct hlist_head nlm_hosts[NLM_HOST_NRHASH]; 28static struct hlist_head nlm_server_hosts[NLM_HOST_NRHASH];
29static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH];
30
31#define for_each_host(host, pos, chain, table) \
32 for ((chain) = (table); \
33 (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
34 hlist_for_each_entry((host), (pos), (chain), h_hash)
35
36#define for_each_host_safe(host, pos, next, chain, table) \
37 for ((chain) = (table); \
38 (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
39 hlist_for_each_entry_safe((host), (pos), (next), \
40 (chain), h_hash)
41
29static unsigned long next_gc; 42static unsigned long next_gc;
30static int nrhosts; 43static unsigned long nrhosts;
31static DEFINE_MUTEX(nlm_host_mutex); 44static DEFINE_MUTEX(nlm_host_mutex);
32 45
33static void nlm_gc_hosts(void); 46static void nlm_gc_hosts(void);
@@ -40,8 +53,6 @@ struct nlm_lookup_host_info {
40 const u32 version; /* NLM version to search for */ 53 const u32 version; /* NLM version to search for */
41 const char *hostname; /* remote's hostname */ 54 const char *hostname; /* remote's hostname */
42 const size_t hostname_len; /* it's length */ 55 const size_t hostname_len; /* it's length */
43 const struct sockaddr *src_sap; /* our address (optional) */
44 const size_t src_len; /* it's length */
45 const int noresvport; /* use non-priv port */ 56 const int noresvport; /* use non-priv port */
46}; 57};
47 58
@@ -88,126 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
88} 99}
89 100
90/* 101/*
91 * Common host lookup routine for server & client 102 * Allocate and initialize an nlm_host. Common to both client and server.
92 */ 103 */
93static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) 104static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
105 struct nsm_handle *nsm)
94{ 106{
95 struct hlist_head *chain; 107 struct nlm_host *host = NULL;
96 struct hlist_node *pos; 108 unsigned long now = jiffies;
97 struct nlm_host *host;
98 struct nsm_handle *nsm = NULL;
99
100 mutex_lock(&nlm_host_mutex);
101
102 if (time_after_eq(jiffies, next_gc))
103 nlm_gc_hosts();
104
105 /* We may keep several nlm_host objects for a peer, because each
106 * nlm_host is identified by
107 * (address, protocol, version, server/client)
108 * We could probably simplify this a little by putting all those
109 * different NLM rpc_clients into one single nlm_host object.
110 * This would allow us to have one nlm_host per address.
111 */
112 chain = &nlm_hosts[nlm_hash_address(ni->sap)];
113 hlist_for_each_entry(host, pos, chain, h_hash) {
114 if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
115 continue;
116
117 /* See if we have an NSM handle for this client */
118 if (!nsm)
119 nsm = host->h_nsmhandle;
120
121 if (host->h_proto != ni->protocol)
122 continue;
123 if (host->h_version != ni->version)
124 continue;
125 if (host->h_server != ni->server)
126 continue;
127 if (ni->server &&
128 !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
129 continue;
130
131 /* Move to head of hash chain. */
132 hlist_del(&host->h_hash);
133 hlist_add_head(&host->h_hash, chain);
134
135 nlm_get_host(host);
136 dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
137 host->h_name, host->h_addrbuf);
138 goto out;
139 }
140 109
141 /* 110 if (nsm != NULL)
142 * The host wasn't in our hash table. If we don't
143 * have an NSM handle for it yet, create one.
144 */
145 if (nsm)
146 atomic_inc(&nsm->sm_count); 111 atomic_inc(&nsm->sm_count);
147 else { 112 else {
148 host = NULL; 113 host = NULL;
149 nsm = nsm_get_handle(ni->sap, ni->salen, 114 nsm = nsm_get_handle(ni->sap, ni->salen,
150 ni->hostname, ni->hostname_len); 115 ni->hostname, ni->hostname_len);
151 if (!nsm) { 116 if (unlikely(nsm == NULL)) {
152 dprintk("lockd: nlm_lookup_host failed; " 117 dprintk("lockd: %s failed; no nsm handle\n",
153 "no nsm handle\n"); 118 __func__);
154 goto out; 119 goto out;
155 } 120 }
156 } 121 }
157 122
158 host = kzalloc(sizeof(*host), GFP_KERNEL); 123 host = kmalloc(sizeof(*host), GFP_KERNEL);
159 if (!host) { 124 if (unlikely(host == NULL)) {
125 dprintk("lockd: %s failed; no memory\n", __func__);
160 nsm_release(nsm); 126 nsm_release(nsm);
161 dprintk("lockd: nlm_lookup_host failed; no memory\n");
162 goto out; 127 goto out;
163 } 128 }
164 host->h_name = nsm->sm_name; 129
165 host->h_addrbuf = nsm->sm_addrbuf;
166 memcpy(nlm_addr(host), ni->sap, ni->salen); 130 memcpy(nlm_addr(host), ni->sap, ni->salen);
167 host->h_addrlen = ni->salen; 131 host->h_addrlen = ni->salen;
168 rpc_set_port(nlm_addr(host), 0); 132 rpc_set_port(nlm_addr(host), 0);
169 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); 133 host->h_srcaddrlen = 0;
134
135 host->h_rpcclnt = NULL;
136 host->h_name = nsm->sm_name;
170 host->h_version = ni->version; 137 host->h_version = ni->version;
171 host->h_proto = ni->protocol; 138 host->h_proto = ni->protocol;
172 host->h_rpcclnt = NULL; 139 host->h_reclaiming = 0;
173 mutex_init(&host->h_mutex); 140 host->h_server = ni->server;
174 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 141 host->h_noresvport = ni->noresvport;
175 host->h_expires = jiffies + NLM_HOST_EXPIRE; 142 host->h_inuse = 0;
176 atomic_set(&host->h_count, 1);
177 init_waitqueue_head(&host->h_gracewait); 143 init_waitqueue_head(&host->h_gracewait);
178 init_rwsem(&host->h_rwsem); 144 init_rwsem(&host->h_rwsem);
179 host->h_state = 0; /* pseudo NSM state */ 145 host->h_state = 0;
180 host->h_nsmstate = 0; /* real NSM state */ 146 host->h_nsmstate = 0;
181 host->h_nsmhandle = nsm; 147 host->h_pidcount = 0;
182 host->h_server = ni->server; 148 atomic_set(&host->h_count, 1);
183 host->h_noresvport = ni->noresvport; 149 mutex_init(&host->h_mutex);
184 hlist_add_head(&host->h_hash, chain); 150 host->h_nextrebind = now + NLM_HOST_REBIND;
151 host->h_expires = now + NLM_HOST_EXPIRE;
185 INIT_LIST_HEAD(&host->h_lockowners); 152 INIT_LIST_HEAD(&host->h_lockowners);
186 spin_lock_init(&host->h_lock); 153 spin_lock_init(&host->h_lock);
187 INIT_LIST_HEAD(&host->h_granted); 154 INIT_LIST_HEAD(&host->h_granted);
188 INIT_LIST_HEAD(&host->h_reclaim); 155 INIT_LIST_HEAD(&host->h_reclaim);
189 156 host->h_nsmhandle = nsm;
190 nrhosts++; 157 host->h_addrbuf = nsm->sm_addrbuf;
191
192 dprintk("lockd: nlm_lookup_host created host %s\n",
193 host->h_name);
194 158
195out: 159out:
196 mutex_unlock(&nlm_host_mutex);
197 return host; 160 return host;
198} 161}
199 162
200/* 163/*
201 * Destroy a host 164 * Destroy an nlm_host and free associated resources
165 *
166 * Caller must hold nlm_host_mutex.
202 */ 167 */
203static void 168static void nlm_destroy_host_locked(struct nlm_host *host)
204nlm_destroy_host(struct nlm_host *host)
205{ 169{
206 struct rpc_clnt *clnt; 170 struct rpc_clnt *clnt;
207 171
172 dprintk("lockd: destroy host %s\n", host->h_name);
173
208 BUG_ON(!list_empty(&host->h_lockowners)); 174 BUG_ON(!list_empty(&host->h_lockowners));
209 BUG_ON(atomic_read(&host->h_count)); 175 BUG_ON(atomic_read(&host->h_count));
210 176
177 hlist_del_init(&host->h_hash);
178
211 nsm_unmonitor(host); 179 nsm_unmonitor(host);
212 nsm_release(host->h_nsmhandle); 180 nsm_release(host->h_nsmhandle);
213 181
@@ -215,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host)
215 if (clnt != NULL) 183 if (clnt != NULL)
216 rpc_shutdown_client(clnt); 184 rpc_shutdown_client(clnt);
217 kfree(host); 185 kfree(host);
186
187 nrhosts--;
218} 188}
219 189
220/** 190/**
@@ -238,9 +208,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
238 const char *hostname, 208 const char *hostname,
239 int noresvport) 209 int noresvport)
240{ 210{
241 const struct sockaddr source = {
242 .sa_family = AF_UNSPEC,
243 };
244 struct nlm_lookup_host_info ni = { 211 struct nlm_lookup_host_info ni = {
245 .server = 0, 212 .server = 0,
246 .sap = sap, 213 .sap = sap,
@@ -249,16 +216,78 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
249 .version = version, 216 .version = version,
250 .hostname = hostname, 217 .hostname = hostname,
251 .hostname_len = strlen(hostname), 218 .hostname_len = strlen(hostname),
252 .src_sap = &source,
253 .src_len = sizeof(source),
254 .noresvport = noresvport, 219 .noresvport = noresvport,
255 }; 220 };
221 struct hlist_head *chain;
222 struct hlist_node *pos;
223 struct nlm_host *host;
224 struct nsm_handle *nsm = NULL;
256 225
257 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, 226 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
258 (hostname ? hostname : "<none>"), version, 227 (hostname ? hostname : "<none>"), version,
259 (protocol == IPPROTO_UDP ? "udp" : "tcp")); 228 (protocol == IPPROTO_UDP ? "udp" : "tcp"));
260 229
261 return nlm_lookup_host(&ni); 230 mutex_lock(&nlm_host_mutex);
231
232 chain = &nlm_client_hosts[nlm_hash_address(sap)];
233 hlist_for_each_entry(host, pos, chain, h_hash) {
234 if (!rpc_cmp_addr(nlm_addr(host), sap))
235 continue;
236
237 /* Same address. Share an NSM handle if we already have one */
238 if (nsm == NULL)
239 nsm = host->h_nsmhandle;
240
241 if (host->h_proto != protocol)
242 continue;
243 if (host->h_version != version)
244 continue;
245
246 nlm_get_host(host);
247 dprintk("lockd: %s found host %s (%s)\n", __func__,
248 host->h_name, host->h_addrbuf);
249 goto out;
250 }
251
252 host = nlm_alloc_host(&ni, nsm);
253 if (unlikely(host == NULL))
254 goto out;
255
256 hlist_add_head(&host->h_hash, chain);
257 nrhosts++;
258
259 dprintk("lockd: %s created host %s (%s)\n", __func__,
260 host->h_name, host->h_addrbuf);
261
262out:
263 mutex_unlock(&nlm_host_mutex);
264 return host;
265}
266
267/**
268 * nlmclnt_release_host - release client nlm_host
269 * @host: nlm_host to release
270 *
271 */
272void nlmclnt_release_host(struct nlm_host *host)
273{
274 if (host == NULL)
275 return;
276
277 dprintk("lockd: release client host %s\n", host->h_name);
278
279 BUG_ON(atomic_read(&host->h_count) < 0);
280 BUG_ON(host->h_server);
281
282 if (atomic_dec_and_test(&host->h_count)) {
283 BUG_ON(!list_empty(&host->h_lockowners));
284 BUG_ON(!list_empty(&host->h_granted));
285 BUG_ON(!list_empty(&host->h_reclaim));
286
287 mutex_lock(&nlm_host_mutex);
288 nlm_destroy_host_locked(host);
289 mutex_unlock(&nlm_host_mutex);
290 }
262} 291}
263 292
264/** 293/**
@@ -283,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
283 const char *hostname, 312 const char *hostname,
284 const size_t hostname_len) 313 const size_t hostname_len)
285{ 314{
315 struct hlist_head *chain;
316 struct hlist_node *pos;
317 struct nlm_host *host = NULL;
318 struct nsm_handle *nsm = NULL;
286 struct sockaddr_in sin = { 319 struct sockaddr_in sin = {
287 .sin_family = AF_INET, 320 .sin_family = AF_INET,
288 }; 321 };
289 struct sockaddr_in6 sin6 = { 322 struct sockaddr_in6 sin6 = {
290 .sin6_family = AF_INET6, 323 .sin6_family = AF_INET6,
291 }; 324 };
325 struct sockaddr *src_sap;
326 size_t src_len = rqstp->rq_addrlen;
292 struct nlm_lookup_host_info ni = { 327 struct nlm_lookup_host_info ni = {
293 .server = 1, 328 .server = 1,
294 .sap = svc_addr(rqstp), 329 .sap = svc_addr(rqstp),
@@ -297,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
297 .version = rqstp->rq_vers, 332 .version = rqstp->rq_vers,
298 .hostname = hostname, 333 .hostname = hostname,
299 .hostname_len = hostname_len, 334 .hostname_len = hostname_len,
300 .src_len = rqstp->rq_addrlen,
301 }; 335 };
302 336
303 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, 337 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
304 (int)hostname_len, hostname, rqstp->rq_vers, 338 (int)hostname_len, hostname, rqstp->rq_vers,
305 (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); 339 (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
306 340
341 mutex_lock(&nlm_host_mutex);
342
307 switch (ni.sap->sa_family) { 343 switch (ni.sap->sa_family) {
308 case AF_INET: 344 case AF_INET:
309 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; 345 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
310 ni.src_sap = (struct sockaddr *)&sin; 346 src_sap = (struct sockaddr *)&sin;
311 break; 347 break;
312 case AF_INET6: 348 case AF_INET6:
313 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); 349 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
314 ni.src_sap = (struct sockaddr *)&sin6; 350 src_sap = (struct sockaddr *)&sin6;
315 break; 351 break;
316 default: 352 default:
317 return NULL; 353 dprintk("lockd: %s failed; unrecognized address family\n",
354 __func__);
355 goto out;
318 } 356 }
319 357
320 return nlm_lookup_host(&ni); 358 if (time_after_eq(jiffies, next_gc))
359 nlm_gc_hosts();
360
361 chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
362 hlist_for_each_entry(host, pos, chain, h_hash) {
363 if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
364 continue;
365
366 /* Same address. Share an NSM handle if we already have one */
367 if (nsm == NULL)
368 nsm = host->h_nsmhandle;
369
370 if (host->h_proto != ni.protocol)
371 continue;
372 if (host->h_version != ni.version)
373 continue;
374 if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
375 continue;
376
377 /* Move to head of hash chain. */
378 hlist_del(&host->h_hash);
379 hlist_add_head(&host->h_hash, chain);
380
381 nlm_get_host(host);
382 dprintk("lockd: %s found host %s (%s)\n",
383 __func__, host->h_name, host->h_addrbuf);
384 goto out;
385 }
386
387 host = nlm_alloc_host(&ni, nsm);
388 if (unlikely(host == NULL))
389 goto out;
390
391 memcpy(nlm_srcaddr(host), src_sap, src_len);
392 host->h_srcaddrlen = src_len;
393 hlist_add_head(&host->h_hash, chain);
394 nrhosts++;
395
396 dprintk("lockd: %s created host %s (%s)\n",
397 __func__, host->h_name, host->h_addrbuf);
398
399out:
400 mutex_unlock(&nlm_host_mutex);
401 return host;
402}
403
404/**
405 * nlmsvc_release_host - release server nlm_host
406 * @host: nlm_host to release
407 *
408 * Host is destroyed later in nlm_gc_host().
409 */
410void nlmsvc_release_host(struct nlm_host *host)
411{
412 if (host == NULL)
413 return;
414
415 dprintk("lockd: release server host %s\n", host->h_name);
416
417 BUG_ON(atomic_read(&host->h_count) < 0);
418 BUG_ON(!host->h_server);
419 atomic_dec(&host->h_count);
321} 420}
322 421
323/* 422/*
@@ -357,7 +456,6 @@ nlm_bind_host(struct nlm_host *host)
357 .protocol = host->h_proto, 456 .protocol = host->h_proto,
358 .address = nlm_addr(host), 457 .address = nlm_addr(host),
359 .addrsize = host->h_addrlen, 458 .addrsize = host->h_addrlen,
360 .saddress = nlm_srcaddr(host),
361 .timeout = &timeparms, 459 .timeout = &timeparms,
362 .servername = host->h_name, 460 .servername = host->h_name,
363 .program = &nlm_program, 461 .program = &nlm_program,
@@ -376,6 +474,8 @@ nlm_bind_host(struct nlm_host *host)
376 args.flags |= RPC_CLNT_CREATE_HARDRTRY; 474 args.flags |= RPC_CLNT_CREATE_HARDRTRY;
377 if (host->h_noresvport) 475 if (host->h_noresvport)
378 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 476 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
477 if (host->h_srcaddrlen)
478 args.saddress = nlm_srcaddr(host);
379 479
380 clnt = rpc_create(&args); 480 clnt = rpc_create(&args);
381 if (!IS_ERR(clnt)) 481 if (!IS_ERR(clnt))
@@ -416,20 +516,29 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
416 return host; 516 return host;
417} 517}
418 518
419/* 519static struct nlm_host *next_host_state(struct hlist_head *cache,
420 * Release NLM host after use 520 struct nsm_handle *nsm,
421 */ 521 const struct nlm_reboot *info)
422void nlm_release_host(struct nlm_host *host)
423{ 522{
424 if (host != NULL) { 523 struct nlm_host *host;
425 dprintk("lockd: release host %s\n", host->h_name); 524 struct hlist_head *chain;
426 BUG_ON(atomic_read(&host->h_count) < 0); 525 struct hlist_node *pos;
427 if (atomic_dec_and_test(&host->h_count)) { 526
428 BUG_ON(!list_empty(&host->h_lockowners)); 527 mutex_lock(&nlm_host_mutex);
429 BUG_ON(!list_empty(&host->h_granted)); 528 for_each_host(host, pos, chain, cache) {
430 BUG_ON(!list_empty(&host->h_reclaim)); 529 if (host->h_nsmhandle == nsm
530 && host->h_nsmstate != info->state) {
531 host->h_nsmstate = info->state;
532 host->h_state++;
533
534 nlm_get_host(host);
535 mutex_unlock(&nlm_host_mutex);
536 return host;
431 } 537 }
432 } 538 }
539
540 mutex_unlock(&nlm_host_mutex);
541 return NULL;
433} 542}
434 543
435/** 544/**
@@ -441,8 +550,6 @@ void nlm_release_host(struct nlm_host *host)
441 */ 550 */
442void nlm_host_rebooted(const struct nlm_reboot *info) 551void nlm_host_rebooted(const struct nlm_reboot *info)
443{ 552{
444 struct hlist_head *chain;
445 struct hlist_node *pos;
446 struct nsm_handle *nsm; 553 struct nsm_handle *nsm;
447 struct nlm_host *host; 554 struct nlm_host *host;
448 555
@@ -455,32 +562,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
455 * lock for this. 562 * lock for this.
456 * To avoid processing a host several times, we match the nsmstate. 563 * To avoid processing a host several times, we match the nsmstate.
457 */ 564 */
458again: mutex_lock(&nlm_host_mutex); 565 while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
459 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 566 nlmsvc_free_host_resources(host);
460 hlist_for_each_entry(host, pos, chain, h_hash) { 567 nlmsvc_release_host(host);
461 if (host->h_nsmhandle == nsm
462 && host->h_nsmstate != info->state) {
463 host->h_nsmstate = info->state;
464 host->h_state++;
465
466 nlm_get_host(host);
467 mutex_unlock(&nlm_host_mutex);
468
469 if (host->h_server) {
470 /* We're server for this guy, just ditch
471 * all the locks he held. */
472 nlmsvc_free_host_resources(host);
473 } else {
474 /* He's the server, initiate lock recovery. */
475 nlmclnt_recovery(host);
476 }
477
478 nlm_release_host(host);
479 goto again;
480 }
481 }
482 } 568 }
483 mutex_unlock(&nlm_host_mutex); 569 while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
570 nlmclnt_recovery(host);
571 nlmclnt_release_host(host);
572 }
573
484 nsm_release(nsm); 574 nsm_release(nsm);
485} 575}
486 576
@@ -500,13 +590,11 @@ nlm_shutdown_hosts(void)
500 590
501 /* First, make all hosts eligible for gc */ 591 /* First, make all hosts eligible for gc */
502 dprintk("lockd: nuking all hosts...\n"); 592 dprintk("lockd: nuking all hosts...\n");
503 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 593 for_each_host(host, pos, chain, nlm_server_hosts) {
504 hlist_for_each_entry(host, pos, chain, h_hash) { 594 host->h_expires = jiffies - 1;
505 host->h_expires = jiffies - 1; 595 if (host->h_rpcclnt) {
506 if (host->h_rpcclnt) { 596 rpc_shutdown_client(host->h_rpcclnt);
507 rpc_shutdown_client(host->h_rpcclnt); 597 host->h_rpcclnt = NULL;
508 host->h_rpcclnt = NULL;
509 }
510 } 598 }
511 } 599 }
512 600
@@ -515,15 +603,13 @@ nlm_shutdown_hosts(void)
515 mutex_unlock(&nlm_host_mutex); 603 mutex_unlock(&nlm_host_mutex);
516 604
517 /* complain if any hosts are left */ 605 /* complain if any hosts are left */
518 if (nrhosts) { 606 if (nrhosts != 0) {
519 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); 607 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
520 dprintk("lockd: %d hosts left:\n", nrhosts); 608 dprintk("lockd: %lu hosts left:\n", nrhosts);
521 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 609 for_each_host(host, pos, chain, nlm_server_hosts) {
522 hlist_for_each_entry(host, pos, chain, h_hash) { 610 dprintk(" %s (cnt %d use %d exp %ld)\n",
523 dprintk(" %s (cnt %d use %d exp %ld)\n", 611 host->h_name, atomic_read(&host->h_count),
524 host->h_name, atomic_read(&host->h_count), 612 host->h_inuse, host->h_expires);
525 host->h_inuse, host->h_expires);
526 }
527 } 613 }
528 } 614 }
529} 615}
@@ -541,29 +627,22 @@ nlm_gc_hosts(void)
541 struct nlm_host *host; 627 struct nlm_host *host;
542 628
543 dprintk("lockd: host garbage collection\n"); 629 dprintk("lockd: host garbage collection\n");
544 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 630 for_each_host(host, pos, chain, nlm_server_hosts)
545 hlist_for_each_entry(host, pos, chain, h_hash) 631 host->h_inuse = 0;
546 host->h_inuse = 0;
547 }
548 632
549 /* Mark all hosts that hold locks, blocks or shares */ 633 /* Mark all hosts that hold locks, blocks or shares */
550 nlmsvc_mark_resources(); 634 nlmsvc_mark_resources();
551 635
552 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 636 for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
553 hlist_for_each_entry_safe(host, pos, next, chain, h_hash) { 637 if (atomic_read(&host->h_count) || host->h_inuse
554 if (atomic_read(&host->h_count) || host->h_inuse 638 || time_before(jiffies, host->h_expires)) {
555 || time_before(jiffies, host->h_expires)) { 639 dprintk("nlm_gc_hosts skipping %s "
556 dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n", 640 "(cnt %d use %d exp %ld)\n",
557 host->h_name, atomic_read(&host->h_count), 641 host->h_name, atomic_read(&host->h_count),
558 host->h_inuse, host->h_expires); 642 host->h_inuse, host->h_expires);
559 continue; 643 continue;
560 }
561 dprintk("lockd: delete host %s\n", host->h_name);
562 hlist_del_init(&host->h_hash);
563
564 nlm_destroy_host(host);
565 nrhosts--;
566 } 644 }
645 nlm_destroy_host_locked(host);
567 } 646 }
568 647
569 next_gc = jiffies + NLM_HOST_COLLECT; 648 next_gc = jiffies + NLM_HOST_COLLECT;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e0c91894964..23d7451b293 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -401,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm)
401 * Status Monitor wire protocol. 401 * Status Monitor wire protocol.
402 */ 402 */
403 403
404static int encode_nsm_string(struct xdr_stream *xdr, const char *string) 404static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
405{ 405{
406 const u32 len = strlen(string); 406 const u32 len = strlen(string);
407 __be32 *p; 407 __be32 *p;
408 408
409 if (unlikely(len > SM_MAXSTRLEN)) 409 BUG_ON(len > SM_MAXSTRLEN);
410 return -EIO; 410 p = xdr_reserve_space(xdr, 4 + len);
411 p = xdr_reserve_space(xdr, sizeof(u32) + len);
412 if (unlikely(p == NULL))
413 return -EIO;
414 xdr_encode_opaque(p, string, len); 411 xdr_encode_opaque(p, string, len);
415 return 0;
416} 412}
417 413
418/* 414/*
419 * "mon_name" specifies the host to be monitored. 415 * "mon_name" specifies the host to be monitored.
420 */ 416 */
421static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) 417static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
422{ 418{
423 return encode_nsm_string(xdr, argp->mon_name); 419 encode_nsm_string(xdr, argp->mon_name);
424} 420}
425 421
426/* 422/*
@@ -429,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
429 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name" 425 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
430 * has changed. 426 * has changed.
431 */ 427 */
432static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) 428static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
433{ 429{
434 int status;
435 __be32 *p; 430 __be32 *p;
436 431
437 status = encode_nsm_string(xdr, utsname()->nodename); 432 encode_nsm_string(xdr, utsname()->nodename);
438 if (unlikely(status != 0)) 433 p = xdr_reserve_space(xdr, 4 + 4 + 4);
439 return status; 434 *p++ = cpu_to_be32(argp->prog);
440 p = xdr_reserve_space(xdr, 3 * sizeof(u32)); 435 *p++ = cpu_to_be32(argp->vers);
441 if (unlikely(p == NULL)) 436 *p = cpu_to_be32(argp->proc);
442 return -EIO;
443 *p++ = htonl(argp->prog);
444 *p++ = htonl(argp->vers);
445 *p++ = htonl(argp->proc);
446 return 0;
447} 437}
448 438
449/* 439/*
450 * The "mon_id" argument specifies the non-private arguments 440 * The "mon_id" argument specifies the non-private arguments
451 * of an NSMPROC_MON or NSMPROC_UNMON call. 441 * of an NSMPROC_MON or NSMPROC_UNMON call.
452 */ 442 */
453static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) 443static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
454{ 444{
455 int status; 445 encode_mon_name(xdr, argp);
456 446 encode_my_id(xdr, argp);
457 status = encode_mon_name(xdr, argp);
458 if (unlikely(status != 0))
459 return status;
460 return encode_my_id(xdr, argp);
461} 447}
462 448
463/* 449/*
@@ -465,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
465 * by the NSMPROC_MON call. This information will be supplied in the 451 * by the NSMPROC_MON call. This information will be supplied in the
466 * NLMPROC_SM_NOTIFY call. 452 * NLMPROC_SM_NOTIFY call.
467 */ 453 */
468static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) 454static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
469{ 455{
470 __be32 *p; 456 __be32 *p;
471 457
472 p = xdr_reserve_space(xdr, SM_PRIV_SIZE); 458 p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
473 if (unlikely(p == NULL))
474 return -EIO;
475 xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE); 459 xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
476 return 0;
477} 460}
478 461
479static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p, 462static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
480 const struct nsm_args *argp) 463 const struct nsm_args *argp)
481{ 464{
482 struct xdr_stream xdr; 465 encode_mon_id(xdr, argp);
483 int status; 466 encode_priv(xdr, argp);
484
485 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
486 status = encode_mon_id(&xdr, argp);
487 if (unlikely(status))
488 return status;
489 return encode_priv(&xdr, argp);
490} 467}
491 468
492static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p, 469static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
493 const struct nsm_args *argp) 470 const struct nsm_args *argp)
494{ 471{
495 struct xdr_stream xdr; 472 encode_mon_id(xdr, argp);
496
497 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
498 return encode_mon_id(&xdr, argp);
499} 473}
500 474
501static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p, 475static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
502 struct nsm_res *resp) 476 struct xdr_stream *xdr,
477 struct nsm_res *resp)
503{ 478{
504 struct xdr_stream xdr; 479 __be32 *p;
505 480
506 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 481 p = xdr_inline_decode(xdr, 4 + 4);
507 p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
508 if (unlikely(p == NULL)) 482 if (unlikely(p == NULL))
509 return -EIO; 483 return -EIO;
510 resp->status = ntohl(*p++); 484 resp->status = be32_to_cpup(p++);
511 resp->state = ntohl(*p); 485 resp->state = be32_to_cpup(p);
512 486
513 dprintk("lockd: xdr_dec_stat_res status %d state %d\n", 487 dprintk("lockd: %s status %d state %d\n",
514 resp->status, resp->state); 488 __func__, resp->status, resp->state);
515 return 0; 489 return 0;
516} 490}
517 491
518static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p, 492static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
519 struct nsm_res *resp) 493 struct xdr_stream *xdr,
494 struct nsm_res *resp)
520{ 495{
521 struct xdr_stream xdr; 496 __be32 *p;
522 497
523 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 498 p = xdr_inline_decode(xdr, 4);
524 p = xdr_inline_decode(&xdr, sizeof(u32));
525 if (unlikely(p == NULL)) 499 if (unlikely(p == NULL))
526 return -EIO; 500 return -EIO;
527 resp->state = ntohl(*p); 501 resp->state = be32_to_cpup(p);
528 502
529 dprintk("lockd: xdr_dec_stat state %d\n", resp->state); 503 dprintk("lockd: %s state %d\n", __func__, resp->state);
530 return 0; 504 return 0;
531} 505}
532 506
@@ -542,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
542static struct rpc_procinfo nsm_procedures[] = { 516static struct rpc_procinfo nsm_procedures[] = {
543[NSMPROC_MON] = { 517[NSMPROC_MON] = {
544 .p_proc = NSMPROC_MON, 518 .p_proc = NSMPROC_MON,
545 .p_encode = (kxdrproc_t)xdr_enc_mon, 519 .p_encode = (kxdreproc_t)nsm_xdr_enc_mon,
546 .p_decode = (kxdrproc_t)xdr_dec_stat_res, 520 .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat_res,
547 .p_arglen = SM_mon_sz, 521 .p_arglen = SM_mon_sz,
548 .p_replen = SM_monres_sz, 522 .p_replen = SM_monres_sz,
549 .p_statidx = NSMPROC_MON, 523 .p_statidx = NSMPROC_MON,
@@ -551,8 +525,8 @@ static struct rpc_procinfo nsm_procedures[] = {
551 }, 525 },
552[NSMPROC_UNMON] = { 526[NSMPROC_UNMON] = {
553 .p_proc = NSMPROC_UNMON, 527 .p_proc = NSMPROC_UNMON,
554 .p_encode = (kxdrproc_t)xdr_enc_unmon, 528 .p_encode = (kxdreproc_t)nsm_xdr_enc_unmon,
555 .p_decode = (kxdrproc_t)xdr_dec_stat, 529 .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat,
556 .p_arglen = SM_mon_id_sz, 530 .p_arglen = SM_mon_id_sz,
557 .p_replen = SM_unmonres_sz, 531 .p_replen = SM_unmonres_sz,
558 .p_statidx = NSMPROC_UNMON, 532 .p_statidx = NSMPROC_UNMON,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a336e832475..9a41fdc1951 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/smp_lock.h>
13#include <linux/lockd/lockd.h> 12#include <linux/lockd/lockd.h>
14#include <linux/lockd/share.h> 13#include <linux/lockd/share.h>
15 14
@@ -52,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
52 return 0; 51 return 0;
53 52
54no_locks: 53no_locks:
55 nlm_release_host(host); 54 nlmsvc_release_host(host);
56 if (error) 55 if (error)
57 return error; 56 return error;
58 return nlm_lck_denied_nolocks; 57 return nlm_lck_denied_nolocks;
@@ -93,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
93 else 92 else
94 dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); 93 dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
95 94
96 nlm_release_host(host); 95 nlmsvc_release_host(host);
97 nlm_release_file(file); 96 nlm_release_file(file);
98 return rc; 97 return rc;
99} 98}
@@ -135,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
135 else 134 else
136 dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 135 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
137 136
138 nlm_release_host(host); 137 nlmsvc_release_host(host);
139 nlm_release_file(file); 138 nlm_release_file(file);
140 return rc; 139 return rc;
141} 140}
@@ -165,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
165 resp->status = nlmsvc_cancel_blocked(file, &argp->lock); 164 resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
166 165
167 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); 166 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status));
168 nlm_release_host(host); 167 nlmsvc_release_host(host);
169 nlm_release_file(file); 168 nlm_release_file(file);
170 return rpc_success; 169 return rpc_success;
171} 170}
@@ -198,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
198 resp->status = nlmsvc_unlock(file, &argp->lock); 197 resp->status = nlmsvc_unlock(file, &argp->lock);
199 198
200 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); 199 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status));
201 nlm_release_host(host); 200 nlmsvc_release_host(host);
202 nlm_release_file(file); 201 nlm_release_file(file);
203 return rpc_success; 202 return rpc_success;
204} 203}
@@ -230,7 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
230 229
231static void nlm4svc_callback_release(void *data) 230static void nlm4svc_callback_release(void *data)
232{ 231{
233 nlm_release_call(data); 232 nlmsvc_release_call(data);
234} 233}
235 234
236static const struct rpc_call_ops nlm4svc_callback_ops = { 235static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -262,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
262 261
263 stat = func(rqstp, argp, &call->a_res); 262 stat = func(rqstp, argp, &call->a_res);
264 if (stat != 0) { 263 if (stat != 0) {
265 nlm_release_call(call); 264 nlmsvc_release_call(call);
266 return stat; 265 return stat;
267 } 266 }
268 267
@@ -335,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
335 resp->status = nlmsvc_share_file(host, file, argp); 334 resp->status = nlmsvc_share_file(host, file, argp);
336 335
337 dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); 336 dprintk("lockd: SHARE status %d\n", ntohl(resp->status));
338 nlm_release_host(host); 337 nlmsvc_release_host(host);
339 nlm_release_file(file); 338 nlm_release_file(file);
340 return rpc_success; 339 return rpc_success;
341} 340}
@@ -368,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
368 resp->status = nlmsvc_unshare_file(host, file, argp); 367 resp->status = nlmsvc_unshare_file(host, file, argp);
369 368
370 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); 369 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status));
371 nlm_release_host(host); 370 nlmsvc_release_host(host);
372 nlm_release_file(file); 371 nlm_release_file(file);
373 return rpc_success; 372 return rpc_success;
374} 373}
@@ -400,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
400 return rpc_success; 399 return rpc_success;
401 400
402 nlmsvc_free_host_resources(host); 401 nlmsvc_free_host_resources(host);
403 nlm_release_host(host); 402 nlmsvc_release_host(host);
404 return rpc_success; 403 return rpc_success;
405} 404}
406 405
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index c462d346acb..6e31695d046 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -25,7 +25,6 @@
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/smp_lock.h>
29#include <linux/sunrpc/clnt.h> 28#include <linux/sunrpc/clnt.h>
30#include <linux/sunrpc/svc.h> 29#include <linux/sunrpc/svc.h>
31#include <linux/lockd/nlm.h> 30#include <linux/lockd/nlm.h>
@@ -47,6 +46,7 @@ static void nlmsvc_remove_block(struct nlm_block *block);
47static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); 46static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
48static void nlmsvc_freegrantargs(struct nlm_rqst *call); 47static void nlmsvc_freegrantargs(struct nlm_rqst *call);
49static const struct rpc_call_ops nlmsvc_grant_ops; 48static const struct rpc_call_ops nlmsvc_grant_ops;
49static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
50 50
51/* 51/*
52 * The list of blocked locks to retry 52 * The list of blocked locks to retry
@@ -234,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
234failed_free: 234failed_free:
235 kfree(block); 235 kfree(block);
236failed: 236failed:
237 nlm_release_call(call); 237 nlmsvc_release_call(call);
238 return NULL; 238 return NULL;
239} 239}
240 240
@@ -267,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
267 mutex_unlock(&file->f_mutex); 267 mutex_unlock(&file->f_mutex);
268 268
269 nlmsvc_freegrantargs(block->b_call); 269 nlmsvc_freegrantargs(block->b_call);
270 nlm_release_call(block->b_call); 270 nlmsvc_release_call(block->b_call);
271 nlm_release_file(block->b_file); 271 nlm_release_file(block->b_file);
272 kfree(block->b_fl); 272 kfree(block->b_fl);
273 kfree(block); 273 kfree(block);
@@ -935,3 +935,32 @@ nlmsvc_retry_blocked(void)
935 935
936 return timeout; 936 return timeout;
937} 937}
938
939#ifdef RPC_DEBUG
940static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
941{
942 /*
943 * We can get away with a static buffer because we're only
944 * called with BKL held.
945 */
946 static char buf[2*NLM_MAXCOOKIELEN+1];
947 unsigned int i, len = sizeof(buf);
948 char *p = buf;
949
950 len--; /* allow for trailing \0 */
951 if (len < 3)
952 return "???";
953 for (i = 0 ; i < cookie->len ; i++) {
954 if (len < 2) {
955 strcpy(p-3, "...");
956 break;
957 }
958 sprintf(p, "%02x", cookie->data[i]);
959 p += 2;
960 len -= 2;
961 }
962 *p = '\0';
963
964 return buf;
965}
966#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index c3069f38d60..d27aab11f32 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/smp_lock.h>
13#include <linux/lockd/lockd.h> 12#include <linux/lockd/lockd.h>
14#include <linux/lockd/share.h> 13#include <linux/lockd/share.h>
15 14
@@ -81,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
81 return 0; 80 return 0;
82 81
83no_locks: 82no_locks:
84 nlm_release_host(host); 83 nlmsvc_release_host(host);
85 if (error) 84 if (error)
86 return error; 85 return error;
87 return nlm_lck_denied_nolocks; 86 return nlm_lck_denied_nolocks;
@@ -123,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
123 dprintk("lockd: TEST status %d vers %d\n", 122 dprintk("lockd: TEST status %d vers %d\n",
124 ntohl(resp->status), rqstp->rq_vers); 123 ntohl(resp->status), rqstp->rq_vers);
125 124
126 nlm_release_host(host); 125 nlmsvc_release_host(host);
127 nlm_release_file(file); 126 nlm_release_file(file);
128 return rc; 127 return rc;
129} 128}
@@ -165,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
165 else 164 else
166 dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 165 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
167 166
168 nlm_release_host(host); 167 nlmsvc_release_host(host);
169 nlm_release_file(file); 168 nlm_release_file(file);
170 return rc; 169 return rc;
171} 170}
@@ -195,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
195 resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock)); 194 resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
196 195
197 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); 196 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status));
198 nlm_release_host(host); 197 nlmsvc_release_host(host);
199 nlm_release_file(file); 198 nlm_release_file(file);
200 return rpc_success; 199 return rpc_success;
201} 200}
@@ -228,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
228 resp->status = cast_status(nlmsvc_unlock(file, &argp->lock)); 227 resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
229 228
230 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); 229 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status));
231 nlm_release_host(host); 230 nlmsvc_release_host(host);
232 nlm_release_file(file); 231 nlm_release_file(file);
233 return rpc_success; 232 return rpc_success;
234} 233}
@@ -258,9 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
258 -task->tk_status); 257 -task->tk_status);
259} 258}
260 259
260void nlmsvc_release_call(struct nlm_rqst *call)
261{
262 if (!atomic_dec_and_test(&call->a_count))
263 return;
264 nlmsvc_release_host(call->a_host);
265 kfree(call);
266}
267
261static void nlmsvc_callback_release(void *data) 268static void nlmsvc_callback_release(void *data)
262{ 269{
263 nlm_release_call(data); 270 nlmsvc_release_call(data);
264} 271}
265 272
266static const struct rpc_call_ops nlmsvc_callback_ops = { 273static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -292,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
292 299
293 stat = func(rqstp, argp, &call->a_res); 300 stat = func(rqstp, argp, &call->a_res);
294 if (stat != 0) { 301 if (stat != 0) {
295 nlm_release_call(call); 302 nlmsvc_release_call(call);
296 return stat; 303 return stat;
297 } 304 }
298 305
@@ -367,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
367 resp->status = cast_status(nlmsvc_share_file(host, file, argp)); 374 resp->status = cast_status(nlmsvc_share_file(host, file, argp));
368 375
369 dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); 376 dprintk("lockd: SHARE status %d\n", ntohl(resp->status));
370 nlm_release_host(host); 377 nlmsvc_release_host(host);
371 nlm_release_file(file); 378 nlm_release_file(file);
372 return rpc_success; 379 return rpc_success;
373} 380}
@@ -400,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
400 resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); 407 resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
401 408
402 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); 409 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status));
403 nlm_release_host(host); 410 nlmsvc_release_host(host);
404 nlm_release_file(file); 411 nlm_release_file(file);
405 return rpc_success; 412 return rpc_success;
406} 413}
@@ -432,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
432 return rpc_success; 439 return rpc_success;
433 440
434 nlmsvc_free_host_resources(host); 441 nlmsvc_free_host_resources(host);
435 nlm_release_host(host); 442 nlmsvc_release_host(host);
436 return rpc_success; 443 return rpc_success;
437} 444}
438 445
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index b583ab0a4cb..964666c68a8 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
149} 149}
150 150
151/* 151/*
152 * Encode a lock as part of an NLM call
153 */
154static __be32 *
155nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
156{
157 struct file_lock *fl = &lock->fl;
158 __s32 start, len;
159
160 if (!(p = xdr_encode_string(p, lock->caller))
161 || !(p = nlm_encode_fh(p, &lock->fh))
162 || !(p = nlm_encode_oh(p, &lock->oh)))
163 return NULL;
164
165 if (fl->fl_start > NLM_OFFSET_MAX
166 || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
167 return NULL;
168
169 start = loff_t_to_s32(fl->fl_start);
170 if (fl->fl_end == OFFSET_MAX)
171 len = 0;
172 else
173 len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
174
175 *p++ = htonl(lock->svid);
176 *p++ = htonl(start);
177 *p++ = htonl(len);
178
179 return p;
180}
181
182/*
183 * Encode result of a TEST/TEST_MSG call 152 * Encode result of a TEST/TEST_MSG call
184 */ 153 */
185static __be32 * 154static __be32 *
@@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
372{ 341{
373 return xdr_ressize_check(rqstp, p); 342 return xdr_ressize_check(rqstp, p);
374} 343}
375
376/*
377 * Now, the client side XDR functions
378 */
379#ifdef NLMCLNT_SUPPORT_SHARES
380static int
381nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
382{
383 return 0;
384}
385#endif
386
387static int
388nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
389{
390 struct nlm_lock *lock = &argp->lock;
391
392 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
393 return -EIO;
394 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
395 if (!(p = nlm_encode_lock(p, lock)))
396 return -EIO;
397 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
398 return 0;
399}
400
401static int
402nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
403{
404 if (!(p = nlm_decode_cookie(p, &resp->cookie)))
405 return -EIO;
406 resp->status = *p++;
407 if (resp->status == nlm_lck_denied) {
408 struct file_lock *fl = &resp->lock.fl;
409 u32 excl;
410 s32 start, len, end;
411
412 memset(&resp->lock, 0, sizeof(resp->lock));
413 locks_init_lock(fl);
414 excl = ntohl(*p++);
415 resp->lock.svid = ntohl(*p++);
416 fl->fl_pid = (pid_t)resp->lock.svid;
417 if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
418 return -EIO;
419
420 fl->fl_flags = FL_POSIX;
421 fl->fl_type = excl? F_WRLCK : F_RDLCK;
422 start = ntohl(*p++);
423 len = ntohl(*p++);
424 end = start + len - 1;
425
426 fl->fl_start = s32_to_loff_t(start);
427 if (len == 0 || end < 0)
428 fl->fl_end = OFFSET_MAX;
429 else
430 fl->fl_end = s32_to_loff_t(end);
431 }
432 return 0;
433}
434
435
436static int
437nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
438{
439 struct nlm_lock *lock = &argp->lock;
440
441 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
442 return -EIO;
443 *p++ = argp->block? xdr_one : xdr_zero;
444 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
445 if (!(p = nlm_encode_lock(p, lock)))
446 return -EIO;
447 *p++ = argp->reclaim? xdr_one : xdr_zero;
448 *p++ = htonl(argp->state);
449 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
450 return 0;
451}
452
453static int
454nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
455{
456 struct nlm_lock *lock = &argp->lock;
457
458 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
459 return -EIO;
460 *p++ = argp->block? xdr_one : xdr_zero;
461 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
462 if (!(p = nlm_encode_lock(p, lock)))
463 return -EIO;
464 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
465 return 0;
466}
467
468static int
469nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
470{
471 struct nlm_lock *lock = &argp->lock;
472
473 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
474 return -EIO;
475 if (!(p = nlm_encode_lock(p, lock)))
476 return -EIO;
477 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
478 return 0;
479}
480
481static int
482nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
483{
484 if (!(p = nlm_encode_cookie(p, &resp->cookie)))
485 return -EIO;
486 *p++ = resp->status;
487 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
488 return 0;
489}
490
491static int
492nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
493{
494 if (!(p = nlm_encode_testres(p, resp)))
495 return -EIO;
496 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
497 return 0;
498}
499
500static int
501nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
502{
503 if (!(p = nlm_decode_cookie(p, &resp->cookie)))
504 return -EIO;
505 resp->status = *p++;
506 return 0;
507}
508
509#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
510# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
511#endif
512
513/*
514 * Buffer requirements for NLM
515 */
516#define NLM_void_sz 0
517#define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
518#define NLM_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
519#define NLM_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
520#define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE)
521#define NLM_lock_sz 3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
522#define NLM_holder_sz 4+NLM_owner_sz
523
524#define NLM_testargs_sz NLM_cookie_sz+1+NLM_lock_sz
525#define NLM_lockargs_sz NLM_cookie_sz+4+NLM_lock_sz
526#define NLM_cancargs_sz NLM_cookie_sz+2+NLM_lock_sz
527#define NLM_unlockargs_sz NLM_cookie_sz+NLM_lock_sz
528
529#define NLM_testres_sz NLM_cookie_sz+1+NLM_holder_sz
530#define NLM_res_sz NLM_cookie_sz+1
531#define NLM_norep_sz 0
532
533/*
534 * For NLM, a void procedure really returns nothing
535 */
536#define nlmclt_decode_norep NULL
537
538#define PROC(proc, argtype, restype) \
539[NLMPROC_##proc] = { \
540 .p_proc = NLMPROC_##proc, \
541 .p_encode = (kxdrproc_t) nlmclt_encode_##argtype, \
542 .p_decode = (kxdrproc_t) nlmclt_decode_##restype, \
543 .p_arglen = NLM_##argtype##_sz, \
544 .p_replen = NLM_##restype##_sz, \
545 .p_statidx = NLMPROC_##proc, \
546 .p_name = #proc, \
547 }
548
549static struct rpc_procinfo nlm_procedures[] = {
550 PROC(TEST, testargs, testres),
551 PROC(LOCK, lockargs, res),
552 PROC(CANCEL, cancargs, res),
553 PROC(UNLOCK, unlockargs, res),
554 PROC(GRANTED, testargs, res),
555 PROC(TEST_MSG, testargs, norep),
556 PROC(LOCK_MSG, lockargs, norep),
557 PROC(CANCEL_MSG, cancargs, norep),
558 PROC(UNLOCK_MSG, unlockargs, norep),
559 PROC(GRANTED_MSG, testargs, norep),
560 PROC(TEST_RES, testres, norep),
561 PROC(LOCK_RES, res, norep),
562 PROC(CANCEL_RES, res, norep),
563 PROC(UNLOCK_RES, res, norep),
564 PROC(GRANTED_RES, res, norep),
565#ifdef NLMCLNT_SUPPORT_SHARES
566 PROC(SHARE, shareargs, shareres),
567 PROC(UNSHARE, shareargs, shareres),
568 PROC(NM_LOCK, lockargs, res),
569 PROC(FREE_ALL, notify, void),
570#endif
571};
572
573static struct rpc_version nlm_version1 = {
574 .number = 1,
575 .nrprocs = 16,
576 .procs = nlm_procedures,
577};
578
579static struct rpc_version nlm_version3 = {
580 .number = 3,
581 .nrprocs = 24,
582 .procs = nlm_procedures,
583};
584
585static struct rpc_version * nlm_versions[] = {
586 [1] = &nlm_version1,
587 [3] = &nlm_version3,
588#ifdef CONFIG_LOCKD_V4
589 [4] = &nlm_version4,
590#endif
591};
592
593static struct rpc_stat nlm_stats;
594
595struct rpc_program nlm_program = {
596 .name = "lockd",
597 .number = NLM_PROGRAM,
598 .nrvers = ARRAY_SIZE(nlm_versions),
599 .version = nlm_versions,
600 .stats = &nlm_stats,
601};
602
603#ifdef RPC_DEBUG
604const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
605{
606 /*
607 * We can get away with a static buffer because we're only
608 * called with BKL held.
609 */
610 static char buf[2*NLM_MAXCOOKIELEN+1];
611 unsigned int i, len = sizeof(buf);
612 char *p = buf;
613
614 len--; /* allow for trailing \0 */
615 if (len < 3)
616 return "???";
617 for (i = 0 ; i < cookie->len ; i++) {
618 if (len < 2) {
619 strcpy(p-3, "...");
620 break;
621 }
622 sprintf(p, "%02x", cookie->data[i]);
623 p += 2;
624 len -= 2;
625 }
626 *p = '\0';
627
628 return buf;
629}
630#endif
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ad9dbbc9145..dfa4789cd46 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
93 return p + XDR_QUADLEN(f->size); 93 return p + XDR_QUADLEN(f->size);
94} 94}
95 95
96static __be32 *
97nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
98{
99 *p++ = htonl(f->size);
100 if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
101 memcpy(p, f->data, f->size);
102 return p + XDR_QUADLEN(f->size);
103}
104
105/* 96/*
106 * Encode and decode owner handle 97 * Encode and decode owner handle
107 */ 98 */
@@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
112} 103}
113 104
114static __be32 * 105static __be32 *
115nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
116{
117 return xdr_encode_netobj(p, oh);
118}
119
120static __be32 *
121nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) 106nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
122{ 107{
123 struct file_lock *fl = &lock->fl; 108 struct file_lock *fl = &lock->fl;
@@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
150} 135}
151 136
152/* 137/*
153 * Encode a lock as part of an NLM call
154 */
155static __be32 *
156nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
157{
158 struct file_lock *fl = &lock->fl;
159 __s64 start, len;
160
161 if (!(p = xdr_encode_string(p, lock->caller))
162 || !(p = nlm4_encode_fh(p, &lock->fh))
163 || !(p = nlm4_encode_oh(p, &lock->oh)))
164 return NULL;
165
166 if (fl->fl_start > NLM4_OFFSET_MAX
167 || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
168 return NULL;
169
170 *p++ = htonl(lock->svid);
171
172 start = loff_t_to_s64(fl->fl_start);
173 if (fl->fl_end == OFFSET_MAX)
174 len = 0;
175 else
176 len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
177
178 p = xdr_encode_hyper(p, start);
179 p = xdr_encode_hyper(p, len);
180
181 return p;
182}
183
184/*
185 * Encode result of a TEST/TEST_MSG call 138 * Encode result of a TEST/TEST_MSG call
186 */ 139 */
187static __be32 * 140static __be32 *
@@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
379{ 332{
380 return xdr_ressize_check(rqstp, p); 333 return xdr_ressize_check(rqstp, p);
381} 334}
382
383/*
384 * Now, the client side XDR functions
385 */
386#ifdef NLMCLNT_SUPPORT_SHARES
387static int
388nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
389{
390 return 0;
391}
392#endif
393
394static int
395nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
396{
397 struct nlm_lock *lock = &argp->lock;
398
399 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
400 return -EIO;
401 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
402 if (!(p = nlm4_encode_lock(p, lock)))
403 return -EIO;
404 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
405 return 0;
406}
407
408static int
409nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
410{
411 if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
412 return -EIO;
413 resp->status = *p++;
414 if (resp->status == nlm_lck_denied) {
415 struct file_lock *fl = &resp->lock.fl;
416 u32 excl;
417 __u64 start, len;
418 __s64 end;
419
420 memset(&resp->lock, 0, sizeof(resp->lock));
421 locks_init_lock(fl);
422 excl = ntohl(*p++);
423 resp->lock.svid = ntohl(*p++);
424 fl->fl_pid = (pid_t)resp->lock.svid;
425 if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
426 return -EIO;
427
428 fl->fl_flags = FL_POSIX;
429 fl->fl_type = excl? F_WRLCK : F_RDLCK;
430 p = xdr_decode_hyper(p, &start);
431 p = xdr_decode_hyper(p, &len);
432 end = start + len - 1;
433
434 fl->fl_start = s64_to_loff_t(start);
435 if (len == 0 || end < 0)
436 fl->fl_end = OFFSET_MAX;
437 else
438 fl->fl_end = s64_to_loff_t(end);
439 }
440 return 0;
441}
442
443
444static int
445nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
446{
447 struct nlm_lock *lock = &argp->lock;
448
449 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
450 return -EIO;
451 *p++ = argp->block? xdr_one : xdr_zero;
452 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
453 if (!(p = nlm4_encode_lock(p, lock)))
454 return -EIO;
455 *p++ = argp->reclaim? xdr_one : xdr_zero;
456 *p++ = htonl(argp->state);
457 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
458 return 0;
459}
460
461static int
462nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
463{
464 struct nlm_lock *lock = &argp->lock;
465
466 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
467 return -EIO;
468 *p++ = argp->block? xdr_one : xdr_zero;
469 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
470 if (!(p = nlm4_encode_lock(p, lock)))
471 return -EIO;
472 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
473 return 0;
474}
475
476static int
477nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
478{
479 struct nlm_lock *lock = &argp->lock;
480
481 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
482 return -EIO;
483 if (!(p = nlm4_encode_lock(p, lock)))
484 return -EIO;
485 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
486 return 0;
487}
488
489static int
490nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
491{
492 if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
493 return -EIO;
494 *p++ = resp->status;
495 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
496 return 0;
497}
498
499static int
500nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
501{
502 if (!(p = nlm4_encode_testres(p, resp)))
503 return -EIO;
504 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
505 return 0;
506}
507
508static int
509nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
510{
511 if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
512 return -EIO;
513 resp->status = *p++;
514 return 0;
515}
516
517#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
518# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
519#endif
520
521#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
522# error "NLM host name cannot be larger than NLM's maximum string length!"
523#endif
524
525/*
526 * Buffer requirements for NLM
527 */
528#define NLM4_void_sz 0
529#define NLM4_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
530#define NLM4_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
531#define NLM4_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
532#define NLM4_fhandle_sz 1+XDR_QUADLEN(NFS3_FHSIZE)
533#define NLM4_lock_sz 5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
534#define NLM4_holder_sz 6+NLM4_owner_sz
535
536#define NLM4_testargs_sz NLM4_cookie_sz+1+NLM4_lock_sz
537#define NLM4_lockargs_sz NLM4_cookie_sz+4+NLM4_lock_sz
538#define NLM4_cancargs_sz NLM4_cookie_sz+2+NLM4_lock_sz
539#define NLM4_unlockargs_sz NLM4_cookie_sz+NLM4_lock_sz
540
541#define NLM4_testres_sz NLM4_cookie_sz+1+NLM4_holder_sz
542#define NLM4_res_sz NLM4_cookie_sz+1
543#define NLM4_norep_sz 0
544
545/*
546 * For NLM, a void procedure really returns nothing
547 */
548#define nlm4clt_decode_norep NULL
549
550#define PROC(proc, argtype, restype) \
551[NLMPROC_##proc] = { \
552 .p_proc = NLMPROC_##proc, \
553 .p_encode = (kxdrproc_t) nlm4clt_encode_##argtype, \
554 .p_decode = (kxdrproc_t) nlm4clt_decode_##restype, \
555 .p_arglen = NLM4_##argtype##_sz, \
556 .p_replen = NLM4_##restype##_sz, \
557 .p_statidx = NLMPROC_##proc, \
558 .p_name = #proc, \
559 }
560
561static struct rpc_procinfo nlm4_procedures[] = {
562 PROC(TEST, testargs, testres),
563 PROC(LOCK, lockargs, res),
564 PROC(CANCEL, cancargs, res),
565 PROC(UNLOCK, unlockargs, res),
566 PROC(GRANTED, testargs, res),
567 PROC(TEST_MSG, testargs, norep),
568 PROC(LOCK_MSG, lockargs, norep),
569 PROC(CANCEL_MSG, cancargs, norep),
570 PROC(UNLOCK_MSG, unlockargs, norep),
571 PROC(GRANTED_MSG, testargs, norep),
572 PROC(TEST_RES, testres, norep),
573 PROC(LOCK_RES, res, norep),
574 PROC(CANCEL_RES, res, norep),
575 PROC(UNLOCK_RES, res, norep),
576 PROC(GRANTED_RES, res, norep),
577#ifdef NLMCLNT_SUPPORT_SHARES
578 PROC(SHARE, shareargs, shareres),
579 PROC(UNSHARE, shareargs, shareres),
580 PROC(NM_LOCK, lockargs, res),
581 PROC(FREE_ALL, notify, void),
582#endif
583};
584
585struct rpc_version nlm_version4 = {
586 .number = 4,
587 .nrprocs = 24,
588 .procs = nlm4_procedures,
589};
diff --git a/fs/locks.c b/fs/locks.c
index 65765cb6afe..0f3998291f7 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -122,7 +122,6 @@
122#include <linux/module.h> 122#include <linux/module.h>
123#include <linux/security.h> 123#include <linux/security.h>
124#include <linux/slab.h> 124#include <linux/slab.h>
125#include <linux/smp_lock.h>
126#include <linux/syscalls.h> 125#include <linux/syscalls.h>
127#include <linux/time.h> 126#include <linux/time.h>
128#include <linux/rcupdate.h> 127#include <linux/rcupdate.h>
@@ -445,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
445 fl->fl_file->f_owner.signum = 0; 444 fl->fl_file->f_owner.signum = 0;
446} 445}
447 446
448static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
449{
450 return fl->fl_file == try->fl_file;
451}
452
453static const struct lock_manager_operations lease_manager_ops = { 447static const struct lock_manager_operations lease_manager_ops = {
454 .fl_break = lease_break_callback, 448 .fl_break = lease_break_callback,
455 .fl_release_private = lease_release_private_callback, 449 .fl_release_private = lease_release_private_callback,
456 .fl_mylease = lease_mylease_callback,
457 .fl_change = lease_modify, 450 .fl_change = lease_modify,
458}; 451};
459 452
@@ -1390,7 +1383,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1390 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1383 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1391 goto out; 1384 goto out;
1392 if ((arg == F_WRLCK) 1385 if ((arg == F_WRLCK)
1393 && ((atomic_read(&dentry->d_count) > 1) 1386 && ((dentry->d_count > 1)
1394 || (atomic_read(&inode->i_count) > 1))) 1387 || (atomic_read(&inode->i_count) > 1)))
1395 goto out; 1388 goto out;
1396 } 1389 }
@@ -1406,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1406 for (before = &inode->i_flock; 1399 for (before = &inode->i_flock;
1407 ((fl = *before) != NULL) && IS_LEASE(fl); 1400 ((fl = *before) != NULL) && IS_LEASE(fl);
1408 before = &fl->fl_next) { 1401 before = &fl->fl_next) {
1409 if (lease->fl_lmops->fl_mylease(fl, lease)) 1402 if (fl->fl_file == filp)
1410 my_before = before; 1403 my_before = before;
1411 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) 1404 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
1412 /* 1405 /*
@@ -1504,9 +1497,8 @@ static int do_fcntl_delete_lease(struct file *filp)
1504 1497
1505static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) 1498static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1506{ 1499{
1507 struct file_lock *fl; 1500 struct file_lock *fl, *ret;
1508 struct fasync_struct *new; 1501 struct fasync_struct *new;
1509 struct inode *inode = filp->f_path.dentry->d_inode;
1510 int error; 1502 int error;
1511 1503
1512 fl = lease_alloc(filp, arg); 1504 fl = lease_alloc(filp, arg);
@@ -1518,13 +1510,16 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1518 locks_free_lock(fl); 1510 locks_free_lock(fl);
1519 return -ENOMEM; 1511 return -ENOMEM;
1520 } 1512 }
1513 ret = fl;
1521 lock_flocks(); 1514 lock_flocks();
1522 error = __vfs_setlease(filp, arg, &fl); 1515 error = __vfs_setlease(filp, arg, &ret);
1523 if (error) { 1516 if (error) {
1524 unlock_flocks(); 1517 unlock_flocks();
1525 locks_free_lock(fl); 1518 locks_free_lock(fl);
1526 goto out_free_fasync; 1519 goto out_free_fasync;
1527 } 1520 }
1521 if (ret != fl)
1522 locks_free_lock(fl);
1528 1523
1529 /* 1524 /*
1530 * fasync_insert_entry() returns the old entry if any. 1525 * fasync_insert_entry() returns the old entry if any.
@@ -1532,17 +1527,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1532 * inserted it into the fasync list. Clear new so that 1527 * inserted it into the fasync list. Clear new so that
1533 * we don't release it here. 1528 * we don't release it here.
1534 */ 1529 */
1535 if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new)) 1530 if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
1536 new = NULL; 1531 new = NULL;
1537 1532
1538 if (error < 0) { 1533 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1539 /* remove lease just inserted by setlease */
1540 fl->fl_type = F_UNLCK | F_INPROGRESS;
1541 fl->fl_break_time = jiffies - 10;
1542 time_out_leases(inode);
1543 } else {
1544 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1545 }
1546 unlock_flocks(); 1534 unlock_flocks();
1547 1535
1548out_free_fasync: 1536out_free_fasync:
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 92ca6fbe09b..723bc5bca09 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
300 300
301static void bdev_put_device(struct logfs_super *s) 301static void bdev_put_device(struct logfs_super *s)
302{ 302{
303 close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE); 303 blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
304} 304}
305 305
306static int bdev_can_write_buf(struct super_block *sb, u64 ofs) 306static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -325,13 +325,14 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
325{ 325{
326 struct block_device *bdev; 326 struct block_device *bdev;
327 327
328 bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); 328 bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
329 type);
329 if (IS_ERR(bdev)) 330 if (IS_ERR(bdev))
330 return PTR_ERR(bdev); 331 return PTR_ERR(bdev);
331 332
332 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { 333 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
333 int mtdnr = MINOR(bdev->bd_dev); 334 int mtdnr = MINOR(bdev->bd_dev);
334 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 335 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
335 return logfs_get_sb_mtd(p, mtdnr); 336 return logfs_get_sb_mtd(p, mtdnr);
336 } 337 }
337 338
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 409dfd65e9a..f9ddf0c388c 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
555 return __logfs_create(dir, dentry, inode, target, destlen); 555 return __logfs_create(dir, dentry, inode, target, destlen);
556} 556}
557 557
558static int logfs_permission(struct inode *inode, int mask) 558static int logfs_permission(struct inode *inode, int mask, unsigned int flags)
559{ 559{
560 return generic_permission(inode, mask, NULL); 560 if (flags & IPERM_FLAG_RCU)
561 return -ECHILD;
562 return generic_permission(inode, mask, flags, NULL);
561} 563}
562 564
563static int logfs_link(struct dentry *old_dentry, struct inode *dir, 565static int logfs_link(struct dentry *old_dentry, struct inode *dir,
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index d8c71ece098..03b8c240aed 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
141 return __logfs_iget(sb, ino); 141 return __logfs_iget(sb, ino);
142} 142}
143 143
144static void logfs_i_callback(struct rcu_head *head)
145{
146 struct inode *inode = container_of(head, struct inode, i_rcu);
147 INIT_LIST_HEAD(&inode->i_dentry);
148 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
149}
150
144static void __logfs_destroy_inode(struct inode *inode) 151static void __logfs_destroy_inode(struct inode *inode)
145{ 152{
146 struct logfs_inode *li = logfs_inode(inode); 153 struct logfs_inode *li = logfs_inode(inode);
147 154
148 BUG_ON(li->li_block); 155 BUG_ON(li->li_block);
149 list_del(&li->li_freeing_list); 156 list_del(&li->li_freeing_list);
150 kmem_cache_free(logfs_inode_cache, li); 157 call_rcu(&inode->i_rcu, logfs_i_callback);
151} 158}
152 159
153static void logfs_destroy_inode(struct inode *inode) 160static void logfs_destroy_inode(struct inode *inode)
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index f46ee8b0e13..9da29706f91 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
828 super->s_journal_seg[i] = segno; 828 super->s_journal_seg[i] = segno;
829 super->s_journal_ec[i] = ec; 829 super->s_journal_ec[i] = ec;
830 logfs_set_segment_reserved(sb, segno); 830 logfs_set_segment_reserved(sb, segno);
831 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL); 831 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
832 BUG_ON(err); /* mempool should prevent this */ 832 BUG_ON(err); /* mempool should prevent this */
833 err = logfs_erase_segment(sb, segno, 1); 833 err = logfs_erase_segment(sb, segno, 1);
834 BUG_ON(err); /* FIXME: remount-ro would be nicer */ 834 BUG_ON(err); /* FIXME: remount-ro would be nicer */
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index cd51a36b37f..57afd4a6fab 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -486,7 +486,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
486 486
487/* dev_mtd.c */ 487/* dev_mtd.c */
488#ifdef CONFIG_MTD 488#ifdef CONFIG_MTD
489int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) 489int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
490#else 490#else
491static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) 491static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
492{ 492{
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6127baf0e18..ee99a9f5dfd 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode)
1994 1994
1995 /* FIXME: transaction is part of logfs_block now. Is that enough? */ 1995 /* FIXME: transaction is part of logfs_block now. Is that enough? */
1996 err = logfs_write_buf(master_inode, page, 0); 1996 err = logfs_write_buf(master_inode, page, 0);
1997 if (err)
1998 move_page_to_inode(inode, page);
1999
1997 logfs_put_write_page(page); 2000 logfs_put_write_page(page);
1998 return err; 2001 return err;
1999} 2002}
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237..a25444ab2ba 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
76EXPORT_SYMBOL(mb_cache_entry_find_next); 76EXPORT_SYMBOL(mb_cache_entry_find_next);
77#endif 77#endif
78 78
79struct mb_cache {
80 struct list_head c_cache_list;
81 const char *c_name;
82 atomic_t c_entry_count;
83 int c_max_entries;
84 int c_bucket_bits;
85 struct kmem_cache *c_entry_cache;
86 struct list_head *c_block_hash;
87 struct list_head *c_index_hash;
88};
89
90
91/* 79/*
92 * Global data: list of all mbcache's, lru list, and a spinlock for 80 * Global data: list of all mbcache's, lru list, and a spinlock for
93 * accessing cache data structures on SMP machines. The lru list is 81 * accessing cache data structures on SMP machines. The lru list is
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index fb2020858a3..ae0b83f476a 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb)
68 return &ei->vfs_inode; 68 return &ei->vfs_inode;
69} 69}
70 70
71static void minix_destroy_inode(struct inode *inode) 71static void minix_i_callback(struct rcu_head *head)
72{ 72{
73 struct inode *inode = container_of(head, struct inode, i_rcu);
74 INIT_LIST_HEAD(&inode->i_dentry);
73 kmem_cache_free(minix_inode_cachep, minix_i(inode)); 75 kmem_cache_free(minix_inode_cachep, minix_i(inode));
74} 76}
75 77
78static void minix_destroy_inode(struct inode *inode)
79{
80 call_rcu(&inode->i_rcu, minix_i_callback);
81}
82
76static void init_once(void *foo) 83static void init_once(void *foo)
77{ 84{
78 struct minix_inode_info *ei = (struct minix_inode_info *) foo; 85 struct minix_inode_info *ei = (struct minix_inode_info *) foo;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index c0d35a3acce..ce7337ddfdb 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,8 +23,6 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
23 struct inode * inode = NULL; 23 struct inode * inode = NULL;
24 ino_t ino; 24 ino_t ino;
25 25
26 dentry->d_op = dir->i_sb->s_root->d_op;
27
28 if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen) 26 if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
29 return ERR_PTR(-ENAMETOOLONG); 27 return ERR_PTR(-ENAMETOOLONG);
30 28
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea55..d78455a81ec 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
40 * status of that page is hard. See end_buffer_async_read() for the details. 40 * status of that page is hard. See end_buffer_async_read() for the details.
41 * There is no point in duplicating all that complexity. 41 * There is no point in duplicating all that complexity.
42 */ 42 */
43static void mpage_end_io_read(struct bio *bio, int err) 43static void mpage_end_io(struct bio *bio, int err)
44{ 44{
45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
50 50
51 if (--bvec >= bio->bi_io_vec) 51 if (--bvec >= bio->bi_io_vec)
52 prefetchw(&bvec->bv_page->flags); 52 prefetchw(&bvec->bv_page->flags);
53 53 if (bio_data_dir(bio) == READ) {
54 if (uptodate) { 54 if (uptodate) {
55 SetPageUptodate(page); 55 SetPageUptodate(page);
56 } else { 56 } else {
57 ClearPageUptodate(page); 57 ClearPageUptodate(page);
58 SetPageError(page); 58 SetPageError(page);
59 } 59 }
60 unlock_page(page); 60 unlock_page(page);
61 } while (bvec >= bio->bi_io_vec); 61 } else { /* bio_data_dir(bio) == WRITE */
62 bio_put(bio); 62 if (!uptodate) {
63} 63 SetPageError(page);
64 64 if (page->mapping)
65static void mpage_end_io_write(struct bio *bio, int err) 65 set_bit(AS_EIO, &page->mapping->flags);
66{ 66 }
67 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 67 end_page_writeback(page);
68 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
69
70 do {
71 struct page *page = bvec->bv_page;
72
73 if (--bvec >= bio->bi_io_vec)
74 prefetchw(&bvec->bv_page->flags);
75
76 if (!uptodate){
77 SetPageError(page);
78 if (page->mapping)
79 set_bit(AS_EIO, &page->mapping->flags);
80 } 68 }
81 end_page_writeback(page);
82 } while (bvec >= bio->bi_io_vec); 69 } while (bvec >= bio->bi_io_vec);
83 bio_put(bio); 70 bio_put(bio);
84} 71}
85 72
86static struct bio *mpage_bio_submit(int rw, struct bio *bio) 73static struct bio *mpage_bio_submit(int rw, struct bio *bio)
87{ 74{
88 bio->bi_end_io = mpage_end_io_read; 75 bio->bi_end_io = mpage_end_io;
89 if (rw == WRITE)
90 bio->bi_end_io = mpage_end_io_write;
91 submit_bio(rw, bio); 76 submit_bio(rw, bio);
92 return NULL; 77 return NULL;
93} 78}
diff --git a/fs/namei.c b/fs/namei.c
index 5362af9b737..7d77f24d32a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
169/* 169/*
170 * This does basic POSIX ACL permission checking 170 * This does basic POSIX ACL permission checking
171 */ 171 */
172static int acl_permission_check(struct inode *inode, int mask, 172static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
173 int (*check_acl)(struct inode *inode, int mask)) 173 int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
174{ 174{
175 umode_t mode = inode->i_mode; 175 umode_t mode = inode->i_mode;
176 176
@@ -180,7 +180,7 @@ static int acl_permission_check(struct inode *inode, int mask,
180 mode >>= 6; 180 mode >>= 6;
181 else { 181 else {
182 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 182 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
183 int error = check_acl(inode, mask); 183 int error = check_acl(inode, mask, flags);
184 if (error != -EAGAIN) 184 if (error != -EAGAIN)
185 return error; 185 return error;
186 } 186 }
@@ -198,25 +198,30 @@ static int acl_permission_check(struct inode *inode, int mask,
198} 198}
199 199
200/** 200/**
201 * generic_permission - check for access rights on a Posix-like filesystem 201 * generic_permission - check for access rights on a Posix-like filesystem
202 * @inode: inode to check access rights for 202 * @inode: inode to check access rights for
203 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 203 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
204 * @check_acl: optional callback to check for Posix ACLs 204 * @check_acl: optional callback to check for Posix ACLs
205 * @flags: IPERM_FLAG_ flags.
205 * 206 *
206 * Used to check for read/write/execute permissions on a file. 207 * Used to check for read/write/execute permissions on a file.
207 * We use "fsuid" for this, letting us set arbitrary permissions 208 * We use "fsuid" for this, letting us set arbitrary permissions
208 * for filesystem access without changing the "normal" uids which 209 * for filesystem access without changing the "normal" uids which
209 * are used for other things.. 210 * are used for other things.
211 *
212 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
213 * request cannot be satisfied (eg. requires blocking or too much complexity).
214 * It would then be called again in ref-walk mode.
210 */ 215 */
211int generic_permission(struct inode *inode, int mask, 216int generic_permission(struct inode *inode, int mask, unsigned int flags,
212 int (*check_acl)(struct inode *inode, int mask)) 217 int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
213{ 218{
214 int ret; 219 int ret;
215 220
216 /* 221 /*
217 * Do the basic POSIX ACL permission checks. 222 * Do the basic POSIX ACL permission checks.
218 */ 223 */
219 ret = acl_permission_check(inode, mask, check_acl); 224 ret = acl_permission_check(inode, mask, flags, check_acl);
220 if (ret != -EACCES) 225 if (ret != -EACCES)
221 return ret; 226 return ret;
222 227
@@ -271,9 +276,10 @@ int inode_permission(struct inode *inode, int mask)
271 } 276 }
272 277
273 if (inode->i_op->permission) 278 if (inode->i_op->permission)
274 retval = inode->i_op->permission(inode, mask); 279 retval = inode->i_op->permission(inode, mask, 0);
275 else 280 else
276 retval = generic_permission(inode, mask, inode->i_op->check_acl); 281 retval = generic_permission(inode, mask, 0,
282 inode->i_op->check_acl);
277 283
278 if (retval) 284 if (retval)
279 return retval; 285 return retval;
@@ -375,6 +381,181 @@ void path_put(struct path *path)
375EXPORT_SYMBOL(path_put); 381EXPORT_SYMBOL(path_put);
376 382
377/** 383/**
384 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
385 * @nd: nameidata pathwalk data to drop
386 * Returns: 0 on success, -ECHILD on failure
387 *
388 * Path walking has 2 modes, rcu-walk and ref-walk (see
389 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
390 * to drop out of rcu-walk mode and take normal reference counts on dentries
391 * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
392 * refcounts at the last known good point before rcu-walk got stuck, so
393 * ref-walk may continue from there. If this is not successful (eg. a seqcount
394 * has changed), then failure is returned and path walk restarts from the
395 * beginning in ref-walk mode.
396 *
397 * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
398 * ref-walk. Must be called from rcu-walk context.
399 */
400static int nameidata_drop_rcu(struct nameidata *nd)
401{
402 struct fs_struct *fs = current->fs;
403 struct dentry *dentry = nd->path.dentry;
404
405 BUG_ON(!(nd->flags & LOOKUP_RCU));
406 if (nd->root.mnt) {
407 spin_lock(&fs->lock);
408 if (nd->root.mnt != fs->root.mnt ||
409 nd->root.dentry != fs->root.dentry)
410 goto err_root;
411 }
412 spin_lock(&dentry->d_lock);
413 if (!__d_rcu_to_refcount(dentry, nd->seq))
414 goto err;
415 BUG_ON(nd->inode != dentry->d_inode);
416 spin_unlock(&dentry->d_lock);
417 if (nd->root.mnt) {
418 path_get(&nd->root);
419 spin_unlock(&fs->lock);
420 }
421 mntget(nd->path.mnt);
422
423 rcu_read_unlock();
424 br_read_unlock(vfsmount_lock);
425 nd->flags &= ~LOOKUP_RCU;
426 return 0;
427err:
428 spin_unlock(&dentry->d_lock);
429err_root:
430 if (nd->root.mnt)
431 spin_unlock(&fs->lock);
432 return -ECHILD;
433}
434
435/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
436static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
437{
438 if (nd->flags & LOOKUP_RCU)
439 return nameidata_drop_rcu(nd);
440 return 0;
441}
442
443/**
444 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
445 * @nd: nameidata pathwalk data to drop
446 * @dentry: dentry to drop
447 * Returns: 0 on success, -ECHILD on failure
448 *
449 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
450 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
451 * @nd. Must be called from rcu-walk context.
452 */
453static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
454{
455 struct fs_struct *fs = current->fs;
456 struct dentry *parent = nd->path.dentry;
457
458 /*
459 * It can be possible to revalidate the dentry that we started
460 * the path walk with. force_reval_path may also revalidate the
461 * dentry already committed to the nameidata.
462 */
463 if (unlikely(parent == dentry))
464 return nameidata_drop_rcu(nd);
465
466 BUG_ON(!(nd->flags & LOOKUP_RCU));
467 if (nd->root.mnt) {
468 spin_lock(&fs->lock);
469 if (nd->root.mnt != fs->root.mnt ||
470 nd->root.dentry != fs->root.dentry)
471 goto err_root;
472 }
473 spin_lock(&parent->d_lock);
474 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
475 if (!__d_rcu_to_refcount(dentry, nd->seq))
476 goto err;
477 /*
478 * If the sequence check on the child dentry passed, then the child has
479 * not been removed from its parent. This means the parent dentry must
480 * be valid and able to take a reference at this point.
481 */
482 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
483 BUG_ON(!parent->d_count);
484 parent->d_count++;
485 spin_unlock(&dentry->d_lock);
486 spin_unlock(&parent->d_lock);
487 if (nd->root.mnt) {
488 path_get(&nd->root);
489 spin_unlock(&fs->lock);
490 }
491 mntget(nd->path.mnt);
492
493 rcu_read_unlock();
494 br_read_unlock(vfsmount_lock);
495 nd->flags &= ~LOOKUP_RCU;
496 return 0;
497err:
498 spin_unlock(&dentry->d_lock);
499 spin_unlock(&parent->d_lock);
500err_root:
501 if (nd->root.mnt)
502 spin_unlock(&fs->lock);
503 return -ECHILD;
504}
505
506/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
507static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
508{
509 if (nd->flags & LOOKUP_RCU)
510 return nameidata_dentry_drop_rcu(nd, dentry);
511 return 0;
512}
513
514/**
515 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
516 * @nd: nameidata pathwalk data to drop
517 * Returns: 0 on success, -ECHILD on failure
518 *
519 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
520 * nd->path should be the final element of the lookup, so nd->root is discarded.
521 * Must be called from rcu-walk context.
522 */
523static int nameidata_drop_rcu_last(struct nameidata *nd)
524{
525 struct dentry *dentry = nd->path.dentry;
526
527 BUG_ON(!(nd->flags & LOOKUP_RCU));
528 nd->flags &= ~LOOKUP_RCU;
529 nd->root.mnt = NULL;
530 spin_lock(&dentry->d_lock);
531 if (!__d_rcu_to_refcount(dentry, nd->seq))
532 goto err_unlock;
533 BUG_ON(nd->inode != dentry->d_inode);
534 spin_unlock(&dentry->d_lock);
535
536 mntget(nd->path.mnt);
537
538 rcu_read_unlock();
539 br_read_unlock(vfsmount_lock);
540
541 return 0;
542
543err_unlock:
544 spin_unlock(&dentry->d_lock);
545 rcu_read_unlock();
546 br_read_unlock(vfsmount_lock);
547 return -ECHILD;
548}
549
550/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
551static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
552{
553 if (likely(nd->flags & LOOKUP_RCU))
554 return nameidata_drop_rcu_last(nd);
555 return 0;
556}
557
558/**
378 * release_open_intent - free up open intent resources 559 * release_open_intent - free up open intent resources
379 * @nd: pointer to nameidata 560 * @nd: pointer to nameidata
380 */ 561 */
@@ -386,10 +567,33 @@ void release_open_intent(struct nameidata *nd)
386 fput(nd->intent.open.file); 567 fput(nd->intent.open.file);
387} 568}
388 569
570/*
571 * Call d_revalidate and handle filesystems that request rcu-walk
572 * to be dropped. This may be called and return in rcu-walk mode,
573 * regardless of success or error. If -ECHILD is returned, the caller
574 * must return -ECHILD back up the path walk stack so path walk may
575 * be restarted in ref-walk mode.
576 */
577static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
578{
579 int status;
580
581 status = dentry->d_op->d_revalidate(dentry, nd);
582 if (status == -ECHILD) {
583 if (nameidata_dentry_drop_rcu(nd, dentry))
584 return status;
585 status = dentry->d_op->d_revalidate(dentry, nd);
586 }
587
588 return status;
589}
590
389static inline struct dentry * 591static inline struct dentry *
390do_revalidate(struct dentry *dentry, struct nameidata *nd) 592do_revalidate(struct dentry *dentry, struct nameidata *nd)
391{ 593{
392 int status = dentry->d_op->d_revalidate(dentry, nd); 594 int status;
595
596 status = d_revalidate(dentry, nd);
393 if (unlikely(status <= 0)) { 597 if (unlikely(status <= 0)) {
394 /* 598 /*
395 * The dentry failed validation. 599 * The dentry failed validation.
@@ -397,19 +601,36 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
397 * the dentry otherwise d_revalidate is asking us 601 * the dentry otherwise d_revalidate is asking us
398 * to return a fail status. 602 * to return a fail status.
399 */ 603 */
400 if (!status) { 604 if (status < 0) {
605 /* If we're in rcu-walk, we don't have a ref */
606 if (!(nd->flags & LOOKUP_RCU))
607 dput(dentry);
608 dentry = ERR_PTR(status);
609
610 } else {
611 /* Don't d_invalidate in rcu-walk mode */
612 if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
613 return ERR_PTR(-ECHILD);
401 if (!d_invalidate(dentry)) { 614 if (!d_invalidate(dentry)) {
402 dput(dentry); 615 dput(dentry);
403 dentry = NULL; 616 dentry = NULL;
404 } 617 }
405 } else {
406 dput(dentry);
407 dentry = ERR_PTR(status);
408 } 618 }
409 } 619 }
410 return dentry; 620 return dentry;
411} 621}
412 622
623static inline int need_reval_dot(struct dentry *dentry)
624{
625 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
626 return 0;
627
628 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
629 return 0;
630
631 return 1;
632}
633
413/* 634/*
414 * force_reval_path - force revalidation of a dentry 635 * force_reval_path - force revalidation of a dentry
415 * 636 *
@@ -433,17 +654,19 @@ force_reval_path(struct path *path, struct nameidata *nd)
433 654
434 /* 655 /*
435 * only check on filesystems where it's possible for the dentry to 656 * only check on filesystems where it's possible for the dentry to
436 * become stale. It's assumed that if this flag is set then the 657 * become stale.
437 * d_revalidate op will also be defined.
438 */ 658 */
439 if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) 659 if (!need_reval_dot(dentry))
440 return 0; 660 return 0;
441 661
442 status = dentry->d_op->d_revalidate(dentry, nd); 662 status = d_revalidate(dentry, nd);
443 if (status > 0) 663 if (status > 0)
444 return 0; 664 return 0;
445 665
446 if (!status) { 666 if (!status) {
667 /* Don't d_invalidate in rcu-walk mode */
668 if (nameidata_drop_rcu(nd))
669 return -ECHILD;
447 d_invalidate(dentry); 670 d_invalidate(dentry);
448 status = -ESTALE; 671 status = -ESTALE;
449 } 672 }
@@ -459,26 +682,27 @@ force_reval_path(struct path *path, struct nameidata *nd)
459 * short-cut DAC fails, then call ->permission() to do more 682 * short-cut DAC fails, then call ->permission() to do more
460 * complete permission check. 683 * complete permission check.
461 */ 684 */
462static int exec_permission(struct inode *inode) 685static inline int exec_permission(struct inode *inode, unsigned int flags)
463{ 686{
464 int ret; 687 int ret;
465 688
466 if (inode->i_op->permission) { 689 if (inode->i_op->permission) {
467 ret = inode->i_op->permission(inode, MAY_EXEC); 690 ret = inode->i_op->permission(inode, MAY_EXEC, flags);
468 if (!ret) 691 } else {
469 goto ok; 692 ret = acl_permission_check(inode, MAY_EXEC, flags,
470 return ret; 693 inode->i_op->check_acl);
471 } 694 }
472 ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); 695 if (likely(!ret))
473 if (!ret)
474 goto ok; 696 goto ok;
697 if (ret == -ECHILD)
698 return ret;
475 699
476 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) 700 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
477 goto ok; 701 goto ok;
478 702
479 return ret; 703 return ret;
480ok: 704ok:
481 return security_inode_permission(inode, MAY_EXEC); 705 return security_inode_exec_permission(inode, flags);
482} 706}
483 707
484static __always_inline void set_root(struct nameidata *nd) 708static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +713,23 @@ static __always_inline void set_root(struct nameidata *nd)
489 713
490static int link_path_walk(const char *, struct nameidata *); 714static int link_path_walk(const char *, struct nameidata *);
491 715
716static __always_inline void set_root_rcu(struct nameidata *nd)
717{
718 if (!nd->root.mnt) {
719 struct fs_struct *fs = current->fs;
720 unsigned seq;
721
722 do {
723 seq = read_seqcount_begin(&fs->seq);
724 nd->root = fs->root;
725 } while (read_seqcount_retry(&fs->seq, seq));
726 }
727}
728
492static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 729static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
493{ 730{
731 int ret;
732
494 if (IS_ERR(link)) 733 if (IS_ERR(link))
495 goto fail; 734 goto fail;
496 735
@@ -500,8 +739,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
500 nd->path = nd->root; 739 nd->path = nd->root;
501 path_get(&nd->root); 740 path_get(&nd->root);
502 } 741 }
742 nd->inode = nd->path.dentry->d_inode;
503 743
504 return link_path_walk(link, nd); 744 ret = link_path_walk(link, nd);
745 return ret;
505fail: 746fail:
506 path_put(&nd->path); 747 path_put(&nd->path);
507 return PTR_ERR(link); 748 return PTR_ERR(link);
@@ -514,30 +755,30 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
514 mntput(path->mnt); 755 mntput(path->mnt);
515} 756}
516 757
517static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 758static inline void path_to_nameidata(const struct path *path,
759 struct nameidata *nd)
518{ 760{
519 dput(nd->path.dentry); 761 if (!(nd->flags & LOOKUP_RCU)) {
520 if (nd->path.mnt != path->mnt) { 762 dput(nd->path.dentry);
521 mntput(nd->path.mnt); 763 if (nd->path.mnt != path->mnt)
522 nd->path.mnt = path->mnt; 764 mntput(nd->path.mnt);
523 } 765 }
766 nd->path.mnt = path->mnt;
524 nd->path.dentry = path->dentry; 767 nd->path.dentry = path->dentry;
525} 768}
526 769
527static __always_inline int 770static __always_inline int
528__do_follow_link(struct path *path, struct nameidata *nd, void **p) 771__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
529{ 772{
530 int error; 773 int error;
531 struct dentry *dentry = path->dentry; 774 struct dentry *dentry = link->dentry;
532 775
533 touch_atime(path->mnt, dentry); 776 touch_atime(link->mnt, dentry);
534 nd_set_link(nd, NULL); 777 nd_set_link(nd, NULL);
535 778
536 if (path->mnt != nd->path.mnt) { 779 if (link->mnt == nd->path.mnt)
537 path_to_nameidata(path, nd); 780 mntget(link->mnt);
538 dget(dentry); 781
539 }
540 mntget(path->mnt);
541 nd->last_type = LAST_BIND; 782 nd->last_type = LAST_BIND;
542 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 783 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
543 error = PTR_ERR(*p); 784 error = PTR_ERR(*p);
@@ -591,6 +832,20 @@ loop:
591 return err; 832 return err;
592} 833}
593 834
835static int follow_up_rcu(struct path *path)
836{
837 struct vfsmount *parent;
838 struct dentry *mountpoint;
839
840 parent = path->mnt->mnt_parent;
841 if (parent == path->mnt)
842 return 0;
843 mountpoint = path->mnt->mnt_mountpoint;
844 path->dentry = mountpoint;
845 path->mnt = parent;
846 return 1;
847}
848
594int follow_up(struct path *path) 849int follow_up(struct path *path)
595{ 850{
596 struct vfsmount *parent; 851 struct vfsmount *parent;
@@ -612,58 +867,295 @@ int follow_up(struct path *path)
612 return 1; 867 return 1;
613} 868}
614 869
615/* no need for dcache_lock, as serialization is taken care in 870/*
616 * namespace.c 871 * Perform an automount
872 * - return -EISDIR to tell follow_managed() to stop and return the path we
873 * were called with.
617 */ 874 */
618static int __follow_mount(struct path *path) 875static int follow_automount(struct path *path, unsigned flags,
876 bool *need_mntput)
619{ 877{
620 int res = 0; 878 struct vfsmount *mnt;
621 while (d_mountpoint(path->dentry)) { 879 int err;
622 struct vfsmount *mounted = lookup_mnt(path); 880
623 if (!mounted) 881 if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
624 break; 882 return -EREMOTE;
883
884 /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
885 * and this is the terminal part of the path.
886 */
887 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
888 return -EISDIR; /* we actually want to stop here */
889
890 /* We want to mount if someone is trying to open/create a file of any
891 * type under the mountpoint, wants to traverse through the mountpoint
892 * or wants to open the mounted directory.
893 *
894 * We don't want to mount if someone's just doing a stat and they've
895 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
896 * appended a '/' to the name.
897 */
898 if (!(flags & LOOKUP_FOLLOW) &&
899 !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
900 LOOKUP_OPEN | LOOKUP_CREATE)))
901 return -EISDIR;
902
903 current->total_link_count++;
904 if (current->total_link_count >= 40)
905 return -ELOOP;
906
907 mnt = path->dentry->d_op->d_automount(path);
908 if (IS_ERR(mnt)) {
909 /*
910 * The filesystem is allowed to return -EISDIR here to indicate
911 * it doesn't want to automount. For instance, autofs would do
912 * this so that its userspace daemon can mount on this dentry.
913 *
914 * However, we can only permit this if it's a terminal point in
915 * the path being looked up; if it wasn't then the remainder of
916 * the path is inaccessible and we should say so.
917 */
918 if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
919 return -EREMOTE;
920 return PTR_ERR(mnt);
921 }
922
923 if (!mnt) /* mount collision */
924 return 0;
925
926 err = finish_automount(mnt, path);
927
928 switch (err) {
929 case -EBUSY:
930 /* Someone else made a mount here whilst we were busy */
931 return 0;
932 case 0:
625 dput(path->dentry); 933 dput(path->dentry);
626 if (res) 934 if (*need_mntput)
627 mntput(path->mnt); 935 mntput(path->mnt);
936 path->mnt = mnt;
937 path->dentry = dget(mnt->mnt_root);
938 *need_mntput = true;
939 return 0;
940 default:
941 return err;
942 }
943
944}
945
946/*
947 * Handle a dentry that is managed in some way.
948 * - Flagged for transit management (autofs)
949 * - Flagged as mountpoint
950 * - Flagged as automount point
951 *
952 * This may only be called in refwalk mode.
953 *
954 * Serialization is taken care of in namespace.c
955 */
956static int follow_managed(struct path *path, unsigned flags)
957{
958 unsigned managed;
959 bool need_mntput = false;
960 int ret;
961
962 /* Given that we're not holding a lock here, we retain the value in a
963 * local variable for each dentry as we look at it so that we don't see
964 * the components of that value change under us */
965 while (managed = ACCESS_ONCE(path->dentry->d_flags),
966 managed &= DCACHE_MANAGED_DENTRY,
967 unlikely(managed != 0)) {
968 /* Allow the filesystem to manage the transit without i_mutex
969 * being held. */
970 if (managed & DCACHE_MANAGE_TRANSIT) {
971 BUG_ON(!path->dentry->d_op);
972 BUG_ON(!path->dentry->d_op->d_manage);
973 ret = path->dentry->d_op->d_manage(path->dentry,
974 false, false);
975 if (ret < 0)
976 return ret == -EISDIR ? 0 : ret;
977 }
978
979 /* Transit to a mounted filesystem. */
980 if (managed & DCACHE_MOUNTED) {
981 struct vfsmount *mounted = lookup_mnt(path);
982 if (mounted) {
983 dput(path->dentry);
984 if (need_mntput)
985 mntput(path->mnt);
986 path->mnt = mounted;
987 path->dentry = dget(mounted->mnt_root);
988 need_mntput = true;
989 continue;
990 }
991
992 /* Something is mounted on this dentry in another
993 * namespace and/or whatever was mounted there in this
994 * namespace got unmounted before we managed to get the
995 * vfsmount_lock */
996 }
997
998 /* Handle an automount point */
999 if (managed & DCACHE_NEED_AUTOMOUNT) {
1000 ret = follow_automount(path, flags, &need_mntput);
1001 if (ret < 0)
1002 return ret == -EISDIR ? 0 : ret;
1003 continue;
1004 }
1005
1006 /* We didn't change the current path point */
1007 break;
1008 }
1009 return 0;
1010}
1011
1012int follow_down_one(struct path *path)
1013{
1014 struct vfsmount *mounted;
1015
1016 mounted = lookup_mnt(path);
1017 if (mounted) {
1018 dput(path->dentry);
1019 mntput(path->mnt);
628 path->mnt = mounted; 1020 path->mnt = mounted;
629 path->dentry = dget(mounted->mnt_root); 1021 path->dentry = dget(mounted->mnt_root);
630 res = 1; 1022 return 1;
631 } 1023 }
632 return res; 1024 return 0;
633} 1025}
634 1026
635static void follow_mount(struct path *path) 1027/*
1028 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
1029 * meet a managed dentry and we're not walking to "..". True is returned to
1030 * continue, false to abort.
1031 */
1032static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1033 struct inode **inode, bool reverse_transit)
636{ 1034{
637 while (d_mountpoint(path->dentry)) { 1035 while (d_mountpoint(path->dentry)) {
638 struct vfsmount *mounted = lookup_mnt(path); 1036 struct vfsmount *mounted;
1037 if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
1038 !reverse_transit &&
1039 path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
1040 return false;
1041 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
639 if (!mounted) 1042 if (!mounted)
640 break; 1043 break;
641 dput(path->dentry);
642 mntput(path->mnt);
643 path->mnt = mounted; 1044 path->mnt = mounted;
644 path->dentry = dget(mounted->mnt_root); 1045 path->dentry = mounted->mnt_root;
1046 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1047 *inode = path->dentry->d_inode;
1048 }
1049
1050 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1051 return reverse_transit;
1052 return true;
1053}
1054
1055static int follow_dotdot_rcu(struct nameidata *nd)
1056{
1057 struct inode *inode = nd->inode;
1058
1059 set_root_rcu(nd);
1060
1061 while (1) {
1062 if (nd->path.dentry == nd->root.dentry &&
1063 nd->path.mnt == nd->root.mnt) {
1064 break;
1065 }
1066 if (nd->path.dentry != nd->path.mnt->mnt_root) {
1067 struct dentry *old = nd->path.dentry;
1068 struct dentry *parent = old->d_parent;
1069 unsigned seq;
1070
1071 seq = read_seqcount_begin(&parent->d_seq);
1072 if (read_seqcount_retry(&old->d_seq, nd->seq))
1073 return -ECHILD;
1074 inode = parent->d_inode;
1075 nd->path.dentry = parent;
1076 nd->seq = seq;
1077 break;
1078 }
1079 if (!follow_up_rcu(&nd->path))
1080 break;
1081 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1082 inode = nd->path.dentry->d_inode;
645 } 1083 }
1084 __follow_mount_rcu(nd, &nd->path, &inode, true);
1085 nd->inode = inode;
1086
1087 return 0;
646} 1088}
647 1089
648/* no need for dcache_lock, as serialization is taken care in 1090/*
649 * namespace.c 1091 * Follow down to the covering mount currently visible to userspace. At each
1092 * point, the filesystem owning that dentry may be queried as to whether the
1093 * caller is permitted to proceed or not.
1094 *
1095 * Care must be taken as namespace_sem may be held (indicated by mounting_here
1096 * being true).
650 */ 1097 */
651int follow_down(struct path *path) 1098int follow_down(struct path *path, bool mounting_here)
652{ 1099{
653 struct vfsmount *mounted; 1100 unsigned managed;
1101 int ret;
654 1102
655 mounted = lookup_mnt(path); 1103 while (managed = ACCESS_ONCE(path->dentry->d_flags),
656 if (mounted) { 1104 unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1105 /* Allow the filesystem to manage the transit without i_mutex
1106 * being held.
1107 *
1108 * We indicate to the filesystem if someone is trying to mount
1109 * something here. This gives autofs the chance to deny anyone
1110 * other than its daemon the right to mount on its
1111 * superstructure.
1112 *
1113 * The filesystem may sleep at this point.
1114 */
1115 if (managed & DCACHE_MANAGE_TRANSIT) {
1116 BUG_ON(!path->dentry->d_op);
1117 BUG_ON(!path->dentry->d_op->d_manage);
1118 ret = path->dentry->d_op->d_manage(
1119 path->dentry, mounting_here, false);
1120 if (ret < 0)
1121 return ret == -EISDIR ? 0 : ret;
1122 }
1123
1124 /* Transit to a mounted filesystem. */
1125 if (managed & DCACHE_MOUNTED) {
1126 struct vfsmount *mounted = lookup_mnt(path);
1127 if (!mounted)
1128 break;
1129 dput(path->dentry);
1130 mntput(path->mnt);
1131 path->mnt = mounted;
1132 path->dentry = dget(mounted->mnt_root);
1133 continue;
1134 }
1135
1136 /* Don't handle automount points here */
1137 break;
1138 }
1139 return 0;
1140}
1141
1142/*
1143 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1144 */
1145static void follow_mount(struct path *path)
1146{
1147 while (d_mountpoint(path->dentry)) {
1148 struct vfsmount *mounted = lookup_mnt(path);
1149 if (!mounted)
1150 break;
657 dput(path->dentry); 1151 dput(path->dentry);
658 mntput(path->mnt); 1152 mntput(path->mnt);
659 path->mnt = mounted; 1153 path->mnt = mounted;
660 path->dentry = dget(mounted->mnt_root); 1154 path->dentry = dget(mounted->mnt_root);
661 return 1;
662 } 1155 }
663 return 0;
664} 1156}
665 1157
666static __always_inline void follow_dotdot(struct nameidata *nd) 1158static void follow_dotdot(struct nameidata *nd)
667{ 1159{
668 set_root(nd); 1160 set_root(nd);
669 1161
@@ -684,6 +1176,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
684 break; 1176 break;
685 } 1177 }
686 follow_mount(&nd->path); 1178 follow_mount(&nd->path);
1179 nd->inode = nd->path.dentry->d_inode;
687} 1180}
688 1181
689/* 1182/*
@@ -721,17 +1214,19 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
721 * It _is_ time-critical. 1214 * It _is_ time-critical.
722 */ 1215 */
723static int do_lookup(struct nameidata *nd, struct qstr *name, 1216static int do_lookup(struct nameidata *nd, struct qstr *name,
724 struct path *path) 1217 struct path *path, struct inode **inode)
725{ 1218{
726 struct vfsmount *mnt = nd->path.mnt; 1219 struct vfsmount *mnt = nd->path.mnt;
727 struct dentry *dentry, *parent; 1220 struct dentry *dentry, *parent = nd->path.dentry;
728 struct inode *dir; 1221 struct inode *dir;
1222 int err;
1223
729 /* 1224 /*
730 * See if the low-level filesystem might want 1225 * See if the low-level filesystem might want
731 * to use its own hash.. 1226 * to use its own hash..
732 */ 1227 */
733 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { 1228 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
734 int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name); 1229 err = parent->d_op->d_hash(parent, nd->inode, name);
735 if (err < 0) 1230 if (err < 0)
736 return err; 1231 return err;
737 } 1232 }
@@ -741,21 +1236,52 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
741 * of a false negative due to a concurrent rename, we're going to 1236 * of a false negative due to a concurrent rename, we're going to
742 * do the non-racy lookup, below. 1237 * do the non-racy lookup, below.
743 */ 1238 */
744 dentry = __d_lookup(nd->path.dentry, name); 1239 if (nd->flags & LOOKUP_RCU) {
1240 unsigned seq;
1241
1242 *inode = nd->inode;
1243 dentry = __d_lookup_rcu(parent, name, &seq, inode);
1244 if (!dentry) {
1245 if (nameidata_drop_rcu(nd))
1246 return -ECHILD;
1247 goto need_lookup;
1248 }
1249 /* Memory barrier in read_seqcount_begin of child is enough */
1250 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1251 return -ECHILD;
1252
1253 nd->seq = seq;
1254 if (dentry->d_flags & DCACHE_OP_REVALIDATE)
1255 goto need_revalidate;
1256done2:
1257 path->mnt = mnt;
1258 path->dentry = dentry;
1259 if (likely(__follow_mount_rcu(nd, path, inode, false)))
1260 return 0;
1261 if (nameidata_drop_rcu(nd))
1262 return -ECHILD;
1263 /* fallthru */
1264 }
1265 dentry = __d_lookup(parent, name);
745 if (!dentry) 1266 if (!dentry)
746 goto need_lookup; 1267 goto need_lookup;
747found: 1268found:
748 if (dentry->d_op && dentry->d_op->d_revalidate) 1269 if (dentry->d_flags & DCACHE_OP_REVALIDATE)
749 goto need_revalidate; 1270 goto need_revalidate;
750done: 1271done:
751 path->mnt = mnt; 1272 path->mnt = mnt;
752 path->dentry = dentry; 1273 path->dentry = dentry;
753 __follow_mount(path); 1274 err = follow_managed(path, nd->flags);
1275 if (unlikely(err < 0)) {
1276 path_put_conditional(path, nd);
1277 return err;
1278 }
1279 *inode = path->dentry->d_inode;
754 return 0; 1280 return 0;
755 1281
756need_lookup: 1282need_lookup:
757 parent = nd->path.dentry;
758 dir = parent->d_inode; 1283 dir = parent->d_inode;
1284 BUG_ON(nd->inode != dir);
759 1285
760 mutex_lock(&dir->i_mutex); 1286 mutex_lock(&dir->i_mutex);
761 /* 1287 /*
@@ -789,6 +1315,8 @@ need_revalidate:
789 goto need_lookup; 1315 goto need_lookup;
790 if (IS_ERR(dentry)) 1316 if (IS_ERR(dentry))
791 goto fail; 1317 goto fail;
1318 if (nd->flags & LOOKUP_RCU)
1319 goto done2;
792 goto done; 1320 goto done;
793 1321
794fail: 1322fail:
@@ -796,17 +1324,6 @@ fail:
796} 1324}
797 1325
798/* 1326/*
799 * This is a temporary kludge to deal with "automount" symlinks; proper
800 * solution is to trigger them on follow_mount(), so that do_lookup()
801 * would DTRT. To be killed before 2.6.34-final.
802 */
803static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
804{
805 return inode && unlikely(inode->i_op->follow_link) &&
806 ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
807}
808
809/*
810 * Name resolution. 1327 * Name resolution.
811 * This is the basic name resolution function, turning a pathname into 1328 * This is the basic name resolution function, turning a pathname into
812 * the final dentry. We expect 'base' to be positive and a directory. 1329 * the final dentry. We expect 'base' to be positive and a directory.
@@ -817,7 +1334,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
817static int link_path_walk(const char *name, struct nameidata *nd) 1334static int link_path_walk(const char *name, struct nameidata *nd)
818{ 1335{
819 struct path next; 1336 struct path next;
820 struct inode *inode;
821 int err; 1337 int err;
822 unsigned int lookup_flags = nd->flags; 1338 unsigned int lookup_flags = nd->flags;
823 1339
@@ -826,18 +1342,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
826 if (!*name) 1342 if (!*name)
827 goto return_reval; 1343 goto return_reval;
828 1344
829 inode = nd->path.dentry->d_inode;
830 if (nd->depth) 1345 if (nd->depth)
831 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); 1346 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
832 1347
833 /* At this point we know we have a real path component. */ 1348 /* At this point we know we have a real path component. */
834 for(;;) { 1349 for(;;) {
1350 struct inode *inode;
835 unsigned long hash; 1351 unsigned long hash;
836 struct qstr this; 1352 struct qstr this;
837 unsigned int c; 1353 unsigned int c;
838 1354
839 nd->flags |= LOOKUP_CONTINUE; 1355 nd->flags |= LOOKUP_CONTINUE;
840 err = exec_permission(inode); 1356 if (nd->flags & LOOKUP_RCU) {
1357 err = exec_permission(nd->inode, IPERM_FLAG_RCU);
1358 if (err == -ECHILD) {
1359 if (nameidata_drop_rcu(nd))
1360 return -ECHILD;
1361 goto exec_again;
1362 }
1363 } else {
1364exec_again:
1365 err = exec_permission(nd->inode, 0);
1366 }
841 if (err) 1367 if (err)
842 break; 1368 break;
843 1369
@@ -868,37 +1394,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
868 if (this.name[0] == '.') switch (this.len) { 1394 if (this.name[0] == '.') switch (this.len) {
869 default: 1395 default:
870 break; 1396 break;
871 case 2: 1397 case 2:
872 if (this.name[1] != '.') 1398 if (this.name[1] != '.')
873 break; 1399 break;
874 follow_dotdot(nd); 1400 if (nd->flags & LOOKUP_RCU) {
875 inode = nd->path.dentry->d_inode; 1401 if (follow_dotdot_rcu(nd))
1402 return -ECHILD;
1403 } else
1404 follow_dotdot(nd);
876 /* fallthrough */ 1405 /* fallthrough */
877 case 1: 1406 case 1:
878 continue; 1407 continue;
879 } 1408 }
880 /* This does the actual lookups.. */ 1409 /* This does the actual lookups.. */
881 err = do_lookup(nd, &this, &next); 1410 err = do_lookup(nd, &this, &next, &inode);
882 if (err) 1411 if (err)
883 break; 1412 break;
884
885 err = -ENOENT; 1413 err = -ENOENT;
886 inode = next.dentry->d_inode;
887 if (!inode) 1414 if (!inode)
888 goto out_dput; 1415 goto out_dput;
889 1416
890 if (inode->i_op->follow_link) { 1417 if (inode->i_op->follow_link) {
1418 /* We commonly drop rcu-walk here */
1419 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1420 return -ECHILD;
1421 BUG_ON(inode != next.dentry->d_inode);
891 err = do_follow_link(&next, nd); 1422 err = do_follow_link(&next, nd);
892 if (err) 1423 if (err)
893 goto return_err; 1424 goto return_err;
1425 nd->inode = nd->path.dentry->d_inode;
894 err = -ENOENT; 1426 err = -ENOENT;
895 inode = nd->path.dentry->d_inode; 1427 if (!nd->inode)
896 if (!inode)
897 break; 1428 break;
898 } else 1429 } else {
899 path_to_nameidata(&next, nd); 1430 path_to_nameidata(&next, nd);
1431 nd->inode = inode;
1432 }
900 err = -ENOTDIR; 1433 err = -ENOTDIR;
901 if (!inode->i_op->lookup) 1434 if (!nd->inode->i_op->lookup)
902 break; 1435 break;
903 continue; 1436 continue;
904 /* here ends the main loop */ 1437 /* here ends the main loop */
@@ -913,32 +1446,40 @@ last_component:
913 if (this.name[0] == '.') switch (this.len) { 1446 if (this.name[0] == '.') switch (this.len) {
914 default: 1447 default:
915 break; 1448 break;
916 case 2: 1449 case 2:
917 if (this.name[1] != '.') 1450 if (this.name[1] != '.')
918 break; 1451 break;
919 follow_dotdot(nd); 1452 if (nd->flags & LOOKUP_RCU) {
920 inode = nd->path.dentry->d_inode; 1453 if (follow_dotdot_rcu(nd))
1454 return -ECHILD;
1455 } else
1456 follow_dotdot(nd);
921 /* fallthrough */ 1457 /* fallthrough */
922 case 1: 1458 case 1:
923 goto return_reval; 1459 goto return_reval;
924 } 1460 }
925 err = do_lookup(nd, &this, &next); 1461 err = do_lookup(nd, &this, &next, &inode);
926 if (err) 1462 if (err)
927 break; 1463 break;
928 inode = next.dentry->d_inode; 1464 if (inode && unlikely(inode->i_op->follow_link) &&
929 if (follow_on_final(inode, lookup_flags)) { 1465 (lookup_flags & LOOKUP_FOLLOW)) {
1466 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1467 return -ECHILD;
1468 BUG_ON(inode != next.dentry->d_inode);
930 err = do_follow_link(&next, nd); 1469 err = do_follow_link(&next, nd);
931 if (err) 1470 if (err)
932 goto return_err; 1471 goto return_err;
933 inode = nd->path.dentry->d_inode; 1472 nd->inode = nd->path.dentry->d_inode;
934 } else 1473 } else {
935 path_to_nameidata(&next, nd); 1474 path_to_nameidata(&next, nd);
1475 nd->inode = inode;
1476 }
936 err = -ENOENT; 1477 err = -ENOENT;
937 if (!inode) 1478 if (!nd->inode)
938 break; 1479 break;
939 if (lookup_flags & LOOKUP_DIRECTORY) { 1480 if (lookup_flags & LOOKUP_DIRECTORY) {
940 err = -ENOTDIR; 1481 err = -ENOTDIR;
941 if (!inode->i_op->lookup) 1482 if (!nd->inode->i_op->lookup)
942 break; 1483 break;
943 } 1484 }
944 goto return_base; 1485 goto return_base;
@@ -958,25 +1499,43 @@ return_reval:
958 * We bypassed the ordinary revalidation routines. 1499 * We bypassed the ordinary revalidation routines.
959 * We may need to check the cached dentry for staleness. 1500 * We may need to check the cached dentry for staleness.
960 */ 1501 */
961 if (nd->path.dentry && nd->path.dentry->d_sb && 1502 if (need_reval_dot(nd->path.dentry)) {
962 (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
963 err = -ESTALE;
964 /* Note: we do not d_invalidate() */ 1503 /* Note: we do not d_invalidate() */
965 if (!nd->path.dentry->d_op->d_revalidate( 1504 err = d_revalidate(nd->path.dentry, nd);
966 nd->path.dentry, nd)) 1505 if (!err)
1506 err = -ESTALE;
1507 if (err < 0)
967 break; 1508 break;
968 } 1509 }
969return_base: 1510return_base:
1511 if (nameidata_drop_rcu_last_maybe(nd))
1512 return -ECHILD;
970 return 0; 1513 return 0;
971out_dput: 1514out_dput:
972 path_put_conditional(&next, nd); 1515 if (!(nd->flags & LOOKUP_RCU))
1516 path_put_conditional(&next, nd);
973 break; 1517 break;
974 } 1518 }
975 path_put(&nd->path); 1519 if (!(nd->flags & LOOKUP_RCU))
1520 path_put(&nd->path);
976return_err: 1521return_err:
977 return err; 1522 return err;
978} 1523}
979 1524
1525static inline int path_walk_rcu(const char *name, struct nameidata *nd)
1526{
1527 current->total_link_count = 0;
1528
1529 return link_path_walk(name, nd);
1530}
1531
1532static inline int path_walk_simple(const char *name, struct nameidata *nd)
1533{
1534 current->total_link_count = 0;
1535
1536 return link_path_walk(name, nd);
1537}
1538
980static int path_walk(const char *name, struct nameidata *nd) 1539static int path_walk(const char *name, struct nameidata *nd)
981{ 1540{
982 struct path save = nd->path; 1541 struct path save = nd->path;
@@ -1002,6 +1561,93 @@ static int path_walk(const char *name, struct nameidata *nd)
1002 return result; 1561 return result;
1003} 1562}
1004 1563
1564static void path_finish_rcu(struct nameidata *nd)
1565{
1566 if (nd->flags & LOOKUP_RCU) {
1567 /* RCU dangling. Cancel it. */
1568 nd->flags &= ~LOOKUP_RCU;
1569 nd->root.mnt = NULL;
1570 rcu_read_unlock();
1571 br_read_unlock(vfsmount_lock);
1572 }
1573 if (nd->file)
1574 fput(nd->file);
1575}
1576
1577static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1578{
1579 int retval = 0;
1580 int fput_needed;
1581 struct file *file;
1582
1583 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1584 nd->flags = flags | LOOKUP_RCU;
1585 nd->depth = 0;
1586 nd->root.mnt = NULL;
1587 nd->file = NULL;
1588
1589 if (*name=='/') {
1590 struct fs_struct *fs = current->fs;
1591 unsigned seq;
1592
1593 br_read_lock(vfsmount_lock);
1594 rcu_read_lock();
1595
1596 do {
1597 seq = read_seqcount_begin(&fs->seq);
1598 nd->root = fs->root;
1599 nd->path = nd->root;
1600 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1601 } while (read_seqcount_retry(&fs->seq, seq));
1602
1603 } else if (dfd == AT_FDCWD) {
1604 struct fs_struct *fs = current->fs;
1605 unsigned seq;
1606
1607 br_read_lock(vfsmount_lock);
1608 rcu_read_lock();
1609
1610 do {
1611 seq = read_seqcount_begin(&fs->seq);
1612 nd->path = fs->pwd;
1613 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1614 } while (read_seqcount_retry(&fs->seq, seq));
1615
1616 } else {
1617 struct dentry *dentry;
1618
1619 file = fget_light(dfd, &fput_needed);
1620 retval = -EBADF;
1621 if (!file)
1622 goto out_fail;
1623
1624 dentry = file->f_path.dentry;
1625
1626 retval = -ENOTDIR;
1627 if (!S_ISDIR(dentry->d_inode->i_mode))
1628 goto fput_fail;
1629
1630 retval = file_permission(file, MAY_EXEC);
1631 if (retval)
1632 goto fput_fail;
1633
1634 nd->path = file->f_path;
1635 if (fput_needed)
1636 nd->file = file;
1637
1638 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1639 br_read_lock(vfsmount_lock);
1640 rcu_read_lock();
1641 }
1642 nd->inode = nd->path.dentry->d_inode;
1643 return 0;
1644
1645fput_fail:
1646 fput_light(file, fput_needed);
1647out_fail:
1648 return retval;
1649}
1650
1005static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) 1651static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1006{ 1652{
1007 int retval = 0; 1653 int retval = 0;
@@ -1042,6 +1688,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
1042 1688
1043 fput_light(file, fput_needed); 1689 fput_light(file, fput_needed);
1044 } 1690 }
1691 nd->inode = nd->path.dentry->d_inode;
1045 return 0; 1692 return 0;
1046 1693
1047fput_fail: 1694fput_fail:
@@ -1054,16 +1701,53 @@ out_fail:
1054static int do_path_lookup(int dfd, const char *name, 1701static int do_path_lookup(int dfd, const char *name,
1055 unsigned int flags, struct nameidata *nd) 1702 unsigned int flags, struct nameidata *nd)
1056{ 1703{
1057 int retval = path_init(dfd, name, flags, nd); 1704 int retval;
1058 if (!retval) 1705
1059 retval = path_walk(name, nd); 1706 /*
1060 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1707 * Path walking is largely split up into 2 different synchronisation
1061 nd->path.dentry->d_inode)) 1708 * schemes, rcu-walk and ref-walk (explained in
1062 audit_inode(name, nd->path.dentry); 1709 * Documentation/filesystems/path-lookup.txt). These share much of the
1710 * path walk code, but some things particularly setup, cleanup, and
1711 * following mounts are sufficiently divergent that functions are
1712 * duplicated. Typically there is a function foo(), and its RCU
1713 * analogue, foo_rcu().
1714 *
1715 * -ECHILD is the error number of choice (just to avoid clashes) that
1716 * is returned if some aspect of an rcu-walk fails. Such an error must
1717 * be handled by restarting a traditional ref-walk (which will always
1718 * be able to complete).
1719 */
1720 retval = path_init_rcu(dfd, name, flags, nd);
1721 if (unlikely(retval))
1722 return retval;
1723 retval = path_walk_rcu(name, nd);
1724 path_finish_rcu(nd);
1063 if (nd->root.mnt) { 1725 if (nd->root.mnt) {
1064 path_put(&nd->root); 1726 path_put(&nd->root);
1065 nd->root.mnt = NULL; 1727 nd->root.mnt = NULL;
1066 } 1728 }
1729
1730 if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
1731 /* slower, locked walk */
1732 if (retval == -ESTALE)
1733 flags |= LOOKUP_REVAL;
1734 retval = path_init(dfd, name, flags, nd);
1735 if (unlikely(retval))
1736 return retval;
1737 retval = path_walk(name, nd);
1738 if (nd->root.mnt) {
1739 path_put(&nd->root);
1740 nd->root.mnt = NULL;
1741 }
1742 }
1743
1744 if (likely(!retval)) {
1745 if (unlikely(!audit_dummy_context())) {
1746 if (nd->path.dentry && nd->inode)
1747 audit_inode(name, nd->path.dentry);
1748 }
1749 }
1750
1067 return retval; 1751 return retval;
1068} 1752}
1069 1753
@@ -1106,10 +1790,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1106 path_get(&nd->path); 1790 path_get(&nd->path);
1107 nd->root = nd->path; 1791 nd->root = nd->path;
1108 path_get(&nd->root); 1792 path_get(&nd->root);
1793 nd->inode = nd->path.dentry->d_inode;
1109 1794
1110 retval = path_walk(name, nd); 1795 retval = path_walk(name, nd);
1111 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1796 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1112 nd->path.dentry->d_inode)) 1797 nd->inode))
1113 audit_inode(name, nd->path.dentry); 1798 audit_inode(name, nd->path.dentry);
1114 1799
1115 path_put(&nd->root); 1800 path_put(&nd->root);
@@ -1125,7 +1810,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
1125 struct dentry *dentry; 1810 struct dentry *dentry;
1126 int err; 1811 int err;
1127 1812
1128 err = exec_permission(inode); 1813 err = exec_permission(inode, 0);
1129 if (err) 1814 if (err)
1130 return ERR_PTR(err); 1815 return ERR_PTR(err);
1131 1816
@@ -1133,8 +1818,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
1133 * See if the low-level filesystem might want 1818 * See if the low-level filesystem might want
1134 * to use its own hash.. 1819 * to use its own hash..
1135 */ 1820 */
1136 if (base->d_op && base->d_op->d_hash) { 1821 if (base->d_flags & DCACHE_OP_HASH) {
1137 err = base->d_op->d_hash(base, name); 1822 err = base->d_op->d_hash(base, inode, name);
1138 dentry = ERR_PTR(err); 1823 dentry = ERR_PTR(err);
1139 if (err < 0) 1824 if (err < 0)
1140 goto out; 1825 goto out;
@@ -1147,7 +1832,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
1147 */ 1832 */
1148 dentry = d_lookup(base, name); 1833 dentry = d_lookup(base, name);
1149 1834
1150 if (dentry && dentry->d_op && dentry->d_op->d_revalidate) 1835 if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
1151 dentry = do_revalidate(dentry, nd); 1836 dentry = do_revalidate(dentry, nd);
1152 1837
1153 if (!dentry) 1838 if (!dentry)
@@ -1448,8 +2133,9 @@ int may_open(struct path *path, int acc_mode, int flag)
1448 return break_lease(inode, flag); 2133 return break_lease(inode, flag);
1449} 2134}
1450 2135
1451static int handle_truncate(struct path *path) 2136static int handle_truncate(struct file *filp)
1452{ 2137{
2138 struct path *path = &filp->f_path;
1453 struct inode *inode = path->dentry->d_inode; 2139 struct inode *inode = path->dentry->d_inode;
1454 int error = get_write_access(inode); 2140 int error = get_write_access(inode);
1455 if (error) 2141 if (error)
@@ -1463,7 +2149,7 @@ static int handle_truncate(struct path *path)
1463 if (!error) { 2149 if (!error) {
1464 error = do_truncate(path->dentry, 0, 2150 error = do_truncate(path->dentry, 0,
1465 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 2151 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1466 NULL); 2152 filp);
1467 } 2153 }
1468 put_write_access(inode); 2154 put_write_access(inode);
1469 return error; 2155 return error;
@@ -1490,6 +2176,7 @@ out_unlock:
1490 mutex_unlock(&dir->d_inode->i_mutex); 2176 mutex_unlock(&dir->d_inode->i_mutex);
1491 dput(nd->path.dentry); 2177 dput(nd->path.dentry);
1492 nd->path.dentry = path->dentry; 2178 nd->path.dentry = path->dentry;
2179
1493 if (error) 2180 if (error)
1494 return error; 2181 return error;
1495 /* Don't check for write permission, don't truncate */ 2182 /* Don't check for write permission, don't truncate */
@@ -1560,7 +2247,7 @@ static struct file *finish_open(struct nameidata *nd,
1560 } 2247 }
1561 if (!IS_ERR(filp)) { 2248 if (!IS_ERR(filp)) {
1562 if (will_truncate) { 2249 if (will_truncate) {
1563 error = handle_truncate(&nd->path); 2250 error = handle_truncate(filp);
1564 if (error) { 2251 if (error) {
1565 fput(filp); 2252 fput(filp);
1566 filp = ERR_PTR(error); 2253 filp = ERR_PTR(error);
@@ -1584,6 +2271,9 @@ exit:
1584 return ERR_PTR(error); 2271 return ERR_PTR(error);
1585} 2272}
1586 2273
2274/*
2275 * Handle O_CREAT case for do_filp_open
2276 */
1587static struct file *do_last(struct nameidata *nd, struct path *path, 2277static struct file *do_last(struct nameidata *nd, struct path *path,
1588 int open_flag, int acc_mode, 2278 int open_flag, int acc_mode,
1589 int mode, const char *pathname) 2279 int mode, const char *pathname)
@@ -1597,50 +2287,27 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1597 follow_dotdot(nd); 2287 follow_dotdot(nd);
1598 dir = nd->path.dentry; 2288 dir = nd->path.dentry;
1599 case LAST_DOT: 2289 case LAST_DOT:
1600 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { 2290 if (need_reval_dot(dir)) {
1601 if (!dir->d_op->d_revalidate(dir, nd)) { 2291 int status = d_revalidate(nd->path.dentry, nd);
1602 error = -ESTALE; 2292 if (!status)
2293 status = -ESTALE;
2294 if (status < 0) {
2295 error = status;
1603 goto exit; 2296 goto exit;
1604 } 2297 }
1605 } 2298 }
1606 /* fallthrough */ 2299 /* fallthrough */
1607 case LAST_ROOT: 2300 case LAST_ROOT:
1608 if (open_flag & O_CREAT) 2301 goto exit;
1609 goto exit;
1610 /* fallthrough */
1611 case LAST_BIND: 2302 case LAST_BIND:
1612 audit_inode(pathname, dir); 2303 audit_inode(pathname, dir);
1613 goto ok; 2304 goto ok;
1614 } 2305 }
1615 2306
1616 /* trailing slashes? */ 2307 /* trailing slashes? */
1617 if (nd->last.name[nd->last.len]) { 2308 if (nd->last.name[nd->last.len])
1618 if (open_flag & O_CREAT) 2309 goto exit;
1619 goto exit;
1620 nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
1621 }
1622
1623 /* just plain open? */
1624 if (!(open_flag & O_CREAT)) {
1625 error = do_lookup(nd, &nd->last, path);
1626 if (error)
1627 goto exit;
1628 error = -ENOENT;
1629 if (!path->dentry->d_inode)
1630 goto exit_dput;
1631 if (path->dentry->d_inode->i_op->follow_link)
1632 return NULL;
1633 error = -ENOTDIR;
1634 if (nd->flags & LOOKUP_DIRECTORY) {
1635 if (!path->dentry->d_inode->i_op->lookup)
1636 goto exit_dput;
1637 }
1638 path_to_nameidata(path, nd);
1639 audit_inode(pathname, nd->path.dentry);
1640 goto ok;
1641 }
1642 2310
1643 /* OK, it's O_CREAT */
1644 mutex_lock(&dir->d_inode->i_mutex); 2311 mutex_lock(&dir->d_inode->i_mutex);
1645 2312
1646 path->dentry = lookup_hash(nd); 2313 path->dentry = lookup_hash(nd);
@@ -1697,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1697 if (open_flag & O_EXCL) 2364 if (open_flag & O_EXCL)
1698 goto exit_dput; 2365 goto exit_dput;
1699 2366
1700 if (__follow_mount(path)) { 2367 error = follow_managed(path, nd->flags);
1701 error = -ELOOP; 2368 if (error < 0)
1702 if (open_flag & O_NOFOLLOW) 2369 goto exit_dput;
1703 goto exit_dput;
1704 }
1705 2370
1706 error = -ENOENT; 2371 error = -ENOENT;
1707 if (!path->dentry->d_inode) 2372 if (!path->dentry->d_inode)
@@ -1711,8 +2376,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1711 return NULL; 2376 return NULL;
1712 2377
1713 path_to_nameidata(path, nd); 2378 path_to_nameidata(path, nd);
2379 nd->inode = path->dentry->d_inode;
1714 error = -EISDIR; 2380 error = -EISDIR;
1715 if (S_ISDIR(path->dentry->d_inode->i_mode)) 2381 if (S_ISDIR(nd->inode->i_mode))
1716 goto exit; 2382 goto exit;
1717ok: 2383ok:
1718 filp = finish_open(nd, open_flag, acc_mode); 2384 filp = finish_open(nd, open_flag, acc_mode);
@@ -1743,11 +2409,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
1743 struct path path; 2409 struct path path;
1744 int count = 0; 2410 int count = 0;
1745 int flag = open_to_namei_flags(open_flag); 2411 int flag = open_to_namei_flags(open_flag);
1746 int force_reval = 0; 2412 int flags;
1747 2413
1748 if (!(open_flag & O_CREAT)) 2414 if (!(open_flag & O_CREAT))
1749 mode = 0; 2415 mode = 0;
1750 2416
2417 /* Must never be set by userspace */
2418 open_flag &= ~FMODE_NONOTIFY;
2419
1751 /* 2420 /*
1752 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only 2421 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
1753 * check for O_DSYNC if the need any syncing at all we enforce it's 2422 * check for O_DSYNC if the need any syncing at all we enforce it's
@@ -1769,54 +2438,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
1769 if (open_flag & O_APPEND) 2438 if (open_flag & O_APPEND)
1770 acc_mode |= MAY_APPEND; 2439 acc_mode |= MAY_APPEND;
1771 2440
1772 /* find the parent */ 2441 flags = LOOKUP_OPEN;
1773reval: 2442 if (open_flag & O_CREAT) {
1774 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); 2443 flags |= LOOKUP_CREATE;
2444 if (open_flag & O_EXCL)
2445 flags |= LOOKUP_EXCL;
2446 }
2447 if (open_flag & O_DIRECTORY)
2448 flags |= LOOKUP_DIRECTORY;
2449 if (!(open_flag & O_NOFOLLOW))
2450 flags |= LOOKUP_FOLLOW;
2451
2452 filp = get_empty_filp();
2453 if (!filp)
2454 return ERR_PTR(-ENFILE);
2455
2456 filp->f_flags = open_flag;
2457 nd.intent.open.file = filp;
2458 nd.intent.open.flags = flag;
2459 nd.intent.open.create_mode = mode;
2460
2461 if (open_flag & O_CREAT)
2462 goto creat;
2463
2464 /* !O_CREAT, simple open */
2465 error = do_path_lookup(dfd, pathname, flags, &nd);
2466 if (unlikely(error))
2467 goto out_filp;
2468 error = -ELOOP;
2469 if (!(nd.flags & LOOKUP_FOLLOW)) {
2470 if (nd.inode->i_op->follow_link)
2471 goto out_path;
2472 }
2473 error = -ENOTDIR;
2474 if (nd.flags & LOOKUP_DIRECTORY) {
2475 if (!nd.inode->i_op->lookup)
2476 goto out_path;
2477 }
2478 audit_inode(pathname, nd.path.dentry);
2479 filp = finish_open(&nd, open_flag, acc_mode);
2480 return filp;
2481
2482creat:
2483 /* OK, have to create the file. Find the parent. */
2484 error = path_init_rcu(dfd, pathname,
2485 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
1775 if (error) 2486 if (error)
1776 return ERR_PTR(error); 2487 goto out_filp;
1777 if (force_reval) 2488 error = path_walk_rcu(pathname, &nd);
1778 nd.flags |= LOOKUP_REVAL; 2489 path_finish_rcu(&nd);
2490 if (unlikely(error == -ECHILD || error == -ESTALE)) {
2491 /* slower, locked walk */
2492 if (error == -ESTALE) {
2493reval:
2494 flags |= LOOKUP_REVAL;
2495 }
2496 error = path_init(dfd, pathname,
2497 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2498 if (error)
2499 goto out_filp;
1779 2500
1780 current->total_link_count = 0; 2501 error = path_walk_simple(pathname, &nd);
1781 error = link_path_walk(pathname, &nd);
1782 if (error) {
1783 filp = ERR_PTR(error);
1784 goto out;
1785 } 2502 }
1786 if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) 2503 if (unlikely(error))
2504 goto out_filp;
2505 if (unlikely(!audit_dummy_context()))
1787 audit_inode(pathname, nd.path.dentry); 2506 audit_inode(pathname, nd.path.dentry);
1788 2507
1789 /* 2508 /*
1790 * We have the parent and last component. 2509 * We have the parent and last component.
1791 */ 2510 */
1792 2511 nd.flags = flags;
1793 error = -ENFILE;
1794 filp = get_empty_filp();
1795 if (filp == NULL)
1796 goto exit_parent;
1797 nd.intent.open.file = filp;
1798 filp->f_flags = open_flag;
1799 nd.intent.open.flags = flag;
1800 nd.intent.open.create_mode = mode;
1801 nd.flags &= ~LOOKUP_PARENT;
1802 nd.flags |= LOOKUP_OPEN;
1803 if (open_flag & O_CREAT) {
1804 nd.flags |= LOOKUP_CREATE;
1805 if (open_flag & O_EXCL)
1806 nd.flags |= LOOKUP_EXCL;
1807 }
1808 if (open_flag & O_DIRECTORY)
1809 nd.flags |= LOOKUP_DIRECTORY;
1810 if (!(open_flag & O_NOFOLLOW))
1811 nd.flags |= LOOKUP_FOLLOW;
1812 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2512 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1813 while (unlikely(!filp)) { /* trailing symlink */ 2513 while (unlikely(!filp)) { /* trailing symlink */
1814 struct path holder; 2514 struct path link = path;
1815 struct inode *inode = path.dentry->d_inode; 2515 struct inode *linki = link.dentry->d_inode;
1816 void *cookie; 2516 void *cookie;
1817 error = -ELOOP; 2517 error = -ELOOP;
1818 /* S_ISDIR part is a temporary automount kludge */ 2518 if (!(nd.flags & LOOKUP_FOLLOW))
1819 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
1820 goto exit_dput; 2519 goto exit_dput;
1821 if (count++ == 32) 2520 if (count++ == 32)
1822 goto exit_dput; 2521 goto exit_dput;
@@ -1832,41 +2531,37 @@ reval:
1832 * just set LAST_BIND. 2531 * just set LAST_BIND.
1833 */ 2532 */
1834 nd.flags |= LOOKUP_PARENT; 2533 nd.flags |= LOOKUP_PARENT;
1835 error = security_inode_follow_link(path.dentry, &nd); 2534 error = security_inode_follow_link(link.dentry, &nd);
1836 if (error) 2535 if (error)
1837 goto exit_dput; 2536 goto exit_dput;
1838 error = __do_follow_link(&path, &nd, &cookie); 2537 error = __do_follow_link(&link, &nd, &cookie);
1839 if (unlikely(error)) { 2538 if (unlikely(error)) {
2539 if (!IS_ERR(cookie) && linki->i_op->put_link)
2540 linki->i_op->put_link(link.dentry, &nd, cookie);
1840 /* nd.path had been dropped */ 2541 /* nd.path had been dropped */
1841 if (!IS_ERR(cookie) && inode->i_op->put_link) 2542 nd.path = link;
1842 inode->i_op->put_link(path.dentry, &nd, cookie); 2543 goto out_path;
1843 path_put(&path);
1844 release_open_intent(&nd);
1845 filp = ERR_PTR(error);
1846 goto out;
1847 } 2544 }
1848 holder = path;
1849 nd.flags &= ~LOOKUP_PARENT; 2545 nd.flags &= ~LOOKUP_PARENT;
1850 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2546 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1851 if (inode->i_op->put_link) 2547 if (linki->i_op->put_link)
1852 inode->i_op->put_link(holder.dentry, &nd, cookie); 2548 linki->i_op->put_link(link.dentry, &nd, cookie);
1853 path_put(&holder); 2549 path_put(&link);
1854 } 2550 }
1855out: 2551out:
1856 if (nd.root.mnt) 2552 if (nd.root.mnt)
1857 path_put(&nd.root); 2553 path_put(&nd.root);
1858 if (filp == ERR_PTR(-ESTALE) && !force_reval) { 2554 if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
1859 force_reval = 1;
1860 goto reval; 2555 goto reval;
1861 }
1862 return filp; 2556 return filp;
1863 2557
1864exit_dput: 2558exit_dput:
1865 path_put_conditional(&path, &nd); 2559 path_put_conditional(&path, &nd);
2560out_path:
2561 path_put(&nd.path);
2562out_filp:
1866 if (!IS_ERR(nd.intent.open.file)) 2563 if (!IS_ERR(nd.intent.open.file))
1867 release_open_intent(&nd); 2564 release_open_intent(&nd);
1868exit_parent:
1869 path_put(&nd.path);
1870 filp = ERR_PTR(error); 2565 filp = ERR_PTR(error);
1871 goto out; 2566 goto out;
1872} 2567}
@@ -2127,12 +2822,10 @@ void dentry_unhash(struct dentry *dentry)
2127{ 2822{
2128 dget(dentry); 2823 dget(dentry);
2129 shrink_dcache_parent(dentry); 2824 shrink_dcache_parent(dentry);
2130 spin_lock(&dcache_lock);
2131 spin_lock(&dentry->d_lock); 2825 spin_lock(&dentry->d_lock);
2132 if (atomic_read(&dentry->d_count) == 2) 2826 if (dentry->d_count == 2)
2133 __d_drop(dentry); 2827 __d_drop(dentry);
2134 spin_unlock(&dentry->d_lock); 2828 spin_unlock(&dentry->d_lock);
2135 spin_unlock(&dcache_lock);
2136} 2829}
2137 2830
2138int vfs_rmdir(struct inode *dir, struct dentry *dentry) 2831int vfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -2881,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = {
2881}; 3574};
2882 3575
2883EXPORT_SYMBOL(user_path_at); 3576EXPORT_SYMBOL(user_path_at);
3577EXPORT_SYMBOL(follow_down_one);
2884EXPORT_SYMBOL(follow_down); 3578EXPORT_SYMBOL(follow_down);
2885EXPORT_SYMBOL(follow_up); 3579EXPORT_SYMBOL(follow_up);
2886EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 3580EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 8a415c9c5e5..7b0b9537169 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,7 +13,6 @@
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/smp_lock.h>
17#include <linux/init.h> 16#include <linux/init.h>
18#include <linux/kernel.h> 17#include <linux/kernel.h>
19#include <linux/acct.h> 18#include <linux/acct.h>
@@ -139,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt)
139 mnt->mnt_group_id = 0; 138 mnt->mnt_group_id = 0;
140} 139}
141 140
141/*
142 * vfsmount lock must be held for read
143 */
144static inline void mnt_add_count(struct vfsmount *mnt, int n)
145{
146#ifdef CONFIG_SMP
147 this_cpu_add(mnt->mnt_pcp->mnt_count, n);
148#else
149 preempt_disable();
150 mnt->mnt_count += n;
151 preempt_enable();
152#endif
153}
154
155static inline void mnt_set_count(struct vfsmount *mnt, int n)
156{
157#ifdef CONFIG_SMP
158 this_cpu_write(mnt->mnt_pcp->mnt_count, n);
159#else
160 mnt->mnt_count = n;
161#endif
162}
163
164/*
165 * vfsmount lock must be held for read
166 */
167static inline void mnt_inc_count(struct vfsmount *mnt)
168{
169 mnt_add_count(mnt, 1);
170}
171
172/*
173 * vfsmount lock must be held for read
174 */
175static inline void mnt_dec_count(struct vfsmount *mnt)
176{
177 mnt_add_count(mnt, -1);
178}
179
180/*
181 * vfsmount lock must be held for write
182 */
183unsigned int mnt_get_count(struct vfsmount *mnt)
184{
185#ifdef CONFIG_SMP
186 unsigned int count = 0;
187 int cpu;
188
189 for_each_possible_cpu(cpu) {
190 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
191 }
192
193 return count;
194#else
195 return mnt->mnt_count;
196#endif
197}
198
142struct vfsmount *alloc_vfsmnt(const char *name) 199struct vfsmount *alloc_vfsmnt(const char *name)
143{ 200{
144 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 201 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -155,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name)
155 goto out_free_id; 212 goto out_free_id;
156 } 213 }
157 214
158 atomic_set(&mnt->mnt_count, 1); 215#ifdef CONFIG_SMP
216 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
217 if (!mnt->mnt_pcp)
218 goto out_free_devname;
219
220 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
221#else
222 mnt->mnt_count = 1;
223 mnt->mnt_writers = 0;
224#endif
225
159 INIT_LIST_HEAD(&mnt->mnt_hash); 226 INIT_LIST_HEAD(&mnt->mnt_hash);
160 INIT_LIST_HEAD(&mnt->mnt_child); 227 INIT_LIST_HEAD(&mnt->mnt_child);
161 INIT_LIST_HEAD(&mnt->mnt_mounts); 228 INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -167,13 +234,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)
167#ifdef CONFIG_FSNOTIFY 234#ifdef CONFIG_FSNOTIFY
168 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 235 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
169#endif 236#endif
170#ifdef CONFIG_SMP
171 mnt->mnt_writers = alloc_percpu(int);
172 if (!mnt->mnt_writers)
173 goto out_free_devname;
174#else
175 mnt->mnt_writers = 0;
176#endif
177 } 237 }
178 return mnt; 238 return mnt;
179 239
@@ -217,32 +277,32 @@ int __mnt_is_readonly(struct vfsmount *mnt)
217} 277}
218EXPORT_SYMBOL_GPL(__mnt_is_readonly); 278EXPORT_SYMBOL_GPL(__mnt_is_readonly);
219 279
220static inline void inc_mnt_writers(struct vfsmount *mnt) 280static inline void mnt_inc_writers(struct vfsmount *mnt)
221{ 281{
222#ifdef CONFIG_SMP 282#ifdef CONFIG_SMP
223 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; 283 this_cpu_inc(mnt->mnt_pcp->mnt_writers);
224#else 284#else
225 mnt->mnt_writers++; 285 mnt->mnt_writers++;
226#endif 286#endif
227} 287}
228 288
229static inline void dec_mnt_writers(struct vfsmount *mnt) 289static inline void mnt_dec_writers(struct vfsmount *mnt)
230{ 290{
231#ifdef CONFIG_SMP 291#ifdef CONFIG_SMP
232 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; 292 this_cpu_dec(mnt->mnt_pcp->mnt_writers);
233#else 293#else
234 mnt->mnt_writers--; 294 mnt->mnt_writers--;
235#endif 295#endif
236} 296}
237 297
238static unsigned int count_mnt_writers(struct vfsmount *mnt) 298static unsigned int mnt_get_writers(struct vfsmount *mnt)
239{ 299{
240#ifdef CONFIG_SMP 300#ifdef CONFIG_SMP
241 unsigned int count = 0; 301 unsigned int count = 0;
242 int cpu; 302 int cpu;
243 303
244 for_each_possible_cpu(cpu) { 304 for_each_possible_cpu(cpu) {
245 count += *per_cpu_ptr(mnt->mnt_writers, cpu); 305 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
246 } 306 }
247 307
248 return count; 308 return count;
@@ -274,9 +334,9 @@ int mnt_want_write(struct vfsmount *mnt)
274 int ret = 0; 334 int ret = 0;
275 335
276 preempt_disable(); 336 preempt_disable();
277 inc_mnt_writers(mnt); 337 mnt_inc_writers(mnt);
278 /* 338 /*
279 * The store to inc_mnt_writers must be visible before we pass 339 * The store to mnt_inc_writers must be visible before we pass
280 * MNT_WRITE_HOLD loop below, so that the slowpath can see our 340 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
281 * incremented count after it has set MNT_WRITE_HOLD. 341 * incremented count after it has set MNT_WRITE_HOLD.
282 */ 342 */
@@ -290,7 +350,7 @@ int mnt_want_write(struct vfsmount *mnt)
290 */ 350 */
291 smp_rmb(); 351 smp_rmb();
292 if (__mnt_is_readonly(mnt)) { 352 if (__mnt_is_readonly(mnt)) {
293 dec_mnt_writers(mnt); 353 mnt_dec_writers(mnt);
294 ret = -EROFS; 354 ret = -EROFS;
295 goto out; 355 goto out;
296 } 356 }
@@ -318,7 +378,7 @@ int mnt_clone_write(struct vfsmount *mnt)
318 if (__mnt_is_readonly(mnt)) 378 if (__mnt_is_readonly(mnt))
319 return -EROFS; 379 return -EROFS;
320 preempt_disable(); 380 preempt_disable();
321 inc_mnt_writers(mnt); 381 mnt_inc_writers(mnt);
322 preempt_enable(); 382 preempt_enable();
323 return 0; 383 return 0;
324} 384}
@@ -352,7 +412,7 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
352void mnt_drop_write(struct vfsmount *mnt) 412void mnt_drop_write(struct vfsmount *mnt)
353{ 413{
354 preempt_disable(); 414 preempt_disable();
355 dec_mnt_writers(mnt); 415 mnt_dec_writers(mnt);
356 preempt_enable(); 416 preempt_enable();
357} 417}
358EXPORT_SYMBOL_GPL(mnt_drop_write); 418EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -385,7 +445,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
385 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while 445 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
386 * we're counting up here. 446 * we're counting up here.
387 */ 447 */
388 if (count_mnt_writers(mnt) > 0) 448 if (mnt_get_writers(mnt) > 0)
389 ret = -EBUSY; 449 ret = -EBUSY;
390 else 450 else
391 mnt->mnt_flags |= MNT_READONLY; 451 mnt->mnt_flags |= MNT_READONLY;
@@ -419,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt)
419 kfree(mnt->mnt_devname); 479 kfree(mnt->mnt_devname);
420 mnt_free_id(mnt); 480 mnt_free_id(mnt);
421#ifdef CONFIG_SMP 481#ifdef CONFIG_SMP
422 free_percpu(mnt->mnt_writers); 482 free_percpu(mnt->mnt_pcp);
423#endif 483#endif
424 kmem_cache_free(mnt_cache, mnt); 484 kmem_cache_free(mnt_cache, mnt);
425} 485}
@@ -493,6 +553,27 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
493} 553}
494 554
495/* 555/*
556 * Clear dentry's mounted state if it has no remaining mounts.
557 * vfsmount_lock must be held for write.
558 */
559static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
560{
561 unsigned u;
562
563 for (u = 0; u < HASH_SIZE; u++) {
564 struct vfsmount *p;
565
566 list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
567 if (p->mnt_mountpoint == dentry)
568 return;
569 }
570 }
571 spin_lock(&dentry->d_lock);
572 dentry->d_flags &= ~DCACHE_MOUNTED;
573 spin_unlock(&dentry->d_lock);
574}
575
576/*
496 * vfsmount lock must be held for write 577 * vfsmount lock must be held for write
497 */ 578 */
498static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 579static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
@@ -503,7 +584,7 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
503 mnt->mnt_mountpoint = mnt->mnt_root; 584 mnt->mnt_mountpoint = mnt->mnt_root;
504 list_del_init(&mnt->mnt_child); 585 list_del_init(&mnt->mnt_child);
505 list_del_init(&mnt->mnt_hash); 586 list_del_init(&mnt->mnt_hash);
506 old_path->dentry->d_mounted--; 587 dentry_reset_mounted(old_path->mnt, old_path->dentry);
507} 588}
508 589
509/* 590/*
@@ -514,7 +595,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
514{ 595{
515 child_mnt->mnt_parent = mntget(mnt); 596 child_mnt->mnt_parent = mntget(mnt);
516 child_mnt->mnt_mountpoint = dget(dentry); 597 child_mnt->mnt_mountpoint = dget(dentry);
517 dentry->d_mounted++; 598 spin_lock(&dentry->d_lock);
599 dentry->d_flags |= DCACHE_MOUNTED;
600 spin_unlock(&dentry->d_lock);
518} 601}
519 602
520/* 603/*
@@ -528,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
528 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); 611 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
529} 612}
530 613
614static inline void __mnt_make_longterm(struct vfsmount *mnt)
615{
616#ifdef CONFIG_SMP
617 atomic_inc(&mnt->mnt_longterm);
618#endif
619}
620
621/* needs vfsmount lock for write */
622static inline void __mnt_make_shortterm(struct vfsmount *mnt)
623{
624#ifdef CONFIG_SMP
625 atomic_dec(&mnt->mnt_longterm);
626#endif
627}
628
531/* 629/*
532 * vfsmount lock must be held for write 630 * vfsmount lock must be held for write
533 */ 631 */
@@ -541,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
541 BUG_ON(parent == mnt); 639 BUG_ON(parent == mnt);
542 640
543 list_add_tail(&head, &mnt->mnt_list); 641 list_add_tail(&head, &mnt->mnt_list);
544 list_for_each_entry(m, &head, mnt_list) 642 list_for_each_entry(m, &head, mnt_list) {
545 m->mnt_ns = n; 643 m->mnt_ns = n;
644 __mnt_make_longterm(m);
645 }
646
546 list_splice(&head, n->list.prev); 647 list_splice(&head, n->list.prev);
547 648
548 list_add_tail(&mnt->mnt_hash, mount_hashtable + 649 list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -630,9 +731,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
630 return NULL; 731 return NULL;
631} 732}
632 733
633static inline void __mntput(struct vfsmount *mnt) 734static inline void mntfree(struct vfsmount *mnt)
634{ 735{
635 struct super_block *sb = mnt->mnt_sb; 736 struct super_block *sb = mnt->mnt_sb;
737
636 /* 738 /*
637 * This probably indicates that somebody messed 739 * This probably indicates that somebody messed
638 * up a mnt_want/drop_write() pair. If this 740 * up a mnt_want/drop_write() pair. If this
@@ -640,38 +742,69 @@ static inline void __mntput(struct vfsmount *mnt)
640 * to make r/w->r/o transitions. 742 * to make r/w->r/o transitions.
641 */ 743 */
642 /* 744 /*
643 * atomic_dec_and_lock() used to deal with ->mnt_count decrements 745 * The locking used to deal with mnt_count decrement provides barriers,
644 * provides barriers, so count_mnt_writers() below is safe. AV 746 * so mnt_get_writers() below is safe.
645 */ 747 */
646 WARN_ON(count_mnt_writers(mnt)); 748 WARN_ON(mnt_get_writers(mnt));
647 fsnotify_vfsmount_delete(mnt); 749 fsnotify_vfsmount_delete(mnt);
648 dput(mnt->mnt_root); 750 dput(mnt->mnt_root);
649 free_vfsmnt(mnt); 751 free_vfsmnt(mnt);
650 deactivate_super(sb); 752 deactivate_super(sb);
651} 753}
652 754
653void mntput_no_expire(struct vfsmount *mnt) 755static void mntput_no_expire(struct vfsmount *mnt)
654{ 756{
655repeat: 757put_again:
656 if (atomic_add_unless(&mnt->mnt_count, -1, 1)) 758#ifdef CONFIG_SMP
759 br_read_lock(vfsmount_lock);
760 if (likely(atomic_read(&mnt->mnt_longterm))) {
761 mnt_dec_count(mnt);
762 br_read_unlock(vfsmount_lock);
657 return; 763 return;
764 }
765 br_read_unlock(vfsmount_lock);
766
658 br_write_lock(vfsmount_lock); 767 br_write_lock(vfsmount_lock);
659 if (!atomic_dec_and_test(&mnt->mnt_count)) { 768 mnt_dec_count(mnt);
769 if (mnt_get_count(mnt)) {
660 br_write_unlock(vfsmount_lock); 770 br_write_unlock(vfsmount_lock);
661 return; 771 return;
662 } 772 }
663 if (likely(!mnt->mnt_pinned)) { 773#else
664 br_write_unlock(vfsmount_lock); 774 mnt_dec_count(mnt);
665 __mntput(mnt); 775 if (likely(mnt_get_count(mnt)))
666 return; 776 return;
777 br_write_lock(vfsmount_lock);
778#endif
779 if (unlikely(mnt->mnt_pinned)) {
780 mnt_add_count(mnt, mnt->mnt_pinned + 1);
781 mnt->mnt_pinned = 0;
782 br_write_unlock(vfsmount_lock);
783 acct_auto_close_mnt(mnt);
784 goto put_again;
667 } 785 }
668 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
669 mnt->mnt_pinned = 0;
670 br_write_unlock(vfsmount_lock); 786 br_write_unlock(vfsmount_lock);
671 acct_auto_close_mnt(mnt); 787 mntfree(mnt);
672 goto repeat;
673} 788}
674EXPORT_SYMBOL(mntput_no_expire); 789
790void mntput(struct vfsmount *mnt)
791{
792 if (mnt) {
793 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
794 if (unlikely(mnt->mnt_expiry_mark))
795 mnt->mnt_expiry_mark = 0;
796 mntput_no_expire(mnt);
797 }
798}
799EXPORT_SYMBOL(mntput);
800
801struct vfsmount *mntget(struct vfsmount *mnt)
802{
803 if (mnt)
804 mnt_inc_count(mnt);
805 return mnt;
806}
807EXPORT_SYMBOL(mntget);
675 808
676void mnt_pin(struct vfsmount *mnt) 809void mnt_pin(struct vfsmount *mnt)
677{ 810{
@@ -679,19 +812,17 @@ void mnt_pin(struct vfsmount *mnt)
679 mnt->mnt_pinned++; 812 mnt->mnt_pinned++;
680 br_write_unlock(vfsmount_lock); 813 br_write_unlock(vfsmount_lock);
681} 814}
682
683EXPORT_SYMBOL(mnt_pin); 815EXPORT_SYMBOL(mnt_pin);
684 816
685void mnt_unpin(struct vfsmount *mnt) 817void mnt_unpin(struct vfsmount *mnt)
686{ 818{
687 br_write_lock(vfsmount_lock); 819 br_write_lock(vfsmount_lock);
688 if (mnt->mnt_pinned) { 820 if (mnt->mnt_pinned) {
689 atomic_inc(&mnt->mnt_count); 821 mnt_inc_count(mnt);
690 mnt->mnt_pinned--; 822 mnt->mnt_pinned--;
691 } 823 }
692 br_write_unlock(vfsmount_lock); 824 br_write_unlock(vfsmount_lock);
693} 825}
694
695EXPORT_SYMBOL(mnt_unpin); 826EXPORT_SYMBOL(mnt_unpin);
696 827
697static inline void mangle(struct seq_file *m, const char *s) 828static inline void mangle(struct seq_file *m, const char *s)
@@ -986,12 +1117,13 @@ int may_umount_tree(struct vfsmount *mnt)
986 int minimum_refs = 0; 1117 int minimum_refs = 0;
987 struct vfsmount *p; 1118 struct vfsmount *p;
988 1119
989 br_read_lock(vfsmount_lock); 1120 /* write lock needed for mnt_get_count */
1121 br_write_lock(vfsmount_lock);
990 for (p = mnt; p; p = next_mnt(p, mnt)) { 1122 for (p = mnt; p; p = next_mnt(p, mnt)) {
991 actual_refs += atomic_read(&p->mnt_count); 1123 actual_refs += mnt_get_count(p);
992 minimum_refs += 2; 1124 minimum_refs += 2;
993 } 1125 }
994 br_read_unlock(vfsmount_lock); 1126 br_write_unlock(vfsmount_lock);
995 1127
996 if (actual_refs > minimum_refs) 1128 if (actual_refs > minimum_refs)
997 return 0; 1129 return 0;
@@ -1018,10 +1150,10 @@ int may_umount(struct vfsmount *mnt)
1018{ 1150{
1019 int ret = 1; 1151 int ret = 1;
1020 down_read(&namespace_sem); 1152 down_read(&namespace_sem);
1021 br_read_lock(vfsmount_lock); 1153 br_write_lock(vfsmount_lock);
1022 if (propagate_mount_busy(mnt, 2)) 1154 if (propagate_mount_busy(mnt, 2))
1023 ret = 0; 1155 ret = 0;
1024 br_read_unlock(vfsmount_lock); 1156 br_write_unlock(vfsmount_lock);
1025 up_read(&namespace_sem); 1157 up_read(&namespace_sem);
1026 return ret; 1158 return ret;
1027} 1159}
@@ -1058,26 +1190,29 @@ void release_mounts(struct list_head *head)
1058 */ 1190 */
1059void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1191void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1060{ 1192{
1193 LIST_HEAD(tmp_list);
1061 struct vfsmount *p; 1194 struct vfsmount *p;
1062 1195
1063 for (p = mnt; p; p = next_mnt(p, mnt)) 1196 for (p = mnt; p; p = next_mnt(p, mnt))
1064 list_move(&p->mnt_hash, kill); 1197 list_move(&p->mnt_hash, &tmp_list);
1065 1198
1066 if (propagate) 1199 if (propagate)
1067 propagate_umount(kill); 1200 propagate_umount(&tmp_list);
1068 1201
1069 list_for_each_entry(p, kill, mnt_hash) { 1202 list_for_each_entry(p, &tmp_list, mnt_hash) {
1070 list_del_init(&p->mnt_expire); 1203 list_del_init(&p->mnt_expire);
1071 list_del_init(&p->mnt_list); 1204 list_del_init(&p->mnt_list);
1072 __touch_mnt_namespace(p->mnt_ns); 1205 __touch_mnt_namespace(p->mnt_ns);
1073 p->mnt_ns = NULL; 1206 p->mnt_ns = NULL;
1207 __mnt_make_shortterm(p);
1074 list_del_init(&p->mnt_child); 1208 list_del_init(&p->mnt_child);
1075 if (p->mnt_parent != p) { 1209 if (p->mnt_parent != p) {
1076 p->mnt_parent->mnt_ghosts++; 1210 p->mnt_parent->mnt_ghosts++;
1077 p->mnt_mountpoint->d_mounted--; 1211 dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
1078 } 1212 }
1079 change_mnt_propagation(p, MS_PRIVATE); 1213 change_mnt_propagation(p, MS_PRIVATE);
1080 } 1214 }
1215 list_splice(&tmp_list, kill);
1081} 1216}
1082 1217
1083static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); 1218static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1103,8 +1238,16 @@ static int do_umount(struct vfsmount *mnt, int flags)
1103 flags & (MNT_FORCE | MNT_DETACH)) 1238 flags & (MNT_FORCE | MNT_DETACH))
1104 return -EINVAL; 1239 return -EINVAL;
1105 1240
1106 if (atomic_read(&mnt->mnt_count) != 2) 1241 /*
1242 * probably don't strictly need the lock here if we examined
1243 * all race cases, but it's a slowpath.
1244 */
1245 br_write_lock(vfsmount_lock);
1246 if (mnt_get_count(mnt) != 2) {
1247 br_write_lock(vfsmount_lock);
1107 return -EBUSY; 1248 return -EBUSY;
1249 }
1250 br_write_unlock(vfsmount_lock);
1108 1251
1109 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1252 if (!xchg(&mnt->mnt_expiry_mark, 1))
1110 return -EAGAIN; 1253 return -EAGAIN;
@@ -1668,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
1668 return err; 1811 return err;
1669 1812
1670 down_write(&namespace_sem); 1813 down_write(&namespace_sem);
1671 while (d_mountpoint(path->dentry) && 1814 err = follow_down(path, true);
1672 follow_down(path)) 1815 if (err < 0)
1673 ; 1816 goto out;
1817
1674 err = -EINVAL; 1818 err = -EINVAL;
1675 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1819 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
1676 goto out; 1820 goto out;
@@ -1728,6 +1872,8 @@ out:
1728 return err; 1872 return err;
1729} 1873}
1730 1874
1875static int do_add_mount(struct vfsmount *, struct path *, int);
1876
1731/* 1877/*
1732 * create a new mount for userspace and request it to be added into the 1878 * create a new mount for userspace and request it to be added into the
1733 * namespace's tree 1879 * namespace's tree
@@ -1736,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
1736 int mnt_flags, char *name, void *data) 1882 int mnt_flags, char *name, void *data)
1737{ 1883{
1738 struct vfsmount *mnt; 1884 struct vfsmount *mnt;
1885 int err;
1739 1886
1740 if (!type) 1887 if (!type)
1741 return -EINVAL; 1888 return -EINVAL;
@@ -1748,15 +1895,47 @@ static int do_new_mount(struct path *path, char *type, int flags,
1748 if (IS_ERR(mnt)) 1895 if (IS_ERR(mnt))
1749 return PTR_ERR(mnt); 1896 return PTR_ERR(mnt);
1750 1897
1751 return do_add_mount(mnt, path, mnt_flags, NULL); 1898 err = do_add_mount(mnt, path, mnt_flags);
1899 if (err)
1900 mntput(mnt);
1901 return err;
1902}
1903
1904int finish_automount(struct vfsmount *m, struct path *path)
1905{
1906 int err;
1907 /* The new mount record should have at least 2 refs to prevent it being
1908 * expired before we get a chance to add it
1909 */
1910 BUG_ON(mnt_get_count(m) < 2);
1911
1912 if (m->mnt_sb == path->mnt->mnt_sb &&
1913 m->mnt_root == path->dentry) {
1914 err = -ELOOP;
1915 goto fail;
1916 }
1917
1918 err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
1919 if (!err)
1920 return 0;
1921fail:
1922 /* remove m from any expiration list it may be on */
1923 if (!list_empty(&m->mnt_expire)) {
1924 down_write(&namespace_sem);
1925 br_write_lock(vfsmount_lock);
1926 list_del_init(&m->mnt_expire);
1927 br_write_unlock(vfsmount_lock);
1928 up_write(&namespace_sem);
1929 }
1930 mntput(m);
1931 mntput(m);
1932 return err;
1752} 1933}
1753 1934
1754/* 1935/*
1755 * add a mount into a namespace's mount tree 1936 * add a mount into a namespace's mount tree
1756 * - provide the option of adding the new mount to an expiration list
1757 */ 1937 */
1758int do_add_mount(struct vfsmount *newmnt, struct path *path, 1938static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
1759 int mnt_flags, struct list_head *fslist)
1760{ 1939{
1761 int err; 1940 int err;
1762 1941
@@ -1764,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1764 1943
1765 down_write(&namespace_sem); 1944 down_write(&namespace_sem);
1766 /* Something was mounted here while we slept */ 1945 /* Something was mounted here while we slept */
1767 while (d_mountpoint(path->dentry) && 1946 err = follow_down(path, true);
1768 follow_down(path)) 1947 if (err < 0)
1769 ; 1948 goto unlock;
1949
1770 err = -EINVAL; 1950 err = -EINVAL;
1771 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) 1951 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1772 goto unlock; 1952 goto unlock;
@@ -1782,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1782 goto unlock; 1962 goto unlock;
1783 1963
1784 newmnt->mnt_flags = mnt_flags; 1964 newmnt->mnt_flags = mnt_flags;
1785 if ((err = graft_tree(newmnt, path))) 1965 err = graft_tree(newmnt, path);
1786 goto unlock;
1787
1788 if (fslist) /* add to the specified expiration list */
1789 list_add_tail(&newmnt->mnt_expire, fslist);
1790
1791 up_write(&namespace_sem);
1792 return 0;
1793 1966
1794unlock: 1967unlock:
1795 up_write(&namespace_sem); 1968 up_write(&namespace_sem);
1796 mntput(newmnt);
1797 return err; 1969 return err;
1798} 1970}
1799 1971
1800EXPORT_SYMBOL_GPL(do_add_mount); 1972/**
1973 * mnt_set_expiry - Put a mount on an expiration list
1974 * @mnt: The mount to list.
1975 * @expiry_list: The list to add the mount to.
1976 */
1977void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
1978{
1979 down_write(&namespace_sem);
1980 br_write_lock(vfsmount_lock);
1981
1982 list_add_tail(&mnt->mnt_expire, expiry_list);
1983
1984 br_write_unlock(vfsmount_lock);
1985 up_write(&namespace_sem);
1986}
1987EXPORT_SYMBOL(mnt_set_expiry);
1801 1988
1802/* 1989/*
1803 * process a list of expirable mountpoints with the intent of discarding any 1990 * process a list of expirable mountpoints with the intent of discarding any
@@ -2086,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
2086 return new_ns; 2273 return new_ns;
2087} 2274}
2088 2275
2276void mnt_make_longterm(struct vfsmount *mnt)
2277{
2278 __mnt_make_longterm(mnt);
2279}
2280
2281void mnt_make_shortterm(struct vfsmount *mnt)
2282{
2283#ifdef CONFIG_SMP
2284 if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
2285 return;
2286 br_write_lock(vfsmount_lock);
2287 atomic_dec(&mnt->mnt_longterm);
2288 br_write_unlock(vfsmount_lock);
2289#endif
2290}
2291
2089/* 2292/*
2090 * Allocate a new namespace structure and populate it with contents 2293 * Allocate a new namespace structure and populate it with contents
2091 * copied from the namespace of the passed in task structure. 2294 * copied from the namespace of the passed in task structure.
@@ -2123,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2123 q = new_ns->root; 2326 q = new_ns->root;
2124 while (p) { 2327 while (p) {
2125 q->mnt_ns = new_ns; 2328 q->mnt_ns = new_ns;
2329 __mnt_make_longterm(q);
2126 if (fs) { 2330 if (fs) {
2127 if (p == fs->root.mnt) { 2331 if (p == fs->root.mnt) {
2128 rootmnt = p;
2129 fs->root.mnt = mntget(q); 2332 fs->root.mnt = mntget(q);
2333 __mnt_make_longterm(q);
2334 mnt_make_shortterm(p);
2335 rootmnt = p;
2130 } 2336 }
2131 if (p == fs->pwd.mnt) { 2337 if (p == fs->pwd.mnt) {
2132 pwdmnt = p;
2133 fs->pwd.mnt = mntget(q); 2338 fs->pwd.mnt = mntget(q);
2339 __mnt_make_longterm(q);
2340 mnt_make_shortterm(p);
2341 pwdmnt = p;
2134 } 2342 }
2135 } 2343 }
2136 p = next_mnt(p, mnt_ns->root); 2344 p = next_mnt(p, mnt_ns->root);
@@ -2174,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
2174 new_ns = alloc_mnt_ns(); 2382 new_ns = alloc_mnt_ns();
2175 if (!IS_ERR(new_ns)) { 2383 if (!IS_ERR(new_ns)) {
2176 mnt->mnt_ns = new_ns; 2384 mnt->mnt_ns = new_ns;
2385 __mnt_make_longterm(mnt);
2177 new_ns->root = mnt; 2386 new_ns->root = mnt;
2178 list_add(&new_ns->list, &new_ns->root->mnt_list); 2387 list_add(&new_ns->list, &new_ns->root->mnt_list);
2179 } 2388 }
@@ -2328,6 +2537,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2328 touch_mnt_namespace(current->nsproxy->mnt_ns); 2537 touch_mnt_namespace(current->nsproxy->mnt_ns);
2329 br_write_unlock(vfsmount_lock); 2538 br_write_unlock(vfsmount_lock);
2330 chroot_fs_refs(&root, &new); 2539 chroot_fs_refs(&root, &new);
2540
2331 error = 0; 2541 error = 0;
2332 path_put(&root_parent); 2542 path_put(&root_parent);
2333 path_put(&parent_path); 2543 path_put(&parent_path);
@@ -2354,6 +2564,7 @@ static void __init init_mount_tree(void)
2354 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2564 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
2355 if (IS_ERR(mnt)) 2565 if (IS_ERR(mnt))
2356 panic("Can't create rootfs"); 2566 panic("Can't create rootfs");
2567
2357 ns = create_mnt_ns(mnt); 2568 ns = create_mnt_ns(mnt);
2358 if (IS_ERR(ns)) 2569 if (IS_ERR(ns))
2359 panic("Can't allocate initial namespace"); 2570 panic("Can't allocate initial namespace");
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index aac8832e919..f6946bb5cb5 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -17,13 +17,11 @@
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/namei.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include <asm/byteorder.h> 22#include <asm/byteorder.h>
22#include <linux/smp_lock.h>
23 23
24#include <linux/ncp_fs.h> 24#include "ncp_fs.h"
25
26#include "ncplib_kernel.h"
27 25
28static void ncp_read_volume_list(struct file *, void *, filldir_t, 26static void ncp_read_volume_list(struct file *, void *, filldir_t,
29 struct ncp_cache_control *); 27 struct ncp_cache_control *);
@@ -75,11 +73,14 @@ const struct inode_operations ncp_dir_inode_operations =
75 * Dentry operations routines 73 * Dentry operations routines
76 */ 74 */
77static int ncp_lookup_validate(struct dentry *, struct nameidata *); 75static int ncp_lookup_validate(struct dentry *, struct nameidata *);
78static int ncp_hash_dentry(struct dentry *, struct qstr *); 76static int ncp_hash_dentry(const struct dentry *, const struct inode *,
79static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *); 77 struct qstr *);
80static int ncp_delete_dentry(struct dentry *); 78static int ncp_compare_dentry(const struct dentry *, const struct inode *,
81 79 const struct dentry *, const struct inode *,
82static const struct dentry_operations ncp_dentry_operations = 80 unsigned int, const char *, const struct qstr *);
81static int ncp_delete_dentry(const struct dentry *);
82
83const struct dentry_operations ncp_dentry_operations =
83{ 84{
84 .d_revalidate = ncp_lookup_validate, 85 .d_revalidate = ncp_lookup_validate,
85 .d_hash = ncp_hash_dentry, 86 .d_hash = ncp_hash_dentry,
@@ -87,14 +88,6 @@ static const struct dentry_operations ncp_dentry_operations =
87 .d_delete = ncp_delete_dentry, 88 .d_delete = ncp_delete_dentry,
88}; 89};
89 90
90const struct dentry_operations ncp_root_dentry_operations =
91{
92 .d_hash = ncp_hash_dentry,
93 .d_compare = ncp_compare_dentry,
94 .d_delete = ncp_delete_dentry,
95};
96
97
98#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) 91#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
99 92
100static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator) 93static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
@@ -114,10 +107,10 @@ static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
114 107
115#define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS) 108#define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS)
116 109
117static inline int ncp_case_sensitive(struct dentry *dentry) 110static inline int ncp_case_sensitive(const struct inode *i)
118{ 111{
119#ifdef CONFIG_NCPFS_NFS_NS 112#ifdef CONFIG_NCPFS_NFS_NS
120 return ncp_namespace(dentry->d_inode) == NW_NS_NFS; 113 return ncp_namespace(i) == NW_NS_NFS;
121#else 114#else
122 return 0; 115 return 0;
123#endif /* CONFIG_NCPFS_NFS_NS */ 116#endif /* CONFIG_NCPFS_NFS_NS */
@@ -128,14 +121,16 @@ static inline int ncp_case_sensitive(struct dentry *dentry)
128 * is case-sensitive. 121 * is case-sensitive.
129 */ 122 */
130static int 123static int
131ncp_hash_dentry(struct dentry *dentry, struct qstr *this) 124ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
125 struct qstr *this)
132{ 126{
133 if (!ncp_case_sensitive(dentry)) { 127 if (!ncp_case_sensitive(inode)) {
128 struct super_block *sb = dentry->d_sb;
134 struct nls_table *t; 129 struct nls_table *t;
135 unsigned long hash; 130 unsigned long hash;
136 int i; 131 int i;
137 132
138 t = NCP_IO_TABLE(dentry); 133 t = NCP_IO_TABLE(sb);
139 hash = init_name_hash(); 134 hash = init_name_hash();
140 for (i=0; i<this->len ; i++) 135 for (i=0; i<this->len ; i++)
141 hash = partial_name_hash(ncp_tolower(t, this->name[i]), 136 hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -146,15 +141,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
146} 141}
147 142
148static int 143static int
149ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) 144ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
145 const struct dentry *dentry, const struct inode *inode,
146 unsigned int len, const char *str, const struct qstr *name)
150{ 147{
151 if (a->len != b->len) 148 if (len != name->len)
152 return 1; 149 return 1;
153 150
154 if (ncp_case_sensitive(dentry)) 151 if (ncp_case_sensitive(pinode))
155 return strncmp(a->name, b->name, a->len); 152 return strncmp(str, name->name, len);
156 153
157 return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len); 154 return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
158} 155}
159 156
160/* 157/*
@@ -163,7 +160,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
163 * Closing files can be safely postponed until iput() - it's done there anyway. 160 * Closing files can be safely postponed until iput() - it's done there anyway.
164 */ 161 */
165static int 162static int
166ncp_delete_dentry(struct dentry * dentry) 163ncp_delete_dentry(const struct dentry * dentry)
167{ 164{
168 struct inode *inode = dentry->d_inode; 165 struct inode *inode = dentry->d_inode;
169 166
@@ -302,6 +299,12 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
302 int res, val = 0, len; 299 int res, val = 0, len;
303 __u8 __name[NCP_MAXPATHLEN + 1]; 300 __u8 __name[NCP_MAXPATHLEN + 1];
304 301
302 if (dentry == dentry->d_sb->s_root)
303 return 1;
304
305 if (nd->flags & LOOKUP_RCU)
306 return -ECHILD;
307
305 parent = dget_parent(dentry); 308 parent = dget_parent(dentry);
306 dir = parent->d_inode; 309 dir = parent->d_inode;
307 310
@@ -385,21 +388,21 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
385 } 388 }
386 389
387 /* If a pointer is invalid, we search the dentry. */ 390 /* If a pointer is invalid, we search the dentry. */
388 spin_lock(&dcache_lock); 391 spin_lock(&parent->d_lock);
389 next = parent->d_subdirs.next; 392 next = parent->d_subdirs.next;
390 while (next != &parent->d_subdirs) { 393 while (next != &parent->d_subdirs) {
391 dent = list_entry(next, struct dentry, d_u.d_child); 394 dent = list_entry(next, struct dentry, d_u.d_child);
392 if ((unsigned long)dent->d_fsdata == fpos) { 395 if ((unsigned long)dent->d_fsdata == fpos) {
393 if (dent->d_inode) 396 if (dent->d_inode)
394 dget_locked(dent); 397 dget(dent);
395 else 398 else
396 dent = NULL; 399 dent = NULL;
397 spin_unlock(&dcache_lock); 400 spin_unlock(&parent->d_lock);
398 goto out; 401 goto out;
399 } 402 }
400 next = next->next; 403 next = next->next;
401 } 404 }
402 spin_unlock(&dcache_lock); 405 spin_unlock(&parent->d_lock);
403 return NULL; 406 return NULL;
404 407
405out: 408out:
@@ -593,7 +596,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
593 qname.hash = full_name_hash(qname.name, qname.len); 596 qname.hash = full_name_hash(qname.name, qname.len);
594 597
595 if (dentry->d_op && dentry->d_op->d_hash) 598 if (dentry->d_op && dentry->d_op->d_hash)
596 if (dentry->d_op->d_hash(dentry, &qname) != 0) 599 if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
597 goto end_advance; 600 goto end_advance;
598 601
599 newdent = d_lookup(dentry, &qname); 602 newdent = d_lookup(dentry, &qname);
@@ -612,35 +615,12 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
612 shrink_dcache_parent(newdent); 615 shrink_dcache_parent(newdent);
613 616
614 /* 617 /*
615 * It is not as dangerous as it looks. NetWare's OS2 namespace is 618 * NetWare's OS2 namespace is case preserving yet case
616 * case preserving yet case insensitive. So we update dentry's name 619 * insensitive. So we update dentry's name as received from
617 * as received from server. We found dentry via d_lookup with our 620 * server. Parent dir's i_mutex is locked because we're in
618 * hash, so we know that hash does not change, and so replacing name 621 * readdir.
619 * should be reasonably safe.
620 */ 622 */
621 if (qname.len == newdent->d_name.len && 623 dentry_update_name_case(newdent, &qname);
622 memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
623 struct inode *inode = newdent->d_inode;
624
625 /*
626 * Inside ncpfs all uses of d_name are either for debugging,
627 * or on functions which acquire inode mutex (mknod, creat,
628 * lookup). So grab i_mutex here, to be sure. d_path
629 * uses dcache_lock when generating path, so we should too.
630 * And finally d_compare is protected by dentry's d_lock, so
631 * here we go.
632 */
633 if (inode)
634 mutex_lock(&inode->i_mutex);
635 spin_lock(&dcache_lock);
636 spin_lock(&newdent->d_lock);
637 memcpy((char *) newdent->d_name.name, qname.name,
638 newdent->d_name.len);
639 spin_unlock(&newdent->d_lock);
640 spin_unlock(&dcache_lock);
641 if (inode)
642 mutex_unlock(&inode->i_mutex);
643 }
644 } 624 }
645 625
646 if (!newdent->d_inode) { 626 if (!newdent->d_inode) {
@@ -650,7 +630,6 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
650 entry->ino = iunique(dir->i_sb, 2); 630 entry->ino = iunique(dir->i_sb, 2);
651 inode = ncp_iget(dir->i_sb, entry); 631 inode = ncp_iget(dir->i_sb, entry);
652 if (inode) { 632 if (inode) {
653 newdent->d_op = &ncp_dentry_operations;
654 d_instantiate(newdent, inode); 633 d_instantiate(newdent, inode);
655 if (!hashed) 634 if (!hashed)
656 d_rehash(newdent); 635 d_rehash(newdent);
@@ -658,7 +637,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
658 } else { 637 } else {
659 struct inode *inode = newdent->d_inode; 638 struct inode *inode = newdent->d_inode;
660 639
661 mutex_lock(&inode->i_mutex); 640 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
662 ncp_update_inode2(inode, entry); 641 ncp_update_inode2(inode, entry);
663 mutex_unlock(&inode->i_mutex); 642 mutex_unlock(&inode->i_mutex);
664 } 643 }
@@ -906,7 +885,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
906 if (inode) { 885 if (inode) {
907 ncp_new_dentry(dentry); 886 ncp_new_dentry(dentry);
908add_entry: 887add_entry:
909 dentry->d_op = &ncp_dentry_operations;
910 d_add(dentry, inode); 888 d_add(dentry, inode);
911 error = 0; 889 error = 0;
912 } 890 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6c754f70c52..0ed65e0c3df 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -17,10 +17,8 @@
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/smp_lock.h>
21 20
22#include <linux/ncp_fs.h> 21#include "ncp_fs.h"
23#include "ncplib_kernel.h"
24 22
25static int ncp_fsync(struct file *file, int datasync) 23static int ncp_fsync(struct file *file, int datasync)
26{ 24{
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d290545aa0c..00a1d1c3d3a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -26,16 +26,14 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/smp_lock.h>
30#include <linux/vfs.h> 29#include <linux/vfs.h>
31#include <linux/mount.h> 30#include <linux/mount.h>
32#include <linux/seq_file.h> 31#include <linux/seq_file.h>
33 32#include <linux/namei.h>
34#include <linux/ncp_fs.h>
35 33
36#include <net/sock.h> 34#include <net/sock.h>
37 35
38#include "ncplib_kernel.h" 36#include "ncp_fs.h"
39#include "getopt.h" 37#include "getopt.h"
40 38
41#define NCP_DEFAULT_FILE_MODE 0600 39#define NCP_DEFAULT_FILE_MODE 0600
@@ -59,11 +57,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb)
59 return &ei->vfs_inode; 57 return &ei->vfs_inode;
60} 58}
61 59
62static void ncp_destroy_inode(struct inode *inode) 60static void ncp_i_callback(struct rcu_head *head)
63{ 61{
62 struct inode *inode = container_of(head, struct inode, i_rcu);
63 INIT_LIST_HEAD(&inode->i_dentry);
64 kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); 64 kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
65} 65}
66 66
67static void ncp_destroy_inode(struct inode *inode)
68{
69 call_rcu(&inode->i_rcu, ncp_i_callback);
70}
71
67static void init_once(void *foo) 72static void init_once(void *foo)
68{ 73{
69 struct ncp_inode_info *ei = (struct ncp_inode_info *) foo; 74 struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
@@ -310,7 +315,12 @@ static void ncp_stop_tasks(struct ncp_server *server) {
310 sk->sk_write_space = server->write_space; 315 sk->sk_write_space = server->write_space;
311 release_sock(sk); 316 release_sock(sk);
312 del_timer_sync(&server->timeout_tm); 317 del_timer_sync(&server->timeout_tm);
313 flush_scheduled_work(); 318
319 flush_work_sync(&server->rcv.tq);
320 if (sk->sk_socket->type == SOCK_STREAM)
321 flush_work_sync(&server->tx.tq);
322 else
323 flush_work_sync(&server->timeout_tq);
314} 324}
315 325
316static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt) 326static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -532,6 +542,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
532 sb->s_blocksize_bits = 10; 542 sb->s_blocksize_bits = 10;
533 sb->s_magic = NCP_SUPER_MAGIC; 543 sb->s_magic = NCP_SUPER_MAGIC;
534 sb->s_op = &ncp_sops; 544 sb->s_op = &ncp_sops;
545 sb->s_d_op = &ncp_dentry_operations;
535 sb->s_bdi = &server->bdi; 546 sb->s_bdi = &server->bdi;
536 547
537 server = NCP_SBP(sb); 548 server = NCP_SBP(sb);
@@ -711,7 +722,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
711 sb->s_root = d_alloc_root(root_inode); 722 sb->s_root = d_alloc_root(root_inode);
712 if (!sb->s_root) 723 if (!sb->s_root)
713 goto out_no_root; 724 goto out_no_root;
714 sb->s_root->d_op = &ncp_root_dentry_operations;
715 return 0; 725 return 0;
716 726
717out_no_root: 727out_no_root:
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c2a1f9a155c..790e92a9ec6 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -17,15 +17,12 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/highuid.h> 19#include <linux/highuid.h>
20#include <linux/smp_lock.h>
21#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
23 22
24#include <linux/ncp_fs.h>
25
26#include <asm/uaccess.h> 23#include <asm/uaccess.h>
27 24
28#include "ncplib_kernel.h" 25#include "ncp_fs.h"
29 26
30/* maximum limit for ncp_objectname_ioctl */ 27/* maximum limit for ncp_objectname_ioctl */
31#define NCP_OBJECT_NAME_MAX_LEN 4096 28#define NCP_OBJECT_NAME_MAX_LEN 4096
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 56f5b3a0e1e..a7c07b44b10 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,12 +16,12 @@
16#include <linux/mman.h> 16#include <linux/mman.h>
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/fcntl.h> 18#include <linux/fcntl.h>
19#include <linux/ncp_fs.h>
20 19
21#include "ncplib_kernel.h"
22#include <asm/uaccess.h> 20#include <asm/uaccess.h>
23#include <asm/system.h> 21#include <asm/system.h>
24 22
23#include "ncp_fs.h"
24
25/* 25/*
26 * Fill in the supplied page for mmap 26 * Fill in the supplied page for mmap
27 * XXX: how are we excluding truncate/invalidate here? Maybe need to lock 27 * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 00000000000..31831afe1c3
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
1#include <linux/ncp_fs.h>
2#include "ncp_fs_i.h"
3#include "ncp_fs_sb.h"
4
5/* define because it is easy to change PRINTK to {*}PRINTK */
6#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
7
8#undef NCPFS_PARANOIA
9#ifdef NCPFS_PARANOIA
10#define PPRINTK(format, args...) PRINTK(format , ## args)
11#else
12#define PPRINTK(format, args...)
13#endif
14
15#ifndef DEBUG_NCP
16#define DEBUG_NCP 0
17#endif
18#if DEBUG_NCP > 0
19#define DPRINTK(format, args...) PRINTK(format , ## args)
20#else
21#define DPRINTK(format, args...)
22#endif
23#if DEBUG_NCP > 1
24#define DDPRINTK(format, args...) PRINTK(format , ## args)
25#else
26#define DDPRINTK(format, args...)
27#endif
28
29#define NCP_MAX_RPC_TIMEOUT (6*HZ)
30
31
32struct ncp_entry_info {
33 struct nw_info_struct i;
34 ino_t ino;
35 int opened;
36 int access;
37 unsigned int volume;
38 __u8 file_handle[6];
39};
40
41static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
42{
43 return sb->s_fs_info;
44}
45
46#define NCP_SERVER(inode) NCP_SBP((inode)->i_sb)
47static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
48{
49 return container_of(inode, struct ncp_inode_info, vfs_inode);
50}
51
52/* linux/fs/ncpfs/inode.c */
53int ncp_notify_change(struct dentry *, struct iattr *);
54struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
55void ncp_update_inode(struct inode *, struct ncp_entry_info *);
56void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
57
58/* linux/fs/ncpfs/dir.c */
59extern const struct inode_operations ncp_dir_inode_operations;
60extern const struct file_operations ncp_dir_operations;
61extern const struct dentry_operations ncp_dentry_operations;
62int ncp_conn_logged_in(struct super_block *);
63int ncp_date_dos2unix(__le16 time, __le16 date);
64void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
65
66/* linux/fs/ncpfs/ioctl.c */
67long ncp_ioctl(struct file *, unsigned int, unsigned long);
68long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
69
70/* linux/fs/ncpfs/sock.c */
71int ncp_request2(struct ncp_server *server, int function,
72 void* reply, int max_reply_size);
73static inline int ncp_request(struct ncp_server *server, int function) {
74 return ncp_request2(server, function, server->packet, server->packet_size);
75}
76int ncp_connect(struct ncp_server *server);
77int ncp_disconnect(struct ncp_server *server);
78void ncp_lock_server(struct ncp_server *server);
79void ncp_unlock_server(struct ncp_server *server);
80
81/* linux/fs/ncpfs/symlink.c */
82#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
83extern const struct address_space_operations ncp_symlink_aops;
84int ncp_symlink(struct inode*, struct dentry*, const char*);
85#endif
86
87/* linux/fs/ncpfs/file.c */
88extern const struct inode_operations ncp_file_inode_operations;
89extern const struct file_operations ncp_file_operations;
90int ncp_make_open(struct inode *, int);
91
92/* linux/fs/ncpfs/mmap.c */
93int ncp_mmap(struct file *, struct vm_area_struct *);
94
95/* linux/fs/ncpfs/ncplib_kernel.c */
96int ncp_make_closed(struct inode *);
97
98#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 00000000000..4b0bec47784
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
1/*
2 * ncp_fs_i.h
3 *
4 * Copyright (C) 1995 Volker Lendecke
5 *
6 */
7
8#ifndef _LINUX_NCP_FS_I
9#define _LINUX_NCP_FS_I
10
11/*
12 * This is the ncpfs part of the inode structure. This must contain
13 * all the information we need to work with an inode after creation.
14 */
15struct ncp_inode_info {
16 __le32 dirEntNum;
17 __le32 DosDirNum;
18 __u8 volNumber;
19 __le32 nwattr;
20 struct mutex open_mutex;
21 atomic_t opened;
22 int access;
23 int flags;
24#define NCPI_KLUDGE_SYMLINK 0x0001
25 __u8 file_handle[6];
26 struct inode vfs_inode;
27};
28
29#endif /* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 00000000000..4af803f1351
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
1/*
2 * ncp_fs_sb.h
3 *
4 * Copyright (C) 1995, 1996 by Volker Lendecke
5 *
6 */
7
8#ifndef _NCP_FS_SB
9#define _NCP_FS_SB
10
11#include <linux/types.h>
12#include <linux/ncp_mount.h>
13#include <linux/net.h>
14#include <linux/mutex.h>
15#include <linux/backing-dev.h>
16#include <linux/workqueue.h>
17
18#define NCP_DEFAULT_OPTIONS 0 /* 2 for packet signatures */
19
20struct sock;
21
22struct ncp_mount_data_kernel {
23 unsigned long flags; /* NCP_MOUNT_* flags */
24 unsigned int int_flags; /* internal flags */
25#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001
26 __kernel_uid32_t mounted_uid; /* Who may umount() this filesystem? */
27 struct pid *wdog_pid; /* Who cares for our watchdog packets? */
28 unsigned int ncp_fd; /* The socket to the ncp port */
29 unsigned int time_out; /* How long should I wait after
30 sending a NCP request? */
31 unsigned int retry_count; /* And how often should I retry? */
32 unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
33 __kernel_uid32_t uid;
34 __kernel_gid32_t gid;
35 __kernel_mode_t file_mode;
36 __kernel_mode_t dir_mode;
37 int info_fd;
38};
39
40struct ncp_server {
41
42 struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of
43 interest for us later, so we store
44 it completely. */
45
46 __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
47
48 struct file *ncp_filp; /* File pointer to ncp socket */
49 struct socket *ncp_sock;/* ncp socket */
50 struct file *info_filp;
51 struct socket *info_sock;
52
53 u8 sequence;
54 u8 task;
55 u16 connection; /* Remote connection number */
56
57 u8 completion; /* Status message from server */
58 u8 conn_status; /* Bit 4 = 1 ==> Server going down, no
59 requests allowed anymore.
60 Bit 0 = 1 ==> Server is down. */
61
62 int buffer_size; /* Negotiated bufsize */
63
64 int reply_size; /* Size of last reply */
65
66 int packet_size;
67 unsigned char *packet; /* Here we prepare requests and
68 receive replies */
69 unsigned char *txbuf; /* Storage for current request */
70 unsigned char *rxbuf; /* Storage for reply to current request */
71
72 int lock; /* To prevent mismatch in protocols. */
73 struct mutex mutex;
74
75 int current_size; /* for packet preparation */
76 int has_subfunction;
77 int ncp_reply_size;
78
79 int root_setuped;
80 struct mutex root_setup_lock;
81
82 /* info for packet signing */
83 int sign_wanted; /* 1=Server needs signed packets */
84 int sign_active; /* 0=don't do signing, 1=do */
85 char sign_root[8]; /* generated from password and encr. key */
86 char sign_last[16];
87
88 /* Authentication info: NDS or BINDERY, username */
89 struct {
90 int auth_type;
91 size_t object_name_len;
92 void* object_name;
93 int object_type;
94 } auth;
95 /* Password info */
96 struct {
97 size_t len;
98 void* data;
99 } priv;
100 struct rw_semaphore auth_rwsem;
101
102 /* nls info: codepage for volume and charset for I/O */
103 struct nls_table *nls_vol;
104 struct nls_table *nls_io;
105
106 /* maximum age in jiffies */
107 atomic_t dentry_ttl;
108
109 /* miscellaneous */
110 unsigned int flags;
111
112 spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
113
114 void (*data_ready)(struct sock* sk, int len);
115 void (*error_report)(struct sock* sk);
116 void (*write_space)(struct sock* sk); /* STREAM mode only */
117 struct {
118 struct work_struct tq; /* STREAM/DGRAM: data/error ready */
119 struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */
120 struct mutex creq_mutex; /* DGRAM only: lock accesses to rcv.creq */
121
122 unsigned int state; /* STREAM only: receiver state */
123 struct {
124 __u32 magic __packed;
125 __u32 len __packed;
126 __u16 type __packed;
127 __u16 p1 __packed;
128 __u16 p2 __packed;
129 __u16 p3 __packed;
130 __u16 type2 __packed;
131 } buf; /* STREAM only: temporary buffer */
132 unsigned char* ptr; /* STREAM only: pointer to data */
133 size_t len; /* STREAM only: length of data to receive */
134 } rcv;
135 struct {
136 struct list_head requests; /* STREAM only: queued requests */
137 struct work_struct tq; /* STREAM only: transmitter ready */
138 struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */
139 } tx;
140 struct timer_list timeout_tm; /* DGRAM only: timeout timer */
141 struct work_struct timeout_tq; /* DGRAM only: associated queue, we run timers from process context */
142 int timeout_last; /* DGRAM only: current timeout length */
143 int timeout_retries; /* DGRAM only: retries left */
144 struct {
145 size_t len;
146 __u8 data[128];
147 } unexpected_packet;
148 struct backing_dev_info bdi;
149};
150
151extern void ncp_tcp_rcv_proc(struct work_struct *work);
152extern void ncp_tcp_tx_proc(struct work_struct *work);
153extern void ncpdgram_rcv_proc(struct work_struct *work);
154extern void ncpdgram_timeout_proc(struct work_struct *work);
155extern void ncpdgram_timeout_call(unsigned long server);
156extern void ncp_tcp_data_ready(struct sock* sk, int len);
157extern void ncp_tcp_write_space(struct sock* sk);
158extern void ncp_tcp_error_report(struct sock* sk);
159
160#define NCP_FLAG_UTF8 1
161
162#define NCP_CLR_FLAG(server, flag) ((server)->flags &= ~(flag))
163#define NCP_SET_FLAG(server, flag) ((server)->flags |= (flag))
164#define NCP_IS_FLAG(server, flag) ((server)->flags & (flag))
165
166static inline int ncp_conn_valid(struct ncp_server *server)
167{
168 return ((server->conn_status & 0x11) == 0);
169}
170
171static inline void ncp_invalidate_conn(struct ncp_server *server)
172{
173 server->conn_status |= 0x01;
174}
175
176#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index a95615a0b6a..981a95617fc 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -11,7 +11,7 @@
11 11
12 12
13 13
14#include "ncplib_kernel.h" 14#include "ncp_fs.h"
15 15
16static inline void assert_server_locked(struct ncp_server *server) 16static inline void assert_server_locked(struct ncp_server *server)
17{ 17{
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 3c57eca634c..09881e6aa5a 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -32,8 +32,6 @@
32#include <linux/ctype.h> 32#include <linux/ctype.h>
33#endif /* CONFIG_NCPFS_NLS */ 33#endif /* CONFIG_NCPFS_NLS */
34 34
35#include <linux/ncp_fs.h>
36
37#define NCP_MIN_SYMLINK_SIZE 8 35#define NCP_MIN_SYMLINK_SIZE 8
38#define NCP_MAX_SYMLINK_SIZE 512 36#define NCP_MAX_SYMLINK_SIZE 512
39 37
@@ -135,7 +133,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
135 const unsigned char *, unsigned int, int); 133 const unsigned char *, unsigned int, int);
136 134
137#define NCP_ESC ':' 135#define NCP_ESC ':'
138#define NCP_IO_TABLE(dentry) (NCP_SERVER((dentry)->d_inode)->nls_io) 136#define NCP_IO_TABLE(sb) (NCP_SBP(sb)->nls_io)
139#define ncp_tolower(t, c) nls_tolower(t, c) 137#define ncp_tolower(t, c) nls_tolower(t, c)
140#define ncp_toupper(t, c) nls_toupper(t, c) 138#define ncp_toupper(t, c) nls_toupper(t, c)
141#define ncp_strnicmp(t, s1, s2, len) \ 139#define ncp_strnicmp(t, s1, s2, len) \
@@ -150,15 +148,15 @@ int ncp__io2vol(unsigned char *, unsigned int *,
150int ncp__vol2io(unsigned char *, unsigned int *, 148int ncp__vol2io(unsigned char *, unsigned int *,
151 const unsigned char *, unsigned int, int); 149 const unsigned char *, unsigned int, int);
152 150
153#define NCP_IO_TABLE(dentry) NULL 151#define NCP_IO_TABLE(sb) NULL
154#define ncp_tolower(t, c) tolower(c) 152#define ncp_tolower(t, c) tolower(c)
155#define ncp_toupper(t, c) toupper(c) 153#define ncp_toupper(t, c) toupper(c)
156#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U) 154#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U)
157#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U) 155#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U)
158 156
159 157
160static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1, 158static inline int ncp_strnicmp(const struct nls_table *t,
161 const unsigned char *s2, int len) 159 const unsigned char *s1, const unsigned char *s2, int len)
162{ 160{
163 while (len--) { 161 while (len--) {
164 if (tolower(*s1++) != tolower(*s2++)) 162 if (tolower(*s1++) != tolower(*s2++))
@@ -193,7 +191,7 @@ ncp_renew_dentries(struct dentry *parent)
193 struct list_head *next; 191 struct list_head *next;
194 struct dentry *dentry; 192 struct dentry *dentry;
195 193
196 spin_lock(&dcache_lock); 194 spin_lock(&parent->d_lock);
197 next = parent->d_subdirs.next; 195 next = parent->d_subdirs.next;
198 while (next != &parent->d_subdirs) { 196 while (next != &parent->d_subdirs) {
199 dentry = list_entry(next, struct dentry, d_u.d_child); 197 dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -205,7 +203,7 @@ ncp_renew_dentries(struct dentry *parent)
205 203
206 next = next->next; 204 next = next->next;
207 } 205 }
208 spin_unlock(&dcache_lock); 206 spin_unlock(&parent->d_lock);
209} 207}
210 208
211static inline void 209static inline void
@@ -215,7 +213,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
215 struct list_head *next; 213 struct list_head *next;
216 struct dentry *dentry; 214 struct dentry *dentry;
217 215
218 spin_lock(&dcache_lock); 216 spin_lock(&parent->d_lock);
219 next = parent->d_subdirs.next; 217 next = parent->d_subdirs.next;
220 while (next != &parent->d_subdirs) { 218 while (next != &parent->d_subdirs) {
221 dentry = list_entry(next, struct dentry, d_u.d_child); 219 dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -223,7 +221,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
223 ncp_age_dentry(server, dentry); 221 ncp_age_dentry(server, dentry);
224 next = next->next; 222 next = next->next;
225 } 223 }
226 spin_unlock(&dcache_lock); 224 spin_unlock(&parent->d_lock);
227} 225}
228 226
229struct ncp_cache_head { 227struct ncp_cache_head {
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index d8b2d7e6910..08907599dcd 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -11,6 +11,7 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/ncp.h> 12#include <linux/ncp.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include "ncp_fs.h"
14#include "ncpsign_kernel.h" 15#include "ncpsign_kernel.h"
15 16
16/* i386: 32-bit, little endian, handles mis-alignment */ 17/* i386: 32-bit, little endian, handles mis-alignment */
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index 6451a68381c..d9a1438bb1f 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -8,8 +8,6 @@
8#ifndef _NCPSIGN_KERNEL_H 8#ifndef _NCPSIGN_KERNEL_H
9#define _NCPSIGN_KERNEL_H 9#define _NCPSIGN_KERNEL_H
10 10
11#include <linux/ncp_fs.h>
12
13#ifdef CONFIG_NCPFS_PACKET_SIGNING 11#ifdef CONFIG_NCPFS_PACKET_SIGNING
14void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff); 12void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
15int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff); 13int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 668bd267346..3a1587222c8 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -28,7 +28,7 @@
28#include <linux/poll.h> 28#include <linux/poll.h>
29#include <linux/file.h> 29#include <linux/file.h>
30 30
31#include <linux/ncp_fs.h> 31#include "ncp_fs.h"
32 32
33#include "ncpsign_kernel.h" 33#include "ncpsign_kernel.h"
34 34
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index c634fd17b33..661f861d80c 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -25,13 +25,11 @@
25 25
26#include <linux/errno.h> 26#include <linux/errno.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/ncp_fs.h>
29#include <linux/time.h> 28#include <linux/time.h>
30#include <linux/slab.h> 29#include <linux/slab.h>
31#include <linux/mm.h> 30#include <linux/mm.h>
32#include <linux/stat.h> 31#include <linux/stat.h>
33#include "ncplib_kernel.h" 32#include "ncp_fs.h"
34
35 33
36/* these magic numbers must appear in the symlink file -- this makes it a bit 34/* these magic numbers must appear in the symlink file -- this makes it a bit
37 more resilient against the magic attributes being set on random files. */ 35 more resilient against the magic attributes being set on random files. */
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index aeec017fe81..e3d29426905 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,7 +9,6 @@
9#include <linux/completion.h> 9#include <linux/completion.h>
10#include <linux/ip.h> 10#include <linux/ip.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/smp_lock.h>
13#include <linux/sunrpc/svc.h> 12#include <linux/sunrpc/svc.h>
14#include <linux/sunrpc/svcsock.h> 13#include <linux/sunrpc/svcsock.h>
15#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
@@ -17,9 +16,7 @@
17#include <linux/freezer.h> 16#include <linux/freezer.h>
18#include <linux/kthread.h> 17#include <linux/kthread.h>
19#include <linux/sunrpc/svcauth_gss.h> 18#include <linux/sunrpc/svcauth_gss.h>
20#if defined(CONFIG_NFS_V4_1)
21#include <linux/sunrpc/bc_xprt.h> 19#include <linux/sunrpc/bc_xprt.h>
22#endif
23 20
24#include <net/inet_sock.h> 21#include <net/inet_sock.h>
25 22
@@ -178,30 +175,38 @@ nfs41_callback_svc(void *vrqstp)
178struct svc_rqst * 175struct svc_rqst *
179nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) 176nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
180{ 177{
181 struct svc_xprt *bc_xprt; 178 struct svc_rqst *rqstp;
182 struct svc_rqst *rqstp = ERR_PTR(-ENOMEM); 179 int ret;
183 180
184 dprintk("--> %s\n", __func__); 181 /*
185 /* Create a svc_sock for the service */ 182 * Create an svc_sock for the back channel service that shares the
186 bc_xprt = svc_sock_create(serv, xprt->prot); 183 * fore channel connection.
187 if (!bc_xprt) 184 * Returns the input port (0) and sets the svc_serv bc_xprt on success
185 */
186 ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
187 SVC_SOCK_ANONYMOUS);
188 if (ret < 0) {
189 rqstp = ERR_PTR(ret);
188 goto out; 190 goto out;
191 }
189 192
190 /* 193 /*
191 * Save the svc_serv in the transport so that it can 194 * Save the svc_serv in the transport so that it can
192 * be referenced when the session backchannel is initialized 195 * be referenced when the session backchannel is initialized
193 */ 196 */
194 serv->bc_xprt = bc_xprt;
195 xprt->bc_serv = serv; 197 xprt->bc_serv = serv;
196 198
197 INIT_LIST_HEAD(&serv->sv_cb_list); 199 INIT_LIST_HEAD(&serv->sv_cb_list);
198 spin_lock_init(&serv->sv_cb_lock); 200 spin_lock_init(&serv->sv_cb_lock);
199 init_waitqueue_head(&serv->sv_cb_waitq); 201 init_waitqueue_head(&serv->sv_cb_waitq);
200 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); 202 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
201 if (IS_ERR(rqstp)) 203 if (IS_ERR(rqstp)) {
202 svc_sock_destroy(bc_xprt); 204 svc_xprt_put(serv->sv_bc_xprt);
205 serv->sv_bc_xprt = NULL;
206 }
203out: 207out:
204 dprintk("--> %s return %p\n", __func__, rqstp); 208 dprintk("--> %s return %ld\n", __func__,
209 IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
205 return rqstp; 210 return rqstp;
206} 211}
207 212
@@ -323,58 +328,58 @@ void nfs_callback_down(int minorversion)
323 mutex_unlock(&nfs_callback_mutex); 328 mutex_unlock(&nfs_callback_mutex);
324} 329}
325 330
326static int check_gss_callback_principal(struct nfs_client *clp, 331/* Boolean check of RPC_AUTH_GSS principal */
327 struct svc_rqst *rqstp) 332int
333check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
328{ 334{
329 struct rpc_clnt *r = clp->cl_rpcclient; 335 struct rpc_clnt *r = clp->cl_rpcclient;
330 char *p = svc_gss_principal(rqstp); 336 char *p = svc_gss_principal(rqstp);
331 337
338 if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
339 return 1;
340
341 /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
342 if (clp->cl_minorversion != 0)
343 return 0;
332 /* 344 /*
333 * It might just be a normal user principal, in which case 345 * It might just be a normal user principal, in which case
334 * userspace won't bother to tell us the name at all. 346 * userspace won't bother to tell us the name at all.
335 */ 347 */
336 if (p == NULL) 348 if (p == NULL)
337 return SVC_DENIED; 349 return 0;
338 350
339 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ 351 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
340 352
341 if (memcmp(p, "nfs@", 4) != 0) 353 if (memcmp(p, "nfs@", 4) != 0)
342 return SVC_DENIED; 354 return 0;
343 p += 4; 355 p += 4;
344 if (strcmp(p, r->cl_server) != 0) 356 if (strcmp(p, r->cl_server) != 0)
345 return SVC_DENIED; 357 return 0;
346 return SVC_OK; 358 return 1;
347} 359}
348 360
361/*
362 * pg_authenticate method for nfsv4 callback threads.
363 *
364 * The authflavor has been negotiated, so an incorrect flavor is a server
365 * bug. Drop packets with incorrect authflavor.
366 *
367 * All other checking done after NFS decoding where the nfs_client can be
368 * found in nfs4_callback_compound
369 */
349static int nfs_callback_authenticate(struct svc_rqst *rqstp) 370static int nfs_callback_authenticate(struct svc_rqst *rqstp)
350{ 371{
351 struct nfs_client *clp;
352 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
353 int ret = SVC_OK;
354
355 /* Don't talk to strangers */
356 clp = nfs_find_client(svc_addr(rqstp), 4);
357 if (clp == NULL)
358 return SVC_DROP;
359
360 dprintk("%s: %s NFSv4 callback!\n", __func__,
361 svc_print_addr(rqstp, buf, sizeof(buf)));
362
363 switch (rqstp->rq_authop->flavour) { 372 switch (rqstp->rq_authop->flavour) {
364 case RPC_AUTH_NULL: 373 case RPC_AUTH_NULL:
365 if (rqstp->rq_proc != CB_NULL) 374 if (rqstp->rq_proc != CB_NULL)
366 ret = SVC_DENIED; 375 return SVC_DROP;
367 break; 376 break;
368 case RPC_AUTH_UNIX: 377 case RPC_AUTH_GSS:
369 break; 378 /* No RPC_AUTH_GSS support yet in NFSv4.1 */
370 case RPC_AUTH_GSS: 379 if (svc_is_backchannel(rqstp))
371 ret = check_gss_callback_principal(clp, rqstp); 380 return SVC_DROP;
372 break;
373 default:
374 ret = SVC_DENIED;
375 } 381 }
376 nfs_put_client(clp); 382 return SVC_OK;
377 return ret;
378} 383}
379 384
380/* 385/*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8d..46d93ce7311 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -7,6 +7,7 @@
7 */ 7 */
8#ifndef __LINUX_FS_NFS_CALLBACK_H 8#ifndef __LINUX_FS_NFS_CALLBACK_H
9#define __LINUX_FS_NFS_CALLBACK_H 9#define __LINUX_FS_NFS_CALLBACK_H
10#include <linux/sunrpc/svc.h>
10 11
11#define NFS4_CALLBACK 0x40000000 12#define NFS4_CALLBACK 0x40000000
12#define NFS4_CALLBACK_XDRSIZE 2048 13#define NFS4_CALLBACK_XDRSIZE 2048
@@ -34,10 +35,16 @@ enum nfs4_callback_opnum {
34 OP_CB_ILLEGAL = 10044, 35 OP_CB_ILLEGAL = 10044,
35}; 36};
36 37
38struct cb_process_state {
39 __be32 drc_status;
40 struct nfs_client *clp;
41};
42
37struct cb_compound_hdr_arg { 43struct cb_compound_hdr_arg {
38 unsigned int taglen; 44 unsigned int taglen;
39 const char *tag; 45 const char *tag;
40 unsigned int minorversion; 46 unsigned int minorversion;
47 unsigned int cb_ident; /* v4.0 callback identifier */
41 unsigned nops; 48 unsigned nops;
42}; 49};
43 50
@@ -103,14 +110,23 @@ struct cb_sequenceres {
103 uint32_t csr_target_highestslotid; 110 uint32_t csr_target_highestslotid;
104}; 111};
105 112
106extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, 113extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
107 struct cb_sequenceres *res); 114 struct cb_sequenceres *res,
115 struct cb_process_state *cps);
108 116
109extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, 117extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
110 const nfs4_stateid *stateid); 118 const nfs4_stateid *stateid);
111 119
112#define RCA4_TYPE_MASK_RDATA_DLG 0 120#define RCA4_TYPE_MASK_RDATA_DLG 0
113#define RCA4_TYPE_MASK_WDATA_DLG 1 121#define RCA4_TYPE_MASK_WDATA_DLG 1
122#define RCA4_TYPE_MASK_DIR_DLG 2
123#define RCA4_TYPE_MASK_FILE_LAYOUT 3
124#define RCA4_TYPE_MASK_BLK_LAYOUT 4
125#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8
126#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
127#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
128#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
129#define RCA4_TYPE_MASK_ALL 0xf31f
114 130
115struct cb_recallanyargs { 131struct cb_recallanyargs {
116 struct sockaddr *craa_addr; 132 struct sockaddr *craa_addr;
@@ -118,25 +134,52 @@ struct cb_recallanyargs {
118 uint32_t craa_type_mask; 134 uint32_t craa_type_mask;
119}; 135};
120 136
121extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); 137extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
138 void *dummy,
139 struct cb_process_state *cps);
122 140
123struct cb_recallslotargs { 141struct cb_recallslotargs {
124 struct sockaddr *crsa_addr; 142 struct sockaddr *crsa_addr;
125 uint32_t crsa_target_max_slots; 143 uint32_t crsa_target_max_slots;
126}; 144};
127extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, 145extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
128 void *dummy); 146 void *dummy,
129 147 struct cb_process_state *cps);
130#endif /* CONFIG_NFS_V4_1 */ 148
149struct cb_layoutrecallargs {
150 struct sockaddr *cbl_addr;
151 uint32_t cbl_recall_type;
152 uint32_t cbl_layout_type;
153 uint32_t cbl_layoutchanged;
154 union {
155 struct {
156 struct nfs_fh cbl_fh;
157 struct pnfs_layout_range cbl_range;
158 nfs4_stateid cbl_stateid;
159 };
160 struct nfs_fsid cbl_fsid;
161 };
162};
131 163
132extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); 164extern unsigned nfs4_callback_layoutrecall(
133extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy); 165 struct cb_layoutrecallargs *args,
166 void *dummy, struct cb_process_state *cps);
134 167
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170#endif /* CONFIG_NFS_V4_1 */
171extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
173 struct cb_getattrres *res,
174 struct cb_process_state *cps);
175extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
176 struct cb_process_state *cps);
135#ifdef CONFIG_NFS_V4 177#ifdef CONFIG_NFS_V4
136extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); 178extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
137extern void nfs_callback_down(int minorversion); 179extern void nfs_callback_down(int minorversion);
138extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, 180extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
139 const nfs4_stateid *stateid); 181 const nfs4_stateid *stateid);
182extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
140#endif /* CONFIG_NFS_V4 */ 183#endif /* CONFIG_NFS_V4 */
141/* 184/*
142 * nfs41: Callbacks are expected to not cause substantial latency, 185 * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2950fca0c61..89587573fe5 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,30 +12,33 @@
12#include "callback.h" 12#include "callback.h"
13#include "delegation.h" 13#include "delegation.h"
14#include "internal.h" 14#include "internal.h"
15#include "pnfs.h"
15 16
16#ifdef NFS_DEBUG 17#ifdef NFS_DEBUG
17#define NFSDBG_FACILITY NFSDBG_CALLBACK 18#define NFSDBG_FACILITY NFSDBG_CALLBACK
18#endif 19#endif
19 20
20__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) 21__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
22 struct cb_getattrres *res,
23 struct cb_process_state *cps)
21{ 24{
22 struct nfs_client *clp;
23 struct nfs_delegation *delegation; 25 struct nfs_delegation *delegation;
24 struct nfs_inode *nfsi; 26 struct nfs_inode *nfsi;
25 struct inode *inode; 27 struct inode *inode;
26 28
29 res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
30 if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
31 goto out;
32
27 res->bitmap[0] = res->bitmap[1] = 0; 33 res->bitmap[0] = res->bitmap[1] = 0;
28 res->status = htonl(NFS4ERR_BADHANDLE); 34 res->status = htonl(NFS4ERR_BADHANDLE);
29 clp = nfs_find_client(args->addr, 4);
30 if (clp == NULL)
31 goto out;
32 35
33 dprintk("NFS: GETATTR callback request from %s\n", 36 dprintk("NFS: GETATTR callback request from %s\n",
34 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 37 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
35 38
36 inode = nfs_delegation_find_inode(clp, &args->fh); 39 inode = nfs_delegation_find_inode(cps->clp, &args->fh);
37 if (inode == NULL) 40 if (inode == NULL)
38 goto out_putclient; 41 goto out;
39 nfsi = NFS_I(inode); 42 nfsi = NFS_I(inode);
40 rcu_read_lock(); 43 rcu_read_lock();
41 delegation = rcu_dereference(nfsi->delegation); 44 delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
55out_iput: 58out_iput:
56 rcu_read_unlock(); 59 rcu_read_unlock();
57 iput(inode); 60 iput(inode);
58out_putclient:
59 nfs_put_client(clp);
60out: 61out:
61 dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); 62 dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
62 return res->status; 63 return res->status;
63} 64}
64 65
65__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) 66__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
67 struct cb_process_state *cps)
66{ 68{
67 struct nfs_client *clp;
68 struct inode *inode; 69 struct inode *inode;
69 __be32 res; 70 __be32 res;
70 71
71 res = htonl(NFS4ERR_BADHANDLE); 72 res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
72 clp = nfs_find_client(args->addr, 4); 73 if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
73 if (clp == NULL)
74 goto out; 74 goto out;
75 75
76 dprintk("NFS: RECALL callback request from %s\n", 76 dprintk("NFS: RECALL callback request from %s\n",
77 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 77 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
78 78
79 do { 79 res = htonl(NFS4ERR_BADHANDLE);
80 struct nfs_client *prev = clp; 80 inode = nfs_delegation_find_inode(cps->clp, &args->fh);
81 81 if (inode == NULL)
82 inode = nfs_delegation_find_inode(clp, &args->fh); 82 goto out;
83 if (inode != NULL) { 83 /* Set up a helper thread to actually return the delegation */
84 /* Set up a helper thread to actually return the delegation */ 84 switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
85 switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { 85 case 0:
86 case 0: 86 res = 0;
87 res = 0; 87 break;
88 break; 88 case -ENOENT:
89 case -ENOENT: 89 if (res != 0)
90 if (res != 0) 90 res = htonl(NFS4ERR_BAD_STATEID);
91 res = htonl(NFS4ERR_BAD_STATEID); 91 break;
92 break; 92 default:
93 default: 93 res = htonl(NFS4ERR_RESOURCE);
94 res = htonl(NFS4ERR_RESOURCE); 94 }
95 } 95 iput(inode);
96 iput(inode);
97 }
98 clp = nfs_find_client_next(prev);
99 nfs_put_client(prev);
100 } while (clp != NULL);
101out: 96out:
102 dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); 97 dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
103 return res; 98 return res;
@@ -113,6 +108,139 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
113 108
114#if defined(CONFIG_NFS_V4_1) 109#if defined(CONFIG_NFS_V4_1)
115 110
111static u32 initiate_file_draining(struct nfs_client *clp,
112 struct cb_layoutrecallargs *args)
113{
114 struct pnfs_layout_hdr *lo;
115 struct inode *ino;
116 bool found = false;
117 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
118 LIST_HEAD(free_me_list);
119
120 spin_lock(&clp->cl_lock);
121 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
122 if (nfs_compare_fh(&args->cbl_fh,
123 &NFS_I(lo->plh_inode)->fh))
124 continue;
125 ino = igrab(lo->plh_inode);
126 if (!ino)
127 continue;
128 found = true;
129 /* Without this, layout can be freed as soon
130 * as we release cl_lock.
131 */
132 get_layout_hdr(lo);
133 break;
134 }
135 spin_unlock(&clp->cl_lock);
136 if (!found)
137 return NFS4ERR_NOMATCHING_LAYOUT;
138
139 spin_lock(&ino->i_lock);
140 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
141 mark_matching_lsegs_invalid(lo, &free_me_list,
142 args->cbl_range.iomode))
143 rv = NFS4ERR_DELAY;
144 else
145 rv = NFS4ERR_NOMATCHING_LAYOUT;
146 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
147 spin_unlock(&ino->i_lock);
148 pnfs_free_lseg_list(&free_me_list);
149 put_layout_hdr(lo);
150 iput(ino);
151 return rv;
152}
153
154static u32 initiate_bulk_draining(struct nfs_client *clp,
155 struct cb_layoutrecallargs *args)
156{
157 struct pnfs_layout_hdr *lo;
158 struct inode *ino;
159 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
160 struct pnfs_layout_hdr *tmp;
161 LIST_HEAD(recall_list);
162 LIST_HEAD(free_me_list);
163 struct pnfs_layout_range range = {
164 .iomode = IOMODE_ANY,
165 .offset = 0,
166 .length = NFS4_MAX_UINT64,
167 };
168
169 spin_lock(&clp->cl_lock);
170 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
171 if ((args->cbl_recall_type == RETURN_FSID) &&
172 memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
173 &args->cbl_fsid, sizeof(struct nfs_fsid)))
174 continue;
175 if (!igrab(lo->plh_inode))
176 continue;
177 get_layout_hdr(lo);
178 BUG_ON(!list_empty(&lo->plh_bulk_recall));
179 list_add(&lo->plh_bulk_recall, &recall_list);
180 }
181 spin_unlock(&clp->cl_lock);
182 list_for_each_entry_safe(lo, tmp,
183 &recall_list, plh_bulk_recall) {
184 ino = lo->plh_inode;
185 spin_lock(&ino->i_lock);
186 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
187 if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
188 rv = NFS4ERR_DELAY;
189 list_del_init(&lo->plh_bulk_recall);
190 spin_unlock(&ino->i_lock);
191 put_layout_hdr(lo);
192 iput(ino);
193 }
194 pnfs_free_lseg_list(&free_me_list);
195 return rv;
196}
197
198static u32 do_callback_layoutrecall(struct nfs_client *clp,
199 struct cb_layoutrecallargs *args)
200{
201 u32 res = NFS4ERR_DELAY;
202
203 dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
204 if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
205 goto out;
206 if (args->cbl_recall_type == RETURN_FILE)
207 res = initiate_file_draining(clp, args);
208 else
209 res = initiate_bulk_draining(clp, args);
210 clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
211out:
212 dprintk("%s returning %i\n", __func__, res);
213 return res;
214
215}
216
217__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
218 void *dummy, struct cb_process_state *cps)
219{
220 u32 res;
221
222 dprintk("%s: -->\n", __func__);
223
224 if (cps->clp)
225 res = do_callback_layoutrecall(cps->clp, args);
226 else
227 res = NFS4ERR_OP_NOT_IN_SESSION;
228
229 dprintk("%s: exit with status = %d\n", __func__, res);
230 return cpu_to_be32(res);
231}
232
233static void pnfs_recall_all_layouts(struct nfs_client *clp)
234{
235 struct cb_layoutrecallargs args;
236
237 /* Pretend we got a CB_LAYOUTRECALL(ALL) */
238 memset(&args, 0, sizeof(args));
239 args.cbl_recall_type = RETURN_ALL;
240 /* FIXME we ignore errors, what should we do? */
241 do_callback_layoutrecall(clp, &args);
242}
243
116int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) 244int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
117{ 245{
118 if (delegation == NULL) 246 if (delegation == NULL)
@@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
185} 313}
186 314
187/* 315/*
188 * Returns a pointer to a held 'struct nfs_client' that matches the server's
189 * address, major version number, and session ID. It is the caller's
190 * responsibility to release the returned reference.
191 *
192 * Returns NULL if there are no connections with sessions, or if no session
193 * matches the one of interest.
194 */
195 static struct nfs_client *find_client_with_session(
196 const struct sockaddr *addr, u32 nfsversion,
197 struct nfs4_sessionid *sessionid)
198{
199 struct nfs_client *clp;
200
201 clp = nfs_find_client(addr, 4);
202 if (clp == NULL)
203 return NULL;
204
205 do {
206 struct nfs_client *prev = clp;
207
208 if (clp->cl_session != NULL) {
209 if (memcmp(clp->cl_session->sess_id.data,
210 sessionid->data,
211 NFS4_MAX_SESSIONID_LEN) == 0) {
212 /* Returns a held reference to clp */
213 return clp;
214 }
215 }
216 clp = nfs_find_client_next(prev);
217 nfs_put_client(prev);
218 } while (clp != NULL);
219
220 return NULL;
221}
222
223/*
224 * For each referring call triple, check the session's slot table for 316 * For each referring call triple, check the session's slot table for
225 * a match. If the slot is in use and the sequence numbers match, the 317 * a match. If the slot is in use and the sequence numbers match, the
226 * client is still waiting for a response to the original request. 318 * client is still waiting for a response to the original request.
@@ -276,20 +368,28 @@ out:
276} 368}
277 369
278__be32 nfs4_callback_sequence(struct cb_sequenceargs *args, 370__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
279 struct cb_sequenceres *res) 371 struct cb_sequenceres *res,
372 struct cb_process_state *cps)
280{ 373{
281 struct nfs_client *clp; 374 struct nfs_client *clp;
282 int i; 375 int i;
283 __be32 status; 376 __be32 status = htonl(NFS4ERR_BADSESSION);
284 377
285 status = htonl(NFS4ERR_BADSESSION); 378 cps->clp = NULL;
286 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); 379
380 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
287 if (clp == NULL) 381 if (clp == NULL)
288 goto out; 382 goto out;
289 383
384 /* state manager is resetting the session */
385 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
386 status = NFS4ERR_DELAY;
387 goto out;
388 }
389
290 status = validate_seqid(&clp->cl_session->bc_slot_table, args); 390 status = validate_seqid(&clp->cl_session->bc_slot_table, args);
291 if (status) 391 if (status)
292 goto out_putclient; 392 goto out;
293 393
294 /* 394 /*
295 * Check for pending referring calls. If a match is found, a 395 * Check for pending referring calls. If a match is found, a
@@ -298,7 +398,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
298 */ 398 */
299 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { 399 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
300 status = htonl(NFS4ERR_DELAY); 400 status = htonl(NFS4ERR_DELAY);
301 goto out_putclient; 401 goto out;
302 } 402 }
303 403
304 memcpy(&res->csr_sessionid, &args->csa_sessionid, 404 memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,83 +407,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
307 res->csr_slotid = args->csa_slotid; 407 res->csr_slotid = args->csa_slotid;
308 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 408 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
309 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 409 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
410 nfs4_cb_take_slot(clp);
310 411
311out_putclient:
312 nfs_put_client(clp);
313out: 412out:
413 cps->clp = clp; /* put in nfs4_callback_compound */
314 for (i = 0; i < args->csa_nrclists; i++) 414 for (i = 0; i < args->csa_nrclists; i++)
315 kfree(args->csa_rclists[i].rcl_refcalls); 415 kfree(args->csa_rclists[i].rcl_refcalls);
316 kfree(args->csa_rclists); 416 kfree(args->csa_rclists);
317 417
318 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) 418 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
319 res->csr_status = 0; 419 cps->drc_status = status;
320 else 420 status = 0;
421 } else
321 res->csr_status = status; 422 res->csr_status = status;
423
322 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, 424 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
323 ntohl(status), ntohl(res->csr_status)); 425 ntohl(status), ntohl(res->csr_status));
324 return status; 426 return status;
325} 427}
326 428
327__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) 429static bool
430validate_bitmap_values(unsigned long mask)
431{
432 return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
433}
434
435__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
436 struct cb_process_state *cps)
328{ 437{
329 struct nfs_client *clp;
330 __be32 status; 438 __be32 status;
331 fmode_t flags = 0; 439 fmode_t flags = 0;
332 440
333 status = htonl(NFS4ERR_OP_NOT_IN_SESSION); 441 status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
334 clp = nfs_find_client(args->craa_addr, 4); 442 if (!cps->clp) /* set in cb_sequence */
335 if (clp == NULL)
336 goto out; 443 goto out;
337 444
338 dprintk("NFS: RECALL_ANY callback request from %s\n", 445 dprintk("NFS: RECALL_ANY callback request from %s\n",
339 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 446 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
447
448 status = cpu_to_be32(NFS4ERR_INVAL);
449 if (!validate_bitmap_values(args->craa_type_mask))
450 goto out;
340 451
452 status = cpu_to_be32(NFS4_OK);
341 if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) 453 if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
342 &args->craa_type_mask)) 454 &args->craa_type_mask))
343 flags = FMODE_READ; 455 flags = FMODE_READ;
344 if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) 456 if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
345 &args->craa_type_mask)) 457 &args->craa_type_mask))
346 flags |= FMODE_WRITE; 458 flags |= FMODE_WRITE;
347 459 if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
460 &args->craa_type_mask))
461 pnfs_recall_all_layouts(cps->clp);
348 if (flags) 462 if (flags)
349 nfs_expire_all_delegation_types(clp, flags); 463 nfs_expire_all_delegation_types(cps->clp, flags);
350 status = htonl(NFS4_OK);
351out: 464out:
352 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 465 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
353 return status; 466 return status;
354} 467}
355 468
356/* Reduce the fore channel's max_slots to the target value */ 469/* Reduce the fore channel's max_slots to the target value */
357__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) 470__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
471 struct cb_process_state *cps)
358{ 472{
359 struct nfs_client *clp;
360 struct nfs4_slot_table *fc_tbl; 473 struct nfs4_slot_table *fc_tbl;
361 __be32 status; 474 __be32 status;
362 475
363 status = htonl(NFS4ERR_OP_NOT_IN_SESSION); 476 status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
364 clp = nfs_find_client(args->crsa_addr, 4); 477 if (!cps->clp) /* set in cb_sequence */
365 if (clp == NULL)
366 goto out; 478 goto out;
367 479
368 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", 480 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
369 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), 481 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
370 args->crsa_target_max_slots); 482 args->crsa_target_max_slots);
371 483
372 fc_tbl = &clp->cl_session->fc_slot_table; 484 fc_tbl = &cps->clp->cl_session->fc_slot_table;
373 485
374 status = htonl(NFS4ERR_BAD_HIGH_SLOT); 486 status = htonl(NFS4ERR_BAD_HIGH_SLOT);
375 if (args->crsa_target_max_slots > fc_tbl->max_slots || 487 if (args->crsa_target_max_slots > fc_tbl->max_slots ||
376 args->crsa_target_max_slots < 1) 488 args->crsa_target_max_slots < 1)
377 goto out_putclient; 489 goto out;
378 490
379 status = htonl(NFS4_OK); 491 status = htonl(NFS4_OK);
380 if (args->crsa_target_max_slots == fc_tbl->max_slots) 492 if (args->crsa_target_max_slots == fc_tbl->max_slots)
381 goto out_putclient; 493 goto out;
382 494
383 fc_tbl->target_max_slots = args->crsa_target_max_slots; 495 fc_tbl->target_max_slots = args->crsa_target_max_slots;
384 nfs41_handle_recall_slot(clp); 496 nfs41_handle_recall_slot(cps->clp);
385out_putclient:
386 nfs_put_client(clp); /* balance nfs_find_client */
387out: 497out:
388 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 498 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
389 return status; 499 return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0ed..14e0f9371d1 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sunrpc/bc_xprt.h>
13#include "nfs4_fs.h" 14#include "nfs4_fs.h"
14#include "callback.h" 15#include "callback.h"
16#include "internal.h"
15 17
16#define CB_OP_TAGLEN_MAXSZ (512) 18#define CB_OP_TAGLEN_MAXSZ (512)
17#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) 19#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
@@ -22,6 +24,7 @@
22#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 24#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
23 25
24#if defined(CONFIG_NFS_V4_1) 26#if defined(CONFIG_NFS_V4_1)
27#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
25#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 28#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
26 4 + 1 + 3) 29 4 + 1 + 3)
27#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 30#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -33,7 +36,8 @@
33/* Internal error code */ 36/* Internal error code */
34#define NFS4ERR_RESOURCE_HDR 11050 37#define NFS4ERR_RESOURCE_HDR 11050
35 38
36typedef __be32 (*callback_process_op_t)(void *, void *); 39typedef __be32 (*callback_process_op_t)(void *, void *,
40 struct cb_process_state *);
37typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); 41typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
38typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); 42typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
39 43
@@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
160 hdr->minorversion = ntohl(*p++); 164 hdr->minorversion = ntohl(*p++);
161 /* Check minor version is zero or one. */ 165 /* Check minor version is zero or one. */
162 if (hdr->minorversion <= 1) { 166 if (hdr->minorversion <= 1) {
163 p++; /* skip callback_ident */ 167 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
164 } else { 168 } else {
165 printk(KERN_WARNING "%s: NFSv4 server callback with " 169 printk(KERN_WARNING "%s: NFSv4 server callback with "
166 "illegal minor version %u!\n", 170 "illegal minor version %u!\n",
@@ -220,6 +224,66 @@ out:
220 224
221#if defined(CONFIG_NFS_V4_1) 225#if defined(CONFIG_NFS_V4_1)
222 226
227static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
228 struct xdr_stream *xdr,
229 struct cb_layoutrecallargs *args)
230{
231 __be32 *p;
232 __be32 status = 0;
233 uint32_t iomode;
234
235 args->cbl_addr = svc_addr(rqstp);
236 p = read_buf(xdr, 4 * sizeof(uint32_t));
237 if (unlikely(p == NULL)) {
238 status = htonl(NFS4ERR_BADXDR);
239 goto out;
240 }
241
242 args->cbl_layout_type = ntohl(*p++);
243 /* Depite the spec's xdr, iomode really belongs in the FILE switch,
244 * as it is unuseable and ignored with the other types.
245 */
246 iomode = ntohl(*p++);
247 args->cbl_layoutchanged = ntohl(*p++);
248 args->cbl_recall_type = ntohl(*p++);
249
250 if (args->cbl_recall_type == RETURN_FILE) {
251 args->cbl_range.iomode = iomode;
252 status = decode_fh(xdr, &args->cbl_fh);
253 if (unlikely(status != 0))
254 goto out;
255
256 p = read_buf(xdr, 2 * sizeof(uint64_t));
257 if (unlikely(p == NULL)) {
258 status = htonl(NFS4ERR_BADXDR);
259 goto out;
260 }
261 p = xdr_decode_hyper(p, &args->cbl_range.offset);
262 p = xdr_decode_hyper(p, &args->cbl_range.length);
263 status = decode_stateid(xdr, &args->cbl_stateid);
264 if (unlikely(status != 0))
265 goto out;
266 } else if (args->cbl_recall_type == RETURN_FSID) {
267 p = read_buf(xdr, 2 * sizeof(uint64_t));
268 if (unlikely(p == NULL)) {
269 status = htonl(NFS4ERR_BADXDR);
270 goto out;
271 }
272 p = xdr_decode_hyper(p, &args->cbl_fsid.major);
273 p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
274 } else if (args->cbl_recall_type != RETURN_ALL) {
275 status = htonl(NFS4ERR_BADXDR);
276 goto out;
277 }
278 dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
279 __func__,
280 args->cbl_layout_type, iomode,
281 args->cbl_layoutchanged, args->cbl_recall_type);
282out:
283 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
284 return status;
285}
286
223static __be32 decode_sessionid(struct xdr_stream *xdr, 287static __be32 decode_sessionid(struct xdr_stream *xdr,
224 struct nfs4_sessionid *sid) 288 struct nfs4_sessionid *sid)
225{ 289{
@@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
574 case OP_CB_SEQUENCE: 638 case OP_CB_SEQUENCE:
575 case OP_CB_RECALL_ANY: 639 case OP_CB_RECALL_ANY:
576 case OP_CB_RECALL_SLOT: 640 case OP_CB_RECALL_SLOT:
641 case OP_CB_LAYOUTRECALL:
577 *op = &callback_ops[op_nr]; 642 *op = &callback_ops[op_nr];
578 break; 643 break;
579 644
580 case OP_CB_LAYOUTRECALL:
581 case OP_CB_NOTIFY_DEVICEID: 645 case OP_CB_NOTIFY_DEVICEID:
582 case OP_CB_NOTIFY: 646 case OP_CB_NOTIFY:
583 case OP_CB_PUSH_DELEG: 647 case OP_CB_PUSH_DELEG:
@@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
593 return htonl(NFS_OK); 657 return htonl(NFS_OK);
594} 658}
595 659
660static void nfs4_callback_free_slot(struct nfs4_session *session)
661{
662 struct nfs4_slot_table *tbl = &session->bc_slot_table;
663
664 spin_lock(&tbl->slot_tbl_lock);
665 /*
666 * Let the state manager know callback processing done.
667 * A single slot, so highest used slotid is either 0 or -1
668 */
669 tbl->highest_used_slotid--;
670 nfs4_check_drain_bc_complete(session);
671 spin_unlock(&tbl->slot_tbl_lock);
672}
673
674static void nfs4_cb_free_slot(struct nfs_client *clp)
675{
676 if (clp && clp->cl_session)
677 nfs4_callback_free_slot(clp->cl_session);
678}
679
680/* A single slot, so highest used slotid is either 0 or -1 */
681void nfs4_cb_take_slot(struct nfs_client *clp)
682{
683 struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
684
685 spin_lock(&tbl->slot_tbl_lock);
686 tbl->highest_used_slotid++;
687 BUG_ON(tbl->highest_used_slotid != 0);
688 spin_unlock(&tbl->slot_tbl_lock);
689}
690
596#else /* CONFIG_NFS_V4_1 */ 691#else /* CONFIG_NFS_V4_1 */
597 692
598static __be32 693static __be32
@@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
601 return htonl(NFS4ERR_MINOR_VERS_MISMATCH); 696 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
602} 697}
603 698
699static void nfs4_cb_free_slot(struct nfs_client *clp)
700{
701}
604#endif /* CONFIG_NFS_V4_1 */ 702#endif /* CONFIG_NFS_V4_1 */
605 703
606static __be32 704static __be32
@@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
621static __be32 process_op(uint32_t minorversion, int nop, 719static __be32 process_op(uint32_t minorversion, int nop,
622 struct svc_rqst *rqstp, 720 struct svc_rqst *rqstp,
623 struct xdr_stream *xdr_in, void *argp, 721 struct xdr_stream *xdr_in, void *argp,
624 struct xdr_stream *xdr_out, void *resp, int* drc_status) 722 struct xdr_stream *xdr_out, void *resp,
723 struct cb_process_state *cps)
625{ 724{
626 struct callback_op *op = &callback_ops[0]; 725 struct callback_op *op = &callback_ops[0];
627 unsigned int op_nr; 726 unsigned int op_nr;
@@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
644 if (status) 743 if (status)
645 goto encode_hdr; 744 goto encode_hdr;
646 745
647 if (*drc_status) { 746 if (cps->drc_status) {
648 status = *drc_status; 747 status = cps->drc_status;
649 goto encode_hdr; 748 goto encode_hdr;
650 } 749 }
651 750
@@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
653 if (maxlen > 0 && maxlen < PAGE_SIZE) { 752 if (maxlen > 0 && maxlen < PAGE_SIZE) {
654 status = op->decode_args(rqstp, xdr_in, argp); 753 status = op->decode_args(rqstp, xdr_in, argp);
655 if (likely(status == 0)) 754 if (likely(status == 0))
656 status = op->process_op(argp, resp); 755 status = op->process_op(argp, resp, cps);
657 } else 756 } else
658 status = htonl(NFS4ERR_RESOURCE); 757 status = htonl(NFS4ERR_RESOURCE);
659 758
660 /* Only set by OP_CB_SEQUENCE processing */
661 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
662 *drc_status = status;
663 status = 0;
664 }
665
666encode_hdr: 759encode_hdr:
667 res = encode_op_hdr(xdr_out, op_nr, status); 760 res = encode_op_hdr(xdr_out, op_nr, status);
668 if (unlikely(res)) 761 if (unlikely(res))
@@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
681 struct cb_compound_hdr_arg hdr_arg = { 0 }; 774 struct cb_compound_hdr_arg hdr_arg = { 0 };
682 struct cb_compound_hdr_res hdr_res = { NULL }; 775 struct cb_compound_hdr_res hdr_res = { NULL };
683 struct xdr_stream xdr_in, xdr_out; 776 struct xdr_stream xdr_in, xdr_out;
684 __be32 *p; 777 __be32 *p, status;
685 __be32 status, drc_status = 0; 778 struct cb_process_state cps = {
779 .drc_status = 0,
780 .clp = NULL,
781 };
686 unsigned int nops = 0; 782 unsigned int nops = 0;
687 783
688 dprintk("%s: start\n", __func__); 784 dprintk("%s: start\n", __func__);
@@ -696,6 +792,12 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
696 if (status == __constant_htonl(NFS4ERR_RESOURCE)) 792 if (status == __constant_htonl(NFS4ERR_RESOURCE))
697 return rpc_garbage_args; 793 return rpc_garbage_args;
698 794
795 if (hdr_arg.minorversion == 0) {
796 cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
797 if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
798 return rpc_drop_reply;
799 }
800
699 hdr_res.taglen = hdr_arg.taglen; 801 hdr_res.taglen = hdr_arg.taglen;
700 hdr_res.tag = hdr_arg.tag; 802 hdr_res.tag = hdr_arg.tag;
701 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) 803 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +805,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
703 805
704 while (status == 0 && nops != hdr_arg.nops) { 806 while (status == 0 && nops != hdr_arg.nops) {
705 status = process_op(hdr_arg.minorversion, nops, rqstp, 807 status = process_op(hdr_arg.minorversion, nops, rqstp,
706 &xdr_in, argp, &xdr_out, resp, &drc_status); 808 &xdr_in, argp, &xdr_out, resp, &cps);
707 nops++; 809 nops++;
708 } 810 }
709 811
@@ -716,6 +818,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
716 818
717 *hdr_res.status = status; 819 *hdr_res.status = status;
718 *hdr_res.nops = htonl(nops); 820 *hdr_res.nops = htonl(nops);
821 nfs4_cb_free_slot(cps.clp);
822 nfs_put_client(cps.clp);
719 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 823 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
720 return rpc_success; 824 return rpc_success;
721} 825}
@@ -739,6 +843,12 @@ static struct callback_op callback_ops[] = {
739 .res_maxsize = CB_OP_RECALL_RES_MAXSZ, 843 .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
740 }, 844 },
741#if defined(CONFIG_NFS_V4_1) 845#if defined(CONFIG_NFS_V4_1)
846 [OP_CB_LAYOUTRECALL] = {
847 .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
848 .decode_args =
849 (callback_decode_arg_t)decode_layoutrecall_args,
850 .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
851 },
742 [OP_CB_SEQUENCE] = { 852 [OP_CB_SEQUENCE] = {
743 .process_op = (callback_process_op_t)nfs4_callback_sequence, 853 .process_op = (callback_process_op_t)nfs4_callback_sequence,
744 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, 854 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0870d0d4efc..bd3ca32879e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -56,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
56static LIST_HEAD(nfs_client_list); 56static LIST_HEAD(nfs_client_list);
57static LIST_HEAD(nfs_volume_list); 57static LIST_HEAD(nfs_volume_list);
58static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); 58static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
59#ifdef CONFIG_NFS_V4
60static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
61
62/*
63 * Get a unique NFSv4.0 callback identifier which will be used
64 * by the V4.0 callback service to lookup the nfs_client struct
65 */
66static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
67{
68 int ret = 0;
69
70 if (clp->rpc_ops->version != 4 || minorversion != 0)
71 return ret;
72retry:
73 if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
74 return -ENOMEM;
75 spin_lock(&nfs_client_lock);
76 ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
77 spin_unlock(&nfs_client_lock);
78 if (ret == -EAGAIN)
79 goto retry;
80 return ret;
81}
82#endif /* CONFIG_NFS_V4 */
59 83
60/* 84/*
61 * RPC cruft for NFS 85 * RPC cruft for NFS
@@ -144,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
144 clp->cl_proto = cl_init->proto; 168 clp->cl_proto = cl_init->proto;
145 169
146#ifdef CONFIG_NFS_V4 170#ifdef CONFIG_NFS_V4
147 INIT_LIST_HEAD(&clp->cl_delegations); 171 err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
172 if (err)
173 goto error_cleanup;
174
148 spin_lock_init(&clp->cl_lock); 175 spin_lock_init(&clp->cl_lock);
149 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); 176 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
150 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); 177 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -170,21 +197,17 @@ error_0:
170} 197}
171 198
172#ifdef CONFIG_NFS_V4 199#ifdef CONFIG_NFS_V4
173/*
174 * Clears/puts all minor version specific parts from an nfs_client struct
175 * reverting it to minorversion 0.
176 */
177static void nfs4_clear_client_minor_version(struct nfs_client *clp)
178{
179#ifdef CONFIG_NFS_V4_1 200#ifdef CONFIG_NFS_V4_1
180 if (nfs4_has_session(clp)) { 201static void nfs4_shutdown_session(struct nfs_client *clp)
202{
203 if (nfs4_has_session(clp))
181 nfs4_destroy_session(clp->cl_session); 204 nfs4_destroy_session(clp->cl_session);
182 clp->cl_session = NULL;
183 }
184
185 clp->cl_mvops = nfs_v4_minor_ops[0];
186#endif /* CONFIG_NFS_V4_1 */
187} 205}
206#else /* CONFIG_NFS_V4_1 */
207static void nfs4_shutdown_session(struct nfs_client *clp)
208{
209}
210#endif /* CONFIG_NFS_V4_1 */
188 211
189/* 212/*
190 * Destroy the NFS4 callback service 213 * Destroy the NFS4 callback service
@@ -199,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
199{ 222{
200 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) 223 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
201 nfs4_kill_renewd(clp); 224 nfs4_kill_renewd(clp);
202 nfs4_clear_client_minor_version(clp); 225 nfs4_shutdown_session(clp);
203 nfs4_destroy_callback(clp); 226 nfs4_destroy_callback(clp);
204 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) 227 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
205 nfs_idmap_delete(clp); 228 nfs_idmap_delete(clp);
206 229
207 rpc_destroy_wait_queue(&clp->cl_rpcwaitq); 230 rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
208} 231}
232
233/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
234void nfs_cleanup_cb_ident_idr(void)
235{
236 idr_destroy(&cb_ident_idr);
237}
238
239/* nfs_client_lock held */
240static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
241{
242 if (clp->cl_cb_ident)
243 idr_remove(&cb_ident_idr, clp->cl_cb_ident);
244}
245
246static void pnfs_init_server(struct nfs_server *server)
247{
248 rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
249}
250
209#else 251#else
210static void nfs4_shutdown_client(struct nfs_client *clp) 252static void nfs4_shutdown_client(struct nfs_client *clp)
211{ 253{
212} 254}
255
256void nfs_cleanup_cb_ident_idr(void)
257{
258}
259
260static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
261{
262}
263
264static void pnfs_init_server(struct nfs_server *server)
265{
266}
267
213#endif /* CONFIG_NFS_V4 */ 268#endif /* CONFIG_NFS_V4 */
214 269
215/* 270/*
@@ -248,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp)
248 303
249 if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { 304 if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
250 list_del(&clp->cl_share_link); 305 list_del(&clp->cl_share_link);
306 nfs_cb_idr_remove_locked(clp);
251 spin_unlock(&nfs_client_lock); 307 spin_unlock(&nfs_client_lock);
252 308
253 BUG_ON(!list_empty(&clp->cl_superblocks)); 309 BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -363,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
363 return 0; 419 return 0;
364} 420}
365 421
366/* 422/* Common match routine for v4.0 and v4.1 callback services */
367 * Find a client by IP address and protocol version 423bool
368 * - returns NULL if no such client 424nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
369 */ 425 u32 minorversion)
370struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
371{ 426{
372 struct nfs_client *clp; 427 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
373 428
374 spin_lock(&nfs_client_lock); 429 /* Don't match clients that failed to initialise */
375 list_for_each_entry(clp, &nfs_client_list, cl_share_link) { 430 if (!(clp->cl_cons_state == NFS_CS_READY ||
376 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; 431 clp->cl_cons_state == NFS_CS_SESSION_INITING))
432 return false;
377 433
378 /* Don't match clients that failed to initialise properly */ 434 /* Match the version and minorversion */
379 if (!(clp->cl_cons_state == NFS_CS_READY || 435 if (clp->rpc_ops->version != 4 ||
380 clp->cl_cons_state == NFS_CS_SESSION_INITING)) 436 clp->cl_minorversion != minorversion)
381 continue; 437 return false;
382 438
383 /* Different NFS versions cannot share the same nfs_client */ 439 /* Match only the IP address, not the port number */
384 if (clp->rpc_ops->version != nfsversion) 440 if (!nfs_sockaddr_match_ipaddr(addr, clap))
385 continue; 441 return false;
386 442
387 /* Match only the IP address, not the port number */ 443 return true;
388 if (!nfs_sockaddr_match_ipaddr(addr, clap))
389 continue;
390
391 atomic_inc(&clp->cl_count);
392 spin_unlock(&nfs_client_lock);
393 return clp;
394 }
395 spin_unlock(&nfs_client_lock);
396 return NULL;
397}
398
399/*
400 * Find a client by IP address and protocol version
401 * - returns NULL if no such client
402 */
403struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
404{
405 struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
406 u32 nfsvers = clp->rpc_ops->version;
407
408 spin_lock(&nfs_client_lock);
409 list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
410 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
411
412 /* Don't match clients that failed to initialise properly */
413 if (clp->cl_cons_state != NFS_CS_READY)
414 continue;
415
416 /* Different NFS versions cannot share the same nfs_client */
417 if (clp->rpc_ops->version != nfsvers)
418 continue;
419
420 /* Match only the IP address, not the port number */
421 if (!nfs_sockaddr_match_ipaddr(sap, clap))
422 continue;
423
424 atomic_inc(&clp->cl_count);
425 spin_unlock(&nfs_client_lock);
426 return clp;
427 }
428 spin_unlock(&nfs_client_lock);
429 return NULL;
430} 444}
431 445
432/* 446/*
@@ -988,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
988 target->options = source->options; 1002 target->options = source->options;
989} 1003}
990 1004
1005static void nfs_server_insert_lists(struct nfs_server *server)
1006{
1007 struct nfs_client *clp = server->nfs_client;
1008
1009 spin_lock(&nfs_client_lock);
1010 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
1011 list_add_tail(&server->master_link, &nfs_volume_list);
1012 spin_unlock(&nfs_client_lock);
1013
1014}
1015
1016static void nfs_server_remove_lists(struct nfs_server *server)
1017{
1018 spin_lock(&nfs_client_lock);
1019 list_del_rcu(&server->client_link);
1020 list_del(&server->master_link);
1021 spin_unlock(&nfs_client_lock);
1022
1023 synchronize_rcu();
1024}
1025
991/* 1026/*
992 * Allocate and initialise a server record 1027 * Allocate and initialise a server record
993 */ 1028 */
@@ -1004,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
1004 /* Zero out the NFS state stuff */ 1039 /* Zero out the NFS state stuff */
1005 INIT_LIST_HEAD(&server->client_link); 1040 INIT_LIST_HEAD(&server->client_link);
1006 INIT_LIST_HEAD(&server->master_link); 1041 INIT_LIST_HEAD(&server->master_link);
1042 INIT_LIST_HEAD(&server->delegations);
1007 1043
1008 atomic_set(&server->active, 0); 1044 atomic_set(&server->active, 0);
1009 1045
@@ -1019,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void)
1019 return NULL; 1055 return NULL;
1020 } 1056 }
1021 1057
1058 pnfs_init_server(server);
1059
1022 return server; 1060 return server;
1023} 1061}
1024 1062
@@ -1029,11 +1067,8 @@ void nfs_free_server(struct nfs_server *server)
1029{ 1067{
1030 dprintk("--> nfs_free_server()\n"); 1068 dprintk("--> nfs_free_server()\n");
1031 1069
1070 nfs_server_remove_lists(server);
1032 unset_pnfs_layoutdriver(server); 1071 unset_pnfs_layoutdriver(server);
1033 spin_lock(&nfs_client_lock);
1034 list_del(&server->client_link);
1035 list_del(&server->master_link);
1036 spin_unlock(&nfs_client_lock);
1037 1072
1038 if (server->destroy != NULL) 1073 if (server->destroy != NULL)
1039 server->destroy(server); 1074 server->destroy(server);
@@ -1108,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1108 (unsigned long long) server->fsid.major, 1143 (unsigned long long) server->fsid.major,
1109 (unsigned long long) server->fsid.minor); 1144 (unsigned long long) server->fsid.minor);
1110 1145
1111 spin_lock(&nfs_client_lock); 1146 nfs_server_insert_lists(server);
1112 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1113 list_add_tail(&server->master_link, &nfs_volume_list);
1114 spin_unlock(&nfs_client_lock);
1115
1116 server->mount_time = jiffies; 1147 server->mount_time = jiffies;
1117 nfs_free_fattr(fattr); 1148 nfs_free_fattr(fattr);
1118 return server; 1149 return server;
@@ -1125,6 +1156,96 @@ error:
1125 1156
1126#ifdef CONFIG_NFS_V4 1157#ifdef CONFIG_NFS_V4
1127/* 1158/*
1159 * NFSv4.0 callback thread helper
1160 *
1161 * Find a client by IP address, protocol version, and minorversion
1162 *
1163 * Called from the pg_authenticate method. The callback identifier
1164 * is not used as it has not been decoded.
1165 *
1166 * Returns NULL if no such client
1167 */
1168struct nfs_client *
1169nfs4_find_client_no_ident(const struct sockaddr *addr)
1170{
1171 struct nfs_client *clp;
1172
1173 spin_lock(&nfs_client_lock);
1174 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
1175 if (nfs4_cb_match_client(addr, clp, 0) == false)
1176 continue;
1177 atomic_inc(&clp->cl_count);
1178 spin_unlock(&nfs_client_lock);
1179 return clp;
1180 }
1181 spin_unlock(&nfs_client_lock);
1182 return NULL;
1183}
1184
1185/*
1186 * NFSv4.0 callback thread helper
1187 *
1188 * Find a client by callback identifier
1189 */
1190struct nfs_client *
1191nfs4_find_client_ident(int cb_ident)
1192{
1193 struct nfs_client *clp;
1194
1195 spin_lock(&nfs_client_lock);
1196 clp = idr_find(&cb_ident_idr, cb_ident);
1197 if (clp)
1198 atomic_inc(&clp->cl_count);
1199 spin_unlock(&nfs_client_lock);
1200 return clp;
1201}
1202
1203#if defined(CONFIG_NFS_V4_1)
1204/*
1205 * NFSv4.1 callback thread helper
1206 * For CB_COMPOUND calls, find a client by IP address, protocol version,
1207 * minorversion, and sessionID
1208 *
1209 * Returns NULL if no such client
1210 */
1211struct nfs_client *
1212nfs4_find_client_sessionid(const struct sockaddr *addr,
1213 struct nfs4_sessionid *sid)
1214{
1215 struct nfs_client *clp;
1216
1217 spin_lock(&nfs_client_lock);
1218 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
1219 if (nfs4_cb_match_client(addr, clp, 1) == false)
1220 continue;
1221
1222 if (!nfs4_has_session(clp))
1223 continue;
1224
1225 /* Match sessionid*/
1226 if (memcmp(clp->cl_session->sess_id.data,
1227 sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
1228 continue;
1229
1230 atomic_inc(&clp->cl_count);
1231 spin_unlock(&nfs_client_lock);
1232 return clp;
1233 }
1234 spin_unlock(&nfs_client_lock);
1235 return NULL;
1236}
1237
1238#else /* CONFIG_NFS_V4_1 */
1239
1240struct nfs_client *
1241nfs4_find_client_sessionid(const struct sockaddr *addr,
1242 struct nfs4_sessionid *sid)
1243{
1244 return NULL;
1245}
1246#endif /* CONFIG_NFS_V4_1 */
1247
1248/*
1128 * Initialize the NFS4 callback service 1249 * Initialize the NFS4 callback service
1129 */ 1250 */
1130static int nfs4_init_callback(struct nfs_client *clp) 1251static int nfs4_init_callback(struct nfs_client *clp)
@@ -1342,11 +1463,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
1342 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) 1463 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
1343 server->namelen = NFS4_MAXNAMLEN; 1464 server->namelen = NFS4_MAXNAMLEN;
1344 1465
1345 spin_lock(&nfs_client_lock); 1466 nfs_server_insert_lists(server);
1346 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1347 list_add_tail(&server->master_link, &nfs_volume_list);
1348 spin_unlock(&nfs_client_lock);
1349
1350 server->mount_time = jiffies; 1467 server->mount_time = jiffies;
1351out: 1468out:
1352 nfs_free_fattr(fattr); 1469 nfs_free_fattr(fattr);
@@ -1551,11 +1668,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1551 if (error < 0) 1668 if (error < 0)
1552 goto out_free_server; 1669 goto out_free_server;
1553 1670
1554 spin_lock(&nfs_client_lock); 1671 nfs_server_insert_lists(server);
1555 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1556 list_add_tail(&server->master_link, &nfs_volume_list);
1557 spin_unlock(&nfs_client_lock);
1558
1559 server->mount_time = jiffies; 1672 server->mount_time = jiffies;
1560 1673
1561 nfs_free_fattr(fattr_fsinfo); 1674 nfs_free_fattr(fattr_fsinfo);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 232a7eead33..bbbc6bf5cb2 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/smp_lock.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
16 15
17#include <linux/nfs4.h> 16#include <linux/nfs4.h>
@@ -24,8 +23,6 @@
24 23
25static void nfs_do_free_delegation(struct nfs_delegation *delegation) 24static void nfs_do_free_delegation(struct nfs_delegation *delegation)
26{ 25{
27 if (delegation->cred)
28 put_rpccred(delegation->cred);
29 kfree(delegation); 26 kfree(delegation);
30} 27}
31 28
@@ -38,14 +35,30 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
38 35
39static void nfs_free_delegation(struct nfs_delegation *delegation) 36static void nfs_free_delegation(struct nfs_delegation *delegation)
40{ 37{
38 if (delegation->cred) {
39 put_rpccred(delegation->cred);
40 delegation->cred = NULL;
41 }
41 call_rcu(&delegation->rcu, nfs_free_delegation_callback); 42 call_rcu(&delegation->rcu, nfs_free_delegation_callback);
42} 43}
43 44
45/**
46 * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
47 * @delegation: delegation to process
48 *
49 */
44void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) 50void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
45{ 51{
46 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); 52 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
47} 53}
48 54
55/**
56 * nfs_have_delegation - check if inode has a delegation
57 * @inode: inode to check
58 * @flags: delegation types to check for
59 *
60 * Returns one if inode has the indicated delegation, otherwise zero.
61 */
49int nfs_have_delegation(struct inode *inode, fmode_t flags) 62int nfs_have_delegation(struct inode *inode, fmode_t flags)
50{ 63{
51 struct nfs_delegation *delegation; 64 struct nfs_delegation *delegation;
@@ -120,10 +133,15 @@ again:
120 return 0; 133 return 0;
121} 134}
122 135
123/* 136/**
124 * Set up a delegation on an inode 137 * nfs_inode_reclaim_delegation - process a delegation reclaim request
138 * @inode: inode to process
139 * @cred: credential to use for request
140 * @res: new delegation state from server
141 *
125 */ 142 */
126void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 143void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
144 struct nfs_openres *res)
127{ 145{
128 struct nfs_delegation *delegation; 146 struct nfs_delegation *delegation;
129 struct rpc_cred *oldcred = NULL; 147 struct rpc_cred *oldcred = NULL;
@@ -176,38 +194,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
176 return inode; 194 return inode;
177} 195}
178 196
179static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, 197static struct nfs_delegation *
180 const nfs4_stateid *stateid, 198nfs_detach_delegation_locked(struct nfs_inode *nfsi,
181 struct nfs_client *clp) 199 struct nfs_server *server)
182{ 200{
183 struct nfs_delegation *delegation = 201 struct nfs_delegation *delegation =
184 rcu_dereference_protected(nfsi->delegation, 202 rcu_dereference_protected(nfsi->delegation,
185 lockdep_is_held(&clp->cl_lock)); 203 lockdep_is_held(&server->nfs_client->cl_lock));
186 204
187 if (delegation == NULL) 205 if (delegation == NULL)
188 goto nomatch; 206 goto nomatch;
207
189 spin_lock(&delegation->lock); 208 spin_lock(&delegation->lock);
190 if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
191 sizeof(delegation->stateid.data)) != 0)
192 goto nomatch_unlock;
193 list_del_rcu(&delegation->super_list); 209 list_del_rcu(&delegation->super_list);
194 delegation->inode = NULL; 210 delegation->inode = NULL;
195 nfsi->delegation_state = 0; 211 nfsi->delegation_state = 0;
196 rcu_assign_pointer(nfsi->delegation, NULL); 212 rcu_assign_pointer(nfsi->delegation, NULL);
197 spin_unlock(&delegation->lock); 213 spin_unlock(&delegation->lock);
198 return delegation; 214 return delegation;
199nomatch_unlock:
200 spin_unlock(&delegation->lock);
201nomatch: 215nomatch:
202 return NULL; 216 return NULL;
203} 217}
204 218
205/* 219static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
206 * Set up a delegation on an inode 220 struct nfs_server *server)
221{
222 struct nfs_client *clp = server->nfs_client;
223 struct nfs_delegation *delegation;
224
225 spin_lock(&clp->cl_lock);
226 delegation = nfs_detach_delegation_locked(nfsi, server);
227 spin_unlock(&clp->cl_lock);
228 return delegation;
229}
230
231/**
232 * nfs_inode_set_delegation - set up a delegation on an inode
233 * @inode: inode to which delegation applies
234 * @cred: cred to use for subsequent delegation processing
235 * @res: new delegation state from server
236 *
237 * Returns zero on success, or a negative errno value.
207 */ 238 */
208int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 239int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
209{ 240{
210 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 241 struct nfs_server *server = NFS_SERVER(inode);
242 struct nfs_client *clp = server->nfs_client;
211 struct nfs_inode *nfsi = NFS_I(inode); 243 struct nfs_inode *nfsi = NFS_I(inode);
212 struct nfs_delegation *delegation, *old_delegation; 244 struct nfs_delegation *delegation, *old_delegation;
213 struct nfs_delegation *freeme = NULL; 245 struct nfs_delegation *freeme = NULL;
@@ -228,7 +260,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
228 260
229 spin_lock(&clp->cl_lock); 261 spin_lock(&clp->cl_lock);
230 old_delegation = rcu_dereference_protected(nfsi->delegation, 262 old_delegation = rcu_dereference_protected(nfsi->delegation,
231 lockdep_is_held(&clp->cl_lock)); 263 lockdep_is_held(&clp->cl_lock));
232 if (old_delegation != NULL) { 264 if (old_delegation != NULL) {
233 if (memcmp(&delegation->stateid, &old_delegation->stateid, 265 if (memcmp(&delegation->stateid, &old_delegation->stateid,
234 sizeof(old_delegation->stateid)) == 0 && 266 sizeof(old_delegation->stateid)) == 0 &&
@@ -247,9 +279,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
247 delegation = NULL; 279 delegation = NULL;
248 goto out; 280 goto out;
249 } 281 }
250 freeme = nfs_detach_delegation_locked(nfsi, NULL, clp); 282 freeme = nfs_detach_delegation_locked(nfsi, server);
251 } 283 }
252 list_add_rcu(&delegation->super_list, &clp->cl_delegations); 284 list_add_rcu(&delegation->super_list, &server->delegations);
253 nfsi->delegation_state = delegation->type; 285 nfsi->delegation_state = delegation->type;
254 rcu_assign_pointer(nfsi->delegation, delegation); 286 rcu_assign_pointer(nfsi->delegation, delegation);
255 delegation = NULL; 287 delegation = NULL;
@@ -291,73 +323,85 @@ out:
291 return err; 323 return err;
292} 324}
293 325
294/* 326/**
295 * Return all delegations that have been marked for return 327 * nfs_client_return_marked_delegations - return previously marked delegations
328 * @clp: nfs_client to process
329 *
330 * Returns zero on success, or a negative errno value.
296 */ 331 */
297int nfs_client_return_marked_delegations(struct nfs_client *clp) 332int nfs_client_return_marked_delegations(struct nfs_client *clp)
298{ 333{
299 struct nfs_delegation *delegation; 334 struct nfs_delegation *delegation;
335 struct nfs_server *server;
300 struct inode *inode; 336 struct inode *inode;
301 int err = 0; 337 int err = 0;
302 338
303restart: 339restart:
304 rcu_read_lock(); 340 rcu_read_lock();
305 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 341 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
306 if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) 342 list_for_each_entry_rcu(delegation, &server->delegations,
307 continue; 343 super_list) {
308 inode = nfs_delegation_grab_inode(delegation); 344 if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
309 if (inode == NULL) 345 &delegation->flags))
310 continue; 346 continue;
311 spin_lock(&clp->cl_lock); 347 inode = nfs_delegation_grab_inode(delegation);
312 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); 348 if (inode == NULL)
313 spin_unlock(&clp->cl_lock); 349 continue;
314 rcu_read_unlock(); 350 delegation = nfs_detach_delegation(NFS_I(inode),
315 if (delegation != NULL) { 351 server);
316 filemap_flush(inode->i_mapping); 352 rcu_read_unlock();
317 err = __nfs_inode_return_delegation(inode, delegation, 0); 353
354 if (delegation != NULL) {
355 filemap_flush(inode->i_mapping);
356 err = __nfs_inode_return_delegation(inode,
357 delegation, 0);
358 }
359 iput(inode);
360 if (!err)
361 goto restart;
362 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
363 return err;
318 } 364 }
319 iput(inode);
320 if (!err)
321 goto restart;
322 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
323 return err;
324 } 365 }
325 rcu_read_unlock(); 366 rcu_read_unlock();
326 return 0; 367 return 0;
327} 368}
328 369
329/* 370/**
330 * This function returns the delegation without reclaiming opens 371 * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
331 * or protecting against delegation reclaims. 372 * @inode: inode to process
332 * It is therefore really only safe to be called from 373 *
333 * nfs4_clear_inode() 374 * Does not protect against delegation reclaims, therefore really only safe
375 * to be called from nfs4_clear_inode().
334 */ 376 */
335void nfs_inode_return_delegation_noreclaim(struct inode *inode) 377void nfs_inode_return_delegation_noreclaim(struct inode *inode)
336{ 378{
337 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 379 struct nfs_server *server = NFS_SERVER(inode);
338 struct nfs_inode *nfsi = NFS_I(inode); 380 struct nfs_inode *nfsi = NFS_I(inode);
339 struct nfs_delegation *delegation; 381 struct nfs_delegation *delegation;
340 382
341 if (rcu_access_pointer(nfsi->delegation) != NULL) { 383 if (rcu_access_pointer(nfsi->delegation) != NULL) {
342 spin_lock(&clp->cl_lock); 384 delegation = nfs_detach_delegation(nfsi, server);
343 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
344 spin_unlock(&clp->cl_lock);
345 if (delegation != NULL) 385 if (delegation != NULL)
346 nfs_do_return_delegation(inode, delegation, 0); 386 nfs_do_return_delegation(inode, delegation, 0);
347 } 387 }
348} 388}
349 389
390/**
391 * nfs_inode_return_delegation - synchronously return a delegation
392 * @inode: inode to process
393 *
394 * Returns zero on success, or a negative errno value.
395 */
350int nfs_inode_return_delegation(struct inode *inode) 396int nfs_inode_return_delegation(struct inode *inode)
351{ 397{
352 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 398 struct nfs_server *server = NFS_SERVER(inode);
353 struct nfs_inode *nfsi = NFS_I(inode); 399 struct nfs_inode *nfsi = NFS_I(inode);
354 struct nfs_delegation *delegation; 400 struct nfs_delegation *delegation;
355 int err = 0; 401 int err = 0;
356 402
357 if (rcu_access_pointer(nfsi->delegation) != NULL) { 403 if (rcu_access_pointer(nfsi->delegation) != NULL) {
358 spin_lock(&clp->cl_lock); 404 delegation = nfs_detach_delegation(nfsi, server);
359 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
360 spin_unlock(&clp->cl_lock);
361 if (delegation != NULL) { 405 if (delegation != NULL) {
362 nfs_wb_all(inode); 406 nfs_wb_all(inode);
363 err = __nfs_inode_return_delegation(inode, delegation, 1); 407 err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -366,46 +410,61 @@ int nfs_inode_return_delegation(struct inode *inode)
366 return err; 410 return err;
367} 411}
368 412
369static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation) 413static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
370{ 414{
415 struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
416
371 set_bit(NFS_DELEGATION_RETURN, &delegation->flags); 417 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
372 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); 418 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
373} 419}
374 420
375/* 421/**
376 * Return all delegations associated to a super block 422 * nfs_super_return_all_delegations - return delegations for one superblock
423 * @sb: sb to process
424 *
377 */ 425 */
378void nfs_super_return_all_delegations(struct super_block *sb) 426void nfs_super_return_all_delegations(struct super_block *sb)
379{ 427{
380 struct nfs_client *clp = NFS_SB(sb)->nfs_client; 428 struct nfs_server *server = NFS_SB(sb);
429 struct nfs_client *clp = server->nfs_client;
381 struct nfs_delegation *delegation; 430 struct nfs_delegation *delegation;
382 431
383 if (clp == NULL) 432 if (clp == NULL)
384 return; 433 return;
434
385 rcu_read_lock(); 435 rcu_read_lock();
386 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 436 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
387 spin_lock(&delegation->lock); 437 spin_lock(&delegation->lock);
388 if (delegation->inode != NULL && delegation->inode->i_sb == sb) 438 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
389 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
390 spin_unlock(&delegation->lock); 439 spin_unlock(&delegation->lock);
391 } 440 }
392 rcu_read_unlock(); 441 rcu_read_unlock();
442
393 if (nfs_client_return_marked_delegations(clp) != 0) 443 if (nfs_client_return_marked_delegations(clp) != 0)
394 nfs4_schedule_state_manager(clp); 444 nfs4_schedule_state_manager(clp);
395} 445}
396 446
397static 447static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
398void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags) 448 fmode_t flags)
399{ 449{
400 struct nfs_delegation *delegation; 450 struct nfs_delegation *delegation;
401 451
402 rcu_read_lock(); 452 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
403 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
404 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) 453 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
405 continue; 454 continue;
406 if (delegation->type & flags) 455 if (delegation->type & flags)
407 nfs_mark_return_delegation(clp, delegation); 456 nfs_mark_return_delegation(delegation);
408 } 457 }
458}
459
460static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
461 fmode_t flags)
462{
463 struct nfs_server *server;
464
465 rcu_read_lock();
466 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
467 nfs_mark_return_all_delegation_types(server, flags);
409 rcu_read_unlock(); 468 rcu_read_unlock();
410} 469}
411 470
@@ -420,19 +479,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
420 nfs4_schedule_state_manager(clp); 479 nfs4_schedule_state_manager(clp);
421} 480}
422 481
482/**
483 * nfs_expire_all_delegation_types
484 * @clp: client to process
485 * @flags: delegation types to expire
486 *
487 */
423void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) 488void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
424{ 489{
425 nfs_client_mark_return_all_delegation_types(clp, flags); 490 nfs_client_mark_return_all_delegation_types(clp, flags);
426 nfs_delegation_run_state_manager(clp); 491 nfs_delegation_run_state_manager(clp);
427} 492}
428 493
494/**
495 * nfs_expire_all_delegations
496 * @clp: client to process
497 *
498 */
429void nfs_expire_all_delegations(struct nfs_client *clp) 499void nfs_expire_all_delegations(struct nfs_client *clp)
430{ 500{
431 nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); 501 nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
432} 502}
433 503
434/* 504/**
435 * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. 505 * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
506 * @clp: client to process
507 *
436 */ 508 */
437void nfs_handle_cb_pathdown(struct nfs_client *clp) 509void nfs_handle_cb_pathdown(struct nfs_client *clp)
438{ 510{
@@ -441,29 +513,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
441 nfs_client_mark_return_all_delegations(clp); 513 nfs_client_mark_return_all_delegations(clp);
442} 514}
443 515
444static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp) 516static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
445{ 517{
446 struct nfs_delegation *delegation; 518 struct nfs_delegation *delegation;
447 519
448 rcu_read_lock(); 520 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
449 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
450 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) 521 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
451 continue; 522 continue;
452 nfs_mark_return_delegation(clp, delegation); 523 nfs_mark_return_delegation(delegation);
453 } 524 }
454 rcu_read_unlock();
455} 525}
456 526
527/**
528 * nfs_expire_unreferenced_delegations - Eliminate unused delegations
529 * @clp: nfs_client to process
530 *
531 */
457void nfs_expire_unreferenced_delegations(struct nfs_client *clp) 532void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
458{ 533{
459 nfs_client_mark_return_unreferenced_delegations(clp); 534 struct nfs_server *server;
535
536 rcu_read_lock();
537 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
538 nfs_mark_return_unreferenced_delegations(server);
539 rcu_read_unlock();
540
460 nfs_delegation_run_state_manager(clp); 541 nfs_delegation_run_state_manager(clp);
461} 542}
462 543
463/* 544/**
464 * Asynchronous delegation recall! 545 * nfs_async_inode_return_delegation - asynchronously return a delegation
546 * @inode: inode to process
547 * @stateid: state ID information from CB_RECALL arguments
548 *
549 * Returns zero on success, or a negative errno value.
465 */ 550 */
466int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) 551int nfs_async_inode_return_delegation(struct inode *inode,
552 const nfs4_stateid *stateid)
467{ 553{
468 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 554 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
469 struct nfs_delegation *delegation; 555 struct nfs_delegation *delegation;
@@ -475,22 +561,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
475 rcu_read_unlock(); 561 rcu_read_unlock();
476 return -ENOENT; 562 return -ENOENT;
477 } 563 }
478 564 nfs_mark_return_delegation(delegation);
479 nfs_mark_return_delegation(clp, delegation);
480 rcu_read_unlock(); 565 rcu_read_unlock();
566
481 nfs_delegation_run_state_manager(clp); 567 nfs_delegation_run_state_manager(clp);
482 return 0; 568 return 0;
483} 569}
484 570
485/* 571static struct inode *
486 * Retrieve the inode associated with a delegation 572nfs_delegation_find_inode_server(struct nfs_server *server,
487 */ 573 const struct nfs_fh *fhandle)
488struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
489{ 574{
490 struct nfs_delegation *delegation; 575 struct nfs_delegation *delegation;
491 struct inode *res = NULL; 576 struct inode *res = NULL;
492 rcu_read_lock(); 577
493 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 578 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
494 spin_lock(&delegation->lock); 579 spin_lock(&delegation->lock);
495 if (delegation->inode != NULL && 580 if (delegation->inode != NULL &&
496 nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { 581 nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -500,49 +585,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
500 if (res != NULL) 585 if (res != NULL)
501 break; 586 break;
502 } 587 }
588 return res;
589}
590
591/**
592 * nfs_delegation_find_inode - retrieve the inode associated with a delegation
593 * @clp: client state handle
594 * @fhandle: filehandle from a delegation recall
595 *
596 * Returns pointer to inode matching "fhandle," or NULL if a matching inode
597 * cannot be found.
598 */
599struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
600 const struct nfs_fh *fhandle)
601{
602 struct nfs_server *server;
603 struct inode *res = NULL;
604
605 rcu_read_lock();
606 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
607 res = nfs_delegation_find_inode_server(server, fhandle);
608 if (res != NULL)
609 break;
610 }
503 rcu_read_unlock(); 611 rcu_read_unlock();
504 return res; 612 return res;
505} 613}
506 614
507/* 615static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
508 * Mark all delegations as needing to be reclaimed 616{
617 struct nfs_delegation *delegation;
618
619 list_for_each_entry_rcu(delegation, &server->delegations, super_list)
620 set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
621}
622
623/**
624 * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
625 * @clp: nfs_client to process
626 *
509 */ 627 */
510void nfs_delegation_mark_reclaim(struct nfs_client *clp) 628void nfs_delegation_mark_reclaim(struct nfs_client *clp)
511{ 629{
512 struct nfs_delegation *delegation; 630 struct nfs_server *server;
631
513 rcu_read_lock(); 632 rcu_read_lock();
514 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) 633 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
515 set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); 634 nfs_delegation_mark_reclaim_server(server);
516 rcu_read_unlock(); 635 rcu_read_unlock();
517} 636}
518 637
519/* 638/**
520 * Reap all unclaimed delegations after reboot recovery is done 639 * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
640 * @clp: nfs_client to process
641 *
521 */ 642 */
522void nfs_delegation_reap_unclaimed(struct nfs_client *clp) 643void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
523{ 644{
524 struct nfs_delegation *delegation; 645 struct nfs_delegation *delegation;
646 struct nfs_server *server;
525 struct inode *inode; 647 struct inode *inode;
648
526restart: 649restart:
527 rcu_read_lock(); 650 rcu_read_lock();
528 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 651 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
529 if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) 652 list_for_each_entry_rcu(delegation, &server->delegations,
530 continue; 653 super_list) {
531 inode = nfs_delegation_grab_inode(delegation); 654 if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
532 if (inode == NULL) 655 &delegation->flags) == 0)
533 continue; 656 continue;
534 spin_lock(&clp->cl_lock); 657 inode = nfs_delegation_grab_inode(delegation);
535 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); 658 if (inode == NULL)
536 spin_unlock(&clp->cl_lock); 659 continue;
537 rcu_read_unlock(); 660 delegation = nfs_detach_delegation(NFS_I(inode),
538 if (delegation != NULL) 661 server);
539 nfs_free_delegation(delegation); 662 rcu_read_unlock();
540 iput(inode); 663
541 goto restart; 664 if (delegation != NULL)
665 nfs_free_delegation(delegation);
666 iput(inode);
667 goto restart;
668 }
542 } 669 }
543 rcu_read_unlock(); 670 rcu_read_unlock();
544} 671}
545 672
673/**
674 * nfs_delegations_present - check for existence of delegations
675 * @clp: client state handle
676 *
677 * Returns one if there are any nfs_delegation structures attached
678 * to this nfs_client.
679 */
680int nfs_delegations_present(struct nfs_client *clp)
681{
682 struct nfs_server *server;
683 int ret = 0;
684
685 rcu_read_lock();
686 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
687 if (!list_empty(&server->delegations)) {
688 ret = 1;
689 break;
690 }
691 rcu_read_unlock();
692 return ret;
693}
694
695/**
696 * nfs4_copy_delegation_stateid - Copy inode's state ID information
697 * @dst: stateid data structure to fill in
698 * @inode: inode to check
699 *
700 * Returns one and fills in "dst->data" * if inode had a delegation,
701 * otherwise zero is returned.
702 */
546int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) 703int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
547{ 704{
548 struct nfs_inode *nfsi = NFS_I(inode); 705 struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda1..d9322e490c5 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
44void nfs_expire_unreferenced_delegations(struct nfs_client *clp); 44void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
45void nfs_handle_cb_pathdown(struct nfs_client *clp); 45void nfs_handle_cb_pathdown(struct nfs_client *clp);
46int nfs_client_return_marked_delegations(struct nfs_client *clp); 46int nfs_client_return_marked_delegations(struct nfs_client *clp);
47int nfs_delegations_present(struct nfs_client *clp);
47 48
48void nfs_delegation_mark_reclaim(struct nfs_client *clp); 49void nfs_delegation_mark_reclaim(struct nfs_client *clp);
49void nfs_delegation_reap_unclaimed(struct nfs_client *clp); 50void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 07ac3847e56..2c3eb33b904 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,7 +33,8 @@
33#include <linux/namei.h> 33#include <linux/namei.h>
34#include <linux/mount.h> 34#include <linux/mount.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/vmalloc.h> 36#include <linux/kmemleak.h>
37#include <linux/xattr.h>
37 38
38#include "delegation.h" 39#include "delegation.h"
39#include "iostat.h" 40#include "iostat.h"
@@ -56,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
56 struct inode *, struct dentry *); 57 struct inode *, struct dentry *);
57static int nfs_fsync_dir(struct file *, int); 58static int nfs_fsync_dir(struct file *, int);
58static loff_t nfs_llseek_dir(struct file *, loff_t, int); 59static loff_t nfs_llseek_dir(struct file *, loff_t, int);
59static int nfs_readdir_clear_array(struct page*, gfp_t); 60static void nfs_readdir_clear_array(struct page*);
60 61
61const struct file_operations nfs_dir_operations = { 62const struct file_operations nfs_dir_operations = {
62 .llseek = nfs_llseek_dir, 63 .llseek = nfs_llseek_dir,
@@ -82,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = {
82 .setattr = nfs_setattr, 83 .setattr = nfs_setattr,
83}; 84};
84 85
85const struct address_space_operations nfs_dir_addr_space_ops = { 86const struct address_space_operations nfs_dir_aops = {
86 .releasepage = nfs_readdir_clear_array, 87 .freepage = nfs_readdir_clear_array,
87}; 88};
88 89
89#ifdef CONFIG_NFS_V3 90#ifdef CONFIG_NFS_V3
@@ -124,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = {
124 .permission = nfs_permission, 125 .permission = nfs_permission,
125 .getattr = nfs_getattr, 126 .getattr = nfs_getattr,
126 .setattr = nfs_setattr, 127 .setattr = nfs_setattr,
127 .getxattr = nfs4_getxattr, 128 .getxattr = generic_getxattr,
128 .setxattr = nfs4_setxattr, 129 .setxattr = generic_setxattr,
129 .listxattr = nfs4_listxattr, 130 .listxattr = generic_listxattr,
131 .removexattr = generic_removexattr,
130}; 132};
131 133
132#endif /* CONFIG_NFS_V4 */ 134#endif /* CONFIG_NFS_V4 */
@@ -161,6 +163,7 @@ struct nfs_cache_array_entry {
161 u64 cookie; 163 u64 cookie;
162 u64 ino; 164 u64 ino;
163 struct qstr string; 165 struct qstr string;
166 unsigned char d_type;
164}; 167};
165 168
166struct nfs_cache_array { 169struct nfs_cache_array {
@@ -170,14 +173,13 @@ struct nfs_cache_array {
170 struct nfs_cache_array_entry array[0]; 173 struct nfs_cache_array_entry array[0];
171}; 174};
172 175
173#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) 176typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
174
175typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
176typedef struct { 177typedef struct {
177 struct file *file; 178 struct file *file;
178 struct page *page; 179 struct page *page;
179 unsigned long page_index; 180 unsigned long page_index;
180 u64 *dir_cookie; 181 u64 *dir_cookie;
182 u64 last_cookie;
181 loff_t current_index; 183 loff_t current_index;
182 decode_dirent_t decode; 184 decode_dirent_t decode;
183 185
@@ -194,9 +196,13 @@ typedef struct {
194static 196static
195struct nfs_cache_array *nfs_readdir_get_array(struct page *page) 197struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
196{ 198{
199 void *ptr;
197 if (page == NULL) 200 if (page == NULL)
198 return ERR_PTR(-EIO); 201 return ERR_PTR(-EIO);
199 return (struct nfs_cache_array *)kmap(page); 202 ptr = kmap(page);
203 if (ptr == NULL)
204 return ERR_PTR(-ENOMEM);
205 return ptr;
200} 206}
201 207
202static 208static
@@ -209,14 +215,15 @@ void nfs_readdir_release_array(struct page *page)
209 * we are freeing strings created by nfs_add_to_readdir_array() 215 * we are freeing strings created by nfs_add_to_readdir_array()
210 */ 216 */
211static 217static
212int nfs_readdir_clear_array(struct page *page, gfp_t mask) 218void nfs_readdir_clear_array(struct page *page)
213{ 219{
214 struct nfs_cache_array *array = nfs_readdir_get_array(page); 220 struct nfs_cache_array *array;
215 int i; 221 int i;
222
223 array = kmap_atomic(page, KM_USER0);
216 for (i = 0; i < array->size; i++) 224 for (i = 0; i < array->size; i++)
217 kfree(array->array[i].string.name); 225 kfree(array->array[i].string.name);
218 nfs_readdir_release_array(page); 226 kunmap_atomic(array, KM_USER0);
219 return 0;
220} 227}
221 228
222/* 229/*
@@ -231,6 +238,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
231 string->name = kmemdup(name, len, GFP_KERNEL); 238 string->name = kmemdup(name, len, GFP_KERNEL);
232 if (string->name == NULL) 239 if (string->name == NULL)
233 return -ENOMEM; 240 return -ENOMEM;
241 /*
242 * Avoid a kmemleak false positive. The pointer to the name is stored
243 * in a page cache page which kmemleak does not scan.
244 */
245 kmemleak_not_leak(string->name);
234 string->hash = full_name_hash(name, len); 246 string->hash = full_name_hash(name, len);
235 return 0; 247 return 0;
236} 248}
@@ -244,20 +256,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
244 256
245 if (IS_ERR(array)) 257 if (IS_ERR(array))
246 return PTR_ERR(array); 258 return PTR_ERR(array);
247 ret = -EIO;
248 if (array->size >= MAX_READDIR_ARRAY)
249 goto out;
250 259
251 cache_entry = &array->array[array->size]; 260 cache_entry = &array->array[array->size];
261
262 /* Check that this entry lies within the page bounds */
263 ret = -ENOSPC;
264 if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
265 goto out;
266
252 cache_entry->cookie = entry->prev_cookie; 267 cache_entry->cookie = entry->prev_cookie;
253 cache_entry->ino = entry->ino; 268 cache_entry->ino = entry->ino;
269 cache_entry->d_type = entry->d_type;
254 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); 270 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
255 if (ret) 271 if (ret)
256 goto out; 272 goto out;
257 array->last_cookie = entry->cookie; 273 array->last_cookie = entry->cookie;
258 if (entry->eof == 1)
259 array->eof_index = array->size;
260 array->size++; 274 array->size++;
275 if (entry->eof != 0)
276 array->eof_index = array->size;
261out: 277out:
262 nfs_readdir_release_array(page); 278 nfs_readdir_release_array(page);
263 return ret; 279 return ret;
@@ -272,7 +288,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
272 if (diff < 0) 288 if (diff < 0)
273 goto out_eof; 289 goto out_eof;
274 if (diff >= array->size) { 290 if (diff >= array->size) {
275 if (array->eof_index > 0) 291 if (array->eof_index >= 0)
276 goto out_eof; 292 goto out_eof;
277 desc->current_index += array->size; 293 desc->current_index += array->size;
278 return -EAGAIN; 294 return -EAGAIN;
@@ -281,8 +297,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
281 index = (unsigned int)diff; 297 index = (unsigned int)diff;
282 *desc->dir_cookie = array->array[index].cookie; 298 *desc->dir_cookie = array->array[index].cookie;
283 desc->cache_entry_index = index; 299 desc->cache_entry_index = index;
284 if (index == array->eof_index)
285 desc->eof = 1;
286 return 0; 300 return 0;
287out_eof: 301out_eof:
288 desc->eof = 1; 302 desc->eof = 1;
@@ -296,17 +310,16 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
296 int status = -EAGAIN; 310 int status = -EAGAIN;
297 311
298 for (i = 0; i < array->size; i++) { 312 for (i = 0; i < array->size; i++) {
299 if (i == array->eof_index) {
300 desc->eof = 1;
301 status = -EBADCOOKIE;
302 }
303 if (array->array[i].cookie == *desc->dir_cookie) { 313 if (array->array[i].cookie == *desc->dir_cookie) {
304 desc->cache_entry_index = i; 314 desc->cache_entry_index = i;
305 status = 0; 315 return 0;
306 break;
307 } 316 }
308 } 317 }
309 318 if (array->eof_index >= 0) {
319 status = -EBADCOOKIE;
320 if (*desc->dir_cookie == array->last_cookie)
321 desc->eof = 1;
322 }
310 return status; 323 return status;
311} 324}
312 325
@@ -314,10 +327,7 @@ static
314int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) 327int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
315{ 328{
316 struct nfs_cache_array *array; 329 struct nfs_cache_array *array;
317 int status = -EBADCOOKIE; 330 int status;
318
319 if (desc->dir_cookie == NULL)
320 goto out;
321 331
322 array = nfs_readdir_get_array(desc->page); 332 array = nfs_readdir_get_array(desc->page);
323 if (IS_ERR(array)) { 333 if (IS_ERR(array)) {
@@ -330,6 +340,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
330 else 340 else
331 status = nfs_readdir_search_for_cookie(array, desc); 341 status = nfs_readdir_search_for_cookie(array, desc);
332 342
343 if (status == -EAGAIN) {
344 desc->last_cookie = array->last_cookie;
345 desc->page_index++;
346 }
333 nfs_readdir_release_array(desc->page); 347 nfs_readdir_release_array(desc->page);
334out: 348out:
335 return status; 349 return status;
@@ -365,14 +379,14 @@ error:
365 return error; 379 return error;
366} 380}
367 381
368/* Fill in an entry based on the xdr code stored in desc->page */ 382static int xdr_decode(nfs_readdir_descriptor_t *desc,
369static 383 struct nfs_entry *entry, struct xdr_stream *xdr)
370int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
371{ 384{
372 __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus); 385 int error;
373 if (IS_ERR(p))
374 return PTR_ERR(p);
375 386
387 error = desc->decode(xdr, entry, desc->plus);
388 if (error)
389 return error;
376 entry->fattr->time_start = desc->timestamp; 390 entry->fattr->time_start = desc->timestamp;
377 entry->fattr->gencount = desc->gencount; 391 entry->fattr->gencount = desc->gencount;
378 return 0; 392 return 0;
@@ -381,13 +395,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x
381static 395static
382int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) 396int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
383{ 397{
384 struct nfs_inode *node;
385 if (dentry->d_inode == NULL) 398 if (dentry->d_inode == NULL)
386 goto different; 399 goto different;
387 node = NFS_I(dentry->d_inode); 400 if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
388 if (node->fh.size != entry->fh->size)
389 goto different;
390 if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
391 goto different; 401 goto different;
392 return 1; 402 return 1;
393different: 403different:
@@ -429,7 +439,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
429 if (dentry == NULL) 439 if (dentry == NULL)
430 return; 440 return;
431 441
432 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
433 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); 442 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
434 if (IS_ERR(inode)) 443 if (IS_ERR(inode))
435 goto out; 444 goto out;
@@ -449,43 +458,58 @@ out:
449 458
450/* Perform conversion from xdr to cache array */ 459/* Perform conversion from xdr to cache array */
451static 460static
452void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, 461int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
453 void *xdr_page, struct page *page, unsigned int buflen) 462 struct page **xdr_pages, struct page *page, unsigned int buflen)
454{ 463{
455 struct xdr_stream stream; 464 struct xdr_stream stream;
456 struct xdr_buf buf; 465 struct xdr_buf buf = {
457 __be32 *ptr = xdr_page; 466 .pages = xdr_pages,
458 int status; 467 .page_len = buflen,
468 .buflen = buflen,
469 .len = buflen,
470 };
471 struct page *scratch;
459 struct nfs_cache_array *array; 472 struct nfs_cache_array *array;
473 unsigned int count = 0;
474 int status;
460 475
461 buf.head->iov_base = xdr_page; 476 scratch = alloc_page(GFP_KERNEL);
462 buf.head->iov_len = buflen; 477 if (scratch == NULL)
463 buf.tail->iov_len = 0; 478 return -ENOMEM;
464 buf.page_base = 0;
465 buf.page_len = 0;
466 buf.buflen = buf.head->iov_len;
467 buf.len = buf.head->iov_len;
468
469 xdr_init_decode(&stream, &buf, ptr);
470 479
480 xdr_init_decode(&stream, &buf, NULL);
481 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
471 482
472 do { 483 do {
473 status = xdr_decode(desc, entry, &stream); 484 status = xdr_decode(desc, entry, &stream);
474 if (status != 0) 485 if (status != 0) {
486 if (status == -EAGAIN)
487 status = 0;
475 break; 488 break;
489 }
476 490
477 if (nfs_readdir_add_to_array(entry, page) == -1) 491 count++;
478 break; 492
479 if (desc->plus == 1) 493 if (desc->plus != 0)
480 nfs_prime_dcache(desc->file->f_path.dentry, entry); 494 nfs_prime_dcache(desc->file->f_path.dentry, entry);
495
496 status = nfs_readdir_add_to_array(entry, page);
497 if (status != 0)
498 break;
481 } while (!entry->eof); 499 } while (!entry->eof);
482 500
483 if (status == -EBADCOOKIE && entry->eof) { 501 if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
484 array = nfs_readdir_get_array(page); 502 array = nfs_readdir_get_array(page);
485 array->eof_index = array->size - 1; 503 if (!IS_ERR(array)) {
486 status = 0; 504 array->eof_index = array->size;
487 nfs_readdir_release_array(page); 505 status = 0;
506 nfs_readdir_release_array(page);
507 } else
508 status = PTR_ERR(array);
488 } 509 }
510
511 put_page(scratch);
512 return status;
489} 513}
490 514
491static 515static
@@ -500,7 +524,6 @@ static
500void nfs_readdir_free_large_page(void *ptr, struct page **pages, 524void nfs_readdir_free_large_page(void *ptr, struct page **pages,
501 unsigned int npages) 525 unsigned int npages)
502{ 526{
503 vm_unmap_ram(ptr, npages);
504 nfs_readdir_free_pagearray(pages, npages); 527 nfs_readdir_free_pagearray(pages, npages);
505} 528}
506 529
@@ -509,9 +532,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
509 * to nfs_readdir_free_large_page 532 * to nfs_readdir_free_large_page
510 */ 533 */
511static 534static
512void *nfs_readdir_large_page(struct page **pages, unsigned int npages) 535int nfs_readdir_large_page(struct page **pages, unsigned int npages)
513{ 536{
514 void *ptr;
515 unsigned int i; 537 unsigned int i;
516 538
517 for (i = 0; i < npages; i++) { 539 for (i = 0; i < npages; i++) {
@@ -520,13 +542,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
520 goto out_freepages; 542 goto out_freepages;
521 pages[i] = page; 543 pages[i] = page;
522 } 544 }
545 return 0;
523 546
524 ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
525 if (!IS_ERR_OR_NULL(ptr))
526 return ptr;
527out_freepages: 547out_freepages:
528 nfs_readdir_free_pagearray(pages, i); 548 nfs_readdir_free_pagearray(pages, i);
529 return NULL; 549 return -ENOMEM;
530} 550}
531 551
532static 552static
@@ -537,31 +557,43 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
537 struct nfs_entry entry; 557 struct nfs_entry entry;
538 struct file *file = desc->file; 558 struct file *file = desc->file;
539 struct nfs_cache_array *array; 559 struct nfs_cache_array *array;
540 int status = 0; 560 int status = -ENOMEM;
541 unsigned int array_size = ARRAY_SIZE(pages); 561 unsigned int array_size = ARRAY_SIZE(pages);
542 562
543 entry.prev_cookie = 0; 563 entry.prev_cookie = 0;
544 entry.cookie = *desc->dir_cookie; 564 entry.cookie = desc->last_cookie;
545 entry.eof = 0; 565 entry.eof = 0;
546 entry.fh = nfs_alloc_fhandle(); 566 entry.fh = nfs_alloc_fhandle();
547 entry.fattr = nfs_alloc_fattr(); 567 entry.fattr = nfs_alloc_fattr();
568 entry.server = NFS_SERVER(inode);
548 if (entry.fh == NULL || entry.fattr == NULL) 569 if (entry.fh == NULL || entry.fattr == NULL)
549 goto out; 570 goto out;
550 571
551 array = nfs_readdir_get_array(page); 572 array = nfs_readdir_get_array(page);
573 if (IS_ERR(array)) {
574 status = PTR_ERR(array);
575 goto out;
576 }
552 memset(array, 0, sizeof(struct nfs_cache_array)); 577 memset(array, 0, sizeof(struct nfs_cache_array));
553 array->eof_index = -1; 578 array->eof_index = -1;
554 579
555 pages_ptr = nfs_readdir_large_page(pages, array_size); 580 status = nfs_readdir_large_page(pages, array_size);
556 if (!pages_ptr) 581 if (status < 0)
557 goto out_release_array; 582 goto out_release_array;
558 do { 583 do {
584 unsigned int pglen;
559 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); 585 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
560 586
561 if (status < 0) 587 if (status < 0)
562 break; 588 break;
563 nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); 589 pglen = status;
564 } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); 590 status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
591 if (status < 0) {
592 if (status == -ENOSPC)
593 status = 0;
594 break;
595 }
596 } while (array->eof_index < 0);
565 597
566 nfs_readdir_free_large_page(pages_ptr, pages, array_size); 598 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
567out_release_array: 599out_release_array:
@@ -582,8 +614,10 @@ static
582int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) 614int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
583{ 615{
584 struct inode *inode = desc->file->f_path.dentry->d_inode; 616 struct inode *inode = desc->file->f_path.dentry->d_inode;
617 int ret;
585 618
586 if (nfs_readdir_xdr_to_array(desc, page, inode) < 0) 619 ret = nfs_readdir_xdr_to_array(desc, page, inode);
620 if (ret < 0)
587 goto error; 621 goto error;
588 SetPageUptodate(page); 622 SetPageUptodate(page);
589 623
@@ -595,12 +629,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
595 return 0; 629 return 0;
596 error: 630 error:
597 unlock_page(page); 631 unlock_page(page);
598 return -EIO; 632 return ret;
599} 633}
600 634
601static 635static
602void cache_page_release(nfs_readdir_descriptor_t *desc) 636void cache_page_release(nfs_readdir_descriptor_t *desc)
603{ 637{
638 if (!desc->page->mapping)
639 nfs_readdir_clear_array(desc->page);
604 page_cache_release(desc->page); 640 page_cache_release(desc->page);
605 desc->page = NULL; 641 desc->page = NULL;
606} 642}
@@ -608,12 +644,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
608static 644static
609struct page *get_cache_page(nfs_readdir_descriptor_t *desc) 645struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
610{ 646{
611 struct page *page; 647 return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
612 page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
613 desc->page_index, (filler_t *)nfs_readdir_filler, desc); 648 desc->page_index, (filler_t *)nfs_readdir_filler, desc);
614 if (IS_ERR(page))
615 desc->eof = 1;
616 return page;
617} 649}
618 650
619/* 651/*
@@ -629,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
629 return PTR_ERR(desc->page); 661 return PTR_ERR(desc->page);
630 662
631 res = nfs_readdir_search_array(desc); 663 res = nfs_readdir_search_array(desc);
632 if (res == 0) 664 if (res != 0)
633 return 0; 665 cache_page_release(desc);
634 cache_page_release(desc);
635 return res; 666 return res;
636} 667}
637 668
@@ -639,22 +670,18 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
639static inline 670static inline
640int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) 671int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
641{ 672{
642 int res = -EAGAIN; 673 int res;
643 674
644 while (1) { 675 if (desc->page_index == 0) {
645 res = find_cache_page(desc); 676 desc->current_index = 0;
646 if (res != -EAGAIN) 677 desc->last_cookie = 0;
647 break;
648 desc->page_index++;
649 } 678 }
679 do {
680 res = find_cache_page(desc);
681 } while (res == -EAGAIN);
650 return res; 682 return res;
651} 683}
652 684
653static inline unsigned int dt_type(struct inode *inode)
654{
655 return (inode->i_mode >> 12) & 15;
656}
657
658/* 685/*
659 * Once we've found the start of the dirent within a page: fill 'er up... 686 * Once we've found the start of the dirent within a page: fill 'er up...
660 */ 687 */
@@ -666,35 +693,35 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
666 int i = 0; 693 int i = 0;
667 int res = 0; 694 int res = 0;
668 struct nfs_cache_array *array = NULL; 695 struct nfs_cache_array *array = NULL;
669 unsigned int d_type = DT_UNKNOWN;
670 struct dentry *dentry = NULL;
671 696
672 array = nfs_readdir_get_array(desc->page); 697 array = nfs_readdir_get_array(desc->page);
698 if (IS_ERR(array)) {
699 res = PTR_ERR(array);
700 goto out;
701 }
673 702
674 for (i = desc->cache_entry_index; i < array->size; i++) { 703 for (i = desc->cache_entry_index; i < array->size; i++) {
675 d_type = DT_UNKNOWN; 704 struct nfs_cache_array_entry *ent;
676 705
677 res = filldir(dirent, array->array[i].string.name, 706 ent = &array->array[i];
678 array->array[i].string.len, file->f_pos, 707 if (filldir(dirent, ent->string.name, ent->string.len,
679 nfs_compat_user_ino64(array->array[i].ino), d_type); 708 file->f_pos, nfs_compat_user_ino64(ent->ino),
680 if (res < 0) 709 ent->d_type) < 0) {
710 desc->eof = 1;
681 break; 711 break;
712 }
682 file->f_pos++; 713 file->f_pos++;
683 desc->cache_entry_index = i;
684 if (i < (array->size-1)) 714 if (i < (array->size-1))
685 *desc->dir_cookie = array->array[i+1].cookie; 715 *desc->dir_cookie = array->array[i+1].cookie;
686 else 716 else
687 *desc->dir_cookie = array->last_cookie; 717 *desc->dir_cookie = array->last_cookie;
688 if (i == array->eof_index) {
689 desc->eof = 1;
690 break;
691 }
692 } 718 }
719 if (array->eof_index >= 0)
720 desc->eof = 1;
693 721
694 nfs_readdir_release_array(desc->page); 722 nfs_readdir_release_array(desc->page);
723out:
695 cache_page_release(desc); 724 cache_page_release(desc);
696 if (dentry != NULL)
697 dput(dentry);
698 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", 725 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
699 (unsigned long long)*desc->dir_cookie, res); 726 (unsigned long long)*desc->dir_cookie, res);
700 return res; 727 return res;
@@ -729,13 +756,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
729 goto out; 756 goto out;
730 } 757 }
731 758
732 if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
733 status = -EIO;
734 goto out_release;
735 }
736
737 desc->page_index = 0; 759 desc->page_index = 0;
760 desc->last_cookie = *desc->dir_cookie;
738 desc->page = page; 761 desc->page = page;
762
763 status = nfs_readdir_xdr_to_array(desc, page, inode);
764 if (status < 0)
765 goto out_release;
766
739 status = nfs_do_filldir(desc, dirent, filldir); 767 status = nfs_do_filldir(desc, dirent, filldir);
740 768
741 out: 769 out:
@@ -757,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
757 struct inode *inode = dentry->d_inode; 785 struct inode *inode = dentry->d_inode;
758 nfs_readdir_descriptor_t my_desc, 786 nfs_readdir_descriptor_t my_desc,
759 *desc = &my_desc; 787 *desc = &my_desc;
760 int res = -ENOMEM; 788 int res;
761 789
762 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 790 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
763 dentry->d_parent->d_name.name, dentry->d_name.name, 791 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -782,18 +810,18 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
782 if (res < 0) 810 if (res < 0)
783 goto out; 811 goto out;
784 812
785 while (desc->eof != 1) { 813 do {
786 res = readdir_search_pagecache(desc); 814 res = readdir_search_pagecache(desc);
787 815
788 if (res == -EBADCOOKIE) { 816 if (res == -EBADCOOKIE) {
817 res = 0;
789 /* This means either end of directory */ 818 /* This means either end of directory */
790 if (*desc->dir_cookie && desc->eof == 0) { 819 if (*desc->dir_cookie && desc->eof == 0) {
791 /* Or that the server has 'lost' a cookie */ 820 /* Or that the server has 'lost' a cookie */
792 res = uncached_readdir(desc, dirent, filldir); 821 res = uncached_readdir(desc, dirent, filldir);
793 if (res >= 0) 822 if (res == 0)
794 continue; 823 continue;
795 } 824 }
796 res = 0;
797 break; 825 break;
798 } 826 }
799 if (res == -ETOOSMALL && desc->plus) { 827 if (res == -ETOOSMALL && desc->plus) {
@@ -808,11 +836,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
808 break; 836 break;
809 837
810 res = nfs_do_filldir(desc, dirent, filldir); 838 res = nfs_do_filldir(desc, dirent, filldir);
811 if (res < 0) { 839 if (res < 0)
812 res = 0;
813 break; 840 break;
814 } 841 } while (!desc->eof);
815 }
816out: 842out:
817 nfs_unblock_sillyrename(dentry); 843 nfs_unblock_sillyrename(dentry);
818 if (res > 0) 844 if (res > 0)
@@ -912,7 +938,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
912 * component of the path. 938 * component of the path.
913 * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT. 939 * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT.
914 */ 940 */
915static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask) 941static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
942 unsigned int mask)
916{ 943{
917 if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT)) 944 if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
918 return 0; 945 return 0;
@@ -943,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
943{ 970{
944 struct nfs_server *server = NFS_SERVER(inode); 971 struct nfs_server *server = NFS_SERVER(inode);
945 972
946 if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags)) 973 if (IS_AUTOMOUNT(inode))
947 return 0; 974 return 0;
948 if (nd != NULL) { 975 if (nd != NULL) {
949 /* VFS wants an on-the-wire revalidation */ 976 /* VFS wants an on-the-wire revalidation */
@@ -992,7 +1019,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
992 * If the parent directory is seen to have changed, we throw out the 1019 * If the parent directory is seen to have changed, we throw out the
993 * cached dentry and do a new lookup. 1020 * cached dentry and do a new lookup.
994 */ 1021 */
995static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) 1022static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
996{ 1023{
997 struct inode *dir; 1024 struct inode *dir;
998 struct inode *inode; 1025 struct inode *inode;
@@ -1001,6 +1028,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
1001 struct nfs_fattr *fattr = NULL; 1028 struct nfs_fattr *fattr = NULL;
1002 int error; 1029 int error;
1003 1030
1031 if (nd->flags & LOOKUP_RCU)
1032 return -ECHILD;
1033
1004 parent = dget_parent(dentry); 1034 parent = dget_parent(dentry);
1005 dir = parent->d_inode; 1035 dir = parent->d_inode;
1006 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); 1036 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
@@ -1091,7 +1121,7 @@ out_error:
1091/* 1121/*
1092 * This is called from dput() when d_count is going to 0. 1122 * This is called from dput() when d_count is going to 0.
1093 */ 1123 */
1094static int nfs_dentry_delete(struct dentry *dentry) 1124static int nfs_dentry_delete(const struct dentry *dentry)
1095{ 1125{
1096 dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", 1126 dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
1097 dentry->d_parent->d_name.name, dentry->d_name.name, 1127 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -1143,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
1143 .d_revalidate = nfs_lookup_revalidate, 1173 .d_revalidate = nfs_lookup_revalidate,
1144 .d_delete = nfs_dentry_delete, 1174 .d_delete = nfs_dentry_delete,
1145 .d_iput = nfs_dentry_iput, 1175 .d_iput = nfs_dentry_iput,
1176 .d_automount = nfs_d_automount,
1146}; 1177};
1147 1178
1148static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 1179static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1162,8 +1193,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
1162 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) 1193 if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
1163 goto out; 1194 goto out;
1164 1195
1165 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
1166
1167 /* 1196 /*
1168 * If we're doing an exclusive create, optimize away the lookup 1197 * If we're doing an exclusive create, optimize away the lookup
1169 * but don't hash the dentry. 1198 * but don't hash the dentry.
@@ -1191,7 +1220,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
1191 goto out_unblock_sillyrename; 1220 goto out_unblock_sillyrename;
1192 } 1221 }
1193 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1222 inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
1194 res = (struct dentry *)inode; 1223 res = ERR_CAST(inode);
1195 if (IS_ERR(res)) 1224 if (IS_ERR(res))
1196 goto out_unblock_sillyrename; 1225 goto out_unblock_sillyrename;
1197 1226
@@ -1218,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
1218 .d_revalidate = nfs_open_revalidate, 1247 .d_revalidate = nfs_open_revalidate,
1219 .d_delete = nfs_dentry_delete, 1248 .d_delete = nfs_dentry_delete,
1220 .d_iput = nfs_dentry_iput, 1249 .d_iput = nfs_dentry_iput,
1250 .d_automount = nfs_d_automount,
1221}; 1251};
1222 1252
1223/* 1253/*
@@ -1307,7 +1337,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1307 res = ERR_PTR(-ENAMETOOLONG); 1337 res = ERR_PTR(-ENAMETOOLONG);
1308 goto out; 1338 goto out;
1309 } 1339 }
1310 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
1311 1340
1312 /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash 1341 /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
1313 * the dentry. */ 1342 * the dentry. */
@@ -1325,8 +1354,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1325 if (nd->flags & LOOKUP_CREATE) { 1354 if (nd->flags & LOOKUP_CREATE) {
1326 attr.ia_mode = nd->intent.open.create_mode; 1355 attr.ia_mode = nd->intent.open.create_mode;
1327 attr.ia_valid = ATTR_MODE; 1356 attr.ia_valid = ATTR_MODE;
1328 if (!IS_POSIXACL(dir)) 1357 attr.ia_mode &= ~current_umask();
1329 attr.ia_mode &= ~current_umask();
1330 } else { 1358 } else {
1331 open_flags &= ~(O_EXCL | O_CREAT); 1359 open_flags &= ~(O_EXCL | O_CREAT);
1332 attr.ia_valid = 0; 1360 attr.ia_valid = 0;
@@ -1345,12 +1373,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1345 res = NULL; 1373 res = NULL;
1346 goto out; 1374 goto out;
1347 /* This turned out not to be a regular file */ 1375 /* This turned out not to be a regular file */
1348 case -EISDIR:
1349 case -ENOTDIR: 1376 case -ENOTDIR:
1350 goto no_open; 1377 goto no_open;
1351 case -ELOOP: 1378 case -ELOOP:
1352 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1379 if (!(nd->intent.open.flags & O_NOFOLLOW))
1353 goto no_open; 1380 goto no_open;
1381 /* case -EISDIR: */
1354 /* case -EINVAL: */ 1382 /* case -EINVAL: */
1355 default: 1383 default:
1356 res = ERR_CAST(inode); 1384 res = ERR_CAST(inode);
@@ -1380,11 +1408,15 @@ no_open:
1380static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) 1408static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1381{ 1409{
1382 struct dentry *parent = NULL; 1410 struct dentry *parent = NULL;
1383 struct inode *inode = dentry->d_inode; 1411 struct inode *inode;
1384 struct inode *dir; 1412 struct inode *dir;
1385 struct nfs_open_context *ctx; 1413 struct nfs_open_context *ctx;
1386 int openflags, ret = 0; 1414 int openflags, ret = 0;
1387 1415
1416 if (nd->flags & LOOKUP_RCU)
1417 return -ECHILD;
1418
1419 inode = dentry->d_inode;
1388 if (!is_atomic_open(nd) || d_mountpoint(dentry)) 1420 if (!is_atomic_open(nd) || d_mountpoint(dentry))
1389 goto no_open; 1421 goto no_open;
1390 1422
@@ -1553,6 +1585,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1553{ 1585{
1554 struct iattr attr; 1586 struct iattr attr;
1555 int error; 1587 int error;
1588 int open_flags = 0;
1556 1589
1557 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1590 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1558 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1591 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1560,7 +1593,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1560 attr.ia_mode = mode; 1593 attr.ia_mode = mode;
1561 attr.ia_valid = ATTR_MODE; 1594 attr.ia_valid = ATTR_MODE;
1562 1595
1563 error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); 1596 if ((nd->flags & LOOKUP_CREATE) != 0)
1597 open_flags = nd->intent.open.flags;
1598
1599 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
1564 if (error != 0) 1600 if (error != 0)
1565 goto out_err; 1601 goto out_err;
1566 return 0; 1602 return 0;
@@ -1692,11 +1728,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1692 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, 1728 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
1693 dir->i_ino, dentry->d_name.name); 1729 dir->i_ino, dentry->d_name.name);
1694 1730
1695 spin_lock(&dcache_lock);
1696 spin_lock(&dentry->d_lock); 1731 spin_lock(&dentry->d_lock);
1697 if (atomic_read(&dentry->d_count) > 1) { 1732 if (dentry->d_count > 1) {
1698 spin_unlock(&dentry->d_lock); 1733 spin_unlock(&dentry->d_lock);
1699 spin_unlock(&dcache_lock);
1700 /* Start asynchronous writeout of the inode */ 1734 /* Start asynchronous writeout of the inode */
1701 write_inode_now(dentry->d_inode, 0); 1735 write_inode_now(dentry->d_inode, 0);
1702 error = nfs_sillyrename(dir, dentry); 1736 error = nfs_sillyrename(dir, dentry);
@@ -1707,7 +1741,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1707 need_rehash = 1; 1741 need_rehash = 1;
1708 } 1742 }
1709 spin_unlock(&dentry->d_lock); 1743 spin_unlock(&dentry->d_lock);
1710 spin_unlock(&dcache_lock);
1711 error = nfs_safe_remove(dentry); 1744 error = nfs_safe_remove(dentry);
1712 if (!error || error == -ENOENT) { 1745 if (!error || error == -ENOENT) {
1713 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1746 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
@@ -1842,7 +1875,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1842 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", 1875 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
1843 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1876 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1844 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, 1877 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
1845 atomic_read(&new_dentry->d_count)); 1878 new_dentry->d_count);
1846 1879
1847 /* 1880 /*
1848 * For non-directories, check whether the target is busy and if so, 1881 * For non-directories, check whether the target is busy and if so,
@@ -1860,7 +1893,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1860 rehash = new_dentry; 1893 rehash = new_dentry;
1861 } 1894 }
1862 1895
1863 if (atomic_read(&new_dentry->d_count) > 2) { 1896 if (new_dentry->d_count > 2) {
1864 int err; 1897 int err;
1865 1898
1866 /* copy the target dentry's name */ 1899 /* copy the target dentry's name */
@@ -2162,11 +2195,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
2162 return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); 2195 return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
2163} 2196}
2164 2197
2165int nfs_permission(struct inode *inode, int mask) 2198int nfs_permission(struct inode *inode, int mask, unsigned int flags)
2166{ 2199{
2167 struct rpc_cred *cred; 2200 struct rpc_cred *cred;
2168 int res = 0; 2201 int res = 0;
2169 2202
2203 if (flags & IPERM_FLAG_RCU)
2204 return -ECHILD;
2205
2170 nfs_inc_stats(inode, NFSIOS_VFSACCESS); 2206 nfs_inc_stats(inode, NFSIOS_VFSACCESS);
2171 2207
2172 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 2208 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2214,7 +2250,7 @@ out:
2214out_notsup: 2250out_notsup:
2215 res = nfs_revalidate_inode(NFS_SERVER(inode), inode); 2251 res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
2216 if (res == 0) 2252 if (res == 0)
2217 res = generic_permission(inode, mask, NULL); 2253 res = generic_permission(inode, mask, flags, NULL);
2218 goto out; 2254 goto out;
2219} 2255}
2220 2256
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 84d3c8b9020..9943a75bb6d 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
407 pos += vec->iov_len; 407 pos += vec->iov_len;
408 } 408 }
409 409
410 /*
411 * If no bytes were started, return the error, and let the
412 * generic layer handle the completion.
413 */
414 if (requested_bytes == 0) {
415 nfs_direct_req_release(dreq);
416 return result < 0 ? result : -EIO;
417 }
418
410 if (put_dreq(dreq)) 419 if (put_dreq(dreq))
411 nfs_direct_complete(dreq); 420 nfs_direct_complete(dreq);
412 421 return 0;
413 if (requested_bytes != 0)
414 return 0;
415
416 if (result < 0)
417 return result;
418 return -EIO;
419} 422}
420 423
421static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 424static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
841 pos += vec->iov_len; 844 pos += vec->iov_len;
842 } 845 }
843 846
847 /*
848 * If no bytes were started, return the error, and let the
849 * generic layer handle the completion.
850 */
851 if (requested_bytes == 0) {
852 nfs_direct_req_release(dreq);
853 return result < 0 ? result : -EIO;
854 }
855
844 if (put_dreq(dreq)) 856 if (put_dreq(dreq))
845 nfs_direct_write_complete(dreq, dreq->inode); 857 nfs_direct_write_complete(dreq, dreq->inode);
846 858 return 0;
847 if (requested_bytes != 0)
848 return 0;
849
850 if (result < 0)
851 return result;
852 return -EIO;
853} 859}
854 860
855static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, 861static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
@@ -867,7 +873,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
867 goto out; 873 goto out;
868 nfs_alloc_commit_data(dreq); 874 nfs_alloc_commit_data(dreq);
869 875
870 if (dreq->commit_data == NULL || count < wsize) 876 if (dreq->commit_data == NULL || count <= wsize)
871 sync = NFS_FILE_SYNC; 877 sync = NFS_FILE_SYNC;
872 878
873 dreq->inode = inode; 879 dreq->inode = inode;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 60677f9f131..7bf029ef408 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
693{ 693{
694 struct inode *inode = filp->f_mapping->host; 694 struct inode *inode = filp->f_mapping->host;
695 int status = 0; 695 int status = 0;
696 unsigned int saved_type = fl->fl_type;
696 697
697 /* Try local locking first */ 698 /* Try local locking first */
698 posix_test_lock(filp, fl); 699 posix_test_lock(filp, fl);
@@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
700 /* found a conflict */ 701 /* found a conflict */
701 goto out; 702 goto out;
702 } 703 }
704 fl->fl_type = saved_type;
703 705
704 if (nfs_have_delegation(inode, FMODE_READ)) 706 if (nfs_have_delegation(inode, FMODE_READ))
705 goto out_noconflict; 707 goto out_noconflict;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index ac7b814ce16..b5ffe8fa291 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -63,9 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
63 * This again causes shrink_dcache_for_umount_subtree() to 63 * This again causes shrink_dcache_for_umount_subtree() to
64 * Oops, since the test for IS_ROOT() will fail. 64 * Oops, since the test for IS_ROOT() will fail.
65 */ 65 */
66 spin_lock(&dcache_lock); 66 spin_lock(&sb->s_root->d_inode->i_lock);
67 spin_lock(&sb->s_root->d_lock);
67 list_del_init(&sb->s_root->d_alias); 68 list_del_init(&sb->s_root->d_alias);
68 spin_unlock(&dcache_lock); 69 spin_unlock(&sb->s_root->d_lock);
70 spin_unlock(&sb->s_root->d_inode->i_lock);
69 } 71 }
70 return 0; 72 return 0;
71} 73}
@@ -117,9 +119,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
117 } 119 }
118 120
119 security_d_instantiate(ret, inode); 121 security_d_instantiate(ret, inode);
120
121 if (ret->d_op == NULL)
122 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
123out: 122out:
124 nfs_free_fattr(fsinfo.fattr); 123 nfs_free_fattr(fsinfo.fattr);
125 return ret; 124 return ret;
@@ -225,9 +224,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
225 224
226 security_d_instantiate(ret, inode); 225 security_d_instantiate(ret, inode);
227 226
228 if (ret->d_op == NULL)
229 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
230
231out: 227out:
232 nfs_free_fattr(fattr); 228 nfs_free_fattr(fattr);
233 dprintk("<-- nfs4_get_root()\n"); 229 dprintk("<-- nfs4_get_root()\n");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 4e2d9b6b138..18696882f1c 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -238,7 +238,7 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
238 return nfs_idmap_lookup_name(gid, "group", buf, buflen); 238 return nfs_idmap_lookup_name(gid, "group", buf, buflen);
239} 239}
240 240
241#else /* CONFIG_NFS_USE_IDMAPPER not defined */ 241#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
242 242
243#include <linux/module.h> 243#include <linux/module.h>
244#include <linux/mutex.h> 244#include <linux/mutex.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 314f5716460..1cc600e77bb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
289 } else if (S_ISDIR(inode->i_mode)) { 289 } else if (S_ISDIR(inode->i_mode)) {
290 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; 290 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
291 inode->i_fop = &nfs_dir_operations; 291 inode->i_fop = &nfs_dir_operations;
292 inode->i_data.a_ops = &nfs_dir_aops;
292 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) 293 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
293 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 294 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
294 /* Deal with crossing mountpoints */ 295 /* Deal with crossing mountpoints */
@@ -299,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
299 else 300 else
300 inode->i_op = &nfs_mountpoint_inode_operations; 301 inode->i_op = &nfs_mountpoint_inode_operations;
301 inode->i_fop = NULL; 302 inode->i_fop = NULL;
302 set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags); 303 inode->i_flags |= S_AUTOMOUNT;
303 } 304 }
304 } else if (S_ISLNK(inode->i_mode)) 305 } else if (S_ISLNK(inode->i_mode))
305 inode->i_op = &nfs_symlink_inode_operations; 306 inode->i_op = &nfs_symlink_inode_operations;
@@ -880,9 +881,10 @@ out:
880 return ret; 881 return ret;
881} 882}
882 883
883static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) 884static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
884{ 885{
885 struct nfs_inode *nfsi = NFS_I(inode); 886 struct nfs_inode *nfsi = NFS_I(inode);
887 unsigned long ret = 0;
886 888
887 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) 889 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
888 && (fattr->valid & NFS_ATTR_FATTR_CHANGE) 890 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -890,25 +892,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
890 nfsi->change_attr = fattr->change_attr; 892 nfsi->change_attr = fattr->change_attr;
891 if (S_ISDIR(inode->i_mode)) 893 if (S_ISDIR(inode->i_mode))
892 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 894 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
895 ret |= NFS_INO_INVALID_ATTR;
893 } 896 }
894 /* If we have atomic WCC data, we may update some attributes */ 897 /* If we have atomic WCC data, we may update some attributes */
895 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) 898 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
896 && (fattr->valid & NFS_ATTR_FATTR_CTIME) 899 && (fattr->valid & NFS_ATTR_FATTR_CTIME)
897 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) 900 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
898 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 901 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
902 ret |= NFS_INO_INVALID_ATTR;
903 }
899 904
900 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) 905 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
901 && (fattr->valid & NFS_ATTR_FATTR_MTIME) 906 && (fattr->valid & NFS_ATTR_FATTR_MTIME)
902 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 907 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
903 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 908 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
904 if (S_ISDIR(inode->i_mode)) 909 if (S_ISDIR(inode->i_mode))
905 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 910 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
911 ret |= NFS_INO_INVALID_ATTR;
906 } 912 }
907 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) 913 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
908 && (fattr->valid & NFS_ATTR_FATTR_SIZE) 914 && (fattr->valid & NFS_ATTR_FATTR_SIZE)
909 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) 915 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
910 && nfsi->npages == 0) 916 && nfsi->npages == 0) {
911 i_size_write(inode, nfs_size_to_loff_t(fattr->size)); 917 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
918 ret |= NFS_INO_INVALID_ATTR;
919 }
920 return ret;
912} 921}
913 922
914/** 923/**
@@ -1207,7 +1216,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1207 /* Update the fsid? */ 1216 /* Update the fsid? */
1208 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && 1217 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
1209 !nfs_fsid_equal(&server->fsid, &fattr->fsid) && 1218 !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
1210 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) 1219 !IS_AUTOMOUNT(inode))
1211 server->fsid = fattr->fsid; 1220 server->fsid = fattr->fsid;
1212 1221
1213 /* 1222 /*
@@ -1222,7 +1231,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1222 | NFS_INO_REVAL_PAGECACHE); 1231 | NFS_INO_REVAL_PAGECACHE);
1223 1232
1224 /* Do atomic weak cache consistency updates */ 1233 /* Do atomic weak cache consistency updates */
1225 nfs_wcc_update_inode(inode, fattr); 1234 invalid |= nfs_wcc_update_inode(inode, fattr);
1226 1235
1227 /* More cache consistency checks */ 1236 /* More cache consistency checks */
1228 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { 1237 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
@@ -1409,9 +1418,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1409 */ 1418 */
1410void nfs4_evict_inode(struct inode *inode) 1419void nfs4_evict_inode(struct inode *inode)
1411{ 1420{
1421 pnfs_destroy_layout(NFS_I(inode));
1412 truncate_inode_pages(&inode->i_data, 0); 1422 truncate_inode_pages(&inode->i_data, 0);
1413 end_writeback(inode); 1423 end_writeback(inode);
1414 pnfs_destroy_layout(NFS_I(inode));
1415 /* If we are holding a delegation, return it! */ 1424 /* If we are holding a delegation, return it! */
1416 nfs_inode_return_delegation_noreclaim(inode); 1425 nfs_inode_return_delegation_noreclaim(inode);
1417 /* First call standard NFS clear_inode() code */ 1426 /* First call standard NFS clear_inode() code */
@@ -1437,11 +1446,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
1437 return &nfsi->vfs_inode; 1446 return &nfsi->vfs_inode;
1438} 1447}
1439 1448
1440void nfs_destroy_inode(struct inode *inode) 1449static void nfs_i_callback(struct rcu_head *head)
1441{ 1450{
1451 struct inode *inode = container_of(head, struct inode, i_rcu);
1452 INIT_LIST_HEAD(&inode->i_dentry);
1442 kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); 1453 kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
1443} 1454}
1444 1455
1456void nfs_destroy_inode(struct inode *inode)
1457{
1458 call_rcu(&inode->i_rcu, nfs_i_callback);
1459}
1460
1445static inline void nfs4_init_once(struct nfs_inode *nfsi) 1461static inline void nfs4_init_once(struct nfs_inode *nfsi)
1446{ 1462{
1447#ifdef CONFIG_NFS_V4 1463#ifdef CONFIG_NFS_V4
@@ -1611,6 +1627,7 @@ static void __exit exit_nfs_fs(void)
1611#ifdef CONFIG_PROC_FS 1627#ifdef CONFIG_PROC_FS
1612 rpc_proc_unregister("nfs"); 1628 rpc_proc_unregister("nfs");
1613#endif 1629#endif
1630 nfs_cleanup_cb_ident_idr();
1614 unregister_nfs_fs(); 1631 unregister_nfs_fs();
1615 nfs_fs_proc_exit(); 1632 nfs_fs_proc_exit();
1616 nfsiod_stop(); 1633 nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index db08ff3ff45..cf9fdbdabc6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -128,9 +128,12 @@ extern void nfs_umount(const struct nfs_mount_request *info);
128/* client.c */ 128/* client.c */
129extern struct rpc_program nfs_program; 129extern struct rpc_program nfs_program;
130 130
131extern void nfs_cleanup_cb_ident_idr(void);
131extern void nfs_put_client(struct nfs_client *); 132extern void nfs_put_client(struct nfs_client *);
132extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32); 133extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
133extern struct nfs_client *nfs_find_client_next(struct nfs_client *); 134extern struct nfs_client *nfs4_find_client_ident(int);
135extern struct nfs_client *
136nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
134extern struct nfs_server *nfs_create_server( 137extern struct nfs_server *nfs_create_server(
135 const struct nfs_parsed_mount_data *, 138 const struct nfs_parsed_mount_data *,
136 struct nfs_fh *); 139 struct nfs_fh *);
@@ -185,17 +188,20 @@ extern int __init nfs_init_directcache(void);
185extern void nfs_destroy_directcache(void); 188extern void nfs_destroy_directcache(void);
186 189
187/* nfs2xdr.c */ 190/* nfs2xdr.c */
188extern int nfs_stat_to_errno(int); 191extern int nfs_stat_to_errno(enum nfs_stat);
189extern struct rpc_procinfo nfs_procedures[]; 192extern struct rpc_procinfo nfs_procedures[];
190extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); 193extern int nfs2_decode_dirent(struct xdr_stream *,
194 struct nfs_entry *, int);
191 195
192/* nfs3xdr.c */ 196/* nfs3xdr.c */
193extern struct rpc_procinfo nfs3_procedures[]; 197extern struct rpc_procinfo nfs3_procedures[];
194extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); 198extern int nfs3_decode_dirent(struct xdr_stream *,
199 struct nfs_entry *, int);
195 200
196/* nfs4xdr.c */ 201/* nfs4xdr.c */
197#ifdef CONFIG_NFS_V4 202#ifdef CONFIG_NFS_V4
198extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); 203extern int nfs4_decode_dirent(struct xdr_stream *,
204 struct nfs_entry *, int);
199#endif 205#endif
200#ifdef CONFIG_NFS_V4_1 206#ifdef CONFIG_NFS_V4_1
201extern const u32 nfs41_maxread_overhead; 207extern const u32 nfs41_maxread_overhead;
@@ -245,6 +251,7 @@ extern char *nfs_path(const char *base,
245 const struct dentry *droot, 251 const struct dentry *droot,
246 const struct dentry *dentry, 252 const struct dentry *dentry,
247 char *buffer, ssize_t buflen); 253 char *buffer, ssize_t buflen);
254extern struct vfsmount *nfs_d_automount(struct path *path);
248 255
249/* getroot.c */ 256/* getroot.c */
250extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); 257extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
@@ -362,6 +369,15 @@ unsigned int nfs_page_length(struct page *page)
362} 369}
363 370
364/* 371/*
372 * Convert a umode to a dirent->d_type
373 */
374static inline
375unsigned char nfs_umode_to_dtype(umode_t mode)
376{
377 return (mode >> 12) & 15;
378}
379
380/*
365 * Determine the number of pages in an array of length 'len' and 381 * Determine the number of pages in an array of length 'len' and
366 * with a base offset of 'base' 382 * with a base offset of 'base'
367 */ 383 */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index eceafe74f47..d4c2d6b7507 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -236,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info)
236 .authflavor = RPC_AUTH_UNIX, 236 .authflavor = RPC_AUTH_UNIX,
237 .flags = RPC_CLNT_CREATE_NOPING, 237 .flags = RPC_CLNT_CREATE_NOPING,
238 }; 238 };
239 struct mountres result;
240 struct rpc_message msg = { 239 struct rpc_message msg = {
241 .rpc_argp = info->dirpath, 240 .rpc_argp = info->dirpath,
242 .rpc_resp = &result,
243 }; 241 };
244 struct rpc_clnt *clnt; 242 struct rpc_clnt *clnt;
245 int status; 243 int status;
@@ -248,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info)
248 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 246 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
249 247
250 clnt = rpc_create(&args); 248 clnt = rpc_create(&args);
251 if (unlikely(IS_ERR(clnt))) 249 if (IS_ERR(clnt))
252 goto out_clnt_err; 250 goto out_clnt_err;
253 251
254 dprintk("NFS: sending UMNT request for %s:%s\n", 252 dprintk("NFS: sending UMNT request for %s:%s\n",
@@ -280,29 +278,20 @@ out_call_err:
280 * XDR encode/decode functions for MOUNT 278 * XDR encode/decode functions for MOUNT
281 */ 279 */
282 280
283static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) 281static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
284{ 282{
285 const u32 pathname_len = strlen(pathname); 283 const u32 pathname_len = strlen(pathname);
286 __be32 *p; 284 __be32 *p;
287 285
288 if (unlikely(pathname_len > MNTPATHLEN)) 286 BUG_ON(pathname_len > MNTPATHLEN);
289 return -EIO; 287 p = xdr_reserve_space(xdr, 4 + pathname_len);
290
291 p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
292 if (unlikely(p == NULL))
293 return -EIO;
294 xdr_encode_opaque(p, pathname, pathname_len); 288 xdr_encode_opaque(p, pathname, pathname_len);
295
296 return 0;
297} 289}
298 290
299static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p, 291static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
300 const char *dirpath) 292 const char *dirpath)
301{ 293{
302 struct xdr_stream xdr; 294 encode_mntdirpath(xdr, dirpath);
303
304 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
305 return encode_mntdirpath(&xdr, dirpath);
306} 295}
307 296
308/* 297/*
@@ -320,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
320 u32 status; 309 u32 status;
321 __be32 *p; 310 __be32 *p;
322 311
323 p = xdr_inline_decode(xdr, sizeof(status)); 312 p = xdr_inline_decode(xdr, 4);
324 if (unlikely(p == NULL)) 313 if (unlikely(p == NULL))
325 return -EIO; 314 return -EIO;
326 status = ntohl(*p); 315 status = be32_to_cpup(p);
327 316
328 for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { 317 for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
329 if (mnt_errtbl[i].status == status) { 318 if (mnt_errtbl[i].status == status) {
@@ -351,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
351 return 0; 340 return 0;
352} 341}
353 342
354static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p, 343static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
355 struct mountres *res) 344 struct xdr_stream *xdr,
345 struct mountres *res)
356{ 346{
357 struct xdr_stream xdr;
358 int status; 347 int status;
359 348
360 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 349 status = decode_status(xdr, res);
361
362 status = decode_status(&xdr, res);
363 if (unlikely(status != 0 || res->errno != 0)) 350 if (unlikely(status != 0 || res->errno != 0))
364 return status; 351 return status;
365 return decode_fhandle(&xdr, res); 352 return decode_fhandle(xdr, res);
366} 353}
367 354
368static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) 355static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -371,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
371 u32 status; 358 u32 status;
372 __be32 *p; 359 __be32 *p;
373 360
374 p = xdr_inline_decode(xdr, sizeof(status)); 361 p = xdr_inline_decode(xdr, 4);
375 if (unlikely(p == NULL)) 362 if (unlikely(p == NULL))
376 return -EIO; 363 return -EIO;
377 status = ntohl(*p); 364 status = be32_to_cpup(p);
378 365
379 for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { 366 for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
380 if (mnt3_errtbl[i].status == status) { 367 if (mnt3_errtbl[i].status == status) {
@@ -394,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
394 u32 size; 381 u32 size;
395 __be32 *p; 382 __be32 *p;
396 383
397 p = xdr_inline_decode(xdr, sizeof(size)); 384 p = xdr_inline_decode(xdr, 4);
398 if (unlikely(p == NULL)) 385 if (unlikely(p == NULL))
399 return -EIO; 386 return -EIO;
400 387
401 size = ntohl(*p++); 388 size = be32_to_cpup(p);
402 if (size > NFS3_FHSIZE || size == 0) 389 if (size > NFS3_FHSIZE || size == 0)
403 return -EIO; 390 return -EIO;
404 391
@@ -421,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
421 if (*count == 0) 408 if (*count == 0)
422 return 0; 409 return 0;
423 410
424 p = xdr_inline_decode(xdr, sizeof(entries)); 411 p = xdr_inline_decode(xdr, 4);
425 if (unlikely(p == NULL)) 412 if (unlikely(p == NULL))
426 return -EIO; 413 return -EIO;
427 entries = ntohl(*p); 414 entries = be32_to_cpup(p);
428 dprintk("NFS: received %u auth flavors\n", entries); 415 dprintk("NFS: received %u auth flavors\n", entries);
429 if (entries > NFS_MAX_SECFLAVORS) 416 if (entries > NFS_MAX_SECFLAVORS)
430 entries = NFS_MAX_SECFLAVORS; 417 entries = NFS_MAX_SECFLAVORS;
431 418
432 p = xdr_inline_decode(xdr, sizeof(u32) * entries); 419 p = xdr_inline_decode(xdr, 4 * entries);
433 if (unlikely(p == NULL)) 420 if (unlikely(p == NULL))
434 return -EIO; 421 return -EIO;
435 422
@@ -437,7 +424,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
437 entries = *count; 424 entries = *count;
438 425
439 for (i = 0; i < entries; i++) { 426 for (i = 0; i < entries; i++) {
440 flavors[i] = ntohl(*p++); 427 flavors[i] = be32_to_cpup(p++);
441 dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]); 428 dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]);
442 } 429 }
443 *count = i; 430 *count = i;
@@ -445,30 +432,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
445 return 0; 432 return 0;
446} 433}
447 434
448static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p, 435static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
449 struct mountres *res) 436 struct xdr_stream *xdr,
437 struct mountres *res)
450{ 438{
451 struct xdr_stream xdr;
452 int status; 439 int status;
453 440
454 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 441 status = decode_fhs_status(xdr, res);
455
456 status = decode_fhs_status(&xdr, res);
457 if (unlikely(status != 0 || res->errno != 0)) 442 if (unlikely(status != 0 || res->errno != 0))
458 return status; 443 return status;
459 status = decode_fhandle3(&xdr, res); 444 status = decode_fhandle3(xdr, res);
460 if (unlikely(status != 0)) { 445 if (unlikely(status != 0)) {
461 res->errno = -EBADHANDLE; 446 res->errno = -EBADHANDLE;
462 return 0; 447 return 0;
463 } 448 }
464 return decode_auth_flavors(&xdr, res); 449 return decode_auth_flavors(xdr, res);
465} 450}
466 451
467static struct rpc_procinfo mnt_procedures[] = { 452static struct rpc_procinfo mnt_procedures[] = {
468 [MOUNTPROC_MNT] = { 453 [MOUNTPROC_MNT] = {
469 .p_proc = MOUNTPROC_MNT, 454 .p_proc = MOUNTPROC_MNT,
470 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 455 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
471 .p_decode = (kxdrproc_t)mnt_dec_mountres, 456 .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres,
472 .p_arglen = MNT_enc_dirpath_sz, 457 .p_arglen = MNT_enc_dirpath_sz,
473 .p_replen = MNT_dec_mountres_sz, 458 .p_replen = MNT_dec_mountres_sz,
474 .p_statidx = MOUNTPROC_MNT, 459 .p_statidx = MOUNTPROC_MNT,
@@ -476,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = {
476 }, 461 },
477 [MOUNTPROC_UMNT] = { 462 [MOUNTPROC_UMNT] = {
478 .p_proc = MOUNTPROC_UMNT, 463 .p_proc = MOUNTPROC_UMNT,
479 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 464 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
480 .p_arglen = MNT_enc_dirpath_sz, 465 .p_arglen = MNT_enc_dirpath_sz,
481 .p_statidx = MOUNTPROC_UMNT, 466 .p_statidx = MOUNTPROC_UMNT,
482 .p_name = "UMOUNT", 467 .p_name = "UMOUNT",
@@ -486,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = {
486static struct rpc_procinfo mnt3_procedures[] = { 471static struct rpc_procinfo mnt3_procedures[] = {
487 [MOUNTPROC3_MNT] = { 472 [MOUNTPROC3_MNT] = {
488 .p_proc = MOUNTPROC3_MNT, 473 .p_proc = MOUNTPROC3_MNT,
489 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 474 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
490 .p_decode = (kxdrproc_t)mnt_dec_mountres3, 475 .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres3,
491 .p_arglen = MNT_enc_dirpath_sz, 476 .p_arglen = MNT_enc_dirpath_sz,
492 .p_replen = MNT_dec_mountres3_sz, 477 .p_replen = MNT_dec_mountres3_sz,
493 .p_statidx = MOUNTPROC3_MNT, 478 .p_statidx = MOUNTPROC3_MNT,
@@ -495,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
495 }, 480 },
496 [MOUNTPROC3_UMNT] = { 481 [MOUNTPROC3_UMNT] = {
497 .p_proc = MOUNTPROC3_UMNT, 482 .p_proc = MOUNTPROC3_UMNT,
498 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 483 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
499 .p_arglen = MNT_enc_dirpath_sz, 484 .p_arglen = MNT_enc_dirpath_sz,
500 .p_statidx = MOUNTPROC3_UMNT, 485 .p_statidx = MOUNTPROC3_UMNT,
501 .p_name = "UMOUNT", 486 .p_name = "UMOUNT",
@@ -505,13 +490,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
505 490
506static struct rpc_version mnt_version1 = { 491static struct rpc_version mnt_version1 = {
507 .number = 1, 492 .number = 1,
508 .nrprocs = 2, 493 .nrprocs = ARRAY_SIZE(mnt_procedures),
509 .procs = mnt_procedures, 494 .procs = mnt_procedures,
510}; 495};
511 496
512static struct rpc_version mnt_version3 = { 497static struct rpc_version mnt_version3 = {
513 .number = 3, 498 .number = 3,
514 .nrprocs = 2, 499 .nrprocs = ARRAY_SIZE(mnt3_procedures),
515 .procs = mnt3_procedures, 500 .procs = mnt3_procedures,
516}; 501};
517 502
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index db6aa3673cf..f32b8603dca 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -49,12 +49,17 @@ char *nfs_path(const char *base,
49 const struct dentry *dentry, 49 const struct dentry *dentry,
50 char *buffer, ssize_t buflen) 50 char *buffer, ssize_t buflen)
51{ 51{
52 char *end = buffer+buflen; 52 char *end;
53 int namelen; 53 int namelen;
54 unsigned seq;
54 55
56rename_retry:
57 end = buffer+buflen;
55 *--end = '\0'; 58 *--end = '\0';
56 buflen--; 59 buflen--;
57 spin_lock(&dcache_lock); 60
61 seq = read_seqbegin(&rename_lock);
62 rcu_read_lock();
58 while (!IS_ROOT(dentry) && dentry != droot) { 63 while (!IS_ROOT(dentry) && dentry != droot) {
59 namelen = dentry->d_name.len; 64 namelen = dentry->d_name.len;
60 buflen -= namelen + 1; 65 buflen -= namelen + 1;
@@ -65,7 +70,9 @@ char *nfs_path(const char *base,
65 *--end = '/'; 70 *--end = '/';
66 dentry = dentry->d_parent; 71 dentry = dentry->d_parent;
67 } 72 }
68 spin_unlock(&dcache_lock); 73 rcu_read_unlock();
74 if (read_seqretry(&rename_lock, seq))
75 goto rename_retry;
69 if (*end != '/') { 76 if (*end != '/') {
70 if (--buflen < 0) 77 if (--buflen < 0)
71 goto Elong; 78 goto Elong;
@@ -82,15 +89,16 @@ char *nfs_path(const char *base,
82 memcpy(end, base, namelen); 89 memcpy(end, base, namelen);
83 return end; 90 return end;
84Elong_unlock: 91Elong_unlock:
85 spin_unlock(&dcache_lock); 92 rcu_read_unlock();
93 if (read_seqretry(&rename_lock, seq))
94 goto rename_retry;
86Elong: 95Elong:
87 return ERR_PTR(-ENAMETOOLONG); 96 return ERR_PTR(-ENAMETOOLONG);
88} 97}
89 98
90/* 99/*
91 * nfs_follow_mountpoint - handle crossing a mountpoint on the server 100 * nfs_d_automount - Handle crossing a mountpoint on the server
92 * @dentry - dentry of mountpoint 101 * @path - The mountpoint
93 * @nd - nameidata info
94 * 102 *
95 * When we encounter a mountpoint on the server, we want to set up 103 * When we encounter a mountpoint on the server, we want to set up
96 * a mountpoint on the client too, to prevent inode numbers from 104 * a mountpoint on the client too, to prevent inode numbers from
@@ -100,87 +108,65 @@ Elong:
100 * situation, and that different filesystems may want to use 108 * situation, and that different filesystems may want to use
101 * different security flavours. 109 * different security flavours.
102 */ 110 */
103static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) 111struct vfsmount *nfs_d_automount(struct path *path)
104{ 112{
105 struct vfsmount *mnt; 113 struct vfsmount *mnt;
106 struct nfs_server *server = NFS_SERVER(dentry->d_inode); 114 struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
107 struct dentry *parent; 115 struct dentry *parent;
108 struct nfs_fh *fh = NULL; 116 struct nfs_fh *fh = NULL;
109 struct nfs_fattr *fattr = NULL; 117 struct nfs_fattr *fattr = NULL;
110 int err; 118 int err;
111 119
112 dprintk("--> nfs_follow_mountpoint()\n"); 120 dprintk("--> nfs_d_automount()\n");
113 121
114 err = -ESTALE; 122 mnt = ERR_PTR(-ESTALE);
115 if (IS_ROOT(dentry)) 123 if (IS_ROOT(path->dentry))
116 goto out_err; 124 goto out_nofree;
117 125
118 err = -ENOMEM; 126 mnt = ERR_PTR(-ENOMEM);
119 fh = nfs_alloc_fhandle(); 127 fh = nfs_alloc_fhandle();
120 fattr = nfs_alloc_fattr(); 128 fattr = nfs_alloc_fattr();
121 if (fh == NULL || fattr == NULL) 129 if (fh == NULL || fattr == NULL)
122 goto out_err; 130 goto out;
123 131
124 dprintk("%s: enter\n", __func__); 132 dprintk("%s: enter\n", __func__);
125 dput(nd->path.dentry);
126 nd->path.dentry = dget(dentry);
127 133
128 /* Look it up again */ 134 /* Look it up again to get its attributes */
129 parent = dget_parent(nd->path.dentry); 135 parent = dget_parent(path->dentry);
130 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, 136 err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
131 &nd->path.dentry->d_name, 137 &path->dentry->d_name,
132 fh, fattr); 138 fh, fattr);
133 dput(parent); 139 dput(parent);
134 if (err != 0) 140 if (err != 0) {
135 goto out_err; 141 mnt = ERR_PTR(err);
142 goto out;
143 }
136 144
137 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 145 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
138 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); 146 mnt = nfs_do_refmount(path->mnt, path->dentry);
139 else 147 else
140 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, 148 mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
141 fattr);
142 err = PTR_ERR(mnt);
143 if (IS_ERR(mnt)) 149 if (IS_ERR(mnt))
144 goto out_err; 150 goto out;
145 151
146 mntget(mnt); 152 dprintk("%s: done, success\n", __func__);
147 err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, 153 mntget(mnt); /* prevent immediate expiration */
148 &nfs_automount_list); 154 mnt_set_expiry(mnt, &nfs_automount_list);
149 if (err < 0) {
150 mntput(mnt);
151 if (err == -EBUSY)
152 goto out_follow;
153 goto out_err;
154 }
155 path_put(&nd->path);
156 nd->path.mnt = mnt;
157 nd->path.dentry = dget(mnt->mnt_root);
158 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); 155 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
156
159out: 157out:
160 nfs_free_fattr(fattr); 158 nfs_free_fattr(fattr);
161 nfs_free_fhandle(fh); 159 nfs_free_fhandle(fh);
162 dprintk("%s: done, returned %d\n", __func__, err); 160out_nofree:
163 161 dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
164 dprintk("<-- nfs_follow_mountpoint() = %d\n", err); 162 return mnt;
165 return ERR_PTR(err);
166out_err:
167 path_put(&nd->path);
168 goto out;
169out_follow:
170 while (d_mountpoint(nd->path.dentry) &&
171 follow_down(&nd->path))
172 ;
173 err = 0;
174 goto out;
175} 163}
176 164
177const struct inode_operations nfs_mountpoint_inode_operations = { 165const struct inode_operations nfs_mountpoint_inode_operations = {
178 .follow_link = nfs_follow_mountpoint,
179 .getattr = nfs_getattr, 166 .getattr = nfs_getattr,
180}; 167};
181 168
182const struct inode_operations nfs_referral_inode_operations = { 169const struct inode_operations nfs_referral_inode_operations = {
183 .follow_link = nfs_follow_mountpoint,
184}; 170};
185 171
186static void nfs_expire_automounts(struct work_struct *work) 172static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index e6bf45710cc..792cb13a430 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,582 +61,1008 @@
61#define NFS_readdirres_sz (1) 61#define NFS_readdirres_sz (1)
62#define NFS_statfsres_sz (1+NFS_info_sz) 62#define NFS_statfsres_sz (1+NFS_info_sz)
63 63
64
64/* 65/*
65 * Common NFS XDR functions as inlines 66 * While encoding arguments, set up the reply buffer in advance to
67 * receive reply data directly into the page cache.
66 */ 68 */
67static inline __be32 * 69static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
68xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle) 70 unsigned int base, unsigned int len,
71 unsigned int bufsize)
69{ 72{
70 memcpy(p, fhandle->data, NFS2_FHSIZE); 73 struct rpc_auth *auth = req->rq_cred->cr_auth;
71 return p + XDR_QUADLEN(NFS2_FHSIZE); 74 unsigned int replen;
75
76 replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
77 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
72} 78}
73 79
74static inline __be32 * 80/*
75xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle) 81 * Handle decode buffer overflows out-of-line.
82 */
83static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
76{ 84{
77 /* NFSv2 handles have a fixed length */ 85 dprintk("NFS: %s prematurely hit the end of our receive buffer. "
78 fhandle->size = NFS2_FHSIZE; 86 "Remaining buffer length is %tu words.\n",
79 memcpy(fhandle->data, p, NFS2_FHSIZE); 87 func, xdr->end - xdr->p);
80 return p + XDR_QUADLEN(NFS2_FHSIZE);
81} 88}
82 89
83static inline __be32* 90
84xdr_encode_time(__be32 *p, struct timespec *timep) 91/*
92 * Encode/decode NFSv2 basic data types
93 *
94 * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
95 * "NFS: Network File System Protocol Specification".
96 *
97 * Not all basic data types have their own encoding and decoding
98 * functions. For run-time efficiency, some data types are encoded
99 * or decoded inline.
100 */
101
102/*
103 * typedef opaque nfsdata<>;
104 */
105static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
85{ 106{
86 *p++ = htonl(timep->tv_sec); 107 u32 recvd, count;
87 /* Convert nanoseconds into microseconds */ 108 size_t hdrlen;
88 *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0); 109 __be32 *p;
110
111 p = xdr_inline_decode(xdr, 4);
112 if (unlikely(p == NULL))
113 goto out_overflow;
114 count = be32_to_cpup(p);
115 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
116 recvd = xdr->buf->len - hdrlen;
117 if (unlikely(count > recvd))
118 goto out_cheating;
119out:
120 xdr_read_pages(xdr, count);
121 result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */
122 result->count = count;
123 return count;
124out_cheating:
125 dprintk("NFS: server cheating in read result: "
126 "count %u > recvd %u\n", count, recvd);
127 count = recvd;
128 goto out;
129out_overflow:
130 print_overflow_msg(__func__, xdr);
131 return -EIO;
132}
133
134/*
135 * enum stat {
136 * NFS_OK = 0,
137 * NFSERR_PERM = 1,
138 * NFSERR_NOENT = 2,
139 * NFSERR_IO = 5,
140 * NFSERR_NXIO = 6,
141 * NFSERR_ACCES = 13,
142 * NFSERR_EXIST = 17,
143 * NFSERR_NODEV = 19,
144 * NFSERR_NOTDIR = 20,
145 * NFSERR_ISDIR = 21,
146 * NFSERR_FBIG = 27,
147 * NFSERR_NOSPC = 28,
148 * NFSERR_ROFS = 30,
149 * NFSERR_NAMETOOLONG = 63,
150 * NFSERR_NOTEMPTY = 66,
151 * NFSERR_DQUOT = 69,
152 * NFSERR_STALE = 70,
153 * NFSERR_WFLUSH = 99
154 * };
155 */
156static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
157{
158 __be32 *p;
159
160 p = xdr_inline_decode(xdr, 4);
161 if (unlikely(p == NULL))
162 goto out_overflow;
163 *status = be32_to_cpup(p);
164 return 0;
165out_overflow:
166 print_overflow_msg(__func__, xdr);
167 return -EIO;
168}
169
170/*
171 * 2.3.2. ftype
172 *
173 * enum ftype {
174 * NFNON = 0,
175 * NFREG = 1,
176 * NFDIR = 2,
177 * NFBLK = 3,
178 * NFCHR = 4,
179 * NFLNK = 5
180 * };
181 *
182 */
183static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
184{
185 *type = be32_to_cpup(p++);
186 if (unlikely(*type > NF2FIFO))
187 *type = NFBAD;
89 return p; 188 return p;
90} 189}
91 190
92static inline __be32* 191/*
93xdr_encode_current_server_time(__be32 *p, struct timespec *timep) 192 * 2.3.3. fhandle
193 *
194 * typedef opaque fhandle[FHSIZE];
195 */
196static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
197{
198 __be32 *p;
199
200 BUG_ON(fh->size != NFS2_FHSIZE);
201 p = xdr_reserve_space(xdr, NFS2_FHSIZE);
202 memcpy(p, fh->data, NFS2_FHSIZE);
203}
204
205static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
94{ 206{
95 /* 207 __be32 *p;
96 * Passing the invalid value useconds=1000000 is a 208
97 * Sun convention for "set to current server time". 209 p = xdr_inline_decode(xdr, NFS2_FHSIZE);
98 * It's needed to make permissions checks for the 210 if (unlikely(p == NULL))
99 * "touch" program across v2 mounts to Solaris and 211 goto out_overflow;
100 * Irix boxes work correctly. See description of 212 fh->size = NFS2_FHSIZE;
101 * sattr in section 6.1 of "NFS Illustrated" by 213 memcpy(fh->data, p, NFS2_FHSIZE);
102 * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 214 return 0;
103 */ 215out_overflow:
104 *p++ = htonl(timep->tv_sec); 216 print_overflow_msg(__func__, xdr);
105 *p++ = htonl(1000000); 217 return -EIO;
218}
219
220/*
221 * 2.3.4. timeval
222 *
223 * struct timeval {
224 * unsigned int seconds;
225 * unsigned int useconds;
226 * };
227 */
228static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
229{
230 *p++ = cpu_to_be32(timep->tv_sec);
231 if (timep->tv_nsec != 0)
232 *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
233 else
234 *p++ = cpu_to_be32(0);
235 return p;
236}
237
238/*
239 * Passing the invalid value useconds=1000000 is a Sun convention for
240 * "set to current server time". It's needed to make permissions checks
241 * for the "touch" program across v2 mounts to Solaris and Irix servers
242 * work correctly. See description of sattr in section 6.1 of "NFS
243 * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
244 */
245static __be32 *xdr_encode_current_server_time(__be32 *p,
246 const struct timespec *timep)
247{
248 *p++ = cpu_to_be32(timep->tv_sec);
249 *p++ = cpu_to_be32(1000000);
106 return p; 250 return p;
107} 251}
108 252
109static inline __be32* 253static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
110xdr_decode_time(__be32 *p, struct timespec *timep)
111{ 254{
112 timep->tv_sec = ntohl(*p++); 255 timep->tv_sec = be32_to_cpup(p++);
113 /* Convert microseconds into nanoseconds */ 256 timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
114 timep->tv_nsec = ntohl(*p++) * 1000;
115 return p; 257 return p;
116} 258}
117 259
118static __be32 * 260/*
119xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 261 * 2.3.5. fattr
262 *
263 * struct fattr {
264 * ftype type;
265 * unsigned int mode;
266 * unsigned int nlink;
267 * unsigned int uid;
268 * unsigned int gid;
269 * unsigned int size;
270 * unsigned int blocksize;
271 * unsigned int rdev;
272 * unsigned int blocks;
273 * unsigned int fsid;
274 * unsigned int fileid;
275 * timeval atime;
276 * timeval mtime;
277 * timeval ctime;
278 * };
279 *
280 */
281static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
120{ 282{
121 u32 rdev, type; 283 u32 rdev, type;
122 type = ntohl(*p++); 284 __be32 *p;
123 fattr->mode = ntohl(*p++); 285
124 fattr->nlink = ntohl(*p++); 286 p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
125 fattr->uid = ntohl(*p++); 287 if (unlikely(p == NULL))
126 fattr->gid = ntohl(*p++); 288 goto out_overflow;
127 fattr->size = ntohl(*p++); 289
128 fattr->du.nfs2.blocksize = ntohl(*p++);
129 rdev = ntohl(*p++);
130 fattr->du.nfs2.blocks = ntohl(*p++);
131 fattr->fsid.major = ntohl(*p++);
132 fattr->fsid.minor = 0;
133 fattr->fileid = ntohl(*p++);
134 p = xdr_decode_time(p, &fattr->atime);
135 p = xdr_decode_time(p, &fattr->mtime);
136 p = xdr_decode_time(p, &fattr->ctime);
137 fattr->valid |= NFS_ATTR_FATTR_V2; 290 fattr->valid |= NFS_ATTR_FATTR_V2;
291
292 p = xdr_decode_ftype(p, &type);
293
294 fattr->mode = be32_to_cpup(p++);
295 fattr->nlink = be32_to_cpup(p++);
296 fattr->uid = be32_to_cpup(p++);
297 fattr->gid = be32_to_cpup(p++);
298 fattr->size = be32_to_cpup(p++);
299 fattr->du.nfs2.blocksize = be32_to_cpup(p++);
300
301 rdev = be32_to_cpup(p++);
138 fattr->rdev = new_decode_dev(rdev); 302 fattr->rdev = new_decode_dev(rdev);
139 if (type == NFCHR && rdev == NFS2_FIFO_DEV) { 303 if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
140 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; 304 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
141 fattr->rdev = 0; 305 fattr->rdev = 0;
142 } 306 }
307
308 fattr->du.nfs2.blocks = be32_to_cpup(p++);
309 fattr->fsid.major = be32_to_cpup(p++);
310 fattr->fsid.minor = 0;
311 fattr->fileid = be32_to_cpup(p++);
312
313 p = xdr_decode_time(p, &fattr->atime);
314 p = xdr_decode_time(p, &fattr->mtime);
315 xdr_decode_time(p, &fattr->ctime);
316 return 0;
317out_overflow:
318 print_overflow_msg(__func__, xdr);
319 return -EIO;
320}
321
322/*
323 * 2.3.6. sattr
324 *
325 * struct sattr {
326 * unsigned int mode;
327 * unsigned int uid;
328 * unsigned int gid;
329 * unsigned int size;
330 * timeval atime;
331 * timeval mtime;
332 * };
333 */
334
335#define NFS2_SATTR_NOT_SET (0xffffffff)
336
337static __be32 *xdr_time_not_set(__be32 *p)
338{
339 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
340 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
143 return p; 341 return p;
144} 342}
145 343
146static inline __be32 * 344static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
147xdr_encode_sattr(__be32 *p, struct iattr *attr)
148{ 345{
149 const __be32 not_set = __constant_htonl(0xFFFFFFFF); 346 __be32 *p;
150 347
151 *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set; 348 p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
152 *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
153 *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
154 *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
155 349
156 if (attr->ia_valid & ATTR_ATIME_SET) { 350 if (attr->ia_valid & ATTR_MODE)
351 *p++ = cpu_to_be32(attr->ia_mode);
352 else
353 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
354 if (attr->ia_valid & ATTR_UID)
355 *p++ = cpu_to_be32(attr->ia_uid);
356 else
357 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
358 if (attr->ia_valid & ATTR_GID)
359 *p++ = cpu_to_be32(attr->ia_gid);
360 else
361 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
362 if (attr->ia_valid & ATTR_SIZE)
363 *p++ = cpu_to_be32((u32)attr->ia_size);
364 else
365 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
366
367 if (attr->ia_valid & ATTR_ATIME_SET)
157 p = xdr_encode_time(p, &attr->ia_atime); 368 p = xdr_encode_time(p, &attr->ia_atime);
158 } else if (attr->ia_valid & ATTR_ATIME) { 369 else if (attr->ia_valid & ATTR_ATIME)
159 p = xdr_encode_current_server_time(p, &attr->ia_atime); 370 p = xdr_encode_current_server_time(p, &attr->ia_atime);
160 } else { 371 else
161 *p++ = not_set; 372 p = xdr_time_not_set(p);
162 *p++ = not_set; 373 if (attr->ia_valid & ATTR_MTIME_SET)
163 } 374 xdr_encode_time(p, &attr->ia_mtime);
164 375 else if (attr->ia_valid & ATTR_MTIME)
165 if (attr->ia_valid & ATTR_MTIME_SET) { 376 xdr_encode_current_server_time(p, &attr->ia_mtime);
166 p = xdr_encode_time(p, &attr->ia_mtime); 377 else
167 } else if (attr->ia_valid & ATTR_MTIME) { 378 xdr_time_not_set(p);
168 p = xdr_encode_current_server_time(p, &attr->ia_mtime);
169 } else {
170 *p++ = not_set;
171 *p++ = not_set;
172 }
173 return p;
174} 379}
175 380
176/* 381/*
177 * NFS encode functions 382 * 2.3.7. filename
383 *
384 * typedef string filename<MAXNAMLEN>;
178 */ 385 */
386static void encode_filename(struct xdr_stream *xdr,
387 const char *name, u32 length)
388{
389 __be32 *p;
390
391 BUG_ON(length > NFS2_MAXNAMLEN);
392 p = xdr_reserve_space(xdr, 4 + length);
393 xdr_encode_opaque(p, name, length);
394}
395
396static int decode_filename_inline(struct xdr_stream *xdr,
397 const char **name, u32 *length)
398{
399 __be32 *p;
400 u32 count;
401
402 p = xdr_inline_decode(xdr, 4);
403 if (unlikely(p == NULL))
404 goto out_overflow;
405 count = be32_to_cpup(p);
406 if (count > NFS3_MAXNAMLEN)
407 goto out_nametoolong;
408 p = xdr_inline_decode(xdr, count);
409 if (unlikely(p == NULL))
410 goto out_overflow;
411 *name = (const char *)p;
412 *length = count;
413 return 0;
414out_nametoolong:
415 dprintk("NFS: returned filename too long: %u\n", count);
416 return -ENAMETOOLONG;
417out_overflow:
418 print_overflow_msg(__func__, xdr);
419 return -EIO;
420}
421
179/* 422/*
180 * Encode file handle argument 423 * 2.3.8. path
181 * GETATTR, READLINK, STATFS 424 *
425 * typedef string path<MAXPATHLEN>;
182 */ 426 */
183static int 427static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
184nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) 428{
429 __be32 *p;
430
431 BUG_ON(length > NFS2_MAXPATHLEN);
432 p = xdr_reserve_space(xdr, 4);
433 *p = cpu_to_be32(length);
434 xdr_write_pages(xdr, pages, 0, length);
435}
436
437static int decode_path(struct xdr_stream *xdr)
185{ 438{
186 p = xdr_encode_fhandle(p, fh); 439 u32 length, recvd;
187 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 440 size_t hdrlen;
441 __be32 *p;
442
443 p = xdr_inline_decode(xdr, 4);
444 if (unlikely(p == NULL))
445 goto out_overflow;
446 length = be32_to_cpup(p);
447 if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
448 goto out_size;
449 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
450 recvd = xdr->buf->len - hdrlen;
451 if (unlikely(length > recvd))
452 goto out_cheating;
453
454 xdr_read_pages(xdr, length);
455 xdr_terminate_string(xdr->buf, length);
188 return 0; 456 return 0;
457out_size:
458 dprintk("NFS: returned pathname too long: %u\n", length);
459 return -ENAMETOOLONG;
460out_cheating:
461 dprintk("NFS: server cheating in pathname result: "
462 "length %u > received %u\n", length, recvd);
463 return -EIO;
464out_overflow:
465 print_overflow_msg(__func__, xdr);
466 return -EIO;
189} 467}
190 468
191/* 469/*
192 * Encode SETATTR arguments 470 * 2.3.9. attrstat
471 *
472 * union attrstat switch (stat status) {
473 * case NFS_OK:
474 * fattr attributes;
475 * default:
476 * void;
477 * };
193 */ 478 */
194static int 479static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
195nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
196{ 480{
197 p = xdr_encode_fhandle(p, args->fh); 481 enum nfs_stat status;
198 p = xdr_encode_sattr(p, args->sattr); 482 int error;
199 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 483
200 return 0; 484 error = decode_stat(xdr, &status);
485 if (unlikely(error))
486 goto out;
487 if (status != NFS_OK)
488 goto out_default;
489 error = decode_fattr(xdr, result);
490out:
491 return error;
492out_default:
493 return nfs_stat_to_errno(status);
201} 494}
202 495
203/* 496/*
204 * Encode directory ops argument 497 * 2.3.10. diropargs
205 * LOOKUP, RMDIR 498 *
499 * struct diropargs {
500 * fhandle dir;
501 * filename name;
502 * };
206 */ 503 */
207static int 504static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
208nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args) 505 const char *name, u32 length)
209{ 506{
210 p = xdr_encode_fhandle(p, args->fh); 507 encode_fhandle(xdr, fh);
211 p = xdr_encode_array(p, args->name, args->len); 508 encode_filename(xdr, name, length);
212 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
213 return 0;
214} 509}
215 510
216/* 511/*
217 * Encode REMOVE argument 512 * 2.3.11. diropres
513 *
514 * union diropres switch (stat status) {
515 * case NFS_OK:
516 * struct {
517 * fhandle file;
518 * fattr attributes;
519 * } diropok;
520 * default:
521 * void;
522 * };
218 */ 523 */
219static int 524static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
220nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
221{ 525{
222 p = xdr_encode_fhandle(p, args->fh); 526 int error;
223 p = xdr_encode_array(p, args->name.name, args->name.len); 527
224 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 528 error = decode_fhandle(xdr, result->fh);
225 return 0; 529 if (unlikely(error))
530 goto out;
531 error = decode_fattr(xdr, result->fattr);
532out:
533 return error;
534}
535
536static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
537{
538 enum nfs_stat status;
539 int error;
540
541 error = decode_stat(xdr, &status);
542 if (unlikely(error))
543 goto out;
544 if (status != NFS_OK)
545 goto out_default;
546 error = decode_diropok(xdr, result);
547out:
548 return error;
549out_default:
550 return nfs_stat_to_errno(status);
226} 551}
227 552
553
228/* 554/*
229 * Arguments to a READ call. Since we read data directly into the page 555 * NFSv2 XDR encode functions
230 * cache, we also set up the reply iovec here so that iov[1] points 556 *
231 * exactly to the page we want to fetch. 557 * NFSv2 argument types are defined in section 2.2 of RFC 1094:
558 * "NFS: Network File System Protocol Specification".
232 */ 559 */
233static int 560
234nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 561static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
562 struct xdr_stream *xdr,
563 const struct nfs_fh *fh)
235{ 564{
236 struct rpc_auth *auth = req->rq_cred->cr_auth; 565 encode_fhandle(xdr, fh);
237 unsigned int replen; 566}
238 u32 offset = (u32)args->offset; 567
568/*
569 * 2.2.3. sattrargs
570 *
571 * struct sattrargs {
572 * fhandle file;
573 * sattr attributes;
574 * };
575 */
576static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
577 struct xdr_stream *xdr,
578 const struct nfs_sattrargs *args)
579{
580 encode_fhandle(xdr, args->fh);
581 encode_sattr(xdr, args->sattr);
582}
583
584static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
585 struct xdr_stream *xdr,
586 const struct nfs_diropargs *args)
587{
588 encode_diropargs(xdr, args->fh, args->name, args->len);
589}
590
591static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
592 struct xdr_stream *xdr,
593 const struct nfs_readlinkargs *args)
594{
595 encode_fhandle(xdr, args->fh);
596 prepare_reply_buffer(req, args->pages, args->pgbase,
597 args->pglen, NFS_readlinkres_sz);
598}
599
600/*
601 * 2.2.7. readargs
602 *
603 * struct readargs {
604 * fhandle file;
605 * unsigned offset;
606 * unsigned count;
607 * unsigned totalcount;
608 * };
609 */
610static void encode_readargs(struct xdr_stream *xdr,
611 const struct nfs_readargs *args)
612{
613 u32 offset = args->offset;
239 u32 count = args->count; 614 u32 count = args->count;
615 __be32 *p;
240 616
241 p = xdr_encode_fhandle(p, args->fh); 617 encode_fhandle(xdr, args->fh);
242 *p++ = htonl(offset); 618
243 *p++ = htonl(count); 619 p = xdr_reserve_space(xdr, 4 + 4 + 4);
244 *p++ = htonl(count); 620 *p++ = cpu_to_be32(offset);
245 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 621 *p++ = cpu_to_be32(count);
622 *p = cpu_to_be32(count);
623}
246 624
247 /* Inline the page array */ 625static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
248 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2; 626 struct xdr_stream *xdr,
249 xdr_inline_pages(&req->rq_rcv_buf, replen, 627 const struct nfs_readargs *args)
250 args->pages, args->pgbase, count); 628{
629 encode_readargs(xdr, args);
630 prepare_reply_buffer(req, args->pages, args->pgbase,
631 args->count, NFS_readres_sz);
251 req->rq_rcv_buf.flags |= XDRBUF_READ; 632 req->rq_rcv_buf.flags |= XDRBUF_READ;
252 return 0;
253} 633}
254 634
255/* 635/*
256 * Decode READ reply 636 * 2.2.9. writeargs
637 *
638 * struct writeargs {
639 * fhandle file;
640 * unsigned beginoffset;
641 * unsigned offset;
642 * unsigned totalcount;
643 * nfsdata data;
644 * };
257 */ 645 */
258static int 646static void encode_writeargs(struct xdr_stream *xdr,
259nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) 647 const struct nfs_writeargs *args)
260{ 648{
261 struct kvec *iov = req->rq_rcv_buf.head; 649 u32 offset = args->offset;
262 size_t hdrlen; 650 u32 count = args->count;
263 u32 count, recvd; 651 __be32 *p;
264 int status;
265
266 if ((status = ntohl(*p++)))
267 return nfs_stat_to_errno(status);
268 p = xdr_decode_fattr(p, res->fattr);
269
270 count = ntohl(*p++);
271 res->eof = 0;
272 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
273 if (iov->iov_len < hdrlen) {
274 dprintk("NFS: READ reply header overflowed:"
275 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
276 return -errno_NFSERR_IO;
277 } else if (iov->iov_len != hdrlen) {
278 dprintk("NFS: READ header is short. iovec will be shifted.\n");
279 xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
280 }
281 652
282 recvd = req->rq_rcv_buf.len - hdrlen; 653 encode_fhandle(xdr, args->fh);
283 if (count > recvd) {
284 dprintk("NFS: server cheating in read reply: "
285 "count %u > recvd %u\n", count, recvd);
286 count = recvd;
287 }
288 654
289 dprintk("RPC: readres OK count %u\n", count); 655 p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
290 if (count < res->count) 656 *p++ = cpu_to_be32(offset);
291 res->count = count; 657 *p++ = cpu_to_be32(offset);
658 *p++ = cpu_to_be32(count);
292 659
293 return count; 660 /* nfsdata */
661 *p = cpu_to_be32(count);
662 xdr_write_pages(xdr, args->pages, args->pgbase, count);
294} 663}
295 664
665static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
666 struct xdr_stream *xdr,
667 const struct nfs_writeargs *args)
668{
669 encode_writeargs(xdr, args);
670 xdr->buf->flags |= XDRBUF_WRITE;
671}
296 672
297/* 673/*
298 * Write arguments. Splice the buffer to be written into the iovec. 674 * 2.2.10. createargs
675 *
676 * struct createargs {
677 * diropargs where;
678 * sattr attributes;
679 * };
299 */ 680 */
300static int 681static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
301nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 682 struct xdr_stream *xdr,
683 const struct nfs_createargs *args)
302{ 684{
303 struct xdr_buf *sndbuf = &req->rq_snd_buf; 685 encode_diropargs(xdr, args->fh, args->name, args->len);
304 u32 offset = (u32)args->offset; 686 encode_sattr(xdr, args->sattr);
305 u32 count = args->count; 687}
306
307 p = xdr_encode_fhandle(p, args->fh);
308 *p++ = htonl(offset);
309 *p++ = htonl(offset);
310 *p++ = htonl(count);
311 *p++ = htonl(count);
312 sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
313 688
314 /* Copy the page array */ 689static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
315 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); 690 struct xdr_stream *xdr,
316 sndbuf->flags |= XDRBUF_WRITE; 691 const struct nfs_removeargs *args)
317 return 0; 692{
693 encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
318} 694}
319 695
320/* 696/*
321 * Encode create arguments 697 * 2.2.12. renameargs
322 * CREATE, MKDIR 698 *
699 * struct renameargs {
700 * diropargs from;
701 * diropargs to;
702 * };
323 */ 703 */
324static int 704static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
325nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args) 705 struct xdr_stream *xdr,
706 const struct nfs_renameargs *args)
326{ 707{
327 p = xdr_encode_fhandle(p, args->fh); 708 const struct qstr *old = args->old_name;
328 p = xdr_encode_array(p, args->name, args->len); 709 const struct qstr *new = args->new_name;
329 p = xdr_encode_sattr(p, args->sattr); 710
330 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 711 encode_diropargs(xdr, args->old_dir, old->name, old->len);
331 return 0; 712 encode_diropargs(xdr, args->new_dir, new->name, new->len);
332} 713}
333 714
334/* 715/*
335 * Encode RENAME arguments 716 * 2.2.13. linkargs
717 *
718 * struct linkargs {
719 * fhandle from;
720 * diropargs to;
721 * };
336 */ 722 */
337static int 723static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
338nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) 724 struct xdr_stream *xdr,
725 const struct nfs_linkargs *args)
339{ 726{
340 p = xdr_encode_fhandle(p, args->old_dir); 727 encode_fhandle(xdr, args->fromfh);
341 p = xdr_encode_array(p, args->old_name->name, args->old_name->len); 728 encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
342 p = xdr_encode_fhandle(p, args->new_dir);
343 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
344 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
345 return 0;
346} 729}
347 730
348/* 731/*
349 * Encode LINK arguments 732 * 2.2.14. symlinkargs
733 *
734 * struct symlinkargs {
735 * diropargs from;
736 * path to;
737 * sattr attributes;
738 * };
350 */ 739 */
351static int 740static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
352nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args) 741 struct xdr_stream *xdr,
742 const struct nfs_symlinkargs *args)
353{ 743{
354 p = xdr_encode_fhandle(p, args->fromfh); 744 encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
355 p = xdr_encode_fhandle(p, args->tofh); 745 encode_path(xdr, args->pages, args->pathlen);
356 p = xdr_encode_array(p, args->toname, args->tolen); 746 encode_sattr(xdr, args->sattr);
357 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
358 return 0;
359} 747}
360 748
361/* 749/*
362 * Encode SYMLINK arguments 750 * 2.2.17. readdirargs
751 *
752 * struct readdirargs {
753 * fhandle dir;
754 * nfscookie cookie;
755 * unsigned count;
756 * };
363 */ 757 */
364static int 758static void encode_readdirargs(struct xdr_stream *xdr,
365nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args) 759 const struct nfs_readdirargs *args)
366{ 760{
367 struct xdr_buf *sndbuf = &req->rq_snd_buf; 761 __be32 *p;
368 size_t pad;
369 762
370 p = xdr_encode_fhandle(p, args->fromfh); 763 encode_fhandle(xdr, args->fh);
371 p = xdr_encode_array(p, args->fromname, args->fromlen);
372 *p++ = htonl(args->pathlen);
373 sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
374 764
375 xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen); 765 p = xdr_reserve_space(xdr, 4 + 4);
766 *p++ = cpu_to_be32(args->cookie);
767 *p = cpu_to_be32(args->count);
768}
376 769
377 /* 770static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
378 * xdr_encode_pages may have added a few bytes to ensure the 771 struct xdr_stream *xdr,
379 * pathname ends on a 4-byte boundary. Start encoding the 772 const struct nfs_readdirargs *args)
380 * attributes after the pad bytes. 773{
381 */ 774 encode_readdirargs(xdr, args);
382 pad = sndbuf->tail->iov_len; 775 prepare_reply_buffer(req, args->pages, 0,
383 if (pad > 0) 776 args->count, NFS_readdirres_sz);
384 p++;
385 p = xdr_encode_sattr(p, args->sattr);
386 sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
387 return 0;
388} 777}
389 778
390/* 779/*
391 * Encode arguments to readdir call 780 * NFSv2 XDR decode functions
781 *
782 * NFSv2 result types are defined in section 2.2 of RFC 1094:
783 * "NFS: Network File System Protocol Specification".
392 */ 784 */
393static int 785
394nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) 786static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
787 void *__unused)
395{ 788{
396 struct rpc_auth *auth = req->rq_cred->cr_auth; 789 enum nfs_stat status;
397 unsigned int replen; 790 int error;
398 u32 count = args->count; 791
792 error = decode_stat(xdr, &status);
793 if (unlikely(error))
794 goto out;
795 if (status != NFS_OK)
796 goto out_default;
797out:
798 return error;
799out_default:
800 return nfs_stat_to_errno(status);
801}
399 802
400 p = xdr_encode_fhandle(p, args->fh); 803static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
401 *p++ = htonl(args->cookie); 804 struct nfs_fattr *result)
402 *p++ = htonl(count); /* see above */ 805{
403 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 806 return decode_attrstat(xdr, result);
807}
404 808
405 /* Inline the page array */ 809static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
406 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2; 810 struct nfs_diropok *result)
407 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count); 811{
408 return 0; 812 return decode_diropres(xdr, result);
409} 813}
410 814
411/* 815/*
412 * Decode the result of a readdir call. 816 * 2.2.6. readlinkres
413 * We're not really decoding anymore, we just leave the buffer untouched 817 *
414 * and only check that it is syntactically correct. 818 * union readlinkres switch (stat status) {
415 * The real decoding happens in nfs_decode_entry below, called directly 819 * case NFS_OK:
416 * from nfs_readdir for each entry. 820 * path data;
821 * default:
822 * void;
823 * };
417 */ 824 */
418static int 825static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
419nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) 826 struct xdr_stream *xdr, void *__unused)
420{ 827{
421 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 828 enum nfs_stat status;
422 struct kvec *iov = rcvbuf->head; 829 int error;
423 struct page **page; 830
424 size_t hdrlen; 831 error = decode_stat(xdr, &status);
425 unsigned int pglen, recvd; 832 if (unlikely(error))
426 int status, nr = 0; 833 goto out;
427 834 if (status != NFS_OK)
428 if ((status = ntohl(*p++))) 835 goto out_default;
429 return nfs_stat_to_errno(status); 836 error = decode_path(xdr);
430 837out:
431 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 838 return error;
432 if (iov->iov_len < hdrlen) { 839out_default:
433 dprintk("NFS: READDIR reply header overflowed:" 840 return nfs_stat_to_errno(status);
434 "length %Zu > %Zu\n", hdrlen, iov->iov_len); 841}
435 return -errno_NFSERR_IO;
436 } else if (iov->iov_len != hdrlen) {
437 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
438 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
439 }
440 842
441 pglen = rcvbuf->page_len; 843/*
442 recvd = rcvbuf->len - hdrlen; 844 * 2.2.7. readres
443 if (pglen > recvd) 845 *
444 pglen = recvd; 846 * union readres switch (stat status) {
445 page = rcvbuf->pages; 847 * case NFS_OK:
446 return nr; 848 * fattr attributes;
849 * nfsdata data;
850 * default:
851 * void;
852 * };
853 */
854static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
855 struct nfs_readres *result)
856{
857 enum nfs_stat status;
858 int error;
859
860 error = decode_stat(xdr, &status);
861 if (unlikely(error))
862 goto out;
863 if (status != NFS_OK)
864 goto out_default;
865 error = decode_fattr(xdr, result->fattr);
866 if (unlikely(error))
867 goto out;
868 error = decode_nfsdata(xdr, result);
869out:
870 return error;
871out_default:
872 return nfs_stat_to_errno(status);
447} 873}
448 874
449static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 875static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
876 struct nfs_writeres *result)
450{ 877{
451 dprintk("nfs: %s: prematurely hit end of receive buffer. " 878 /* All NFSv2 writes are "file sync" writes */
452 "Remaining buffer length is %tu words.\n", 879 result->verf->committed = NFS_FILE_SYNC;
453 func, xdr->end - xdr->p); 880 return decode_attrstat(xdr, result->fattr);
454} 881}
455 882
456__be32 * 883/**
457nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) 884 * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
885 * the local page cache.
886 * @xdr: XDR stream where entry resides
887 * @entry: buffer to fill in with entry data
888 * @plus: boolean indicating whether this should be a readdirplus entry
889 *
890 * Returns zero if successful, otherwise a negative errno value is
891 * returned.
892 *
893 * This function is not invoked during READDIR reply decoding, but
894 * rather whenever an application invokes the getdents(2) system call
895 * on a directory already in our cache.
896 *
897 * 2.2.17. entry
898 *
899 * struct entry {
900 * unsigned fileid;
901 * filename name;
902 * nfscookie cookie;
903 * entry *nextentry;
904 * };
905 */
906int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
907 int plus)
458{ 908{
459 __be32 *p; 909 __be32 *p;
910 int error;
911
460 p = xdr_inline_decode(xdr, 4); 912 p = xdr_inline_decode(xdr, 4);
461 if (unlikely(!p)) 913 if (unlikely(p == NULL))
462 goto out_overflow; 914 goto out_overflow;
463 if (!ntohl(*p++)) { 915 if (*p++ == xdr_zero) {
464 p = xdr_inline_decode(xdr, 4); 916 p = xdr_inline_decode(xdr, 4);
465 if (unlikely(!p)) 917 if (unlikely(p == NULL))
466 goto out_overflow; 918 goto out_overflow;
467 if (!ntohl(*p++)) 919 if (*p++ == xdr_zero)
468 return ERR_PTR(-EAGAIN); 920 return -EAGAIN;
469 entry->eof = 1; 921 entry->eof = 1;
470 return ERR_PTR(-EBADCOOKIE); 922 return -EBADCOOKIE;
471 } 923 }
472 924
473 p = xdr_inline_decode(xdr, 8); 925 p = xdr_inline_decode(xdr, 4);
474 if (unlikely(!p)) 926 if (unlikely(p == NULL))
475 goto out_overflow; 927 goto out_overflow;
928 entry->ino = be32_to_cpup(p);
476 929
477 entry->ino = ntohl(*p++); 930 error = decode_filename_inline(xdr, &entry->name, &entry->len);
478 entry->len = ntohl(*p++); 931 if (unlikely(error))
932 return error;
479 933
480 p = xdr_inline_decode(xdr, entry->len + 4); 934 /*
481 if (unlikely(!p)) 935 * The type (size and byte order) of nfscookie isn't defined in
936 * RFC 1094. This implementation assumes that it's an XDR uint32.
937 */
938 entry->prev_cookie = entry->cookie;
939 p = xdr_inline_decode(xdr, 4);
940 if (unlikely(p == NULL))
482 goto out_overflow; 941 goto out_overflow;
483 entry->name = (const char *) p; 942 entry->cookie = be32_to_cpup(p);
484 p += XDR_QUADLEN(entry->len);
485 entry->prev_cookie = entry->cookie;
486 entry->cookie = ntohl(*p++);
487
488 p = xdr_inline_peek(xdr, 8);
489 if (p != NULL)
490 entry->eof = !p[0] && p[1];
491 else
492 entry->eof = 0;
493 943
494 return p; 944 entry->d_type = DT_UNKNOWN;
945
946 return 0;
495 947
496out_overflow: 948out_overflow:
497 print_overflow_msg(__func__, xdr); 949 print_overflow_msg(__func__, xdr);
498 return ERR_PTR(-EIO); 950 return -EAGAIN;
499}
500
501/*
502 * NFS XDR decode functions
503 */
504/*
505 * Decode simple status reply
506 */
507static int
508nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
509{
510 int status;
511
512 if ((status = ntohl(*p++)) != 0)
513 status = nfs_stat_to_errno(status);
514 return status;
515}
516
517/*
518 * Decode attrstat reply
519 * GETATTR, SETATTR, WRITE
520 */
521static int
522nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
523{
524 int status;
525
526 if ((status = ntohl(*p++)))
527 return nfs_stat_to_errno(status);
528 xdr_decode_fattr(p, fattr);
529 return 0;
530} 951}
531 952
532/* 953/*
533 * Decode diropres reply 954 * 2.2.17. readdirres
534 * LOOKUP, CREATE, MKDIR 955 *
956 * union readdirres switch (stat status) {
957 * case NFS_OK:
958 * struct {
959 * entry *entries;
960 * bool eof;
961 * } readdirok;
962 * default:
963 * void;
964 * };
965 *
966 * Read the directory contents into the page cache, but don't
967 * touch them. The actual decoding is done by nfs2_decode_dirent()
968 * during subsequent nfs_readdir() calls.
535 */ 969 */
536static int 970static int decode_readdirok(struct xdr_stream *xdr)
537nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
538{ 971{
539 int status; 972 u32 recvd, pglen;
973 size_t hdrlen;
540 974
541 if ((status = ntohl(*p++))) 975 pglen = xdr->buf->page_len;
542 return nfs_stat_to_errno(status); 976 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
543 p = xdr_decode_fhandle(p, res->fh); 977 recvd = xdr->buf->len - hdrlen;
544 xdr_decode_fattr(p, res->fattr); 978 if (unlikely(pglen > recvd))
545 return 0; 979 goto out_cheating;
980out:
981 xdr_read_pages(xdr, pglen);
982 return pglen;
983out_cheating:
984 dprintk("NFS: server cheating in readdir result: "
985 "pglen %u > recvd %u\n", pglen, recvd);
986 pglen = recvd;
987 goto out;
546} 988}
547 989
548/* 990static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
549 * Encode READLINK args 991 struct xdr_stream *xdr, void *__unused)
550 */
551static int
552nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
553{ 992{
554 struct rpc_auth *auth = req->rq_cred->cr_auth; 993 enum nfs_stat status;
555 unsigned int replen; 994 int error;
556 995
557 p = xdr_encode_fhandle(p, args->fh); 996 error = decode_stat(xdr, &status);
558 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 997 if (unlikely(error))
559 998 goto out;
560 /* Inline the page array */ 999 if (status != NFS_OK)
561 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2; 1000 goto out_default;
562 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); 1001 error = decode_readdirok(xdr);
563 return 0; 1002out:
1003 return error;
1004out_default:
1005 return nfs_stat_to_errno(status);
564} 1006}
565 1007
566/* 1008/*
567 * Decode READLINK reply 1009 * 2.2.18. statfsres
1010 *
1011 * union statfsres (stat status) {
1012 * case NFS_OK:
1013 * struct {
1014 * unsigned tsize;
1015 * unsigned bsize;
1016 * unsigned blocks;
1017 * unsigned bfree;
1018 * unsigned bavail;
1019 * } info;
1020 * default:
1021 * void;
1022 * };
568 */ 1023 */
569static int 1024static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
570nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
571{ 1025{
572 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 1026 __be32 *p;
573 struct kvec *iov = rcvbuf->head;
574 size_t hdrlen;
575 u32 len, recvd;
576 int status;
577
578 if ((status = ntohl(*p++)))
579 return nfs_stat_to_errno(status);
580 /* Convert length of symlink */
581 len = ntohl(*p++);
582 if (len >= rcvbuf->page_len) {
583 dprintk("nfs: server returned giant symlink!\n");
584 return -ENAMETOOLONG;
585 }
586 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
587 if (iov->iov_len < hdrlen) {
588 dprintk("NFS: READLINK reply header overflowed:"
589 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
590 return -errno_NFSERR_IO;
591 } else if (iov->iov_len != hdrlen) {
592 dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
593 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
594 }
595 recvd = req->rq_rcv_buf.len - hdrlen;
596 if (recvd < len) {
597 dprintk("NFS: server cheating in readlink reply: "
598 "count %u > recvd %u\n", len, recvd);
599 return -EIO;
600 }
601 1027
602 xdr_terminate_string(rcvbuf, len); 1028 p = xdr_inline_decode(xdr, NFS_info_sz << 2);
1029 if (unlikely(p == NULL))
1030 goto out_overflow;
1031 result->tsize = be32_to_cpup(p++);
1032 result->bsize = be32_to_cpup(p++);
1033 result->blocks = be32_to_cpup(p++);
1034 result->bfree = be32_to_cpup(p++);
1035 result->bavail = be32_to_cpup(p);
603 return 0; 1036 return 0;
1037out_overflow:
1038 print_overflow_msg(__func__, xdr);
1039 return -EIO;
604} 1040}
605 1041
606/* 1042static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
607 * Decode WRITE reply 1043 struct nfs2_fsstat *result)
608 */
609static int
610nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
611{ 1044{
612 res->verf->committed = NFS_FILE_SYNC; 1045 enum nfs_stat status;
613 return nfs_xdr_attrstat(req, p, res->fattr); 1046 int error;
1047
1048 error = decode_stat(xdr, &status);
1049 if (unlikely(error))
1050 goto out;
1051 if (status != NFS_OK)
1052 goto out_default;
1053 error = decode_info(xdr, result);
1054out:
1055 return error;
1056out_default:
1057 return nfs_stat_to_errno(status);
614} 1058}
615 1059
616/*
617 * Decode STATFS reply
618 */
619static int
620nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
621{
622 int status;
623
624 if ((status = ntohl(*p++)))
625 return nfs_stat_to_errno(status);
626
627 res->tsize = ntohl(*p++);
628 res->bsize = ntohl(*p++);
629 res->blocks = ntohl(*p++);
630 res->bfree = ntohl(*p++);
631 res->bavail = ntohl(*p++);
632 return 0;
633}
634 1060
635/* 1061/*
636 * We need to translate between nfs status return values and 1062 * We need to translate between nfs status return values and
637 * the local errno values which may not be the same. 1063 * the local errno values which may not be the same.
638 */ 1064 */
639static struct { 1065static const struct {
640 int stat; 1066 int stat;
641 int errno; 1067 int errno;
642} nfs_errtbl[] = { 1068} nfs_errtbl[] = {
@@ -676,28 +1102,30 @@ static struct {
676 { -1, -EIO } 1102 { -1, -EIO }
677}; 1103};
678 1104
679/* 1105/**
680 * Convert an NFS error code to a local one. 1106 * nfs_stat_to_errno - convert an NFS status code to a local errno
681 * This one is used jointly by NFSv2 and NFSv3. 1107 * @status: NFS status code to convert
1108 *
1109 * Returns a local errno value, or -EIO if the NFS status code is
1110 * not recognized. This function is used jointly by NFSv2 and NFSv3.
682 */ 1111 */
683int 1112int nfs_stat_to_errno(enum nfs_stat status)
684nfs_stat_to_errno(int stat)
685{ 1113{
686 int i; 1114 int i;
687 1115
688 for (i = 0; nfs_errtbl[i].stat != -1; i++) { 1116 for (i = 0; nfs_errtbl[i].stat != -1; i++) {
689 if (nfs_errtbl[i].stat == stat) 1117 if (nfs_errtbl[i].stat == (int)status)
690 return nfs_errtbl[i].errno; 1118 return nfs_errtbl[i].errno;
691 } 1119 }
692 dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat); 1120 dprintk("NFS: Unrecognized nfs status value: %u\n", status);
693 return nfs_errtbl[i].errno; 1121 return nfs_errtbl[i].errno;
694} 1122}
695 1123
696#define PROC(proc, argtype, restype, timer) \ 1124#define PROC(proc, argtype, restype, timer) \
697[NFSPROC_##proc] = { \ 1125[NFSPROC_##proc] = { \
698 .p_proc = NFSPROC_##proc, \ 1126 .p_proc = NFSPROC_##proc, \
699 .p_encode = (kxdrproc_t) nfs_xdr_##argtype, \ 1127 .p_encode = (kxdreproc_t)nfs2_xdr_enc_##argtype, \
700 .p_decode = (kxdrproc_t) nfs_xdr_##restype, \ 1128 .p_decode = (kxdrdproc_t)nfs2_xdr_dec_##restype, \
701 .p_arglen = NFS_##argtype##_sz, \ 1129 .p_arglen = NFS_##argtype##_sz, \
702 .p_replen = NFS_##restype##_sz, \ 1130 .p_replen = NFS_##restype##_sz, \
703 .p_timer = timer, \ 1131 .p_timer = timer, \
@@ -705,21 +1133,21 @@ nfs_stat_to_errno(int stat)
705 .p_name = #proc, \ 1133 .p_name = #proc, \
706 } 1134 }
707struct rpc_procinfo nfs_procedures[] = { 1135struct rpc_procinfo nfs_procedures[] = {
708 PROC(GETATTR, fhandle, attrstat, 1), 1136 PROC(GETATTR, fhandle, attrstat, 1),
709 PROC(SETATTR, sattrargs, attrstat, 0), 1137 PROC(SETATTR, sattrargs, attrstat, 0),
710 PROC(LOOKUP, diropargs, diropres, 2), 1138 PROC(LOOKUP, diropargs, diropres, 2),
711 PROC(READLINK, readlinkargs, readlinkres, 3), 1139 PROC(READLINK, readlinkargs, readlinkres, 3),
712 PROC(READ, readargs, readres, 3), 1140 PROC(READ, readargs, readres, 3),
713 PROC(WRITE, writeargs, writeres, 4), 1141 PROC(WRITE, writeargs, writeres, 4),
714 PROC(CREATE, createargs, diropres, 0), 1142 PROC(CREATE, createargs, diropres, 0),
715 PROC(REMOVE, removeargs, stat, 0), 1143 PROC(REMOVE, removeargs, stat, 0),
716 PROC(RENAME, renameargs, stat, 0), 1144 PROC(RENAME, renameargs, stat, 0),
717 PROC(LINK, linkargs, stat, 0), 1145 PROC(LINK, linkargs, stat, 0),
718 PROC(SYMLINK, symlinkargs, stat, 0), 1146 PROC(SYMLINK, symlinkargs, stat, 0),
719 PROC(MKDIR, createargs, diropres, 0), 1147 PROC(MKDIR, createargs, diropres, 0),
720 PROC(RMDIR, diropargs, stat, 0), 1148 PROC(RMDIR, diropargs, stat, 0),
721 PROC(READDIR, readdirargs, readdirres, 3), 1149 PROC(READDIR, readdirargs, readdirres, 3),
722 PROC(STATFS, fhandle, statfsres, 0), 1150 PROC(STATFS, fhandle, statfsres, 0),
723}; 1151};
724 1152
725struct rpc_version nfs_version2 = { 1153struct rpc_version nfs_version2 = {
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f4c7e..27434277165 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
311 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 311 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
312 goto out; 312 goto out;
313 313
314 /* We are doing this here, because XDR marshalling can only 314 /* We are doing this here because XDR marshalling does not
315 return -ENOMEM. */ 315 * return any results, it BUGs. */
316 status = -ENOSPC; 316 status = -ENOSPC;
317 if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) 317 if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
318 goto out; 318 goto out;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d9a5e832c25..183c6b123d0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,18 +37,16 @@
37#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) 37#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2))
38#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) 38#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2))
39#define NFS3_fattr_sz (21) 39#define NFS3_fattr_sz (21)
40#define NFS3_wcc_attr_sz (6) 40#define NFS3_cookieverf_sz (NFS3_COOKIEVERFSIZE>>2)
41#define NFS3_wcc_attr_sz (6)
41#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz) 42#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz)
42#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz) 43#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz)
43#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) 44#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
44#define NFS3_fsstat_sz
45#define NFS3_fsinfo_sz
46#define NFS3_pathconf_sz
47#define NFS3_entry_sz (NFS3_filename_sz+3)
48
49#define NFS3_sattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3)
50#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz) 45#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz)
51#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz) 46
47#define NFS3_getattrargs_sz (NFS3_fh_sz)
48#define NFS3_setattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3)
49#define NFS3_lookupargs_sz (NFS3_fh_sz+NFS3_filename_sz)
52#define NFS3_accessargs_sz (NFS3_fh_sz+1) 50#define NFS3_accessargs_sz (NFS3_fh_sz+1)
53#define NFS3_readlinkargs_sz (NFS3_fh_sz) 51#define NFS3_readlinkargs_sz (NFS3_fh_sz)
54#define NFS3_readargs_sz (NFS3_fh_sz+3) 52#define NFS3_readargs_sz (NFS3_fh_sz+3)
@@ -57,14 +55,16 @@
57#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) 55#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz)
58#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz) 56#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz)
59#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz) 57#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz)
58#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz)
60#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz) 59#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz)
61#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz) 60#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz)
62#define NFS3_readdirargs_sz (NFS3_fh_sz+2) 61#define NFS3_readdirargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+3)
62#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
63#define NFS3_commitargs_sz (NFS3_fh_sz+3) 63#define NFS3_commitargs_sz (NFS3_fh_sz+3)
64 64
65#define NFS3_attrstat_sz (1+NFS3_fattr_sz) 65#define NFS3_getattrres_sz (1+NFS3_fattr_sz)
66#define NFS3_wccstat_sz (1+NFS3_wcc_data_sz) 66#define NFS3_setattrres_sz (1+NFS3_wcc_data_sz)
67#define NFS3_removeres_sz (NFS3_wccstat_sz) 67#define NFS3_removeres_sz (NFS3_setattrres_sz)
68#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) 68#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
69#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) 69#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1)
70#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) 70#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1)
@@ -100,1077 +100,2365 @@ static const umode_t nfs_type2fmt[] = {
100 [NF3FIFO] = S_IFIFO, 100 [NF3FIFO] = S_IFIFO,
101}; 101};
102 102
103/*
104 * While encoding arguments, set up the reply buffer in advance to
105 * receive reply data directly into the page cache.
106 */
107static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
108 unsigned int base, unsigned int len,
109 unsigned int bufsize)
110{
111 struct rpc_auth *auth = req->rq_cred->cr_auth;
112 unsigned int replen;
113
114 replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
115 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
116}
117
118/*
119 * Handle decode buffer overflows out-of-line.
120 */
103static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 121static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
104{ 122{
105 dprintk("nfs: %s: prematurely hit end of receive buffer. " 123 dprintk("NFS: %s prematurely hit the end of our receive buffer. "
106 "Remaining buffer length is %tu words.\n", 124 "Remaining buffer length is %tu words.\n",
107 func, xdr->end - xdr->p); 125 func, xdr->end - xdr->p);
108} 126}
109 127
128
110/* 129/*
111 * Common NFS XDR functions as inlines 130 * Encode/decode NFSv3 basic data types
131 *
132 * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
133 * "NFS Version 3 Protocol Specification".
134 *
135 * Not all basic data types have their own encoding and decoding
136 * functions. For run-time efficiency, some data types are encoded
137 * or decoded inline.
112 */ 138 */
113static inline __be32 * 139
114xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh) 140static void encode_uint32(struct xdr_stream *xdr, u32 value)
115{ 141{
116 return xdr_encode_array(p, fh->data, fh->size); 142 __be32 *p = xdr_reserve_space(xdr, 4);
143 *p = cpu_to_be32(value);
117} 144}
118 145
119static inline __be32 * 146static int decode_uint32(struct xdr_stream *xdr, u32 *value)
120xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
121{ 147{
122 if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) { 148 __be32 *p;
123 memcpy(fh->data, p, fh->size); 149
124 return p + XDR_QUADLEN(fh->size); 150 p = xdr_inline_decode(xdr, 4);
125 } 151 if (unlikely(p == NULL))
126 return NULL; 152 goto out_overflow;
153 *value = be32_to_cpup(p);
154 return 0;
155out_overflow:
156 print_overflow_msg(__func__, xdr);
157 return -EIO;
158}
159
160static int decode_uint64(struct xdr_stream *xdr, u64 *value)
161{
162 __be32 *p;
163
164 p = xdr_inline_decode(xdr, 8);
165 if (unlikely(p == NULL))
166 goto out_overflow;
167 xdr_decode_hyper(p, value);
168 return 0;
169out_overflow:
170 print_overflow_msg(__func__, xdr);
171 return -EIO;
172}
173
174/*
175 * fileid3
176 *
177 * typedef uint64 fileid3;
178 */
179static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
180{
181 return xdr_decode_hyper(p, fileid);
182}
183
184static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
185{
186 return decode_uint64(xdr, fileid);
187}
188
189/*
190 * filename3
191 *
192 * typedef string filename3<>;
193 */
194static void encode_filename3(struct xdr_stream *xdr,
195 const char *name, u32 length)
196{
197 __be32 *p;
198
199 BUG_ON(length > NFS3_MAXNAMLEN);
200 p = xdr_reserve_space(xdr, 4 + length);
201 xdr_encode_opaque(p, name, length);
127} 202}
128 203
129static inline __be32 * 204static int decode_inline_filename3(struct xdr_stream *xdr,
130xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh) 205 const char **name, u32 *length)
131{ 206{
132 __be32 *p; 207 __be32 *p;
208 u32 count;
209
133 p = xdr_inline_decode(xdr, 4); 210 p = xdr_inline_decode(xdr, 4);
134 if (unlikely(!p)) 211 if (unlikely(p == NULL))
212 goto out_overflow;
213 count = be32_to_cpup(p);
214 if (count > NFS3_MAXNAMLEN)
215 goto out_nametoolong;
216 p = xdr_inline_decode(xdr, count);
217 if (unlikely(p == NULL))
135 goto out_overflow; 218 goto out_overflow;
136 fh->size = ntohl(*p++); 219 *name = (const char *)p;
220 *length = count;
221 return 0;
137 222
138 if (fh->size <= NFS3_FHSIZE) { 223out_nametoolong:
139 p = xdr_inline_decode(xdr, fh->size); 224 dprintk("NFS: returned filename too long: %u\n", count);
140 if (unlikely(!p)) 225 return -ENAMETOOLONG;
141 goto out_overflow; 226out_overflow:
142 memcpy(fh->data, p, fh->size); 227 print_overflow_msg(__func__, xdr);
143 return p + XDR_QUADLEN(fh->size); 228 return -EIO;
144 } 229}
145 return NULL; 230
231/*
232 * nfspath3
233 *
234 * typedef string nfspath3<>;
235 */
236static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
237 const u32 length)
238{
239 BUG_ON(length > NFS3_MAXPATHLEN);
240 encode_uint32(xdr, length);
241 xdr_write_pages(xdr, pages, 0, length);
242}
146 243
244static int decode_nfspath3(struct xdr_stream *xdr)
245{
246 u32 recvd, count;
247 size_t hdrlen;
248 __be32 *p;
249
250 p = xdr_inline_decode(xdr, 4);
251 if (unlikely(p == NULL))
252 goto out_overflow;
253 count = be32_to_cpup(p);
254 if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
255 goto out_nametoolong;
256 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
257 recvd = xdr->buf->len - hdrlen;
258 if (unlikely(count > recvd))
259 goto out_cheating;
260
261 xdr_read_pages(xdr, count);
262 xdr_terminate_string(xdr->buf, count);
263 return 0;
264
265out_nametoolong:
266 dprintk("NFS: returned pathname too long: %u\n", count);
267 return -ENAMETOOLONG;
268out_cheating:
269 dprintk("NFS: server cheating in pathname result: "
270 "count %u > recvd %u\n", count, recvd);
271 return -EIO;
147out_overflow: 272out_overflow:
148 print_overflow_msg(__func__, xdr); 273 print_overflow_msg(__func__, xdr);
149 return ERR_PTR(-EIO); 274 return -EIO;
150} 275}
151 276
152/* 277/*
153 * Encode/decode time. 278 * cookie3
279 *
280 * typedef uint64 cookie3
154 */ 281 */
155static inline __be32 * 282static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
156xdr_encode_time3(__be32 *p, struct timespec *timep)
157{ 283{
158 *p++ = htonl(timep->tv_sec); 284 return xdr_encode_hyper(p, cookie);
159 *p++ = htonl(timep->tv_nsec);
160 return p;
161} 285}
162 286
163static inline __be32 * 287static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
164xdr_decode_time3(__be32 *p, struct timespec *timep)
165{ 288{
166 timep->tv_sec = ntohl(*p++); 289 return decode_uint64(xdr, cookie);
167 timep->tv_nsec = ntohl(*p++);
168 return p;
169} 290}
170 291
171static __be32 * 292/*
172xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 293 * cookieverf3
294 *
295 * typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
296 */
297static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
298{
299 memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
300 return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
301}
302
303static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
304{
305 __be32 *p;
306
307 p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
308 if (unlikely(p == NULL))
309 goto out_overflow;
310 memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
311 return 0;
312out_overflow:
313 print_overflow_msg(__func__, xdr);
314 return -EIO;
315}
316
317/*
318 * createverf3
319 *
320 * typedef opaque createverf3[NFS3_CREATEVERFSIZE];
321 */
322static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
173{ 323{
174 unsigned int type, major, minor; 324 __be32 *p;
175 umode_t fmode; 325
326 p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
327 memcpy(p, verifier, NFS3_CREATEVERFSIZE);
328}
329
330static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
331{
332 __be32 *p;
333
334 p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
335 if (unlikely(p == NULL))
336 goto out_overflow;
337 memcpy(verifier, p, NFS3_WRITEVERFSIZE);
338 return 0;
339out_overflow:
340 print_overflow_msg(__func__, xdr);
341 return -EIO;
342}
343
344/*
345 * size3
346 *
347 * typedef uint64 size3;
348 */
349static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
350{
351 return xdr_decode_hyper(p, size);
352}
353
354/*
355 * nfsstat3
356 *
357 * enum nfsstat3 {
358 * NFS3_OK = 0,
359 * ...
360 * }
361 */
362#define NFS3_OK NFS_OK
176 363
177 type = ntohl(*p++); 364static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
365{
366 __be32 *p;
367
368 p = xdr_inline_decode(xdr, 4);
369 if (unlikely(p == NULL))
370 goto out_overflow;
371 *status = be32_to_cpup(p);
372 return 0;
373out_overflow:
374 print_overflow_msg(__func__, xdr);
375 return -EIO;
376}
377
378/*
379 * ftype3
380 *
381 * enum ftype3 {
382 * NF3REG = 1,
383 * NF3DIR = 2,
384 * NF3BLK = 3,
385 * NF3CHR = 4,
386 * NF3LNK = 5,
387 * NF3SOCK = 6,
388 * NF3FIFO = 7
389 * };
390 */
391static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
392{
393 BUG_ON(type > NF3FIFO);
394 encode_uint32(xdr, type);
395}
396
397static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
398{
399 u32 type;
400
401 type = be32_to_cpup(p++);
178 if (type > NF3FIFO) 402 if (type > NF3FIFO)
179 type = NF3NON; 403 type = NF3NON;
180 fmode = nfs_type2fmt[type]; 404 *mode = nfs_type2fmt[type];
181 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; 405 return p;
182 fattr->nlink = ntohl(*p++); 406}
183 fattr->uid = ntohl(*p++);
184 fattr->gid = ntohl(*p++);
185 p = xdr_decode_hyper(p, &fattr->size);
186 p = xdr_decode_hyper(p, &fattr->du.nfs3.used);
187
188 /* Turn remote device info into Linux-specific dev_t */
189 major = ntohl(*p++);
190 minor = ntohl(*p++);
191 fattr->rdev = MKDEV(major, minor);
192 if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
193 fattr->rdev = 0;
194 407
195 p = xdr_decode_hyper(p, &fattr->fsid.major); 408/*
196 fattr->fsid.minor = 0; 409 * specdata3
197 p = xdr_decode_hyper(p, &fattr->fileid); 410 *
198 p = xdr_decode_time3(p, &fattr->atime); 411 * struct specdata3 {
199 p = xdr_decode_time3(p, &fattr->mtime); 412 * uint32 specdata1;
200 p = xdr_decode_time3(p, &fattr->ctime); 413 * uint32 specdata2;
414 * };
415 */
416static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
417{
418 __be32 *p;
201 419
202 /* Update the mode bits */ 420 p = xdr_reserve_space(xdr, 8);
203 fattr->valid |= NFS_ATTR_FATTR_V3; 421 *p++ = cpu_to_be32(MAJOR(rdev));
422 *p = cpu_to_be32(MINOR(rdev));
423}
424
425static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
426{
427 unsigned int major, minor;
428
429 major = be32_to_cpup(p++);
430 minor = be32_to_cpup(p++);
431 *rdev = MKDEV(major, minor);
432 if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
433 *rdev = 0;
434 return p;
435}
436
437/*
438 * nfs_fh3
439 *
440 * struct nfs_fh3 {
441 * opaque data<NFS3_FHSIZE>;
442 * };
443 */
444static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
445{
446 __be32 *p;
447
448 BUG_ON(fh->size > NFS3_FHSIZE);
449 p = xdr_reserve_space(xdr, 4 + fh->size);
450 xdr_encode_opaque(p, fh->data, fh->size);
451}
452
453static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
454{
455 u32 length;
456 __be32 *p;
457
458 p = xdr_inline_decode(xdr, 4);
459 if (unlikely(p == NULL))
460 goto out_overflow;
461 length = be32_to_cpup(p++);
462 if (unlikely(length > NFS3_FHSIZE))
463 goto out_toobig;
464 p = xdr_inline_decode(xdr, length);
465 if (unlikely(p == NULL))
466 goto out_overflow;
467 fh->size = length;
468 memcpy(fh->data, p, length);
469 return 0;
470out_toobig:
471 dprintk("NFS: file handle size (%u) too big\n", length);
472 return -E2BIG;
473out_overflow:
474 print_overflow_msg(__func__, xdr);
475 return -EIO;
476}
477
478static void zero_nfs_fh3(struct nfs_fh *fh)
479{
480 memset(fh, 0, sizeof(*fh));
481}
482
483/*
484 * nfstime3
485 *
486 * struct nfstime3 {
487 * uint32 seconds;
488 * uint32 nseconds;
489 * };
490 */
491static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
492{
493 *p++ = cpu_to_be32(timep->tv_sec);
494 *p++ = cpu_to_be32(timep->tv_nsec);
204 return p; 495 return p;
205} 496}
206 497
207static inline __be32 * 498static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
208xdr_encode_sattr(__be32 *p, struct iattr *attr)
209{ 499{
500 timep->tv_sec = be32_to_cpup(p++);
501 timep->tv_nsec = be32_to_cpup(p++);
502 return p;
503}
504
505/*
506 * sattr3
507 *
508 * enum time_how {
509 * DONT_CHANGE = 0,
510 * SET_TO_SERVER_TIME = 1,
511 * SET_TO_CLIENT_TIME = 2
512 * };
513 *
514 * union set_mode3 switch (bool set_it) {
515 * case TRUE:
516 * mode3 mode;
517 * default:
518 * void;
519 * };
520 *
521 * union set_uid3 switch (bool set_it) {
522 * case TRUE:
523 * uid3 uid;
524 * default:
525 * void;
526 * };
527 *
528 * union set_gid3 switch (bool set_it) {
529 * case TRUE:
530 * gid3 gid;
531 * default:
532 * void;
533 * };
534 *
535 * union set_size3 switch (bool set_it) {
536 * case TRUE:
537 * size3 size;
538 * default:
539 * void;
540 * };
541 *
542 * union set_atime switch (time_how set_it) {
543 * case SET_TO_CLIENT_TIME:
544 * nfstime3 atime;
545 * default:
546 * void;
547 * };
548 *
549 * union set_mtime switch (time_how set_it) {
550 * case SET_TO_CLIENT_TIME:
551 * nfstime3 mtime;
552 * default:
553 * void;
554 * };
555 *
556 * struct sattr3 {
557 * set_mode3 mode;
558 * set_uid3 uid;
559 * set_gid3 gid;
560 * set_size3 size;
561 * set_atime atime;
562 * set_mtime mtime;
563 * };
564 */
565static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
566{
567 u32 nbytes;
568 __be32 *p;
569
570 /*
571 * In order to make only a single xdr_reserve_space() call,
572 * pre-compute the total number of bytes to be reserved.
573 * Six boolean values, one for each set_foo field, are always
574 * present in the encoded result, so start there.
575 */
576 nbytes = 6 * 4;
577 if (attr->ia_valid & ATTR_MODE)
578 nbytes += 4;
579 if (attr->ia_valid & ATTR_UID)
580 nbytes += 4;
581 if (attr->ia_valid & ATTR_GID)
582 nbytes += 4;
583 if (attr->ia_valid & ATTR_SIZE)
584 nbytes += 8;
585 if (attr->ia_valid & ATTR_ATIME_SET)
586 nbytes += 8;
587 if (attr->ia_valid & ATTR_MTIME_SET)
588 nbytes += 8;
589 p = xdr_reserve_space(xdr, nbytes);
590
210 if (attr->ia_valid & ATTR_MODE) { 591 if (attr->ia_valid & ATTR_MODE) {
211 *p++ = xdr_one; 592 *p++ = xdr_one;
212 *p++ = htonl(attr->ia_mode & S_IALLUGO); 593 *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
213 } else { 594 } else
214 *p++ = xdr_zero; 595 *p++ = xdr_zero;
215 } 596
216 if (attr->ia_valid & ATTR_UID) { 597 if (attr->ia_valid & ATTR_UID) {
217 *p++ = xdr_one; 598 *p++ = xdr_one;
218 *p++ = htonl(attr->ia_uid); 599 *p++ = cpu_to_be32(attr->ia_uid);
219 } else { 600 } else
220 *p++ = xdr_zero; 601 *p++ = xdr_zero;
221 } 602
222 if (attr->ia_valid & ATTR_GID) { 603 if (attr->ia_valid & ATTR_GID) {
223 *p++ = xdr_one; 604 *p++ = xdr_one;
224 *p++ = htonl(attr->ia_gid); 605 *p++ = cpu_to_be32(attr->ia_gid);
225 } else { 606 } else
226 *p++ = xdr_zero; 607 *p++ = xdr_zero;
227 } 608
228 if (attr->ia_valid & ATTR_SIZE) { 609 if (attr->ia_valid & ATTR_SIZE) {
229 *p++ = xdr_one; 610 *p++ = xdr_one;
230 p = xdr_encode_hyper(p, (__u64) attr->ia_size); 611 p = xdr_encode_hyper(p, (u64)attr->ia_size);
231 } else { 612 } else
232 *p++ = xdr_zero; 613 *p++ = xdr_zero;
233 } 614
234 if (attr->ia_valid & ATTR_ATIME_SET) { 615 if (attr->ia_valid & ATTR_ATIME_SET) {
235 *p++ = xdr_two; 616 *p++ = xdr_two;
236 p = xdr_encode_time3(p, &attr->ia_atime); 617 p = xdr_encode_nfstime3(p, &attr->ia_atime);
237 } else if (attr->ia_valid & ATTR_ATIME) { 618 } else if (attr->ia_valid & ATTR_ATIME) {
238 *p++ = xdr_one; 619 *p++ = xdr_one;
239 } else { 620 } else
240 *p++ = xdr_zero; 621 *p++ = xdr_zero;
241 } 622
242 if (attr->ia_valid & ATTR_MTIME_SET) { 623 if (attr->ia_valid & ATTR_MTIME_SET) {
243 *p++ = xdr_two; 624 *p++ = xdr_two;
244 p = xdr_encode_time3(p, &attr->ia_mtime); 625 xdr_encode_nfstime3(p, &attr->ia_mtime);
245 } else if (attr->ia_valid & ATTR_MTIME) { 626 } else if (attr->ia_valid & ATTR_MTIME) {
246 *p++ = xdr_one; 627 *p = xdr_one;
247 } else { 628 } else
248 *p++ = xdr_zero; 629 *p = xdr_zero;
249 }
250 return p;
251} 630}
252 631
253static inline __be32 * 632/*
254xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr) 633 * fattr3
634 *
635 * struct fattr3 {
636 * ftype3 type;
637 * mode3 mode;
638 * uint32 nlink;
639 * uid3 uid;
640 * gid3 gid;
641 * size3 size;
642 * size3 used;
643 * specdata3 rdev;
644 * uint64 fsid;
645 * fileid3 fileid;
646 * nfstime3 atime;
647 * nfstime3 mtime;
648 * nfstime3 ctime;
649 * };
650 */
651static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
255{ 652{
256 p = xdr_decode_hyper(p, &fattr->pre_size); 653 umode_t fmode;
257 p = xdr_decode_time3(p, &fattr->pre_mtime); 654 __be32 *p;
258 p = xdr_decode_time3(p, &fattr->pre_ctime); 655
656 p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
657 if (unlikely(p == NULL))
658 goto out_overflow;
659
660 p = xdr_decode_ftype3(p, &fmode);
661
662 fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
663 fattr->nlink = be32_to_cpup(p++);
664 fattr->uid = be32_to_cpup(p++);
665 fattr->gid = be32_to_cpup(p++);
666
667 p = xdr_decode_size3(p, &fattr->size);
668 p = xdr_decode_size3(p, &fattr->du.nfs3.used);
669 p = xdr_decode_specdata3(p, &fattr->rdev);
670
671 p = xdr_decode_hyper(p, &fattr->fsid.major);
672 fattr->fsid.minor = 0;
673
674 p = xdr_decode_fileid3(p, &fattr->fileid);
675 p = xdr_decode_nfstime3(p, &fattr->atime);
676 p = xdr_decode_nfstime3(p, &fattr->mtime);
677 xdr_decode_nfstime3(p, &fattr->ctime);
678
679 fattr->valid |= NFS_ATTR_FATTR_V3;
680 return 0;
681out_overflow:
682 print_overflow_msg(__func__, xdr);
683 return -EIO;
684}
685
686/*
687 * post_op_attr
688 *
689 * union post_op_attr switch (bool attributes_follow) {
690 * case TRUE:
691 * fattr3 attributes;
692 * case FALSE:
693 * void;
694 * };
695 */
696static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
697{
698 __be32 *p;
699
700 p = xdr_inline_decode(xdr, 4);
701 if (unlikely(p == NULL))
702 goto out_overflow;
703 if (*p != xdr_zero)
704 return decode_fattr3(xdr, fattr);
705 return 0;
706out_overflow:
707 print_overflow_msg(__func__, xdr);
708 return -EIO;
709}
710
711/*
712 * wcc_attr
713 * struct wcc_attr {
714 * size3 size;
715 * nfstime3 mtime;
716 * nfstime3 ctime;
717 * };
718 */
719static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
720{
721 __be32 *p;
722
723 p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
724 if (unlikely(p == NULL))
725 goto out_overflow;
726
259 fattr->valid |= NFS_ATTR_FATTR_PRESIZE 727 fattr->valid |= NFS_ATTR_FATTR_PRESIZE
260 | NFS_ATTR_FATTR_PREMTIME 728 | NFS_ATTR_FATTR_PREMTIME
261 | NFS_ATTR_FATTR_PRECTIME; 729 | NFS_ATTR_FATTR_PRECTIME;
262 return p;
263}
264 730
265static inline __be32 * 731 p = xdr_decode_size3(p, &fattr->pre_size);
266xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr) 732 p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
267{ 733 xdr_decode_nfstime3(p, &fattr->pre_ctime);
268 if (*p++) 734
269 p = xdr_decode_fattr(p, fattr); 735 return 0;
270 return p; 736out_overflow:
737 print_overflow_msg(__func__, xdr);
738 return -EIO;
271} 739}
272 740
273static inline __be32 * 741/*
274xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr) 742 * pre_op_attr
743 * union pre_op_attr switch (bool attributes_follow) {
744 * case TRUE:
745 * wcc_attr attributes;
746 * case FALSE:
747 * void;
748 * };
749 *
750 * wcc_data
751 *
752 * struct wcc_data {
753 * pre_op_attr before;
754 * post_op_attr after;
755 * };
756 */
757static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
275{ 758{
276 __be32 *p; 759 __be32 *p;
277 760
278 p = xdr_inline_decode(xdr, 4); 761 p = xdr_inline_decode(xdr, 4);
279 if (unlikely(!p)) 762 if (unlikely(p == NULL))
280 goto out_overflow; 763 goto out_overflow;
281 if (ntohl(*p++)) { 764 if (*p != xdr_zero)
282 p = xdr_inline_decode(xdr, 84); 765 return decode_wcc_attr(xdr, fattr);
283 if (unlikely(!p)) 766 return 0;
284 goto out_overflow;
285 p = xdr_decode_fattr(p, fattr);
286 }
287 return p;
288out_overflow: 767out_overflow:
289 print_overflow_msg(__func__, xdr); 768 print_overflow_msg(__func__, xdr);
290 return ERR_PTR(-EIO); 769 return -EIO;
291} 770}
292 771
293static inline __be32 * 772static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
294xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
295{ 773{
296 if (*p++) 774 int error;
297 return xdr_decode_wcc_attr(p, fattr); 775
298 return p; 776 error = decode_pre_op_attr(xdr, fattr);
777 if (unlikely(error))
778 goto out;
779 error = decode_post_op_attr(xdr, fattr);
780out:
781 return error;
299} 782}
300 783
784/*
785 * post_op_fh3
786 *
787 * union post_op_fh3 switch (bool handle_follows) {
788 * case TRUE:
789 * nfs_fh3 handle;
790 * case FALSE:
791 * void;
792 * };
793 */
794static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
795{
796 __be32 *p = xdr_inline_decode(xdr, 4);
797 if (unlikely(p == NULL))
798 goto out_overflow;
799 if (*p != xdr_zero)
800 return decode_nfs_fh3(xdr, fh);
801 zero_nfs_fh3(fh);
802 return 0;
803out_overflow:
804 print_overflow_msg(__func__, xdr);
805 return -EIO;
806}
301 807
302static inline __be32 * 808/*
303xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr) 809 * diropargs3
810 *
811 * struct diropargs3 {
812 * nfs_fh3 dir;
813 * filename3 name;
814 * };
815 */
816static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
817 const char *name, u32 length)
304{ 818{
305 p = xdr_decode_pre_op_attr(p, fattr); 819 encode_nfs_fh3(xdr, fh);
306 return xdr_decode_post_op_attr(p, fattr); 820 encode_filename3(xdr, name, length);
307} 821}
308 822
823
309/* 824/*
310 * NFS encode functions 825 * NFSv3 XDR encode functions
826 *
827 * NFSv3 argument types are defined in section 3.3 of RFC 1813:
828 * "NFS Version 3 Protocol Specification".
311 */ 829 */
312 830
313/* 831/*
314 * Encode file handle argument 832 * 3.3.1 GETATTR3args
833 *
834 * struct GETATTR3args {
835 * nfs_fh3 object;
836 * };
315 */ 837 */
316static int 838static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
317nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) 839 struct xdr_stream *xdr,
840 const struct nfs_fh *fh)
318{ 841{
319 p = xdr_encode_fhandle(p, fh); 842 encode_nfs_fh3(xdr, fh);
320 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
321 return 0;
322} 843}
323 844
324/* 845/*
325 * Encode SETATTR arguments 846 * 3.3.2 SETATTR3args
847 *
848 * union sattrguard3 switch (bool check) {
849 * case TRUE:
850 * nfstime3 obj_ctime;
851 * case FALSE:
852 * void;
853 * };
854 *
855 * struct SETATTR3args {
856 * nfs_fh3 object;
857 * sattr3 new_attributes;
858 * sattrguard3 guard;
859 * };
326 */ 860 */
327static int 861static void encode_sattrguard3(struct xdr_stream *xdr,
328nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args) 862 const struct nfs3_sattrargs *args)
329{ 863{
330 p = xdr_encode_fhandle(p, args->fh); 864 __be32 *p;
331 p = xdr_encode_sattr(p, args->sattr); 865
332 *p++ = htonl(args->guard); 866 if (args->guard) {
333 if (args->guard) 867 p = xdr_reserve_space(xdr, 4 + 8);
334 p = xdr_encode_time3(p, &args->guardtime); 868 *p++ = xdr_one;
335 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 869 xdr_encode_nfstime3(p, &args->guardtime);
336 return 0; 870 } else {
871 p = xdr_reserve_space(xdr, 4);
872 *p = xdr_zero;
873 }
874}
875
876static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
877 struct xdr_stream *xdr,
878 const struct nfs3_sattrargs *args)
879{
880 encode_nfs_fh3(xdr, args->fh);
881 encode_sattr3(xdr, args->sattr);
882 encode_sattrguard3(xdr, args);
337} 883}
338 884
339/* 885/*
340 * Encode directory ops argument 886 * 3.3.3 LOOKUP3args
887 *
888 * struct LOOKUP3args {
889 * diropargs3 what;
890 * };
341 */ 891 */
342static int 892static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
343nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args) 893 struct xdr_stream *xdr,
894 const struct nfs3_diropargs *args)
344{ 895{
345 p = xdr_encode_fhandle(p, args->fh); 896 encode_diropargs3(xdr, args->fh, args->name, args->len);
346 p = xdr_encode_array(p, args->name, args->len);
347 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
348 return 0;
349} 897}
350 898
351/* 899/*
352 * Encode REMOVE argument 900 * 3.3.4 ACCESS3args
901 *
902 * struct ACCESS3args {
903 * nfs_fh3 object;
904 * uint32 access;
905 * };
353 */ 906 */
354static int 907static void encode_access3args(struct xdr_stream *xdr,
355nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) 908 const struct nfs3_accessargs *args)
356{ 909{
357 p = xdr_encode_fhandle(p, args->fh); 910 encode_nfs_fh3(xdr, args->fh);
358 p = xdr_encode_array(p, args->name.name, args->name.len); 911 encode_uint32(xdr, args->access);
359 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 912}
360 return 0; 913
914static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
915 struct xdr_stream *xdr,
916 const struct nfs3_accessargs *args)
917{
918 encode_access3args(xdr, args);
361} 919}
362 920
363/* 921/*
364 * Encode access() argument 922 * 3.3.5 READLINK3args
923 *
924 * struct READLINK3args {
925 * nfs_fh3 symlink;
926 * };
365 */ 927 */
366static int 928static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
367nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args) 929 struct xdr_stream *xdr,
930 const struct nfs3_readlinkargs *args)
368{ 931{
369 p = xdr_encode_fhandle(p, args->fh); 932 encode_nfs_fh3(xdr, args->fh);
370 *p++ = htonl(args->access); 933 prepare_reply_buffer(req, args->pages, args->pgbase,
371 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 934 args->pglen, NFS3_readlinkres_sz);
372 return 0;
373} 935}
374 936
375/* 937/*
376 * Arguments to a READ call. Since we read data directly into the page 938 * 3.3.6 READ3args
377 * cache, we also set up the reply iovec here so that iov[1] points 939 *
378 * exactly to the page we want to fetch. 940 * struct READ3args {
941 * nfs_fh3 file;
942 * offset3 offset;
943 * count3 count;
944 * };
379 */ 945 */
380static int 946static void encode_read3args(struct xdr_stream *xdr,
381nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 947 const struct nfs_readargs *args)
382{ 948{
383 struct rpc_auth *auth = req->rq_cred->cr_auth; 949 __be32 *p;
384 unsigned int replen; 950
385 u32 count = args->count; 951 encode_nfs_fh3(xdr, args->fh);
386 952
387 p = xdr_encode_fhandle(p, args->fh); 953 p = xdr_reserve_space(xdr, 8 + 4);
388 p = xdr_encode_hyper(p, args->offset); 954 p = xdr_encode_hyper(p, args->offset);
389 *p++ = htonl(count); 955 *p = cpu_to_be32(args->count);
390 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 956}
391 957
392 /* Inline the page array */ 958static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
393 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2; 959 struct xdr_stream *xdr,
394 xdr_inline_pages(&req->rq_rcv_buf, replen, 960 const struct nfs_readargs *args)
395 args->pages, args->pgbase, count); 961{
962 encode_read3args(xdr, args);
963 prepare_reply_buffer(req, args->pages, args->pgbase,
964 args->count, NFS3_readres_sz);
396 req->rq_rcv_buf.flags |= XDRBUF_READ; 965 req->rq_rcv_buf.flags |= XDRBUF_READ;
397 return 0;
398} 966}
399 967
400/* 968/*
401 * Write arguments. Splice the buffer to be written into the iovec. 969 * 3.3.7 WRITE3args
970 *
971 * enum stable_how {
972 * UNSTABLE = 0,
973 * DATA_SYNC = 1,
974 * FILE_SYNC = 2
975 * };
976 *
977 * struct WRITE3args {
978 * nfs_fh3 file;
979 * offset3 offset;
980 * count3 count;
981 * stable_how stable;
982 * opaque data<>;
983 * };
402 */ 984 */
403static int 985static void encode_write3args(struct xdr_stream *xdr,
404nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 986 const struct nfs_writeargs *args)
405{ 987{
406 struct xdr_buf *sndbuf = &req->rq_snd_buf; 988 __be32 *p;
407 u32 count = args->count; 989
990 encode_nfs_fh3(xdr, args->fh);
408 991
409 p = xdr_encode_fhandle(p, args->fh); 992 p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
410 p = xdr_encode_hyper(p, args->offset); 993 p = xdr_encode_hyper(p, args->offset);
411 *p++ = htonl(count); 994 *p++ = cpu_to_be32(args->count);
412 *p++ = htonl(args->stable); 995 *p++ = cpu_to_be32(args->stable);
413 *p++ = htonl(count); 996 *p = cpu_to_be32(args->count);
414 sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); 997 xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
415 998}
416 /* Copy the page array */ 999
417 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); 1000static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
418 sndbuf->flags |= XDRBUF_WRITE; 1001 struct xdr_stream *xdr,
419 return 0; 1002 const struct nfs_writeargs *args)
1003{
1004 encode_write3args(xdr, args);
1005 xdr->buf->flags |= XDRBUF_WRITE;
420} 1006}
421 1007
422/* 1008/*
423 * Encode CREATE arguments 1009 * 3.3.8 CREATE3args
1010 *
1011 * enum createmode3 {
1012 * UNCHECKED = 0,
1013 * GUARDED = 1,
1014 * EXCLUSIVE = 2
1015 * };
1016 *
1017 * union createhow3 switch (createmode3 mode) {
1018 * case UNCHECKED:
1019 * case GUARDED:
1020 * sattr3 obj_attributes;
1021 * case EXCLUSIVE:
1022 * createverf3 verf;
1023 * };
1024 *
1025 * struct CREATE3args {
1026 * diropargs3 where;
1027 * createhow3 how;
1028 * };
424 */ 1029 */
425static int 1030static void encode_createhow3(struct xdr_stream *xdr,
426nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args) 1031 const struct nfs3_createargs *args)
427{ 1032{
428 p = xdr_encode_fhandle(p, args->fh); 1033 encode_uint32(xdr, args->createmode);
429 p = xdr_encode_array(p, args->name, args->len); 1034 switch (args->createmode) {
430 1035 case NFS3_CREATE_UNCHECKED:
431 *p++ = htonl(args->createmode); 1036 case NFS3_CREATE_GUARDED:
432 if (args->createmode == NFS3_CREATE_EXCLUSIVE) { 1037 encode_sattr3(xdr, args->sattr);
433 *p++ = args->verifier[0]; 1038 break;
434 *p++ = args->verifier[1]; 1039 case NFS3_CREATE_EXCLUSIVE:
435 } else 1040 encode_createverf3(xdr, args->verifier);
436 p = xdr_encode_sattr(p, args->sattr); 1041 break;
1042 default:
1043 BUG();
1044 }
1045}
437 1046
438 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 1047static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
439 return 0; 1048 struct xdr_stream *xdr,
1049 const struct nfs3_createargs *args)
1050{
1051 encode_diropargs3(xdr, args->fh, args->name, args->len);
1052 encode_createhow3(xdr, args);
440} 1053}
441 1054
442/* 1055/*
443 * Encode MKDIR arguments 1056 * 3.3.9 MKDIR3args
1057 *
1058 * struct MKDIR3args {
1059 * diropargs3 where;
1060 * sattr3 attributes;
1061 * };
444 */ 1062 */
445static int 1063static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
446nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args) 1064 struct xdr_stream *xdr,
1065 const struct nfs3_mkdirargs *args)
447{ 1066{
448 p = xdr_encode_fhandle(p, args->fh); 1067 encode_diropargs3(xdr, args->fh, args->name, args->len);
449 p = xdr_encode_array(p, args->name, args->len); 1068 encode_sattr3(xdr, args->sattr);
450 p = xdr_encode_sattr(p, args->sattr);
451 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
452 return 0;
453} 1069}
454 1070
455/* 1071/*
456 * Encode SYMLINK arguments 1072 * 3.3.10 SYMLINK3args
1073 *
1074 * struct symlinkdata3 {
1075 * sattr3 symlink_attributes;
1076 * nfspath3 symlink_data;
1077 * };
1078 *
1079 * struct SYMLINK3args {
1080 * diropargs3 where;
1081 * symlinkdata3 symlink;
1082 * };
457 */ 1083 */
458static int 1084static void encode_symlinkdata3(struct xdr_stream *xdr,
459nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args) 1085 const struct nfs3_symlinkargs *args)
460{ 1086{
461 p = xdr_encode_fhandle(p, args->fromfh); 1087 encode_sattr3(xdr, args->sattr);
462 p = xdr_encode_array(p, args->fromname, args->fromlen); 1088 encode_nfspath3(xdr, args->pages, args->pathlen);
463 p = xdr_encode_sattr(p, args->sattr); 1089}
464 *p++ = htonl(args->pathlen);
465 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
466 1090
467 /* Copy the page */ 1091static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
468 xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen); 1092 struct xdr_stream *xdr,
469 return 0; 1093 const struct nfs3_symlinkargs *args)
1094{
1095 encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
1096 encode_symlinkdata3(xdr, args);
470} 1097}
471 1098
472/* 1099/*
473 * Encode MKNOD arguments 1100 * 3.3.11 MKNOD3args
1101 *
1102 * struct devicedata3 {
1103 * sattr3 dev_attributes;
1104 * specdata3 spec;
1105 * };
1106 *
1107 * union mknoddata3 switch (ftype3 type) {
1108 * case NF3CHR:
1109 * case NF3BLK:
1110 * devicedata3 device;
1111 * case NF3SOCK:
1112 * case NF3FIFO:
1113 * sattr3 pipe_attributes;
1114 * default:
1115 * void;
1116 * };
1117 *
1118 * struct MKNOD3args {
1119 * diropargs3 where;
1120 * mknoddata3 what;
1121 * };
474 */ 1122 */
475static int 1123static void encode_devicedata3(struct xdr_stream *xdr,
476nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args) 1124 const struct nfs3_mknodargs *args)
477{ 1125{
478 p = xdr_encode_fhandle(p, args->fh); 1126 encode_sattr3(xdr, args->sattr);
479 p = xdr_encode_array(p, args->name, args->len); 1127 encode_specdata3(xdr, args->rdev);
480 *p++ = htonl(args->type); 1128}
481 p = xdr_encode_sattr(p, args->sattr); 1129
482 if (args->type == NF3CHR || args->type == NF3BLK) { 1130static void encode_mknoddata3(struct xdr_stream *xdr,
483 *p++ = htonl(MAJOR(args->rdev)); 1131 const struct nfs3_mknodargs *args)
484 *p++ = htonl(MINOR(args->rdev)); 1132{
1133 encode_ftype3(xdr, args->type);
1134 switch (args->type) {
1135 case NF3CHR:
1136 case NF3BLK:
1137 encode_devicedata3(xdr, args);
1138 break;
1139 case NF3SOCK:
1140 case NF3FIFO:
1141 encode_sattr3(xdr, args->sattr);
1142 break;
1143 case NF3REG:
1144 case NF3DIR:
1145 break;
1146 default:
1147 BUG();
485 } 1148 }
1149}
486 1150
487 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 1151static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
488 return 0; 1152 struct xdr_stream *xdr,
1153 const struct nfs3_mknodargs *args)
1154{
1155 encode_diropargs3(xdr, args->fh, args->name, args->len);
1156 encode_mknoddata3(xdr, args);
489} 1157}
490 1158
491/* 1159/*
492 * Encode RENAME arguments 1160 * 3.3.12 REMOVE3args
1161 *
1162 * struct REMOVE3args {
1163 * diropargs3 object;
1164 * };
493 */ 1165 */
494static int 1166static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
495nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) 1167 struct xdr_stream *xdr,
496{ 1168 const struct nfs_removeargs *args)
497 p = xdr_encode_fhandle(p, args->old_dir); 1169{
498 p = xdr_encode_array(p, args->old_name->name, args->old_name->len); 1170 encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
499 p = xdr_encode_fhandle(p, args->new_dir);
500 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
501 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
502 return 0;
503} 1171}
504 1172
505/* 1173/*
506 * Encode LINK arguments 1174 * 3.3.14 RENAME3args
1175 *
1176 * struct RENAME3args {
1177 * diropargs3 from;
1178 * diropargs3 to;
1179 * };
507 */ 1180 */
508static int 1181static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
509nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args) 1182 struct xdr_stream *xdr,
1183 const struct nfs_renameargs *args)
510{ 1184{
511 p = xdr_encode_fhandle(p, args->fromfh); 1185 const struct qstr *old = args->old_name;
512 p = xdr_encode_fhandle(p, args->tofh); 1186 const struct qstr *new = args->new_name;
513 p = xdr_encode_array(p, args->toname, args->tolen); 1187
514 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 1188 encode_diropargs3(xdr, args->old_dir, old->name, old->len);
515 return 0; 1189 encode_diropargs3(xdr, args->new_dir, new->name, new->len);
516} 1190}
517 1191
518/* 1192/*
519 * Encode arguments to readdir call 1193 * 3.3.15 LINK3args
1194 *
1195 * struct LINK3args {
1196 * nfs_fh3 file;
1197 * diropargs3 link;
1198 * };
520 */ 1199 */
521static int 1200static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
522nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) 1201 struct xdr_stream *xdr,
1202 const struct nfs3_linkargs *args)
523{ 1203{
524 struct rpc_auth *auth = req->rq_cred->cr_auth; 1204 encode_nfs_fh3(xdr, args->fromfh);
525 unsigned int replen; 1205 encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
526 u32 count = args->count;
527
528 p = xdr_encode_fhandle(p, args->fh);
529 p = xdr_encode_hyper(p, args->cookie);
530 *p++ = args->verf[0];
531 *p++ = args->verf[1];
532 if (args->plus) {
533 /* readdirplus: need dircount + buffer size.
534 * We just make sure we make dircount big enough */
535 *p++ = htonl(count >> 3);
536 }
537 *p++ = htonl(count);
538 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
539
540 /* Inline the page array */
541 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2;
542 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
543 return 0;
544} 1206}
545 1207
546/* 1208/*
547 * Decode the result of a readdir call. 1209 * 3.3.16 READDIR3args
548 * We just check for syntactical correctness. 1210 *
1211 * struct READDIR3args {
1212 * nfs_fh3 dir;
1213 * cookie3 cookie;
1214 * cookieverf3 cookieverf;
1215 * count3 count;
1216 * };
549 */ 1217 */
550static int 1218static void encode_readdir3args(struct xdr_stream *xdr,
551nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res) 1219 const struct nfs3_readdirargs *args)
552{ 1220{
553 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 1221 __be32 *p;
554 struct kvec *iov = rcvbuf->head;
555 struct page **page;
556 size_t hdrlen;
557 u32 recvd, pglen;
558 int status, nr = 0;
559
560 status = ntohl(*p++);
561 /* Decode post_op_attrs */
562 p = xdr_decode_post_op_attr(p, res->dir_attr);
563 if (status)
564 return nfs_stat_to_errno(status);
565 /* Decode verifier cookie */
566 if (res->verf) {
567 res->verf[0] = *p++;
568 res->verf[1] = *p++;
569 } else {
570 p += 2;
571 }
572 1222
573 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 1223 encode_nfs_fh3(xdr, args->fh);
574 if (iov->iov_len < hdrlen) {
575 dprintk("NFS: READDIR reply header overflowed:"
576 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
577 return -errno_NFSERR_IO;
578 } else if (iov->iov_len != hdrlen) {
579 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
580 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
581 }
582 1224
583 pglen = rcvbuf->page_len; 1225 p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
584 recvd = rcvbuf->len - hdrlen; 1226 p = xdr_encode_cookie3(p, args->cookie);
585 if (pglen > recvd) 1227 p = xdr_encode_cookieverf3(p, args->verf);
586 pglen = recvd; 1228 *p = cpu_to_be32(args->count);
587 page = rcvbuf->pages; 1229}
588 1230
589 return nr; 1231static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
1232 struct xdr_stream *xdr,
1233 const struct nfs3_readdirargs *args)
1234{
1235 encode_readdir3args(xdr, args);
1236 prepare_reply_buffer(req, args->pages, 0,
1237 args->count, NFS3_readdirres_sz);
590} 1238}
591 1239
592__be32 * 1240/*
593nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) 1241 * 3.3.17 READDIRPLUS3args
1242 *
1243 * struct READDIRPLUS3args {
1244 * nfs_fh3 dir;
1245 * cookie3 cookie;
1246 * cookieverf3 cookieverf;
1247 * count3 dircount;
1248 * count3 maxcount;
1249 * };
1250 */
1251static void encode_readdirplus3args(struct xdr_stream *xdr,
1252 const struct nfs3_readdirargs *args)
594{ 1253{
595 __be32 *p; 1254 __be32 *p;
596 struct nfs_entry old = *entry;
597 1255
598 p = xdr_inline_decode(xdr, 4); 1256 encode_nfs_fh3(xdr, args->fh);
599 if (unlikely(!p))
600 goto out_overflow;
601 if (!ntohl(*p++)) {
602 p = xdr_inline_decode(xdr, 4);
603 if (unlikely(!p))
604 goto out_overflow;
605 if (!ntohl(*p++))
606 return ERR_PTR(-EAGAIN);
607 entry->eof = 1;
608 return ERR_PTR(-EBADCOOKIE);
609 }
610 1257
611 p = xdr_inline_decode(xdr, 12); 1258 p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
612 if (unlikely(!p)) 1259 p = xdr_encode_cookie3(p, args->cookie);
613 goto out_overflow; 1260 p = xdr_encode_cookieverf3(p, args->verf);
614 p = xdr_decode_hyper(p, &entry->ino);
615 entry->len = ntohl(*p++);
616
617 p = xdr_inline_decode(xdr, entry->len + 8);
618 if (unlikely(!p))
619 goto out_overflow;
620 entry->name = (const char *) p;
621 p += XDR_QUADLEN(entry->len);
622 entry->prev_cookie = entry->cookie;
623 p = xdr_decode_hyper(p, &entry->cookie);
624
625 if (plus) {
626 entry->fattr->valid = 0;
627 p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
628 if (IS_ERR(p))
629 goto out_overflow_exit;
630 /* In fact, a post_op_fh3: */
631 p = xdr_inline_decode(xdr, 4);
632 if (unlikely(!p))
633 goto out_overflow;
634 if (*p++) {
635 p = xdr_decode_fhandle_stream(xdr, entry->fh);
636 if (IS_ERR(p))
637 goto out_overflow_exit;
638 /* Ugh -- server reply was truncated */
639 if (p == NULL) {
640 dprintk("NFS: FH truncated\n");
641 *entry = old;
642 return ERR_PTR(-EAGAIN);
643 }
644 } else
645 memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
646 }
647 1261
648 p = xdr_inline_peek(xdr, 8); 1262 /*
649 if (p != NULL) 1263 * readdirplus: need dircount + buffer size.
650 entry->eof = !p[0] && p[1]; 1264 * We just make sure we make dircount big enough
651 else 1265 */
652 entry->eof = 0; 1266 *p++ = cpu_to_be32(args->count >> 3);
653 1267
654 return p; 1268 *p = cpu_to_be32(args->count);
1269}
655 1270
656out_overflow: 1271static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
657 print_overflow_msg(__func__, xdr); 1272 struct xdr_stream *xdr,
658out_overflow_exit: 1273 const struct nfs3_readdirargs *args)
659 return ERR_PTR(-EIO); 1274{
1275 encode_readdirplus3args(xdr, args);
1276 prepare_reply_buffer(req, args->pages, 0,
1277 args->count, NFS3_readdirres_sz);
660} 1278}
661 1279
662/* 1280/*
663 * Encode COMMIT arguments 1281 * 3.3.21 COMMIT3args
1282 *
1283 * struct COMMIT3args {
1284 * nfs_fh3 file;
1285 * offset3 offset;
1286 * count3 count;
1287 * };
664 */ 1288 */
665static int 1289static void encode_commit3args(struct xdr_stream *xdr,
666nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 1290 const struct nfs_writeargs *args)
667{ 1291{
668 p = xdr_encode_fhandle(p, args->fh); 1292 __be32 *p;
1293
1294 encode_nfs_fh3(xdr, args->fh);
1295
1296 p = xdr_reserve_space(xdr, 8 + 4);
669 p = xdr_encode_hyper(p, args->offset); 1297 p = xdr_encode_hyper(p, args->offset);
670 *p++ = htonl(args->count); 1298 *p = cpu_to_be32(args->count);
671 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
672 return 0;
673} 1299}
674 1300
675#ifdef CONFIG_NFS_V3_ACL 1301static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
676/* 1302 struct xdr_stream *xdr,
677 * Encode GETACL arguments 1303 const struct nfs_writeargs *args)
678 */
679static int
680nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
681 struct nfs3_getaclargs *args)
682{ 1304{
683 struct rpc_auth *auth = req->rq_cred->cr_auth; 1305 encode_commit3args(xdr, args);
684 unsigned int replen; 1306}
685 1307
686 p = xdr_encode_fhandle(p, args->fh); 1308#ifdef CONFIG_NFS_V3_ACL
687 *p++ = htonl(args->mask);
688 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
689 1309
690 if (args->mask & (NFS_ACL | NFS_DFACL)) { 1310static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
691 /* Inline the page array */ 1311 struct xdr_stream *xdr,
692 replen = (RPC_REPHDRSIZE + auth->au_rslack + 1312 const struct nfs3_getaclargs *args)
693 ACL3_getaclres_sz) << 2; 1313{
694 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, 1314 encode_nfs_fh3(xdr, args->fh);
695 NFSACL_MAXPAGES << PAGE_SHIFT); 1315 encode_uint32(xdr, args->mask);
696 } 1316 if (args->mask & (NFS_ACL | NFS_DFACL))
697 return 0; 1317 prepare_reply_buffer(req, args->pages, 0,
1318 NFSACL_MAXPAGES << PAGE_SHIFT,
1319 ACL3_getaclres_sz);
698} 1320}
699 1321
700/* 1322static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
701 * Encode SETACL arguments 1323 struct xdr_stream *xdr,
702 */ 1324 const struct nfs3_setaclargs *args)
703static int
704nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
705 struct nfs3_setaclargs *args)
706{ 1325{
707 struct xdr_buf *buf = &req->rq_snd_buf;
708 unsigned int base; 1326 unsigned int base;
709 int err; 1327 int error;
710 1328
711 p = xdr_encode_fhandle(p, NFS_FH(args->inode)); 1329 encode_nfs_fh3(xdr, NFS_FH(args->inode));
712 *p++ = htonl(args->mask); 1330 encode_uint32(xdr, args->mask);
713 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
714 base = req->rq_slen;
715 1331
1332 base = req->rq_slen;
716 if (args->npages != 0) 1333 if (args->npages != 0)
717 xdr_encode_pages(buf, args->pages, 0, args->len); 1334 xdr_write_pages(xdr, args->pages, 0, args->len);
718 else 1335 else
719 req->rq_slen = xdr_adjust_iovec(req->rq_svec, 1336 xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
720 p + XDR_QUADLEN(args->len));
721 1337
722 err = nfsacl_encode(buf, base, args->inode, 1338 error = nfsacl_encode(xdr->buf, base, args->inode,
723 (args->mask & NFS_ACL) ? 1339 (args->mask & NFS_ACL) ?
724 args->acl_access : NULL, 1, 0); 1340 args->acl_access : NULL, 1, 0);
725 if (err > 0) 1341 BUG_ON(error < 0);
726 err = nfsacl_encode(buf, base + err, args->inode, 1342 error = nfsacl_encode(xdr->buf, base + error, args->inode,
727 (args->mask & NFS_DFACL) ? 1343 (args->mask & NFS_DFACL) ?
728 args->acl_default : NULL, 1, 1344 args->acl_default : NULL, 1,
729 NFS_ACL_DEFAULT); 1345 NFS_ACL_DEFAULT);
730 return (err > 0) ? 0 : err; 1346 BUG_ON(error < 0);
731} 1347}
1348
732#endif /* CONFIG_NFS_V3_ACL */ 1349#endif /* CONFIG_NFS_V3_ACL */
733 1350
734/* 1351/*
735 * NFS XDR decode functions 1352 * NFSv3 XDR decode functions
1353 *
1354 * NFSv3 result types are defined in section 3.3 of RFC 1813:
1355 * "NFS Version 3 Protocol Specification".
736 */ 1356 */
737 1357
738/* 1358/*
739 * Decode attrstat reply. 1359 * 3.3.1 GETATTR3res
1360 *
1361 * struct GETATTR3resok {
1362 * fattr3 obj_attributes;
1363 * };
1364 *
1365 * union GETATTR3res switch (nfsstat3 status) {
1366 * case NFS3_OK:
1367 * GETATTR3resok resok;
1368 * default:
1369 * void;
1370 * };
740 */ 1371 */
741static int 1372static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
742nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 1373 struct xdr_stream *xdr,
1374 struct nfs_fattr *result)
743{ 1375{
744 int status; 1376 enum nfs_stat status;
745 1377 int error;
746 if ((status = ntohl(*p++))) 1378
747 return nfs_stat_to_errno(status); 1379 error = decode_nfsstat3(xdr, &status);
748 xdr_decode_fattr(p, fattr); 1380 if (unlikely(error))
749 return 0; 1381 goto out;
1382 if (status != NFS3_OK)
1383 goto out_default;
1384 error = decode_fattr3(xdr, result);
1385out:
1386 return error;
1387out_default:
1388 return nfs_stat_to_errno(status);
750} 1389}
751 1390
752/* 1391/*
753 * Decode status+wcc_data reply 1392 * 3.3.2 SETATTR3res
754 * SATTR, REMOVE, RMDIR 1393 *
1394 * struct SETATTR3resok {
1395 * wcc_data obj_wcc;
1396 * };
1397 *
1398 * struct SETATTR3resfail {
1399 * wcc_data obj_wcc;
1400 * };
1401 *
1402 * union SETATTR3res switch (nfsstat3 status) {
1403 * case NFS3_OK:
1404 * SETATTR3resok resok;
1405 * default:
1406 * SETATTR3resfail resfail;
1407 * };
755 */ 1408 */
756static int 1409static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
757nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 1410 struct xdr_stream *xdr,
1411 struct nfs_fattr *result)
758{ 1412{
759 int status; 1413 enum nfs_stat status;
760 1414 int error;
761 if ((status = ntohl(*p++))) 1415
762 status = nfs_stat_to_errno(status); 1416 error = decode_nfsstat3(xdr, &status);
763 xdr_decode_wcc_data(p, fattr); 1417 if (unlikely(error))
764 return status; 1418 goto out;
1419 error = decode_wcc_data(xdr, result);
1420 if (unlikely(error))
1421 goto out;
1422 if (status != NFS3_OK)
1423 goto out_status;
1424out:
1425 return error;
1426out_status:
1427 return nfs_stat_to_errno(status);
765} 1428}
766 1429
767static int 1430/*
768nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) 1431 * 3.3.3 LOOKUP3res
1432 *
1433 * struct LOOKUP3resok {
1434 * nfs_fh3 object;
1435 * post_op_attr obj_attributes;
1436 * post_op_attr dir_attributes;
1437 * };
1438 *
1439 * struct LOOKUP3resfail {
1440 * post_op_attr dir_attributes;
1441 * };
1442 *
1443 * union LOOKUP3res switch (nfsstat3 status) {
1444 * case NFS3_OK:
1445 * LOOKUP3resok resok;
1446 * default:
1447 * LOOKUP3resfail resfail;
1448 * };
1449 */
1450static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
1451 struct xdr_stream *xdr,
1452 struct nfs3_diropres *result)
769{ 1453{
770 return nfs3_xdr_wccstat(req, p, res->dir_attr); 1454 enum nfs_stat status;
1455 int error;
1456
1457 error = decode_nfsstat3(xdr, &status);
1458 if (unlikely(error))
1459 goto out;
1460 if (status != NFS3_OK)
1461 goto out_default;
1462 error = decode_nfs_fh3(xdr, result->fh);
1463 if (unlikely(error))
1464 goto out;
1465 error = decode_post_op_attr(xdr, result->fattr);
1466 if (unlikely(error))
1467 goto out;
1468 error = decode_post_op_attr(xdr, result->dir_attr);
1469out:
1470 return error;
1471out_default:
1472 error = decode_post_op_attr(xdr, result->dir_attr);
1473 if (unlikely(error))
1474 goto out;
1475 return nfs_stat_to_errno(status);
771} 1476}
772 1477
773/* 1478/*
774 * Decode LOOKUP reply 1479 * 3.3.4 ACCESS3res
1480 *
1481 * struct ACCESS3resok {
1482 * post_op_attr obj_attributes;
1483 * uint32 access;
1484 * };
1485 *
1486 * struct ACCESS3resfail {
1487 * post_op_attr obj_attributes;
1488 * };
1489 *
1490 * union ACCESS3res switch (nfsstat3 status) {
1491 * case NFS3_OK:
1492 * ACCESS3resok resok;
1493 * default:
1494 * ACCESS3resfail resfail;
1495 * };
775 */ 1496 */
776static int 1497static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
777nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) 1498 struct xdr_stream *xdr,
1499 struct nfs3_accessres *result)
778{ 1500{
779 int status; 1501 enum nfs_stat status;
780 1502 int error;
781 if ((status = ntohl(*p++))) { 1503
782 status = nfs_stat_to_errno(status); 1504 error = decode_nfsstat3(xdr, &status);
783 } else { 1505 if (unlikely(error))
784 if (!(p = xdr_decode_fhandle(p, res->fh))) 1506 goto out;
785 return -errno_NFSERR_IO; 1507 error = decode_post_op_attr(xdr, result->fattr);
786 p = xdr_decode_post_op_attr(p, res->fattr); 1508 if (unlikely(error))
787 } 1509 goto out;
788 xdr_decode_post_op_attr(p, res->dir_attr); 1510 if (status != NFS3_OK)
789 return status; 1511 goto out_default;
1512 error = decode_uint32(xdr, &result->access);
1513out:
1514 return error;
1515out_default:
1516 return nfs_stat_to_errno(status);
790} 1517}
791 1518
792/* 1519/*
793 * Decode ACCESS reply 1520 * 3.3.5 READLINK3res
1521 *
1522 * struct READLINK3resok {
1523 * post_op_attr symlink_attributes;
1524 * nfspath3 data;
1525 * };
1526 *
1527 * struct READLINK3resfail {
1528 * post_op_attr symlink_attributes;
1529 * };
1530 *
1531 * union READLINK3res switch (nfsstat3 status) {
1532 * case NFS3_OK:
1533 * READLINK3resok resok;
1534 * default:
1535 * READLINK3resfail resfail;
1536 * };
794 */ 1537 */
795static int 1538static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
796nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) 1539 struct xdr_stream *xdr,
1540 struct nfs_fattr *result)
797{ 1541{
798 int status = ntohl(*p++); 1542 enum nfs_stat status;
799 1543 int error;
800 p = xdr_decode_post_op_attr(p, res->fattr); 1544
801 if (status) 1545 error = decode_nfsstat3(xdr, &status);
802 return nfs_stat_to_errno(status); 1546 if (unlikely(error))
803 res->access = ntohl(*p++); 1547 goto out;
804 return 0; 1548 error = decode_post_op_attr(xdr, result);
1549 if (unlikely(error))
1550 goto out;
1551 if (status != NFS3_OK)
1552 goto out_default;
1553 error = decode_nfspath3(xdr);
1554out:
1555 return error;
1556out_default:
1557 return nfs_stat_to_errno(status);
805} 1558}
806 1559
807static int 1560/*
808nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) 1561 * 3.3.6 READ3res
1562 *
1563 * struct READ3resok {
1564 * post_op_attr file_attributes;
1565 * count3 count;
1566 * bool eof;
1567 * opaque data<>;
1568 * };
1569 *
1570 * struct READ3resfail {
1571 * post_op_attr file_attributes;
1572 * };
1573 *
1574 * union READ3res switch (nfsstat3 status) {
1575 * case NFS3_OK:
1576 * READ3resok resok;
1577 * default:
1578 * READ3resfail resfail;
1579 * };
1580 */
1581static int decode_read3resok(struct xdr_stream *xdr,
1582 struct nfs_readres *result)
809{ 1583{
810 struct rpc_auth *auth = req->rq_cred->cr_auth; 1584 u32 eof, count, ocount, recvd;
811 unsigned int replen; 1585 size_t hdrlen;
1586 __be32 *p;
812 1587
813 p = xdr_encode_fhandle(p, args->fh); 1588 p = xdr_inline_decode(xdr, 4 + 4 + 4);
814 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 1589 if (unlikely(p == NULL))
1590 goto out_overflow;
1591 count = be32_to_cpup(p++);
1592 eof = be32_to_cpup(p++);
1593 ocount = be32_to_cpup(p++);
1594 if (unlikely(ocount != count))
1595 goto out_mismatch;
1596 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
1597 recvd = xdr->buf->len - hdrlen;
1598 if (unlikely(count > recvd))
1599 goto out_cheating;
1600
1601out:
1602 xdr_read_pages(xdr, count);
1603 result->eof = eof;
1604 result->count = count;
1605 return count;
1606out_mismatch:
1607 dprintk("NFS: READ count doesn't match length of opaque: "
1608 "count %u != ocount %u\n", count, ocount);
1609 return -EIO;
1610out_cheating:
1611 dprintk("NFS: server cheating in read result: "
1612 "count %u > recvd %u\n", count, recvd);
1613 count = recvd;
1614 eof = 0;
1615 goto out;
1616out_overflow:
1617 print_overflow_msg(__func__, xdr);
1618 return -EIO;
1619}
815 1620
816 /* Inline the page array */ 1621static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
817 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2; 1622 struct nfs_readres *result)
818 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); 1623{
819 return 0; 1624 enum nfs_stat status;
1625 int error;
1626
1627 error = decode_nfsstat3(xdr, &status);
1628 if (unlikely(error))
1629 goto out;
1630 error = decode_post_op_attr(xdr, result->fattr);
1631 if (unlikely(error))
1632 goto out;
1633 if (status != NFS3_OK)
1634 goto out_status;
1635 error = decode_read3resok(xdr, result);
1636out:
1637 return error;
1638out_status:
1639 return nfs_stat_to_errno(status);
820} 1640}
821 1641
822/* 1642/*
823 * Decode READLINK reply 1643 * 3.3.7 WRITE3res
1644 *
1645 * enum stable_how {
1646 * UNSTABLE = 0,
1647 * DATA_SYNC = 1,
1648 * FILE_SYNC = 2
1649 * };
1650 *
1651 * struct WRITE3resok {
1652 * wcc_data file_wcc;
1653 * count3 count;
1654 * stable_how committed;
1655 * writeverf3 verf;
1656 * };
1657 *
1658 * struct WRITE3resfail {
1659 * wcc_data file_wcc;
1660 * };
1661 *
1662 * union WRITE3res switch (nfsstat3 status) {
1663 * case NFS3_OK:
1664 * WRITE3resok resok;
1665 * default:
1666 * WRITE3resfail resfail;
1667 * };
824 */ 1668 */
825static int 1669static int decode_write3resok(struct xdr_stream *xdr,
826nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 1670 struct nfs_writeres *result)
827{ 1671{
828 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 1672 __be32 *p;
829 struct kvec *iov = rcvbuf->head;
830 size_t hdrlen;
831 u32 len, recvd;
832 int status;
833
834 status = ntohl(*p++);
835 p = xdr_decode_post_op_attr(p, fattr);
836
837 if (status != 0)
838 return nfs_stat_to_errno(status);
839
840 /* Convert length of symlink */
841 len = ntohl(*p++);
842 if (len >= rcvbuf->page_len) {
843 dprintk("nfs: server returned giant symlink!\n");
844 return -ENAMETOOLONG;
845 }
846 1673
847 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 1674 p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
848 if (iov->iov_len < hdrlen) { 1675 if (unlikely(p == NULL))
849 dprintk("NFS: READLINK reply header overflowed:" 1676 goto out_overflow;
850 "length %Zu > %Zu\n", hdrlen, iov->iov_len); 1677 result->count = be32_to_cpup(p++);
851 return -errno_NFSERR_IO; 1678 result->verf->committed = be32_to_cpup(p++);
852 } else if (iov->iov_len != hdrlen) { 1679 if (unlikely(result->verf->committed > NFS_FILE_SYNC))
853 dprintk("NFS: READLINK header is short. " 1680 goto out_badvalue;
854 "iovec will be shifted.\n"); 1681 memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
855 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); 1682 return result->count;
856 } 1683out_badvalue:
857 recvd = req->rq_rcv_buf.len - hdrlen; 1684 dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
858 if (recvd < len) { 1685 return -EIO;
859 dprintk("NFS: server cheating in readlink reply: " 1686out_overflow:
860 "count %u > recvd %u\n", len, recvd); 1687 print_overflow_msg(__func__, xdr);
861 return -EIO; 1688 return -EIO;
862 } 1689}
863 1690
864 xdr_terminate_string(rcvbuf, len); 1691static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
865 return 0; 1692 struct nfs_writeres *result)
1693{
1694 enum nfs_stat status;
1695 int error;
1696
1697 error = decode_nfsstat3(xdr, &status);
1698 if (unlikely(error))
1699 goto out;
1700 error = decode_wcc_data(xdr, result->fattr);
1701 if (unlikely(error))
1702 goto out;
1703 if (status != NFS3_OK)
1704 goto out_status;
1705 error = decode_write3resok(xdr, result);
1706out:
1707 return error;
1708out_status:
1709 return nfs_stat_to_errno(status);
866} 1710}
867 1711
868/* 1712/*
869 * Decode READ reply 1713 * 3.3.8 CREATE3res
1714 *
1715 * struct CREATE3resok {
1716 * post_op_fh3 obj;
1717 * post_op_attr obj_attributes;
1718 * wcc_data dir_wcc;
1719 * };
1720 *
1721 * struct CREATE3resfail {
1722 * wcc_data dir_wcc;
1723 * };
1724 *
1725 * union CREATE3res switch (nfsstat3 status) {
1726 * case NFS3_OK:
1727 * CREATE3resok resok;
1728 * default:
1729 * CREATE3resfail resfail;
1730 * };
870 */ 1731 */
871static int 1732static int decode_create3resok(struct xdr_stream *xdr,
872nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) 1733 struct nfs3_diropres *result)
873{ 1734{
874 struct kvec *iov = req->rq_rcv_buf.head; 1735 int error;
875 size_t hdrlen; 1736
876 u32 count, ocount, recvd; 1737 error = decode_post_op_fh3(xdr, result->fh);
877 int status; 1738 if (unlikely(error))
1739 goto out;
1740 error = decode_post_op_attr(xdr, result->fattr);
1741 if (unlikely(error))
1742 goto out;
1743 /* The server isn't required to return a file handle.
1744 * If it didn't, force the client to perform a LOOKUP
1745 * to determine the correct file handle and attribute
1746 * values for the new object. */
1747 if (result->fh->size == 0)
1748 result->fattr->valid = 0;
1749 error = decode_wcc_data(xdr, result->dir_attr);
1750out:
1751 return error;
1752}
878 1753
879 status = ntohl(*p++); 1754static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
880 p = xdr_decode_post_op_attr(p, res->fattr); 1755 struct xdr_stream *xdr,
1756 struct nfs3_diropres *result)
1757{
1758 enum nfs_stat status;
1759 int error;
1760
1761 error = decode_nfsstat3(xdr, &status);
1762 if (unlikely(error))
1763 goto out;
1764 if (status != NFS3_OK)
1765 goto out_default;
1766 error = decode_create3resok(xdr, result);
1767out:
1768 return error;
1769out_default:
1770 error = decode_wcc_data(xdr, result->dir_attr);
1771 if (unlikely(error))
1772 goto out;
1773 return nfs_stat_to_errno(status);
1774}
881 1775
882 if (status != 0) 1776/*
883 return nfs_stat_to_errno(status); 1777 * 3.3.12 REMOVE3res
1778 *
1779 * struct REMOVE3resok {
1780 * wcc_data dir_wcc;
1781 * };
1782 *
1783 * struct REMOVE3resfail {
1784 * wcc_data dir_wcc;
1785 * };
1786 *
1787 * union REMOVE3res switch (nfsstat3 status) {
1788 * case NFS3_OK:
1789 * REMOVE3resok resok;
1790 * default:
1791 * REMOVE3resfail resfail;
1792 * };
1793 */
1794static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
1795 struct xdr_stream *xdr,
1796 struct nfs_removeres *result)
1797{
1798 enum nfs_stat status;
1799 int error;
1800
1801 error = decode_nfsstat3(xdr, &status);
1802 if (unlikely(error))
1803 goto out;
1804 error = decode_wcc_data(xdr, result->dir_attr);
1805 if (unlikely(error))
1806 goto out;
1807 if (status != NFS3_OK)
1808 goto out_status;
1809out:
1810 return error;
1811out_status:
1812 return nfs_stat_to_errno(status);
1813}
884 1814
885 /* Decode reply count and EOF flag. NFSv3 is somewhat redundant 1815/*
886 * in that it puts the count both in the res struct and in the 1816 * 3.3.14 RENAME3res
887 * opaque data count. */ 1817 *
888 count = ntohl(*p++); 1818 * struct RENAME3resok {
889 res->eof = ntohl(*p++); 1819 * wcc_data fromdir_wcc;
890 ocount = ntohl(*p++); 1820 * wcc_data todir_wcc;
1821 * };
1822 *
1823 * struct RENAME3resfail {
1824 * wcc_data fromdir_wcc;
1825 * wcc_data todir_wcc;
1826 * };
1827 *
1828 * union RENAME3res switch (nfsstat3 status) {
1829 * case NFS3_OK:
1830 * RENAME3resok resok;
1831 * default:
1832 * RENAME3resfail resfail;
1833 * };
1834 */
1835static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
1836 struct xdr_stream *xdr,
1837 struct nfs_renameres *result)
1838{
1839 enum nfs_stat status;
1840 int error;
1841
1842 error = decode_nfsstat3(xdr, &status);
1843 if (unlikely(error))
1844 goto out;
1845 error = decode_wcc_data(xdr, result->old_fattr);
1846 if (unlikely(error))
1847 goto out;
1848 error = decode_wcc_data(xdr, result->new_fattr);
1849 if (unlikely(error))
1850 goto out;
1851 if (status != NFS3_OK)
1852 goto out_status;
1853out:
1854 return error;
1855out_status:
1856 return nfs_stat_to_errno(status);
1857}
891 1858
892 if (ocount != count) { 1859/*
893 dprintk("NFS: READ count doesn't match RPC opaque count.\n"); 1860 * 3.3.15 LINK3res
894 return -errno_NFSERR_IO; 1861 *
895 } 1862 * struct LINK3resok {
1863 * post_op_attr file_attributes;
1864 * wcc_data linkdir_wcc;
1865 * };
1866 *
1867 * struct LINK3resfail {
1868 * post_op_attr file_attributes;
1869 * wcc_data linkdir_wcc;
1870 * };
1871 *
1872 * union LINK3res switch (nfsstat3 status) {
1873 * case NFS3_OK:
1874 * LINK3resok resok;
1875 * default:
1876 * LINK3resfail resfail;
1877 * };
1878 */
1879static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
1880 struct nfs3_linkres *result)
1881{
1882 enum nfs_stat status;
1883 int error;
1884
1885 error = decode_nfsstat3(xdr, &status);
1886 if (unlikely(error))
1887 goto out;
1888 error = decode_post_op_attr(xdr, result->fattr);
1889 if (unlikely(error))
1890 goto out;
1891 error = decode_wcc_data(xdr, result->dir_attr);
1892 if (unlikely(error))
1893 goto out;
1894 if (status != NFS3_OK)
1895 goto out_status;
1896out:
1897 return error;
1898out_status:
1899 return nfs_stat_to_errno(status);
1900}
896 1901
897 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 1902/**
898 if (iov->iov_len < hdrlen) { 1903 * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
899 dprintk("NFS: READ reply header overflowed:" 1904 * the local page cache
900 "length %Zu > %Zu\n", hdrlen, iov->iov_len); 1905 * @xdr: XDR stream where entry resides
901 return -errno_NFSERR_IO; 1906 * @entry: buffer to fill in with entry data
902 } else if (iov->iov_len != hdrlen) { 1907 * @plus: boolean indicating whether this should be a readdirplus entry
903 dprintk("NFS: READ header is short. iovec will be shifted.\n"); 1908 *
904 xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); 1909 * Returns zero if successful, otherwise a negative errno value is
905 } 1910 * returned.
1911 *
1912 * This function is not invoked during READDIR reply decoding, but
1913 * rather whenever an application invokes the getdents(2) system call
1914 * on a directory already in our cache.
1915 *
1916 * 3.3.16 entry3
1917 *
1918 * struct entry3 {
1919 * fileid3 fileid;
1920 * filename3 name;
1921 * cookie3 cookie;
1922 * fhandle3 filehandle;
1923 * post_op_attr3 attributes;
1924 * entry3 *nextentry;
1925 * };
1926 *
1927 * 3.3.17 entryplus3
1928 * struct entryplus3 {
1929 * fileid3 fileid;
1930 * filename3 name;
1931 * cookie3 cookie;
1932 * post_op_attr name_attributes;
1933 * post_op_fh3 name_handle;
1934 * entryplus3 *nextentry;
1935 * };
1936 */
1937int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
1938 int plus)
1939{
1940 struct nfs_entry old = *entry;
1941 __be32 *p;
1942 int error;
906 1943
907 recvd = req->rq_rcv_buf.len - hdrlen; 1944 p = xdr_inline_decode(xdr, 4);
908 if (count > recvd) { 1945 if (unlikely(p == NULL))
909 dprintk("NFS: server cheating in read reply: " 1946 goto out_overflow;
910 "count %u > recvd %u\n", count, recvd); 1947 if (*p == xdr_zero) {
911 count = recvd; 1948 p = xdr_inline_decode(xdr, 4);
912 res->eof = 0; 1949 if (unlikely(p == NULL))
1950 goto out_overflow;
1951 if (*p == xdr_zero)
1952 return -EAGAIN;
1953 entry->eof = 1;
1954 return -EBADCOOKIE;
913 } 1955 }
914 1956
915 if (count < res->count) 1957 error = decode_fileid3(xdr, &entry->ino);
916 res->count = count; 1958 if (unlikely(error))
1959 return error;
917 1960
918 return count; 1961 error = decode_inline_filename3(xdr, &entry->name, &entry->len);
919} 1962 if (unlikely(error))
1963 return error;
920 1964
921/* 1965 entry->prev_cookie = entry->cookie;
922 * Decode WRITE response 1966 error = decode_cookie3(xdr, &entry->cookie);
923 */ 1967 if (unlikely(error))
924static int 1968 return error;
925nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
926{
927 int status;
928 1969
929 status = ntohl(*p++); 1970 entry->d_type = DT_UNKNOWN;
930 p = xdr_decode_wcc_data(p, res->fattr);
931 1971
932 if (status != 0) 1972 if (plus) {
933 return nfs_stat_to_errno(status); 1973 entry->fattr->valid = 0;
1974 error = decode_post_op_attr(xdr, entry->fattr);
1975 if (unlikely(error))
1976 return error;
1977 if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
1978 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
934 1979
935 res->count = ntohl(*p++); 1980 /* In fact, a post_op_fh3: */
936 res->verf->committed = (enum nfs3_stable_how)ntohl(*p++); 1981 p = xdr_inline_decode(xdr, 4);
937 res->verf->verifier[0] = *p++; 1982 if (unlikely(p == NULL))
938 res->verf->verifier[1] = *p++; 1983 goto out_overflow;
1984 if (*p != xdr_zero) {
1985 error = decode_nfs_fh3(xdr, entry->fh);
1986 if (unlikely(error)) {
1987 if (error == -E2BIG)
1988 goto out_truncated;
1989 return error;
1990 }
1991 } else
1992 zero_nfs_fh3(entry->fh);
1993 }
939 1994
940 return res->count; 1995 return 0;
941}
942 1996
943/* 1997out_overflow:
944 * Decode a CREATE response 1998 print_overflow_msg(__func__, xdr);
945 */ 1999 return -EAGAIN;
946static int 2000out_truncated:
947nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) 2001 dprintk("NFS: directory entry contains invalid file handle\n");
948{ 2002 *entry = old;
949 int status; 2003 return -EAGAIN;
950
951 status = ntohl(*p++);
952 if (status == 0) {
953 if (*p++) {
954 if (!(p = xdr_decode_fhandle(p, res->fh)))
955 return -errno_NFSERR_IO;
956 p = xdr_decode_post_op_attr(p, res->fattr);
957 } else {
958 memset(res->fh, 0, sizeof(*res->fh));
959 /* Do decode post_op_attr but set it to NULL */
960 p = xdr_decode_post_op_attr(p, res->fattr);
961 res->fattr->valid = 0;
962 }
963 } else {
964 status = nfs_stat_to_errno(status);
965 }
966 p = xdr_decode_wcc_data(p, res->dir_attr);
967 return status;
968} 2004}
969 2005
970/* 2006/*
971 * Decode RENAME reply 2007 * 3.3.16 READDIR3res
2008 *
2009 * struct dirlist3 {
2010 * entry3 *entries;
2011 * bool eof;
2012 * };
2013 *
2014 * struct READDIR3resok {
2015 * post_op_attr dir_attributes;
2016 * cookieverf3 cookieverf;
2017 * dirlist3 reply;
2018 * };
2019 *
2020 * struct READDIR3resfail {
2021 * post_op_attr dir_attributes;
2022 * };
2023 *
2024 * union READDIR3res switch (nfsstat3 status) {
2025 * case NFS3_OK:
2026 * READDIR3resok resok;
2027 * default:
2028 * READDIR3resfail resfail;
2029 * };
2030 *
2031 * Read the directory contents into the page cache, but otherwise
2032 * don't touch them. The actual decoding is done by nfs3_decode_entry()
2033 * during subsequent nfs_readdir() calls.
972 */ 2034 */
973static int 2035static int decode_dirlist3(struct xdr_stream *xdr)
974nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
975{ 2036{
976 int status; 2037 u32 recvd, pglen;
2038 size_t hdrlen;
977 2039
978 if ((status = ntohl(*p++)) != 0) 2040 pglen = xdr->buf->page_len;
979 status = nfs_stat_to_errno(status); 2041 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
980 p = xdr_decode_wcc_data(p, res->old_fattr); 2042 recvd = xdr->buf->len - hdrlen;
981 p = xdr_decode_wcc_data(p, res->new_fattr); 2043 if (unlikely(pglen > recvd))
982 return status; 2044 goto out_cheating;
2045out:
2046 xdr_read_pages(xdr, pglen);
2047 return pglen;
2048out_cheating:
2049 dprintk("NFS: server cheating in readdir result: "
2050 "pglen %u > recvd %u\n", pglen, recvd);
2051 pglen = recvd;
2052 goto out;
983} 2053}
984 2054
985/* 2055static int decode_readdir3resok(struct xdr_stream *xdr,
986 * Decode LINK reply 2056 struct nfs3_readdirres *result)
987 */
988static int
989nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
990{ 2057{
991 int status; 2058 int error;
2059
2060 error = decode_post_op_attr(xdr, result->dir_attr);
2061 if (unlikely(error))
2062 goto out;
2063 /* XXX: do we need to check if result->verf != NULL ? */
2064 error = decode_cookieverf3(xdr, result->verf);
2065 if (unlikely(error))
2066 goto out;
2067 error = decode_dirlist3(xdr);
2068out:
2069 return error;
2070}
992 2071
993 if ((status = ntohl(*p++)) != 0) 2072static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
994 status = nfs_stat_to_errno(status); 2073 struct xdr_stream *xdr,
995 p = xdr_decode_post_op_attr(p, res->fattr); 2074 struct nfs3_readdirres *result)
996 p = xdr_decode_wcc_data(p, res->dir_attr); 2075{
997 return status; 2076 enum nfs_stat status;
2077 int error;
2078
2079 error = decode_nfsstat3(xdr, &status);
2080 if (unlikely(error))
2081 goto out;
2082 if (status != NFS3_OK)
2083 goto out_default;
2084 error = decode_readdir3resok(xdr, result);
2085out:
2086 return error;
2087out_default:
2088 error = decode_post_op_attr(xdr, result->dir_attr);
2089 if (unlikely(error))
2090 goto out;
2091 return nfs_stat_to_errno(status);
998} 2092}
999 2093
1000/* 2094/*
1001 * Decode FSSTAT reply 2095 * 3.3.18 FSSTAT3res
2096 *
2097 * struct FSSTAT3resok {
2098 * post_op_attr obj_attributes;
2099 * size3 tbytes;
2100 * size3 fbytes;
2101 * size3 abytes;
2102 * size3 tfiles;
2103 * size3 ffiles;
2104 * size3 afiles;
2105 * uint32 invarsec;
2106 * };
2107 *
2108 * struct FSSTAT3resfail {
2109 * post_op_attr obj_attributes;
2110 * };
2111 *
2112 * union FSSTAT3res switch (nfsstat3 status) {
2113 * case NFS3_OK:
2114 * FSSTAT3resok resok;
2115 * default:
2116 * FSSTAT3resfail resfail;
2117 * };
1002 */ 2118 */
1003static int 2119static int decode_fsstat3resok(struct xdr_stream *xdr,
1004nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res) 2120 struct nfs_fsstat *result)
1005{ 2121{
1006 int status; 2122 __be32 *p;
1007
1008 status = ntohl(*p++);
1009
1010 p = xdr_decode_post_op_attr(p, res->fattr);
1011 if (status != 0)
1012 return nfs_stat_to_errno(status);
1013
1014 p = xdr_decode_hyper(p, &res->tbytes);
1015 p = xdr_decode_hyper(p, &res->fbytes);
1016 p = xdr_decode_hyper(p, &res->abytes);
1017 p = xdr_decode_hyper(p, &res->tfiles);
1018 p = xdr_decode_hyper(p, &res->ffiles);
1019 p = xdr_decode_hyper(p, &res->afiles);
1020 2123
2124 p = xdr_inline_decode(xdr, 8 * 6 + 4);
2125 if (unlikely(p == NULL))
2126 goto out_overflow;
2127 p = xdr_decode_size3(p, &result->tbytes);
2128 p = xdr_decode_size3(p, &result->fbytes);
2129 p = xdr_decode_size3(p, &result->abytes);
2130 p = xdr_decode_size3(p, &result->tfiles);
2131 p = xdr_decode_size3(p, &result->ffiles);
2132 xdr_decode_size3(p, &result->afiles);
1021 /* ignore invarsec */ 2133 /* ignore invarsec */
1022 return 0; 2134 return 0;
2135out_overflow:
2136 print_overflow_msg(__func__, xdr);
2137 return -EIO;
2138}
2139
2140static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
2141 struct xdr_stream *xdr,
2142 struct nfs_fsstat *result)
2143{
2144 enum nfs_stat status;
2145 int error;
2146
2147 error = decode_nfsstat3(xdr, &status);
2148 if (unlikely(error))
2149 goto out;
2150 error = decode_post_op_attr(xdr, result->fattr);
2151 if (unlikely(error))
2152 goto out;
2153 if (status != NFS3_OK)
2154 goto out_status;
2155 error = decode_fsstat3resok(xdr, result);
2156out:
2157 return error;
2158out_status:
2159 return nfs_stat_to_errno(status);
1023} 2160}
1024 2161
1025/* 2162/*
1026 * Decode FSINFO reply 2163 * 3.3.19 FSINFO3res
2164 *
2165 * struct FSINFO3resok {
2166 * post_op_attr obj_attributes;
2167 * uint32 rtmax;
2168 * uint32 rtpref;
2169 * uint32 rtmult;
2170 * uint32 wtmax;
2171 * uint32 wtpref;
2172 * uint32 wtmult;
2173 * uint32 dtpref;
2174 * size3 maxfilesize;
2175 * nfstime3 time_delta;
2176 * uint32 properties;
2177 * };
2178 *
2179 * struct FSINFO3resfail {
2180 * post_op_attr obj_attributes;
2181 * };
2182 *
2183 * union FSINFO3res switch (nfsstat3 status) {
2184 * case NFS3_OK:
2185 * FSINFO3resok resok;
2186 * default:
2187 * FSINFO3resfail resfail;
2188 * };
1027 */ 2189 */
1028static int 2190static int decode_fsinfo3resok(struct xdr_stream *xdr,
1029nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) 2191 struct nfs_fsinfo *result)
1030{ 2192{
1031 int status; 2193 __be32 *p;
1032
1033 status = ntohl(*p++);
1034
1035 p = xdr_decode_post_op_attr(p, res->fattr);
1036 if (status != 0)
1037 return nfs_stat_to_errno(status);
1038 2194
1039 res->rtmax = ntohl(*p++); 2195 p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
1040 res->rtpref = ntohl(*p++); 2196 if (unlikely(p == NULL))
1041 res->rtmult = ntohl(*p++); 2197 goto out_overflow;
1042 res->wtmax = ntohl(*p++); 2198 result->rtmax = be32_to_cpup(p++);
1043 res->wtpref = ntohl(*p++); 2199 result->rtpref = be32_to_cpup(p++);
1044 res->wtmult = ntohl(*p++); 2200 result->rtmult = be32_to_cpup(p++);
1045 res->dtpref = ntohl(*p++); 2201 result->wtmax = be32_to_cpup(p++);
1046 p = xdr_decode_hyper(p, &res->maxfilesize); 2202 result->wtpref = be32_to_cpup(p++);
1047 p = xdr_decode_time3(p, &res->time_delta); 2203 result->wtmult = be32_to_cpup(p++);
2204 result->dtpref = be32_to_cpup(p++);
2205 p = xdr_decode_size3(p, &result->maxfilesize);
2206 xdr_decode_nfstime3(p, &result->time_delta);
1048 2207
1049 /* ignore properties */ 2208 /* ignore properties */
1050 res->lease_time = 0; 2209 result->lease_time = 0;
1051 return 0; 2210 return 0;
2211out_overflow:
2212 print_overflow_msg(__func__, xdr);
2213 return -EIO;
2214}
2215
2216static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
2217 struct xdr_stream *xdr,
2218 struct nfs_fsinfo *result)
2219{
2220 enum nfs_stat status;
2221 int error;
2222
2223 error = decode_nfsstat3(xdr, &status);
2224 if (unlikely(error))
2225 goto out;
2226 error = decode_post_op_attr(xdr, result->fattr);
2227 if (unlikely(error))
2228 goto out;
2229 if (status != NFS3_OK)
2230 goto out_status;
2231 error = decode_fsinfo3resok(xdr, result);
2232out:
2233 return error;
2234out_status:
2235 return nfs_stat_to_errno(status);
1052} 2236}
1053 2237
1054/* 2238/*
1055 * Decode PATHCONF reply 2239 * 3.3.20 PATHCONF3res
2240 *
2241 * struct PATHCONF3resok {
2242 * post_op_attr obj_attributes;
2243 * uint32 linkmax;
2244 * uint32 name_max;
2245 * bool no_trunc;
2246 * bool chown_restricted;
2247 * bool case_insensitive;
2248 * bool case_preserving;
2249 * };
2250 *
2251 * struct PATHCONF3resfail {
2252 * post_op_attr obj_attributes;
2253 * };
2254 *
2255 * union PATHCONF3res switch (nfsstat3 status) {
2256 * case NFS3_OK:
2257 * PATHCONF3resok resok;
2258 * default:
2259 * PATHCONF3resfail resfail;
2260 * };
1056 */ 2261 */
1057static int 2262static int decode_pathconf3resok(struct xdr_stream *xdr,
1058nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res) 2263 struct nfs_pathconf *result)
1059{ 2264{
1060 int status; 2265 __be32 *p;
1061
1062 status = ntohl(*p++);
1063
1064 p = xdr_decode_post_op_attr(p, res->fattr);
1065 if (status != 0)
1066 return nfs_stat_to_errno(status);
1067 res->max_link = ntohl(*p++);
1068 res->max_namelen = ntohl(*p++);
1069 2266
2267 p = xdr_inline_decode(xdr, 4 * 6);
2268 if (unlikely(p == NULL))
2269 goto out_overflow;
2270 result->max_link = be32_to_cpup(p++);
2271 result->max_namelen = be32_to_cpup(p);
1070 /* ignore remaining fields */ 2272 /* ignore remaining fields */
1071 return 0; 2273 return 0;
2274out_overflow:
2275 print_overflow_msg(__func__, xdr);
2276 return -EIO;
2277}
2278
2279static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
2280 struct xdr_stream *xdr,
2281 struct nfs_pathconf *result)
2282{
2283 enum nfs_stat status;
2284 int error;
2285
2286 error = decode_nfsstat3(xdr, &status);
2287 if (unlikely(error))
2288 goto out;
2289 error = decode_post_op_attr(xdr, result->fattr);
2290 if (unlikely(error))
2291 goto out;
2292 if (status != NFS3_OK)
2293 goto out_status;
2294 error = decode_pathconf3resok(xdr, result);
2295out:
2296 return error;
2297out_status:
2298 return nfs_stat_to_errno(status);
1072} 2299}
1073 2300
1074/* 2301/*
1075 * Decode COMMIT reply 2302 * 3.3.21 COMMIT3res
2303 *
2304 * struct COMMIT3resok {
2305 * wcc_data file_wcc;
2306 * writeverf3 verf;
2307 * };
2308 *
2309 * struct COMMIT3resfail {
2310 * wcc_data file_wcc;
2311 * };
2312 *
2313 * union COMMIT3res switch (nfsstat3 status) {
2314 * case NFS3_OK:
2315 * COMMIT3resok resok;
2316 * default:
2317 * COMMIT3resfail resfail;
2318 * };
1076 */ 2319 */
1077static int 2320static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
1078nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) 2321 struct xdr_stream *xdr,
2322 struct nfs_writeres *result)
1079{ 2323{
1080 int status; 2324 enum nfs_stat status;
1081 2325 int error;
1082 status = ntohl(*p++); 2326
1083 p = xdr_decode_wcc_data(p, res->fattr); 2327 error = decode_nfsstat3(xdr, &status);
1084 if (status != 0) 2328 if (unlikely(error))
1085 return nfs_stat_to_errno(status); 2329 goto out;
1086 2330 error = decode_wcc_data(xdr, result->fattr);
1087 res->verf->verifier[0] = *p++; 2331 if (unlikely(error))
1088 res->verf->verifier[1] = *p++; 2332 goto out;
1089 return 0; 2333 if (status != NFS3_OK)
2334 goto out_status;
2335 error = decode_writeverf3(xdr, result->verf->verifier);
2336out:
2337 return error;
2338out_status:
2339 return nfs_stat_to_errno(status);
1090} 2340}
1091 2341
1092#ifdef CONFIG_NFS_V3_ACL 2342#ifdef CONFIG_NFS_V3_ACL
1093/* 2343
1094 * Decode GETACL reply 2344static inline int decode_getacl3resok(struct xdr_stream *xdr,
1095 */ 2345 struct nfs3_getaclres *result)
1096static int
1097nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
1098 struct nfs3_getaclres *res)
1099{ 2346{
1100 struct xdr_buf *buf = &req->rq_rcv_buf;
1101 int status = ntohl(*p++);
1102 struct posix_acl **acl; 2347 struct posix_acl **acl;
1103 unsigned int *aclcnt; 2348 unsigned int *aclcnt;
1104 int err, base; 2349 size_t hdrlen;
1105 2350 int error;
1106 if (status != 0) 2351
1107 return nfs_stat_to_errno(status); 2352 error = decode_post_op_attr(xdr, result->fattr);
1108 p = xdr_decode_post_op_attr(p, res->fattr); 2353 if (unlikely(error))
1109 res->mask = ntohl(*p++); 2354 goto out;
1110 if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) 2355 error = decode_uint32(xdr, &result->mask);
1111 return -EINVAL; 2356 if (unlikely(error))
1112 base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base; 2357 goto out;
1113 2358 error = -EINVAL;
1114 acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL; 2359 if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
1115 aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL; 2360 goto out;
1116 err = nfsacl_decode(buf, base, aclcnt, acl); 2361
1117 2362 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
1118 acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL; 2363
1119 aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL; 2364 acl = NULL;
1120 if (err > 0) 2365 if (result->mask & NFS_ACL)
1121 err = nfsacl_decode(buf, base + err, aclcnt, acl); 2366 acl = &result->acl_access;
1122 return (err > 0) ? 0 : err; 2367 aclcnt = NULL;
2368 if (result->mask & NFS_ACLCNT)
2369 aclcnt = &result->acl_access_count;
2370 error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
2371 if (unlikely(error <= 0))
2372 goto out;
2373
2374 acl = NULL;
2375 if (result->mask & NFS_DFACL)
2376 acl = &result->acl_default;
2377 aclcnt = NULL;
2378 if (result->mask & NFS_DFACLCNT)
2379 aclcnt = &result->acl_default_count;
2380 error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
2381 if (unlikely(error <= 0))
2382 return error;
2383 error = 0;
2384out:
2385 return error;
1123} 2386}
1124 2387
1125/* 2388static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
1126 * Decode setacl reply. 2389 struct xdr_stream *xdr,
1127 */ 2390 struct nfs3_getaclres *result)
1128static int
1129nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
1130{ 2391{
1131 int status = ntohl(*p++); 2392 enum nfs_stat status;
2393 int error;
2394
2395 error = decode_nfsstat3(xdr, &status);
2396 if (unlikely(error))
2397 goto out;
2398 if (status != NFS3_OK)
2399 goto out_default;
2400 error = decode_getacl3resok(xdr, result);
2401out:
2402 return error;
2403out_default:
2404 return nfs_stat_to_errno(status);
2405}
1132 2406
1133 if (status) 2407static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
1134 return nfs_stat_to_errno(status); 2408 struct xdr_stream *xdr,
1135 xdr_decode_post_op_attr(p, fattr); 2409 struct nfs_fattr *result)
1136 return 0; 2410{
2411 enum nfs_stat status;
2412 int error;
2413
2414 error = decode_nfsstat3(xdr, &status);
2415 if (unlikely(error))
2416 goto out;
2417 if (status != NFS3_OK)
2418 goto out_default;
2419 error = decode_post_op_attr(xdr, result);
2420out:
2421 return error;
2422out_default:
2423 return nfs_stat_to_errno(status);
1137} 2424}
2425
1138#endif /* CONFIG_NFS_V3_ACL */ 2426#endif /* CONFIG_NFS_V3_ACL */
1139 2427
1140#define PROC(proc, argtype, restype, timer) \ 2428#define PROC(proc, argtype, restype, timer) \
1141[NFS3PROC_##proc] = { \ 2429[NFS3PROC_##proc] = { \
1142 .p_proc = NFS3PROC_##proc, \ 2430 .p_proc = NFS3PROC_##proc, \
1143 .p_encode = (kxdrproc_t) nfs3_xdr_##argtype, \ 2431 .p_encode = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args, \
1144 .p_decode = (kxdrproc_t) nfs3_xdr_##restype, \ 2432 .p_decode = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res, \
1145 .p_arglen = NFS3_##argtype##_sz, \ 2433 .p_arglen = NFS3_##argtype##args_sz, \
1146 .p_replen = NFS3_##restype##_sz, \ 2434 .p_replen = NFS3_##restype##res_sz, \
1147 .p_timer = timer, \ 2435 .p_timer = timer, \
1148 .p_statidx = NFS3PROC_##proc, \ 2436 .p_statidx = NFS3PROC_##proc, \
1149 .p_name = #proc, \ 2437 .p_name = #proc, \
1150 } 2438 }
1151 2439
1152struct rpc_procinfo nfs3_procedures[] = { 2440struct rpc_procinfo nfs3_procedures[] = {
1153 PROC(GETATTR, fhandle, attrstat, 1), 2441 PROC(GETATTR, getattr, getattr, 1),
1154 PROC(SETATTR, sattrargs, wccstat, 0), 2442 PROC(SETATTR, setattr, setattr, 0),
1155 PROC(LOOKUP, diropargs, lookupres, 2), 2443 PROC(LOOKUP, lookup, lookup, 2),
1156 PROC(ACCESS, accessargs, accessres, 1), 2444 PROC(ACCESS, access, access, 1),
1157 PROC(READLINK, readlinkargs, readlinkres, 3), 2445 PROC(READLINK, readlink, readlink, 3),
1158 PROC(READ, readargs, readres, 3), 2446 PROC(READ, read, read, 3),
1159 PROC(WRITE, writeargs, writeres, 4), 2447 PROC(WRITE, write, write, 4),
1160 PROC(CREATE, createargs, createres, 0), 2448 PROC(CREATE, create, create, 0),
1161 PROC(MKDIR, mkdirargs, createres, 0), 2449 PROC(MKDIR, mkdir, create, 0),
1162 PROC(SYMLINK, symlinkargs, createres, 0), 2450 PROC(SYMLINK, symlink, create, 0),
1163 PROC(MKNOD, mknodargs, createres, 0), 2451 PROC(MKNOD, mknod, create, 0),
1164 PROC(REMOVE, removeargs, removeres, 0), 2452 PROC(REMOVE, remove, remove, 0),
1165 PROC(RMDIR, diropargs, wccstat, 0), 2453 PROC(RMDIR, lookup, setattr, 0),
1166 PROC(RENAME, renameargs, renameres, 0), 2454 PROC(RENAME, rename, rename, 0),
1167 PROC(LINK, linkargs, linkres, 0), 2455 PROC(LINK, link, link, 0),
1168 PROC(READDIR, readdirargs, readdirres, 3), 2456 PROC(READDIR, readdir, readdir, 3),
1169 PROC(READDIRPLUS, readdirargs, readdirres, 3), 2457 PROC(READDIRPLUS, readdirplus, readdir, 3),
1170 PROC(FSSTAT, fhandle, fsstatres, 0), 2458 PROC(FSSTAT, getattr, fsstat, 0),
1171 PROC(FSINFO, fhandle, fsinfores, 0), 2459 PROC(FSINFO, getattr, fsinfo, 0),
1172 PROC(PATHCONF, fhandle, pathconfres, 0), 2460 PROC(PATHCONF, getattr, pathconf, 0),
1173 PROC(COMMIT, commitargs, commitres, 5), 2461 PROC(COMMIT, commit, commit, 5),
1174}; 2462};
1175 2463
1176struct rpc_version nfs_version3 = { 2464struct rpc_version nfs_version3 = {
@@ -1183,8 +2471,8 @@ struct rpc_version nfs_version3 = {
1183static struct rpc_procinfo nfs3_acl_procedures[] = { 2471static struct rpc_procinfo nfs3_acl_procedures[] = {
1184 [ACLPROC3_GETACL] = { 2472 [ACLPROC3_GETACL] = {
1185 .p_proc = ACLPROC3_GETACL, 2473 .p_proc = ACLPROC3_GETACL,
1186 .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs, 2474 .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
1187 .p_decode = (kxdrproc_t) nfs3_xdr_getaclres, 2475 .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
1188 .p_arglen = ACL3_getaclargs_sz, 2476 .p_arglen = ACL3_getaclargs_sz,
1189 .p_replen = ACL3_getaclres_sz, 2477 .p_replen = ACL3_getaclres_sz,
1190 .p_timer = 1, 2478 .p_timer = 1,
@@ -1192,8 +2480,8 @@ static struct rpc_procinfo nfs3_acl_procedures[] = {
1192 }, 2480 },
1193 [ACLPROC3_SETACL] = { 2481 [ACLPROC3_SETACL] = {
1194 .p_proc = ACLPROC3_SETACL, 2482 .p_proc = ACLPROC3_SETACL,
1195 .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs, 2483 .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
1196 .p_decode = (kxdrproc_t) nfs3_xdr_setaclres, 2484 .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
1197 .p_arglen = ACL3_setaclargs_sz, 2485 .p_arglen = ACL3_setaclargs_sz,
1198 .p_replen = ACL3_setaclres_sz, 2486 .p_replen = ACL3_setaclres_sz,
1199 .p_timer = 0, 2487 .p_timer = 0,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9fa496387fd..7a747407314 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
44 NFS4CLNT_RECLAIM_REBOOT, 44 NFS4CLNT_RECLAIM_REBOOT,
45 NFS4CLNT_RECLAIM_NOGRACE, 45 NFS4CLNT_RECLAIM_NOGRACE,
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_LAYOUTRECALL,
47 NFS4CLNT_SESSION_RESET, 48 NFS4CLNT_SESSION_RESET,
48 NFS4CLNT_RECALL_SLOT, 49 NFS4CLNT_RECALL_SLOT,
49}; 50};
@@ -109,7 +110,7 @@ struct nfs_unique_id {
109struct nfs4_state_owner { 110struct nfs4_state_owner {
110 struct nfs_unique_id so_owner_id; 111 struct nfs_unique_id so_owner_id;
111 struct nfs_server *so_server; 112 struct nfs_server *so_server;
112 struct rb_node so_client_node; 113 struct rb_node so_server_node;
113 114
114 struct rpc_cred *so_cred; /* Associated cred */ 115 struct rpc_cred *so_cred; /* Associated cred */
115 116
@@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops {
227extern const struct dentry_operations nfs4_dentry_operations; 228extern const struct dentry_operations nfs4_dentry_operations;
228extern const struct inode_operations nfs4_dir_inode_operations; 229extern const struct inode_operations nfs4_dir_inode_operations;
229 230
230/* inode.c */
231extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
232extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
233extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
234
235
236/* nfs4proc.c */ 231/* nfs4proc.c */
237extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); 232extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
238extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); 233extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
@@ -241,11 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
241extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); 236extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 237extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 238extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
244extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); 239extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
245extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 240extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
246extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 241extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
247 struct nfs4_fs_locations *fs_locations, struct page *page); 242 struct nfs4_fs_locations *fs_locations, struct page *page);
248extern void nfs4_release_lockowner(const struct nfs4_lock_state *); 243extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
244extern const struct xattr_handler *nfs4_xattr_handlers[];
249 245
250#if defined(CONFIG_NFS_V4_1) 246#if defined(CONFIG_NFS_V4_1)
251static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) 247static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -331,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
331extern const nfs4_stateid zero_stateid; 327extern const nfs4_stateid zero_stateid;
332 328
333/* nfs4xdr.c */ 329/* nfs4xdr.c */
334extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
335extern struct rpc_procinfo nfs4_procedures[]; 330extern struct rpc_procinfo nfs4_procedures[];
336 331
337struct nfs4_mount_data; 332struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e92f0d8d65..23f930caf1e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
82{ 82{
83 struct nfs4_file_layout_dsaddr *dsaddr; 83 struct nfs4_file_layout_dsaddr *dsaddr;
84 int status = -EINVAL; 84 int status = -EINVAL;
85 struct nfs_server *nfss = NFS_SERVER(lo->inode); 85 struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
86 86
87 dprintk("--> %s\n", __func__); 87 dprintk("--> %s\n", __func__);
88 88
@@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
101 /* find and reference the deviceid */ 101 /* find and reference the deviceid */
102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); 102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
103 if (dsaddr == NULL) { 103 if (dsaddr == NULL) {
104 dsaddr = get_device_info(lo->inode, id); 104 dsaddr = get_device_info(lo->plh_inode, id);
105 if (dsaddr == NULL) 105 if (dsaddr == NULL)
106 goto out; 106 goto out;
107 } 107 }
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
243static void 243static void
244filelayout_free_lseg(struct pnfs_layout_segment *lseg) 244filelayout_free_lseg(struct pnfs_layout_segment *lseg)
245{ 245{
246 struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode); 246 struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
248 248
249 dprintk("--> %s\n", __func__); 249 dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 51fe64ace55..f5c9b125e8c 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -214,7 +214,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
214 214
215 /* ipv6 length plus port is legal */ 215 /* ipv6 length plus port is legal */
216 if (rlen > INET6_ADDRSTRLEN + 8) { 216 if (rlen > INET6_ADDRSTRLEN + 8) {
217 dprintk("%s Invalid address, length %d\n", __func__, 217 dprintk("%s: Invalid address, length %d\n", __func__,
218 rlen); 218 rlen);
219 goto out_err; 219 goto out_err;
220 } 220 }
@@ -225,6 +225,11 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
225 /* replace the port dots with dashes for the in4_pton() delimiter*/ 225 /* replace the port dots with dashes for the in4_pton() delimiter*/
226 for (i = 0; i < 2; i++) { 226 for (i = 0; i < 2; i++) {
227 char *res = strrchr(buf, '.'); 227 char *res = strrchr(buf, '.');
228 if (!res) {
229 dprintk("%s: Failed finding expected dots in port\n",
230 __func__);
231 goto out_free;
232 }
228 *res = '-'; 233 *res = '-';
229 } 234 }
230 235
@@ -240,7 +245,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
240 port = htons((tmp[0] << 8) | (tmp[1])); 245 port = htons((tmp[0] << 8) | (tmp[1]));
241 246
242 ds = nfs4_pnfs_ds_add(inode, ip_addr, port); 247 ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
243 dprintk("%s Decoded address and port %s\n", __func__, buf); 248 dprintk("%s: Decoded address and port %s\n", __func__, buf);
244out_free: 249out_free:
245 kfree(buf); 250 kfree(buf);
246out_err: 251out_err:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0f24cdf2cb1..78936a8f40a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -49,6 +49,8 @@
49#include <linux/mount.h> 49#include <linux/mount.h>
50#include <linux/module.h> 50#include <linux/module.h>
51#include <linux/sunrpc/bc_xprt.h> 51#include <linux/sunrpc/bc_xprt.h>
52#include <linux/xattr.h>
53#include <linux/utsname.h>
52 54
53#include "nfs4_fs.h" 55#include "nfs4_fs.h"
54#include "delegation.h" 56#include "delegation.h"
@@ -355,9 +357,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
355} 357}
356 358
357/* 359/*
358 * Signal state manager thread if session is drained 360 * Signal state manager thread if session fore channel is drained
359 */ 361 */
360static void nfs41_check_drain_session_complete(struct nfs4_session *ses) 362static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
361{ 363{
362 struct rpc_task *task; 364 struct rpc_task *task;
363 365
@@ -371,8 +373,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
371 if (ses->fc_slot_table.highest_used_slotid != -1) 373 if (ses->fc_slot_table.highest_used_slotid != -1)
372 return; 374 return;
373 375
374 dprintk("%s COMPLETE: Session Drained\n", __func__); 376 dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
375 complete(&ses->complete); 377 complete(&ses->fc_slot_table.complete);
378}
379
380/*
381 * Signal state manager thread if session back channel is drained
382 */
383void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
384{
385 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
386 ses->bc_slot_table.highest_used_slotid != -1)
387 return;
388 dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
389 complete(&ses->bc_slot_table.complete);
376} 390}
377 391
378static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) 392static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -389,7 +403,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
389 403
390 spin_lock(&tbl->slot_tbl_lock); 404 spin_lock(&tbl->slot_tbl_lock);
391 nfs4_free_slot(tbl, res->sr_slot); 405 nfs4_free_slot(tbl, res->sr_slot);
392 nfs41_check_drain_session_complete(res->sr_session); 406 nfs4_check_drain_fc_complete(res->sr_session);
393 spin_unlock(&tbl->slot_tbl_lock); 407 spin_unlock(&tbl->slot_tbl_lock);
394 res->sr_slot = NULL; 408 res->sr_slot = NULL;
395} 409}
@@ -1826,6 +1840,8 @@ struct nfs4_closedata {
1826 struct nfs_closeres res; 1840 struct nfs_closeres res;
1827 struct nfs_fattr fattr; 1841 struct nfs_fattr fattr;
1828 unsigned long timestamp; 1842 unsigned long timestamp;
1843 bool roc;
1844 u32 roc_barrier;
1829}; 1845};
1830 1846
1831static void nfs4_free_closedata(void *data) 1847static void nfs4_free_closedata(void *data)
@@ -1833,6 +1849,8 @@ static void nfs4_free_closedata(void *data)
1833 struct nfs4_closedata *calldata = data; 1849 struct nfs4_closedata *calldata = data;
1834 struct nfs4_state_owner *sp = calldata->state->owner; 1850 struct nfs4_state_owner *sp = calldata->state->owner;
1835 1851
1852 if (calldata->roc)
1853 pnfs_roc_release(calldata->state->inode);
1836 nfs4_put_open_state(calldata->state); 1854 nfs4_put_open_state(calldata->state);
1837 nfs_free_seqid(calldata->arg.seqid); 1855 nfs_free_seqid(calldata->arg.seqid);
1838 nfs4_put_state_owner(sp); 1856 nfs4_put_state_owner(sp);
@@ -1865,6 +1883,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1865 */ 1883 */
1866 switch (task->tk_status) { 1884 switch (task->tk_status) {
1867 case 0: 1885 case 0:
1886 if (calldata->roc)
1887 pnfs_roc_set_barrier(state->inode,
1888 calldata->roc_barrier);
1868 nfs_set_open_stateid(state, &calldata->res.stateid, 0); 1889 nfs_set_open_stateid(state, &calldata->res.stateid, 0);
1869 renew_lease(server, calldata->timestamp); 1890 renew_lease(server, calldata->timestamp);
1870 nfs4_close_clear_stateid_flags(state, 1891 nfs4_close_clear_stateid_flags(state,
@@ -1917,8 +1938,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1917 return; 1938 return;
1918 } 1939 }
1919 1940
1920 if (calldata->arg.fmode == 0) 1941 if (calldata->arg.fmode == 0) {
1921 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; 1942 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
1943 if (calldata->roc &&
1944 pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
1945 rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
1946 task, NULL);
1947 return;
1948 }
1949 }
1922 1950
1923 nfs_fattr_init(calldata->res.fattr); 1951 nfs_fattr_init(calldata->res.fattr);
1924 calldata->timestamp = jiffies; 1952 calldata->timestamp = jiffies;
@@ -1946,7 +1974,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
1946 * 1974 *
1947 * NOTE: Caller must be holding the sp->so_owner semaphore! 1975 * NOTE: Caller must be holding the sp->so_owner semaphore!
1948 */ 1976 */
1949int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) 1977int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
1950{ 1978{
1951 struct nfs_server *server = NFS_SERVER(state->inode); 1979 struct nfs_server *server = NFS_SERVER(state->inode);
1952 struct nfs4_closedata *calldata; 1980 struct nfs4_closedata *calldata;
@@ -1981,11 +2009,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
1981 calldata->res.fattr = &calldata->fattr; 2009 calldata->res.fattr = &calldata->fattr;
1982 calldata->res.seqid = calldata->arg.seqid; 2010 calldata->res.seqid = calldata->arg.seqid;
1983 calldata->res.server = server; 2011 calldata->res.server = server;
2012 calldata->roc = roc;
1984 path_get(path); 2013 path_get(path);
1985 calldata->path = *path; 2014 calldata->path = *path;
1986 2015
1987 msg.rpc_argp = &calldata->arg, 2016 msg.rpc_argp = &calldata->arg;
1988 msg.rpc_resp = &calldata->res, 2017 msg.rpc_resp = &calldata->res;
1989 task_setup_data.callback_data = calldata; 2018 task_setup_data.callback_data = calldata;
1990 task = rpc_run_task(&task_setup_data); 2019 task = rpc_run_task(&task_setup_data);
1991 if (IS_ERR(task)) 2020 if (IS_ERR(task))
@@ -1998,6 +2027,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
1998out_free_calldata: 2027out_free_calldata:
1999 kfree(calldata); 2028 kfree(calldata);
2000out: 2029out:
2030 if (roc)
2031 pnfs_roc_release(state->inode);
2001 nfs4_put_open_state(state); 2032 nfs4_put_open_state(state);
2002 nfs4_put_state_owner(sp); 2033 nfs4_put_state_owner(sp);
2003 return status; 2034 return status;
@@ -2486,6 +2517,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2486 path = &ctx->path; 2517 path = &ctx->path;
2487 fmode = ctx->mode; 2518 fmode = ctx->mode;
2488 } 2519 }
2520 sattr->ia_mode &= ~current_umask();
2489 state = nfs4_do_open(dir, path, fmode, flags, sattr, cred); 2521 state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
2490 d_drop(dentry); 2522 d_drop(dentry);
2491 if (IS_ERR(state)) { 2523 if (IS_ERR(state)) {
@@ -2816,6 +2848,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
2816{ 2848{
2817 struct nfs4_exception exception = { }; 2849 struct nfs4_exception exception = { };
2818 int err; 2850 int err;
2851
2852 sattr->ia_mode &= ~current_umask();
2819 do { 2853 do {
2820 err = nfs4_handle_exception(NFS_SERVER(dir), 2854 err = nfs4_handle_exception(NFS_SERVER(dir),
2821 _nfs4_proc_mkdir(dir, dentry, sattr), 2855 _nfs4_proc_mkdir(dir, dentry, sattr),
@@ -2852,8 +2886,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2852 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); 2886 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
2853 res.pgbase = args.pgbase; 2887 res.pgbase = args.pgbase;
2854 status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); 2888 status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
2855 if (status == 0) 2889 if (status >= 0) {
2856 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); 2890 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
2891 status += args.pgbase;
2892 }
2857 2893
2858 nfs_invalidate_atime(dir); 2894 nfs_invalidate_atime(dir);
2859 2895
@@ -2914,6 +2950,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
2914{ 2950{
2915 struct nfs4_exception exception = { }; 2951 struct nfs4_exception exception = { };
2916 int err; 2952 int err;
2953
2954 sattr->ia_mode &= ~current_umask();
2917 do { 2955 do {
2918 err = nfs4_handle_exception(NFS_SERVER(dir), 2956 err = nfs4_handle_exception(NFS_SERVER(dir),
2919 _nfs4_proc_mknod(dir, dentry, sattr, rdev), 2957 _nfs4_proc_mknod(dir, dentry, sattr, rdev),
@@ -3359,6 +3397,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
3359 ret = nfs_revalidate_inode(server, inode); 3397 ret = nfs_revalidate_inode(server, inode);
3360 if (ret < 0) 3398 if (ret < 0)
3361 return ret; 3399 return ret;
3400 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
3401 nfs_zap_acl_cache(inode);
3362 ret = nfs4_read_cached_acl(inode, buf, buflen); 3402 ret = nfs4_read_cached_acl(inode, buf, buflen);
3363 if (ret != -ENOENT) 3403 if (ret != -ENOENT)
3364 return ret; 3404 return ret;
@@ -3387,6 +3427,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
3387 nfs_inode_return_delegation(inode); 3427 nfs_inode_return_delegation(inode);
3388 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 3428 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
3389 ret = nfs4_call_sync(server, &msg, &arg, &res, 1); 3429 ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
3430 /*
3431 * Acl update can result in inode attribute update.
3432 * so mark the attribute cache invalid.
3433 */
3434 spin_lock(&inode->i_lock);
3435 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
3436 spin_unlock(&inode->i_lock);
3390 nfs_access_zap_cache(inode); 3437 nfs_access_zap_cache(inode);
3391 nfs_zap_acl_cache(inode); 3438 nfs_zap_acl_cache(inode);
3392 return ret; 3439 return ret;
@@ -3467,6 +3514,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3467 struct nfs4_setclientid setclientid = { 3514 struct nfs4_setclientid setclientid = {
3468 .sc_verifier = &sc_verifier, 3515 .sc_verifier = &sc_verifier,
3469 .sc_prog = program, 3516 .sc_prog = program,
3517 .sc_cb_ident = clp->cl_cb_ident,
3470 }; 3518 };
3471 struct rpc_message msg = { 3519 struct rpc_message msg = {
3472 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], 3520 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -3506,7 +3554,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3506 if (signalled()) 3554 if (signalled())
3507 break; 3555 break;
3508 if (loop++ & 1) 3556 if (loop++ & 1)
3509 ssleep(clp->cl_lease_time + 1); 3557 ssleep(clp->cl_lease_time / HZ + 1);
3510 else 3558 else
3511 if (++clp->cl_id_uniquifier == 0) 3559 if (++clp->cl_id_uniquifier == 0)
3512 break; 3560 break;
@@ -3652,8 +3700,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3652 data->rpc_status = 0; 3700 data->rpc_status = 0;
3653 3701
3654 task_setup_data.callback_data = data; 3702 task_setup_data.callback_data = data;
3655 msg.rpc_argp = &data->args, 3703 msg.rpc_argp = &data->args;
3656 msg.rpc_resp = &data->res, 3704 msg.rpc_resp = &data->res;
3657 task = rpc_run_task(&task_setup_data); 3705 task = rpc_run_task(&task_setup_data);
3658 if (IS_ERR(task)) 3706 if (IS_ERR(task))
3659 return PTR_ERR(task); 3707 return PTR_ERR(task);
@@ -3732,6 +3780,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
3732 goto out; 3780 goto out;
3733 lsp = request->fl_u.nfs4_fl.owner; 3781 lsp = request->fl_u.nfs4_fl.owner;
3734 arg.lock_owner.id = lsp->ls_id.id; 3782 arg.lock_owner.id = lsp->ls_id.id;
3783 arg.lock_owner.s_dev = server->s_dev;
3735 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 3784 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
3736 switch (status) { 3785 switch (status) {
3737 case 0: 3786 case 0:
@@ -3897,8 +3946,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
3897 return ERR_PTR(-ENOMEM); 3946 return ERR_PTR(-ENOMEM);
3898 } 3947 }
3899 3948
3900 msg.rpc_argp = &data->arg, 3949 msg.rpc_argp = &data->arg;
3901 msg.rpc_resp = &data->res, 3950 msg.rpc_resp = &data->res;
3902 task_setup_data.callback_data = data; 3951 task_setup_data.callback_data = data;
3903 return rpc_run_task(&task_setup_data); 3952 return rpc_run_task(&task_setup_data);
3904} 3953}
@@ -3977,6 +4026,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3977 p->arg.lock_stateid = &lsp->ls_stateid; 4026 p->arg.lock_stateid = &lsp->ls_stateid;
3978 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 4027 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
3979 p->arg.lock_owner.id = lsp->ls_id.id; 4028 p->arg.lock_owner.id = lsp->ls_id.id;
4029 p->arg.lock_owner.s_dev = server->s_dev;
3980 p->res.lock_seqid = p->arg.lock_seqid; 4030 p->res.lock_seqid = p->arg.lock_seqid;
3981 p->lsp = lsp; 4031 p->lsp = lsp;
3982 p->server = server; 4032 p->server = server;
@@ -4134,8 +4184,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4134 data->arg.reclaim = NFS_LOCK_RECLAIM; 4184 data->arg.reclaim = NFS_LOCK_RECLAIM;
4135 task_setup_data.callback_ops = &nfs4_recover_lock_ops; 4185 task_setup_data.callback_ops = &nfs4_recover_lock_ops;
4136 } 4186 }
4137 msg.rpc_argp = &data->arg, 4187 msg.rpc_argp = &data->arg;
4138 msg.rpc_resp = &data->res, 4188 msg.rpc_resp = &data->res;
4139 task_setup_data.callback_data = data; 4189 task_setup_data.callback_data = data;
4140 task = rpc_run_task(&task_setup_data); 4190 task = rpc_run_task(&task_setup_data);
4141 if (IS_ERR(task)) 4191 if (IS_ERR(task))
@@ -4381,48 +4431,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
4381 return; 4431 return;
4382 args->lock_owner.clientid = server->nfs_client->cl_clientid; 4432 args->lock_owner.clientid = server->nfs_client->cl_clientid;
4383 args->lock_owner.id = lsp->ls_id.id; 4433 args->lock_owner.id = lsp->ls_id.id;
4434 args->lock_owner.s_dev = server->s_dev;
4384 msg.rpc_argp = args; 4435 msg.rpc_argp = args;
4385 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); 4436 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
4386} 4437}
4387 4438
4388#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" 4439#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
4389 4440
4390int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, 4441static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
4391 size_t buflen, int flags) 4442 const void *buf, size_t buflen,
4443 int flags, int type)
4392{ 4444{
4393 struct inode *inode = dentry->d_inode; 4445 if (strcmp(key, "") != 0)
4394 4446 return -EINVAL;
4395 if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
4396 return -EOPNOTSUPP;
4397 4447
4398 return nfs4_proc_set_acl(inode, buf, buflen); 4448 return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
4399} 4449}
4400 4450
4401/* The getxattr man page suggests returning -ENODATA for unknown attributes, 4451static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
4402 * and that's what we'll do for e.g. user attributes that haven't been set. 4452 void *buf, size_t buflen, int type)
4403 * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
4404 * attributes in kernel-managed attribute namespaces. */
4405ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
4406 size_t buflen)
4407{ 4453{
4408 struct inode *inode = dentry->d_inode; 4454 if (strcmp(key, "") != 0)
4409 4455 return -EINVAL;
4410 if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
4411 return -EOPNOTSUPP;
4412 4456
4413 return nfs4_proc_get_acl(inode, buf, buflen); 4457 return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
4414} 4458}
4415 4459
4416ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) 4460static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
4461 size_t list_len, const char *name,
4462 size_t name_len, int type)
4417{ 4463{
4418 size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1; 4464 size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
4419 4465
4420 if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode))) 4466 if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
4421 return 0; 4467 return 0;
4422 if (buf && buflen < len) 4468
4423 return -ERANGE; 4469 if (list && len <= list_len)
4424 if (buf) 4470 memcpy(list, XATTR_NAME_NFSV4_ACL, len);
4425 memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
4426 return len; 4471 return len;
4427} 4472}
4428 4473
@@ -4475,6 +4520,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
4475 4520
4476#ifdef CONFIG_NFS_V4_1 4521#ifdef CONFIG_NFS_V4_1
4477/* 4522/*
4523 * Check the exchange flags returned by the server for invalid flags, having
4524 * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
4525 * DS flags set.
4526 */
4527static int nfs4_check_cl_exchange_flags(u32 flags)
4528{
4529 if (flags & ~EXCHGID4_FLAG_MASK_R)
4530 goto out_inval;
4531 if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
4532 (flags & EXCHGID4_FLAG_USE_NON_PNFS))
4533 goto out_inval;
4534 if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
4535 goto out_inval;
4536 return NFS_OK;
4537out_inval:
4538 return -NFS4ERR_INVAL;
4539}
4540
4541/*
4478 * nfs4_proc_exchange_id() 4542 * nfs4_proc_exchange_id()
4479 * 4543 *
4480 * Since the clientid has expired, all compounds using sessions 4544 * Since the clientid has expired, all compounds using sessions
@@ -4487,7 +4551,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4487 nfs4_verifier verifier; 4551 nfs4_verifier verifier;
4488 struct nfs41_exchange_id_args args = { 4552 struct nfs41_exchange_id_args args = {
4489 .client = clp, 4553 .client = clp,
4490 .flags = clp->cl_exchange_flags, 4554 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
4491 }; 4555 };
4492 struct nfs41_exchange_id_res res = { 4556 struct nfs41_exchange_id_res res = {
4493 .client = clp, 4557 .client = clp,
@@ -4504,34 +4568,21 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4504 dprintk("--> %s\n", __func__); 4568 dprintk("--> %s\n", __func__);
4505 BUG_ON(clp == NULL); 4569 BUG_ON(clp == NULL);
4506 4570
4507 /* Remove server-only flags */
4508 args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
4509
4510 p = (u32 *)verifier.data; 4571 p = (u32 *)verifier.data;
4511 *p++ = htonl((u32)clp->cl_boot_time.tv_sec); 4572 *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
4512 *p = htonl((u32)clp->cl_boot_time.tv_nsec); 4573 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
4513 args.verifier = &verifier; 4574 args.verifier = &verifier;
4514 4575
4515 while (1) { 4576 args.id_len = scnprintf(args.id, sizeof(args.id),
4516 args.id_len = scnprintf(args.id, sizeof(args.id), 4577 "%s/%s.%s/%u",
4517 "%s/%s %u", 4578 clp->cl_ipaddr,
4518 clp->cl_ipaddr, 4579 init_utsname()->nodename,
4519 rpc_peeraddr2str(clp->cl_rpcclient, 4580 init_utsname()->domainname,
4520 RPC_DISPLAY_ADDR), 4581 clp->cl_rpcclient->cl_auth->au_flavor);
4521 clp->cl_id_uniquifier);
4522
4523 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4524
4525 if (status != -NFS4ERR_CLID_INUSE)
4526 break;
4527
4528 if (signalled())
4529 break;
4530
4531 if (++clp->cl_id_uniquifier == 0)
4532 break;
4533 }
4534 4582
4583 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4584 if (!status)
4585 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
4535 dprintk("<-- %s status= %d\n", __func__, status); 4586 dprintk("<-- %s status= %d\n", __func__, status);
4536 return status; 4587 return status;
4537} 4588}
@@ -4765,17 +4816,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4765 if (!session) 4816 if (!session)
4766 return NULL; 4817 return NULL;
4767 4818
4768 init_completion(&session->complete);
4769
4770 tbl = &session->fc_slot_table; 4819 tbl = &session->fc_slot_table;
4771 tbl->highest_used_slotid = -1; 4820 tbl->highest_used_slotid = -1;
4772 spin_lock_init(&tbl->slot_tbl_lock); 4821 spin_lock_init(&tbl->slot_tbl_lock);
4773 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); 4822 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
4823 init_completion(&tbl->complete);
4774 4824
4775 tbl = &session->bc_slot_table; 4825 tbl = &session->bc_slot_table;
4776 tbl->highest_used_slotid = -1; 4826 tbl->highest_used_slotid = -1;
4777 spin_lock_init(&tbl->slot_tbl_lock); 4827 spin_lock_init(&tbl->slot_tbl_lock);
4778 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); 4828 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
4829 init_completion(&tbl->complete);
4779 4830
4780 session->session_state = 1<<NFS4_SESSION_INITING; 4831 session->session_state = 1<<NFS4_SESSION_INITING;
4781 4832
@@ -5269,13 +5320,23 @@ static void
5269nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) 5320nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
5270{ 5321{
5271 struct nfs4_layoutget *lgp = calldata; 5322 struct nfs4_layoutget *lgp = calldata;
5272 struct inode *ino = lgp->args.inode; 5323 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5273 struct nfs_server *server = NFS_SERVER(ino);
5274 5324
5275 dprintk("--> %s\n", __func__); 5325 dprintk("--> %s\n", __func__);
5326 /* Note the is a race here, where a CB_LAYOUTRECALL can come in
5327 * right now covering the LAYOUTGET we are about to send.
5328 * However, that is not so catastrophic, and there seems
5329 * to be no way to prevent it completely.
5330 */
5276 if (nfs4_setup_sequence(server, &lgp->args.seq_args, 5331 if (nfs4_setup_sequence(server, &lgp->args.seq_args,
5277 &lgp->res.seq_res, 0, task)) 5332 &lgp->res.seq_res, 0, task))
5278 return; 5333 return;
5334 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
5335 NFS_I(lgp->args.inode)->layout,
5336 lgp->args.ctx->state)) {
5337 rpc_exit(task, NFS4_OK);
5338 return;
5339 }
5279 rpc_call_start(task); 5340 rpc_call_start(task);
5280} 5341}
5281 5342
@@ -5302,7 +5363,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
5302 return; 5363 return;
5303 } 5364 }
5304 } 5365 }
5305 lgp->status = task->tk_status;
5306 dprintk("<-- %s\n", __func__); 5366 dprintk("<-- %s\n", __func__);
5307} 5367}
5308 5368
@@ -5311,7 +5371,6 @@ static void nfs4_layoutget_release(void *calldata)
5311 struct nfs4_layoutget *lgp = calldata; 5371 struct nfs4_layoutget *lgp = calldata;
5312 5372
5313 dprintk("--> %s\n", __func__); 5373 dprintk("--> %s\n", __func__);
5314 put_layout_hdr(lgp->args.inode);
5315 if (lgp->res.layout.buf != NULL) 5374 if (lgp->res.layout.buf != NULL)
5316 free_page((unsigned long) lgp->res.layout.buf); 5375 free_page((unsigned long) lgp->res.layout.buf);
5317 put_nfs_open_context(lgp->args.ctx); 5376 put_nfs_open_context(lgp->args.ctx);
@@ -5356,13 +5415,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5356 if (IS_ERR(task)) 5415 if (IS_ERR(task))
5357 return PTR_ERR(task); 5416 return PTR_ERR(task);
5358 status = nfs4_wait_for_completion_rpc_task(task); 5417 status = nfs4_wait_for_completion_rpc_task(task);
5359 if (status != 0) 5418 if (status == 0)
5360 goto out; 5419 status = task->tk_status;
5361 status = lgp->status; 5420 if (status == 0)
5362 if (status != 0) 5421 status = pnfs_layout_process(lgp);
5363 goto out;
5364 status = pnfs_layout_process(lgp);
5365out:
5366 rpc_put_task(task); 5422 rpc_put_task(task);
5367 dprintk("<-- %s status=%d\n", __func__, status); 5423 dprintk("<-- %s status=%d\n", __func__, status);
5368 return status; 5424 return status;
@@ -5493,9 +5549,10 @@ static const struct inode_operations nfs4_file_inode_operations = {
5493 .permission = nfs_permission, 5549 .permission = nfs_permission,
5494 .getattr = nfs_getattr, 5550 .getattr = nfs_getattr,
5495 .setattr = nfs_setattr, 5551 .setattr = nfs_setattr,
5496 .getxattr = nfs4_getxattr, 5552 .getxattr = generic_getxattr,
5497 .setxattr = nfs4_setxattr, 5553 .setxattr = generic_setxattr,
5498 .listxattr = nfs4_listxattr, 5554 .listxattr = generic_listxattr,
5555 .removexattr = generic_removexattr,
5499}; 5556};
5500 5557
5501const struct nfs_rpc_ops nfs_v4_clientops = { 5558const struct nfs_rpc_ops nfs_v4_clientops = {
@@ -5540,6 +5597,18 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5540 .open_context = nfs4_atomic_open, 5597 .open_context = nfs4_atomic_open,
5541}; 5598};
5542 5599
5600static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
5601 .prefix = XATTR_NAME_NFSV4_ACL,
5602 .list = nfs4_xattr_list_nfs4_acl,
5603 .get = nfs4_xattr_get_nfs4_acl,
5604 .set = nfs4_xattr_set_nfs4_acl,
5605};
5606
5607const struct xattr_handler *nfs4_xattr_handlers[] = {
5608 &nfs4_xattr_nfs4_acl_handler,
5609 NULL
5610};
5611
5543/* 5612/*
5544 * Local variables: 5613 * Local variables:
5545 * c-basic-offset: 8 5614 * c-basic-offset: 8
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 72b6c580af1..402143d75fc 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work)
63 63
64 ops = clp->cl_mvops->state_renewal_ops; 64 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 65 dprintk("%s: start\n", __func__);
66 /* Are there any active superblocks? */ 66
67 if (list_empty(&clp->cl_superblocks)) 67 rcu_read_lock();
68 if (list_empty(&clp->cl_superblocks)) {
69 rcu_read_unlock();
68 goto out; 70 goto out;
71 }
72 rcu_read_unlock();
73
69 spin_lock(&clp->cl_lock); 74 spin_lock(&clp->cl_lock);
70 lease = clp->cl_lease_time; 75 lease = clp->cl_lease_time;
71 last = clp->cl_last_renewal; 76 last = clp->cl_last_renewal;
@@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
75 cred = ops->get_state_renewal_cred_locked(clp); 80 cred = ops->get_state_renewal_cred_locked(clp);
76 spin_unlock(&clp->cl_lock); 81 spin_unlock(&clp->cl_lock);
77 if (cred == NULL) { 82 if (cred == NULL) {
78 if (list_empty(&clp->cl_delegations)) { 83 if (!nfs_delegations_present(clp)) {
79 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 84 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
80 goto out; 85 goto out;
81 } 86 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f575a312673..e6742b57a04 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -105,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
105 put_rpccred(cred); 105 put_rpccred(cred);
106} 106}
107 107
108struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) 108static struct rpc_cred *
109nfs4_get_renew_cred_server_locked(struct nfs_server *server)
109{ 110{
111 struct rpc_cred *cred = NULL;
110 struct nfs4_state_owner *sp; 112 struct nfs4_state_owner *sp;
111 struct rb_node *pos; 113 struct rb_node *pos;
112 struct rpc_cred *cred = NULL;
113 114
114 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 115 for (pos = rb_first(&server->state_owners);
115 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 116 pos != NULL;
117 pos = rb_next(pos)) {
118 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
116 if (list_empty(&sp->so_states)) 119 if (list_empty(&sp->so_states))
117 continue; 120 continue;
118 cred = get_rpccred(sp->so_cred); 121 cred = get_rpccred(sp->so_cred);
@@ -121,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
121 return cred; 124 return cred;
122} 125}
123 126
127/**
128 * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
129 * @clp: client state handle
130 *
131 * Returns an rpc_cred with reference count bumped, or NULL.
132 * Caller must hold clp->cl_lock.
133 */
134struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
135{
136 struct rpc_cred *cred = NULL;
137 struct nfs_server *server;
138
139 rcu_read_lock();
140 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
141 cred = nfs4_get_renew_cred_server_locked(server);
142 if (cred != NULL)
143 break;
144 }
145 rcu_read_unlock();
146 return cred;
147}
148
124#if defined(CONFIG_NFS_V4_1) 149#if defined(CONFIG_NFS_V4_1)
125 150
126static int nfs41_setup_state_renewal(struct nfs_client *clp) 151static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -142,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
142 return status; 167 return status;
143} 168}
144 169
170/*
171 * Back channel returns NFS4ERR_DELAY for new requests when
172 * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
173 * is ended.
174 */
145static void nfs4_end_drain_session(struct nfs_client *clp) 175static void nfs4_end_drain_session(struct nfs_client *clp)
146{ 176{
147 struct nfs4_session *ses = clp->cl_session; 177 struct nfs4_session *ses = clp->cl_session;
@@ -165,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
165 } 195 }
166} 196}
167 197
168static int nfs4_begin_drain_session(struct nfs_client *clp) 198static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
169{ 199{
170 struct nfs4_session *ses = clp->cl_session;
171 struct nfs4_slot_table *tbl = &ses->fc_slot_table;
172
173 spin_lock(&tbl->slot_tbl_lock); 200 spin_lock(&tbl->slot_tbl_lock);
174 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
175 if (tbl->highest_used_slotid != -1) { 201 if (tbl->highest_used_slotid != -1) {
176 INIT_COMPLETION(ses->complete); 202 INIT_COMPLETION(tbl->complete);
177 spin_unlock(&tbl->slot_tbl_lock); 203 spin_unlock(&tbl->slot_tbl_lock);
178 return wait_for_completion_interruptible(&ses->complete); 204 return wait_for_completion_interruptible(&tbl->complete);
179 } 205 }
180 spin_unlock(&tbl->slot_tbl_lock); 206 spin_unlock(&tbl->slot_tbl_lock);
181 return 0; 207 return 0;
182} 208}
183 209
210static int nfs4_begin_drain_session(struct nfs_client *clp)
211{
212 struct nfs4_session *ses = clp->cl_session;
213 int ret = 0;
214
215 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
216 /* back channel */
217 ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
218 if (ret)
219 return ret;
220 /* fore channel */
221 return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
222}
223
184int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 224int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
185{ 225{
186 int status; 226 int status;
@@ -210,28 +250,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
210 250
211#endif /* CONFIG_NFS_V4_1 */ 251#endif /* CONFIG_NFS_V4_1 */
212 252
213struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) 253static struct rpc_cred *
254nfs4_get_setclientid_cred_server(struct nfs_server *server)
214{ 255{
256 struct nfs_client *clp = server->nfs_client;
257 struct rpc_cred *cred = NULL;
215 struct nfs4_state_owner *sp; 258 struct nfs4_state_owner *sp;
216 struct rb_node *pos; 259 struct rb_node *pos;
260
261 spin_lock(&clp->cl_lock);
262 pos = rb_first(&server->state_owners);
263 if (pos != NULL) {
264 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
265 cred = get_rpccred(sp->so_cred);
266 }
267 spin_unlock(&clp->cl_lock);
268 return cred;
269}
270
271/**
272 * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
273 * @clp: client state handle
274 *
275 * Returns an rpc_cred with reference count bumped, or NULL.
276 */
277struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
278{
279 struct nfs_server *server;
217 struct rpc_cred *cred; 280 struct rpc_cred *cred;
218 281
219 spin_lock(&clp->cl_lock); 282 spin_lock(&clp->cl_lock);
220 cred = nfs4_get_machine_cred_locked(clp); 283 cred = nfs4_get_machine_cred_locked(clp);
284 spin_unlock(&clp->cl_lock);
221 if (cred != NULL) 285 if (cred != NULL)
222 goto out; 286 goto out;
223 pos = rb_first(&clp->cl_state_owners); 287
224 if (pos != NULL) { 288 rcu_read_lock();
225 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 289 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
226 cred = get_rpccred(sp->so_cred); 290 cred = nfs4_get_setclientid_cred_server(server);
291 if (cred != NULL)
292 break;
227 } 293 }
294 rcu_read_unlock();
295
228out: 296out:
229 spin_unlock(&clp->cl_lock);
230 return cred; 297 return cred;
231} 298}
232 299
233static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new, 300static void nfs_alloc_unique_id_locked(struct rb_root *root,
234 __u64 minval, int maxbits) 301 struct nfs_unique_id *new,
302 __u64 minval, int maxbits)
235{ 303{
236 struct rb_node **p, *parent; 304 struct rb_node **p, *parent;
237 struct nfs_unique_id *pos; 305 struct nfs_unique_id *pos;
@@ -286,16 +354,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
286} 354}
287 355
288static struct nfs4_state_owner * 356static struct nfs4_state_owner *
289nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred) 357nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
290{ 358{
291 struct nfs_client *clp = server->nfs_client; 359 struct rb_node **p = &server->state_owners.rb_node,
292 struct rb_node **p = &clp->cl_state_owners.rb_node,
293 *parent = NULL; 360 *parent = NULL;
294 struct nfs4_state_owner *sp, *res = NULL; 361 struct nfs4_state_owner *sp, *res = NULL;
295 362
296 while (*p != NULL) { 363 while (*p != NULL) {
297 parent = *p; 364 parent = *p;
298 sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); 365 sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
299 366
300 if (server < sp->so_server) { 367 if (server < sp->so_server) {
301 p = &parent->rb_left; 368 p = &parent->rb_left;
@@ -319,24 +386,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
319} 386}
320 387
321static struct nfs4_state_owner * 388static struct nfs4_state_owner *
322nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new) 389nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
323{ 390{
324 struct rb_node **p = &clp->cl_state_owners.rb_node, 391 struct nfs_server *server = new->so_server;
392 struct rb_node **p = &server->state_owners.rb_node,
325 *parent = NULL; 393 *parent = NULL;
326 struct nfs4_state_owner *sp; 394 struct nfs4_state_owner *sp;
327 395
328 while (*p != NULL) { 396 while (*p != NULL) {
329 parent = *p; 397 parent = *p;
330 sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); 398 sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
331 399
332 if (new->so_server < sp->so_server) {
333 p = &parent->rb_left;
334 continue;
335 }
336 if (new->so_server > sp->so_server) {
337 p = &parent->rb_right;
338 continue;
339 }
340 if (new->so_cred < sp->so_cred) 400 if (new->so_cred < sp->so_cred)
341 p = &parent->rb_left; 401 p = &parent->rb_left;
342 else if (new->so_cred > sp->so_cred) 402 else if (new->so_cred > sp->so_cred)
@@ -346,18 +406,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
346 return sp; 406 return sp;
347 } 407 }
348 } 408 }
349 nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64); 409 nfs_alloc_unique_id_locked(&server->openowner_id,
350 rb_link_node(&new->so_client_node, parent, p); 410 &new->so_owner_id, 1, 64);
351 rb_insert_color(&new->so_client_node, &clp->cl_state_owners); 411 rb_link_node(&new->so_server_node, parent, p);
412 rb_insert_color(&new->so_server_node, &server->state_owners);
352 return new; 413 return new;
353} 414}
354 415
355static void 416static void
356nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp) 417nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
357{ 418{
358 if (!RB_EMPTY_NODE(&sp->so_client_node)) 419 struct nfs_server *server = sp->so_server;
359 rb_erase(&sp->so_client_node, &clp->cl_state_owners); 420
360 nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id); 421 if (!RB_EMPTY_NODE(&sp->so_server_node))
422 rb_erase(&sp->so_server_node, &server->state_owners);
423 nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
361} 424}
362 425
363/* 426/*
@@ -386,23 +449,32 @@ nfs4_alloc_state_owner(void)
386static void 449static void
387nfs4_drop_state_owner(struct nfs4_state_owner *sp) 450nfs4_drop_state_owner(struct nfs4_state_owner *sp)
388{ 451{
389 if (!RB_EMPTY_NODE(&sp->so_client_node)) { 452 if (!RB_EMPTY_NODE(&sp->so_server_node)) {
390 struct nfs_client *clp = sp->so_server->nfs_client; 453 struct nfs_server *server = sp->so_server;
454 struct nfs_client *clp = server->nfs_client;
391 455
392 spin_lock(&clp->cl_lock); 456 spin_lock(&clp->cl_lock);
393 rb_erase(&sp->so_client_node, &clp->cl_state_owners); 457 rb_erase(&sp->so_server_node, &server->state_owners);
394 RB_CLEAR_NODE(&sp->so_client_node); 458 RB_CLEAR_NODE(&sp->so_server_node);
395 spin_unlock(&clp->cl_lock); 459 spin_unlock(&clp->cl_lock);
396 } 460 }
397} 461}
398 462
399struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) 463/**
464 * nfs4_get_state_owner - Look up a state owner given a credential
465 * @server: nfs_server to search
466 * @cred: RPC credential to match
467 *
468 * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
469 */
470struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
471 struct rpc_cred *cred)
400{ 472{
401 struct nfs_client *clp = server->nfs_client; 473 struct nfs_client *clp = server->nfs_client;
402 struct nfs4_state_owner *sp, *new; 474 struct nfs4_state_owner *sp, *new;
403 475
404 spin_lock(&clp->cl_lock); 476 spin_lock(&clp->cl_lock);
405 sp = nfs4_find_state_owner(server, cred); 477 sp = nfs4_find_state_owner_locked(server, cred);
406 spin_unlock(&clp->cl_lock); 478 spin_unlock(&clp->cl_lock);
407 if (sp != NULL) 479 if (sp != NULL)
408 return sp; 480 return sp;
@@ -412,7 +484,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
412 new->so_server = server; 484 new->so_server = server;
413 new->so_cred = cred; 485 new->so_cred = cred;
414 spin_lock(&clp->cl_lock); 486 spin_lock(&clp->cl_lock);
415 sp = nfs4_insert_state_owner(clp, new); 487 sp = nfs4_insert_state_owner_locked(new);
416 spin_unlock(&clp->cl_lock); 488 spin_unlock(&clp->cl_lock);
417 if (sp == new) 489 if (sp == new)
418 get_rpccred(cred); 490 get_rpccred(cred);
@@ -423,6 +495,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
423 return sp; 495 return sp;
424} 496}
425 497
498/**
499 * nfs4_put_state_owner - Release a nfs4_state_owner
500 * @sp: state owner data to release
501 *
502 */
426void nfs4_put_state_owner(struct nfs4_state_owner *sp) 503void nfs4_put_state_owner(struct nfs4_state_owner *sp)
427{ 504{
428 struct nfs_client *clp = sp->so_server->nfs_client; 505 struct nfs_client *clp = sp->so_server->nfs_client;
@@ -430,7 +507,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
430 507
431 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) 508 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
432 return; 509 return;
433 nfs4_remove_state_owner(clp, sp); 510 nfs4_remove_state_owner_locked(sp);
434 spin_unlock(&clp->cl_lock); 511 spin_unlock(&clp->cl_lock);
435 rpc_destroy_wait_queue(&sp->so_sequence.wait); 512 rpc_destroy_wait_queue(&sp->so_sequence.wait);
436 put_rpccred(cred); 513 put_rpccred(cred);
@@ -585,8 +662,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
585 if (!call_close) { 662 if (!call_close) {
586 nfs4_put_open_state(state); 663 nfs4_put_open_state(state);
587 nfs4_put_state_owner(owner); 664 nfs4_put_state_owner(owner);
588 } else 665 } else {
589 nfs4_do_close(path, state, gfp_mask, wait); 666 bool roc = pnfs_roc(state->inode);
667
668 nfs4_do_close(path, state, gfp_mask, wait, roc);
669 }
590} 670}
591 671
592void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) 672void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
@@ -633,7 +713,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
633static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) 713static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
634{ 714{
635 struct nfs4_lock_state *lsp; 715 struct nfs4_lock_state *lsp;
636 struct nfs_client *clp = state->owner->so_server->nfs_client; 716 struct nfs_server *server = state->owner->so_server;
717 struct nfs_client *clp = server->nfs_client;
637 718
638 lsp = kzalloc(sizeof(*lsp), GFP_NOFS); 719 lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
639 if (lsp == NULL) 720 if (lsp == NULL)
@@ -657,7 +738,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
657 return NULL; 738 return NULL;
658 } 739 }
659 spin_lock(&clp->cl_lock); 740 spin_lock(&clp->cl_lock);
660 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); 741 nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
661 spin_unlock(&clp->cl_lock); 742 spin_unlock(&clp->cl_lock);
662 INIT_LIST_HEAD(&lsp->ls_locks); 743 INIT_LIST_HEAD(&lsp->ls_locks);
663 return lsp; 744 return lsp;
@@ -665,10 +746,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
665 746
666static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) 747static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
667{ 748{
668 struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; 749 struct nfs_server *server = lsp->ls_state->owner->so_server;
750 struct nfs_client *clp = server->nfs_client;
669 751
670 spin_lock(&clp->cl_lock); 752 spin_lock(&clp->cl_lock);
671 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); 753 nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
672 spin_unlock(&clp->cl_lock); 754 spin_unlock(&clp->cl_lock);
673 rpc_destroy_wait_queue(&lsp->ls_sequence.wait); 755 rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
674 kfree(lsp); 756 kfree(lsp);
@@ -1114,15 +1196,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
1114 } 1196 }
1115} 1197}
1116 1198
1117static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) 1199static void nfs4_reset_seqids(struct nfs_server *server,
1200 int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
1118{ 1201{
1202 struct nfs_client *clp = server->nfs_client;
1119 struct nfs4_state_owner *sp; 1203 struct nfs4_state_owner *sp;
1120 struct rb_node *pos; 1204 struct rb_node *pos;
1121 struct nfs4_state *state; 1205 struct nfs4_state *state;
1122 1206
1123 /* Reset all sequence ids to zero */ 1207 spin_lock(&clp->cl_lock);
1124 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1208 for (pos = rb_first(&server->state_owners);
1125 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1209 pos != NULL;
1210 pos = rb_next(pos)) {
1211 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
1126 sp->so_seqid.flags = 0; 1212 sp->so_seqid.flags = 0;
1127 spin_lock(&sp->so_lock); 1213 spin_lock(&sp->so_lock);
1128 list_for_each_entry(state, &sp->so_states, open_states) { 1214 list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1131,6 +1217,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
1131 } 1217 }
1132 spin_unlock(&sp->so_lock); 1218 spin_unlock(&sp->so_lock);
1133 } 1219 }
1220 spin_unlock(&clp->cl_lock);
1221}
1222
1223static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
1224 int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
1225{
1226 struct nfs_server *server;
1227
1228 rcu_read_lock();
1229 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
1230 nfs4_reset_seqids(server, mark_reclaim);
1231 rcu_read_unlock();
1134} 1232}
1135 1233
1136static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) 1234static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1148,25 +1246,41 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
1148 (void)ops->reclaim_complete(clp); 1246 (void)ops->reclaim_complete(clp);
1149} 1247}
1150 1248
1151static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) 1249static void nfs4_clear_reclaim_server(struct nfs_server *server)
1152{ 1250{
1251 struct nfs_client *clp = server->nfs_client;
1153 struct nfs4_state_owner *sp; 1252 struct nfs4_state_owner *sp;
1154 struct rb_node *pos; 1253 struct rb_node *pos;
1155 struct nfs4_state *state; 1254 struct nfs4_state *state;
1156 1255
1157 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1256 spin_lock(&clp->cl_lock);
1158 return 0; 1257 for (pos = rb_first(&server->state_owners);
1159 1258 pos != NULL;
1160 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1259 pos = rb_next(pos)) {
1161 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1260 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
1162 spin_lock(&sp->so_lock); 1261 spin_lock(&sp->so_lock);
1163 list_for_each_entry(state, &sp->so_states, open_states) { 1262 list_for_each_entry(state, &sp->so_states, open_states) {
1164 if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags)) 1263 if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
1264 &state->flags))
1165 continue; 1265 continue;
1166 nfs4_state_mark_reclaim_nograce(clp, state); 1266 nfs4_state_mark_reclaim_nograce(clp, state);
1167 } 1267 }
1168 spin_unlock(&sp->so_lock); 1268 spin_unlock(&sp->so_lock);
1169 } 1269 }
1270 spin_unlock(&clp->cl_lock);
1271}
1272
1273static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
1274{
1275 struct nfs_server *server;
1276
1277 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1278 return 0;
1279
1280 rcu_read_lock();
1281 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
1282 nfs4_clear_reclaim_server(server);
1283 rcu_read_unlock();
1170 1284
1171 nfs_delegation_reap_unclaimed(clp); 1285 nfs_delegation_reap_unclaimed(clp);
1172 return 1; 1286 return 1;
@@ -1238,27 +1352,40 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1238 1352
1239static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) 1353static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
1240{ 1354{
1355 struct nfs4_state_owner *sp;
1356 struct nfs_server *server;
1241 struct rb_node *pos; 1357 struct rb_node *pos;
1242 int status = 0; 1358 int status = 0;
1243 1359
1244restart: 1360restart:
1245 spin_lock(&clp->cl_lock); 1361 rcu_read_lock();
1246 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1362 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
1247 struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1363 spin_lock(&clp->cl_lock);
1248 if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags)) 1364 for (pos = rb_first(&server->state_owners);
1249 continue; 1365 pos != NULL;
1250 atomic_inc(&sp->so_count); 1366 pos = rb_next(pos)) {
1251 spin_unlock(&clp->cl_lock); 1367 sp = rb_entry(pos,
1252 status = nfs4_reclaim_open_state(sp, ops); 1368 struct nfs4_state_owner, so_server_node);
1253 if (status < 0) { 1369 if (!test_and_clear_bit(ops->owner_flag_bit,
1254 set_bit(ops->owner_flag_bit, &sp->so_flags); 1370 &sp->so_flags))
1371 continue;
1372 atomic_inc(&sp->so_count);
1373 spin_unlock(&clp->cl_lock);
1374 rcu_read_unlock();
1375
1376 status = nfs4_reclaim_open_state(sp, ops);
1377 if (status < 0) {
1378 set_bit(ops->owner_flag_bit, &sp->so_flags);
1379 nfs4_put_state_owner(sp);
1380 return nfs4_recovery_handle_error(clp, status);
1381 }
1382
1255 nfs4_put_state_owner(sp); 1383 nfs4_put_state_owner(sp);
1256 return nfs4_recovery_handle_error(clp, status); 1384 goto restart;
1257 } 1385 }
1258 nfs4_put_state_owner(sp); 1386 spin_unlock(&clp->cl_lock);
1259 goto restart;
1260 } 1387 }
1261 spin_unlock(&clp->cl_lock); 1388 rcu_read_unlock();
1262 return status; 1389 return status;
1263} 1390}
1264 1391
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f313c4cce7e..4e2c168b6ee 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -71,8 +71,8 @@ static int nfs4_stat_to_errno(int);
71/* lock,open owner id: 71/* lock,open owner id:
72 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) 72 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
73 */ 73 */
74#define open_owner_id_maxsz (1 + 4) 74#define open_owner_id_maxsz (1 + 1 + 4)
75#define lock_owner_id_maxsz (1 + 4) 75#define lock_owner_id_maxsz (1 + 1 + 4)
76#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) 76#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
77#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) 77#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
78#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) 78#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
@@ -1088,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
1088{ 1088{
1089 __be32 *p; 1089 __be32 *p;
1090 1090
1091 p = reserve_space(xdr, 28); 1091 p = reserve_space(xdr, 32);
1092 p = xdr_encode_hyper(p, lowner->clientid); 1092 p = xdr_encode_hyper(p, lowner->clientid);
1093 *p++ = cpu_to_be32(16); 1093 *p++ = cpu_to_be32(20);
1094 p = xdr_encode_opaque_fixed(p, "lock id:", 8); 1094 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1095 *p++ = cpu_to_be32(lowner->s_dev);
1095 xdr_encode_hyper(p, lowner->id); 1096 xdr_encode_hyper(p, lowner->id);
1096} 1097}
1097 1098
@@ -1210,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1210 *p++ = cpu_to_be32(OP_OPEN); 1211 *p++ = cpu_to_be32(OP_OPEN);
1211 *p = cpu_to_be32(arg->seqid->sequence->counter); 1212 *p = cpu_to_be32(arg->seqid->sequence->counter);
1212 encode_share_access(xdr, arg->fmode); 1213 encode_share_access(xdr, arg->fmode);
1213 p = reserve_space(xdr, 28); 1214 p = reserve_space(xdr, 32);
1214 p = xdr_encode_hyper(p, arg->clientid); 1215 p = xdr_encode_hyper(p, arg->clientid);
1215 *p++ = cpu_to_be32(16); 1216 *p++ = cpu_to_be32(20);
1216 p = xdr_encode_opaque_fixed(p, "open id:", 8); 1217 p = xdr_encode_opaque_fixed(p, "open id:", 8);
1218 *p++ = cpu_to_be32(arg->server->s_dev);
1217 xdr_encode_hyper(p, arg->id); 1219 xdr_encode_hyper(p, arg->id);
1218} 1220}
1219 1221
@@ -1510,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1510 hdr->replen += decode_restorefh_maxsz; 1512 hdr->replen += decode_restorefh_maxsz;
1511} 1513}
1512 1514
1513static int 1515static void
1514encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr) 1516encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
1515{ 1517{
1516 __be32 *p; 1518 __be32 *p;
@@ -1521,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1521 p = reserve_space(xdr, 2*4); 1523 p = reserve_space(xdr, 2*4);
1522 *p++ = cpu_to_be32(1); 1524 *p++ = cpu_to_be32(1);
1523 *p = cpu_to_be32(FATTR4_WORD0_ACL); 1525 *p = cpu_to_be32(FATTR4_WORD0_ACL);
1524 if (arg->acl_len % 4) 1526 BUG_ON(arg->acl_len % 4);
1525 return -EINVAL;
1526 p = reserve_space(xdr, 4); 1527 p = reserve_space(xdr, 4);
1527 *p = cpu_to_be32(arg->acl_len); 1528 *p = cpu_to_be32(arg->acl_len);
1528 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1529 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
1529 hdr->nops++; 1530 hdr->nops++;
1530 hdr->replen += decode_setacl_maxsz; 1531 hdr->replen += decode_setacl_maxsz;
1531 return 0;
1532} 1532}
1533 1533
1534static void 1534static void
@@ -1789,7 +1789,6 @@ encode_layoutget(struct xdr_stream *xdr,
1789 const struct nfs4_layoutget_args *args, 1789 const struct nfs4_layoutget_args *args,
1790 struct compound_hdr *hdr) 1790 struct compound_hdr *hdr)
1791{ 1791{
1792 nfs4_stateid stateid;
1793 __be32 *p; 1792 __be32 *p;
1794 1793
1795 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); 1794 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1800,9 +1799,7 @@ encode_layoutget(struct xdr_stream *xdr,
1800 p = xdr_encode_hyper(p, args->range.offset); 1799 p = xdr_encode_hyper(p, args->range.offset);
1801 p = xdr_encode_hyper(p, args->range.length); 1800 p = xdr_encode_hyper(p, args->range.length);
1802 p = xdr_encode_hyper(p, args->minlength); 1801 p = xdr_encode_hyper(p, args->minlength);
1803 pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout, 1802 p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
1804 args->ctx->state);
1805 p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
1806 *p = cpu_to_be32(args->maxcount); 1803 *p = cpu_to_be32(args->maxcount);
1807 1804
1808 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", 1805 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
@@ -1833,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
1833/* 1830/*
1834 * Encode an ACCESS request 1831 * Encode an ACCESS request
1835 */ 1832 */
1836static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args) 1833static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
1834 const struct nfs4_accessargs *args)
1837{ 1835{
1838 struct xdr_stream xdr;
1839 struct compound_hdr hdr = { 1836 struct compound_hdr hdr = {
1840 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1837 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1841 }; 1838 };
1842 1839
1843 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1840 encode_compound_hdr(xdr, req, &hdr);
1844 encode_compound_hdr(&xdr, req, &hdr); 1841 encode_sequence(xdr, &args->seq_args, &hdr);
1845 encode_sequence(&xdr, &args->seq_args, &hdr); 1842 encode_putfh(xdr, args->fh, &hdr);
1846 encode_putfh(&xdr, args->fh, &hdr); 1843 encode_access(xdr, args->access, &hdr);
1847 encode_access(&xdr, args->access, &hdr); 1844 encode_getfattr(xdr, args->bitmask, &hdr);
1848 encode_getfattr(&xdr, args->bitmask, &hdr);
1849 encode_nops(&hdr); 1845 encode_nops(&hdr);
1850 return 0;
1851} 1846}
1852 1847
1853/* 1848/*
1854 * Encode LOOKUP request 1849 * Encode LOOKUP request
1855 */ 1850 */
1856static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args) 1851static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
1852 const struct nfs4_lookup_arg *args)
1857{ 1853{
1858 struct xdr_stream xdr;
1859 struct compound_hdr hdr = { 1854 struct compound_hdr hdr = {
1860 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1855 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1861 }; 1856 };
1862 1857
1863 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1858 encode_compound_hdr(xdr, req, &hdr);
1864 encode_compound_hdr(&xdr, req, &hdr); 1859 encode_sequence(xdr, &args->seq_args, &hdr);
1865 encode_sequence(&xdr, &args->seq_args, &hdr); 1860 encode_putfh(xdr, args->dir_fh, &hdr);
1866 encode_putfh(&xdr, args->dir_fh, &hdr); 1861 encode_lookup(xdr, args->name, &hdr);
1867 encode_lookup(&xdr, args->name, &hdr); 1862 encode_getfh(xdr, &hdr);
1868 encode_getfh(&xdr, &hdr); 1863 encode_getfattr(xdr, args->bitmask, &hdr);
1869 encode_getfattr(&xdr, args->bitmask, &hdr);
1870 encode_nops(&hdr); 1864 encode_nops(&hdr);
1871 return 0;
1872} 1865}
1873 1866
1874/* 1867/*
1875 * Encode LOOKUP_ROOT request 1868 * Encode LOOKUP_ROOT request
1876 */ 1869 */
1877static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args) 1870static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
1871 struct xdr_stream *xdr,
1872 const struct nfs4_lookup_root_arg *args)
1878{ 1873{
1879 struct xdr_stream xdr;
1880 struct compound_hdr hdr = { 1874 struct compound_hdr hdr = {
1881 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1875 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1882 }; 1876 };
1883 1877
1884 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1878 encode_compound_hdr(xdr, req, &hdr);
1885 encode_compound_hdr(&xdr, req, &hdr); 1879 encode_sequence(xdr, &args->seq_args, &hdr);
1886 encode_sequence(&xdr, &args->seq_args, &hdr); 1880 encode_putrootfh(xdr, &hdr);
1887 encode_putrootfh(&xdr, &hdr); 1881 encode_getfh(xdr, &hdr);
1888 encode_getfh(&xdr, &hdr); 1882 encode_getfattr(xdr, args->bitmask, &hdr);
1889 encode_getfattr(&xdr, args->bitmask, &hdr);
1890 encode_nops(&hdr); 1883 encode_nops(&hdr);
1891 return 0;
1892} 1884}
1893 1885
1894/* 1886/*
1895 * Encode REMOVE request 1887 * Encode REMOVE request
1896 */ 1888 */
1897static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) 1889static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
1890 const struct nfs_removeargs *args)
1898{ 1891{
1899 struct xdr_stream xdr;
1900 struct compound_hdr hdr = { 1892 struct compound_hdr hdr = {
1901 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1893 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1902 }; 1894 };
1903 1895
1904 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1896 encode_compound_hdr(xdr, req, &hdr);
1905 encode_compound_hdr(&xdr, req, &hdr); 1897 encode_sequence(xdr, &args->seq_args, &hdr);
1906 encode_sequence(&xdr, &args->seq_args, &hdr); 1898 encode_putfh(xdr, args->fh, &hdr);
1907 encode_putfh(&xdr, args->fh, &hdr); 1899 encode_remove(xdr, &args->name, &hdr);
1908 encode_remove(&xdr, &args->name, &hdr); 1900 encode_getfattr(xdr, args->bitmask, &hdr);
1909 encode_getfattr(&xdr, args->bitmask, &hdr);
1910 encode_nops(&hdr); 1901 encode_nops(&hdr);
1911 return 0;
1912} 1902}
1913 1903
1914/* 1904/*
1915 * Encode RENAME request 1905 * Encode RENAME request
1916 */ 1906 */
1917static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args) 1907static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
1908 const struct nfs_renameargs *args)
1918{ 1909{
1919 struct xdr_stream xdr;
1920 struct compound_hdr hdr = { 1910 struct compound_hdr hdr = {
1921 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1911 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1922 }; 1912 };
1923 1913
1924 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1914 encode_compound_hdr(xdr, req, &hdr);
1925 encode_compound_hdr(&xdr, req, &hdr); 1915 encode_sequence(xdr, &args->seq_args, &hdr);
1926 encode_sequence(&xdr, &args->seq_args, &hdr); 1916 encode_putfh(xdr, args->old_dir, &hdr);
1927 encode_putfh(&xdr, args->old_dir, &hdr); 1917 encode_savefh(xdr, &hdr);
1928 encode_savefh(&xdr, &hdr); 1918 encode_putfh(xdr, args->new_dir, &hdr);
1929 encode_putfh(&xdr, args->new_dir, &hdr); 1919 encode_rename(xdr, args->old_name, args->new_name, &hdr);
1930 encode_rename(&xdr, args->old_name, args->new_name, &hdr); 1920 encode_getfattr(xdr, args->bitmask, &hdr);
1931 encode_getfattr(&xdr, args->bitmask, &hdr); 1921 encode_restorefh(xdr, &hdr);
1932 encode_restorefh(&xdr, &hdr); 1922 encode_getfattr(xdr, args->bitmask, &hdr);
1933 encode_getfattr(&xdr, args->bitmask, &hdr);
1934 encode_nops(&hdr); 1923 encode_nops(&hdr);
1935 return 0;
1936} 1924}
1937 1925
1938/* 1926/*
1939 * Encode LINK request 1927 * Encode LINK request
1940 */ 1928 */
1941static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args) 1929static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
1930 const struct nfs4_link_arg *args)
1942{ 1931{
1943 struct xdr_stream xdr;
1944 struct compound_hdr hdr = { 1932 struct compound_hdr hdr = {
1945 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1933 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1946 }; 1934 };
1947 1935
1948 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1936 encode_compound_hdr(xdr, req, &hdr);
1949 encode_compound_hdr(&xdr, req, &hdr); 1937 encode_sequence(xdr, &args->seq_args, &hdr);
1950 encode_sequence(&xdr, &args->seq_args, &hdr); 1938 encode_putfh(xdr, args->fh, &hdr);
1951 encode_putfh(&xdr, args->fh, &hdr); 1939 encode_savefh(xdr, &hdr);
1952 encode_savefh(&xdr, &hdr); 1940 encode_putfh(xdr, args->dir_fh, &hdr);
1953 encode_putfh(&xdr, args->dir_fh, &hdr); 1941 encode_link(xdr, args->name, &hdr);
1954 encode_link(&xdr, args->name, &hdr); 1942 encode_getfattr(xdr, args->bitmask, &hdr);
1955 encode_getfattr(&xdr, args->bitmask, &hdr); 1943 encode_restorefh(xdr, &hdr);
1956 encode_restorefh(&xdr, &hdr); 1944 encode_getfattr(xdr, args->bitmask, &hdr);
1957 encode_getfattr(&xdr, args->bitmask, &hdr);
1958 encode_nops(&hdr); 1945 encode_nops(&hdr);
1959 return 0;
1960} 1946}
1961 1947
1962/* 1948/*
1963 * Encode CREATE request 1949 * Encode CREATE request
1964 */ 1950 */
1965static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) 1951static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
1952 const struct nfs4_create_arg *args)
1966{ 1953{
1967 struct xdr_stream xdr;
1968 struct compound_hdr hdr = { 1954 struct compound_hdr hdr = {
1969 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1955 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1970 }; 1956 };
1971 1957
1972 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1958 encode_compound_hdr(xdr, req, &hdr);
1973 encode_compound_hdr(&xdr, req, &hdr); 1959 encode_sequence(xdr, &args->seq_args, &hdr);
1974 encode_sequence(&xdr, &args->seq_args, &hdr); 1960 encode_putfh(xdr, args->dir_fh, &hdr);
1975 encode_putfh(&xdr, args->dir_fh, &hdr); 1961 encode_savefh(xdr, &hdr);
1976 encode_savefh(&xdr, &hdr); 1962 encode_create(xdr, args, &hdr);
1977 encode_create(&xdr, args, &hdr); 1963 encode_getfh(xdr, &hdr);
1978 encode_getfh(&xdr, &hdr); 1964 encode_getfattr(xdr, args->bitmask, &hdr);
1979 encode_getfattr(&xdr, args->bitmask, &hdr); 1965 encode_restorefh(xdr, &hdr);
1980 encode_restorefh(&xdr, &hdr); 1966 encode_getfattr(xdr, args->bitmask, &hdr);
1981 encode_getfattr(&xdr, args->bitmask, &hdr);
1982 encode_nops(&hdr); 1967 encode_nops(&hdr);
1983 return 0;
1984} 1968}
1985 1969
1986/* 1970/*
1987 * Encode SYMLINK request 1971 * Encode SYMLINK request
1988 */ 1972 */
1989static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) 1973static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
1974 const struct nfs4_create_arg *args)
1990{ 1975{
1991 return nfs4_xdr_enc_create(req, p, args); 1976 nfs4_xdr_enc_create(req, xdr, args);
1992} 1977}
1993 1978
1994/* 1979/*
1995 * Encode GETATTR request 1980 * Encode GETATTR request
1996 */ 1981 */
1997static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args) 1982static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
1983 const struct nfs4_getattr_arg *args)
1998{ 1984{
1999 struct xdr_stream xdr;
2000 struct compound_hdr hdr = { 1985 struct compound_hdr hdr = {
2001 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1986 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2002 }; 1987 };
2003 1988
2004 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1989 encode_compound_hdr(xdr, req, &hdr);
2005 encode_compound_hdr(&xdr, req, &hdr); 1990 encode_sequence(xdr, &args->seq_args, &hdr);
2006 encode_sequence(&xdr, &args->seq_args, &hdr); 1991 encode_putfh(xdr, args->fh, &hdr);
2007 encode_putfh(&xdr, args->fh, &hdr); 1992 encode_getfattr(xdr, args->bitmask, &hdr);
2008 encode_getfattr(&xdr, args->bitmask, &hdr);
2009 encode_nops(&hdr); 1993 encode_nops(&hdr);
2010 return 0;
2011} 1994}
2012 1995
2013/* 1996/*
2014 * Encode a CLOSE request 1997 * Encode a CLOSE request
2015 */ 1998 */
2016static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) 1999static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
2000 struct nfs_closeargs *args)
2017{ 2001{
2018 struct xdr_stream xdr;
2019 struct compound_hdr hdr = { 2002 struct compound_hdr hdr = {
2020 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2003 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2021 }; 2004 };
2022 2005
2023 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2006 encode_compound_hdr(xdr, req, &hdr);
2024 encode_compound_hdr(&xdr, req, &hdr); 2007 encode_sequence(xdr, &args->seq_args, &hdr);
2025 encode_sequence(&xdr, &args->seq_args, &hdr); 2008 encode_putfh(xdr, args->fh, &hdr);
2026 encode_putfh(&xdr, args->fh, &hdr); 2009 encode_close(xdr, args, &hdr);
2027 encode_close(&xdr, args, &hdr); 2010 encode_getfattr(xdr, args->bitmask, &hdr);
2028 encode_getfattr(&xdr, args->bitmask, &hdr);
2029 encode_nops(&hdr); 2011 encode_nops(&hdr);
2030 return 0;
2031} 2012}
2032 2013
2033/* 2014/*
2034 * Encode an OPEN request 2015 * Encode an OPEN request
2035 */ 2016 */
2036static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) 2017static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
2018 struct nfs_openargs *args)
2037{ 2019{
2038 struct xdr_stream xdr;
2039 struct compound_hdr hdr = { 2020 struct compound_hdr hdr = {
2040 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2021 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2041 }; 2022 };
2042 2023
2043 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2024 encode_compound_hdr(xdr, req, &hdr);
2044 encode_compound_hdr(&xdr, req, &hdr); 2025 encode_sequence(xdr, &args->seq_args, &hdr);
2045 encode_sequence(&xdr, &args->seq_args, &hdr); 2026 encode_putfh(xdr, args->fh, &hdr);
2046 encode_putfh(&xdr, args->fh, &hdr); 2027 encode_savefh(xdr, &hdr);
2047 encode_savefh(&xdr, &hdr); 2028 encode_open(xdr, args, &hdr);
2048 encode_open(&xdr, args, &hdr); 2029 encode_getfh(xdr, &hdr);
2049 encode_getfh(&xdr, &hdr); 2030 encode_getfattr(xdr, args->bitmask, &hdr);
2050 encode_getfattr(&xdr, args->bitmask, &hdr); 2031 encode_restorefh(xdr, &hdr);
2051 encode_restorefh(&xdr, &hdr); 2032 encode_getfattr(xdr, args->bitmask, &hdr);
2052 encode_getfattr(&xdr, args->bitmask, &hdr);
2053 encode_nops(&hdr); 2033 encode_nops(&hdr);
2054 return 0;
2055} 2034}
2056 2035
2057/* 2036/*
2058 * Encode an OPEN_CONFIRM request 2037 * Encode an OPEN_CONFIRM request
2059 */ 2038 */
2060static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args) 2039static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
2040 struct xdr_stream *xdr,
2041 struct nfs_open_confirmargs *args)
2061{ 2042{
2062 struct xdr_stream xdr;
2063 struct compound_hdr hdr = { 2043 struct compound_hdr hdr = {
2064 .nops = 0, 2044 .nops = 0,
2065 }; 2045 };
2066 2046
2067 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2047 encode_compound_hdr(xdr, req, &hdr);
2068 encode_compound_hdr(&xdr, req, &hdr); 2048 encode_putfh(xdr, args->fh, &hdr);
2069 encode_putfh(&xdr, args->fh, &hdr); 2049 encode_open_confirm(xdr, args, &hdr);
2070 encode_open_confirm(&xdr, args, &hdr);
2071 encode_nops(&hdr); 2050 encode_nops(&hdr);
2072 return 0;
2073} 2051}
2074 2052
2075/* 2053/*
2076 * Encode an OPEN request with no attributes. 2054 * Encode an OPEN request with no attributes.
2077 */ 2055 */
2078static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) 2056static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
2057 struct xdr_stream *xdr,
2058 struct nfs_openargs *args)
2079{ 2059{
2080 struct xdr_stream xdr;
2081 struct compound_hdr hdr = { 2060 struct compound_hdr hdr = {
2082 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2061 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2083 }; 2062 };
2084 2063
2085 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2064 encode_compound_hdr(xdr, req, &hdr);
2086 encode_compound_hdr(&xdr, req, &hdr); 2065 encode_sequence(xdr, &args->seq_args, &hdr);
2087 encode_sequence(&xdr, &args->seq_args, &hdr); 2066 encode_putfh(xdr, args->fh, &hdr);
2088 encode_putfh(&xdr, args->fh, &hdr); 2067 encode_open(xdr, args, &hdr);
2089 encode_open(&xdr, args, &hdr); 2068 encode_getfattr(xdr, args->bitmask, &hdr);
2090 encode_getfattr(&xdr, args->bitmask, &hdr);
2091 encode_nops(&hdr); 2069 encode_nops(&hdr);
2092 return 0;
2093} 2070}
2094 2071
2095/* 2072/*
2096 * Encode an OPEN_DOWNGRADE request 2073 * Encode an OPEN_DOWNGRADE request
2097 */ 2074 */
2098static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) 2075static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
2076 struct xdr_stream *xdr,
2077 struct nfs_closeargs *args)
2099{ 2078{
2100 struct xdr_stream xdr;
2101 struct compound_hdr hdr = { 2079 struct compound_hdr hdr = {
2102 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2080 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2103 }; 2081 };
2104 2082
2105 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2083 encode_compound_hdr(xdr, req, &hdr);
2106 encode_compound_hdr(&xdr, req, &hdr); 2084 encode_sequence(xdr, &args->seq_args, &hdr);
2107 encode_sequence(&xdr, &args->seq_args, &hdr); 2085 encode_putfh(xdr, args->fh, &hdr);
2108 encode_putfh(&xdr, args->fh, &hdr); 2086 encode_open_downgrade(xdr, args, &hdr);
2109 encode_open_downgrade(&xdr, args, &hdr); 2087 encode_getfattr(xdr, args->bitmask, &hdr);
2110 encode_getfattr(&xdr, args->bitmask, &hdr);
2111 encode_nops(&hdr); 2088 encode_nops(&hdr);
2112 return 0;
2113} 2089}
2114 2090
2115/* 2091/*
2116 * Encode a LOCK request 2092 * Encode a LOCK request
2117 */ 2093 */
2118static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args) 2094static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
2095 struct nfs_lock_args *args)
2119{ 2096{
2120 struct xdr_stream xdr;
2121 struct compound_hdr hdr = { 2097 struct compound_hdr hdr = {
2122 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2098 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2123 }; 2099 };
2124 2100
2125 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2101 encode_compound_hdr(xdr, req, &hdr);
2126 encode_compound_hdr(&xdr, req, &hdr); 2102 encode_sequence(xdr, &args->seq_args, &hdr);
2127 encode_sequence(&xdr, &args->seq_args, &hdr); 2103 encode_putfh(xdr, args->fh, &hdr);
2128 encode_putfh(&xdr, args->fh, &hdr); 2104 encode_lock(xdr, args, &hdr);
2129 encode_lock(&xdr, args, &hdr);
2130 encode_nops(&hdr); 2105 encode_nops(&hdr);
2131 return 0;
2132} 2106}
2133 2107
2134/* 2108/*
2135 * Encode a LOCKT request 2109 * Encode a LOCKT request
2136 */ 2110 */
2137static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args) 2111static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
2112 struct nfs_lockt_args *args)
2138{ 2113{
2139 struct xdr_stream xdr;
2140 struct compound_hdr hdr = { 2114 struct compound_hdr hdr = {
2141 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2115 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2142 }; 2116 };
2143 2117
2144 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2118 encode_compound_hdr(xdr, req, &hdr);
2145 encode_compound_hdr(&xdr, req, &hdr); 2119 encode_sequence(xdr, &args->seq_args, &hdr);
2146 encode_sequence(&xdr, &args->seq_args, &hdr); 2120 encode_putfh(xdr, args->fh, &hdr);
2147 encode_putfh(&xdr, args->fh, &hdr); 2121 encode_lockt(xdr, args, &hdr);
2148 encode_lockt(&xdr, args, &hdr);
2149 encode_nops(&hdr); 2122 encode_nops(&hdr);
2150 return 0;
2151} 2123}
2152 2124
2153/* 2125/*
2154 * Encode a LOCKU request 2126 * Encode a LOCKU request
2155 */ 2127 */
2156static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args) 2128static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
2129 struct nfs_locku_args *args)
2157{ 2130{
2158 struct xdr_stream xdr;
2159 struct compound_hdr hdr = { 2131 struct compound_hdr hdr = {
2160 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2132 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2161 }; 2133 };
2162 2134
2163 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2135 encode_compound_hdr(xdr, req, &hdr);
2164 encode_compound_hdr(&xdr, req, &hdr); 2136 encode_sequence(xdr, &args->seq_args, &hdr);
2165 encode_sequence(&xdr, &args->seq_args, &hdr); 2137 encode_putfh(xdr, args->fh, &hdr);
2166 encode_putfh(&xdr, args->fh, &hdr); 2138 encode_locku(xdr, args, &hdr);
2167 encode_locku(&xdr, args, &hdr);
2168 encode_nops(&hdr); 2139 encode_nops(&hdr);
2169 return 0;
2170} 2140}
2171 2141
2172static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) 2142static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
2143 struct xdr_stream *xdr,
2144 struct nfs_release_lockowner_args *args)
2173{ 2145{
2174 struct xdr_stream xdr;
2175 struct compound_hdr hdr = { 2146 struct compound_hdr hdr = {
2176 .minorversion = 0, 2147 .minorversion = 0,
2177 }; 2148 };
2178 2149
2179 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2150 encode_compound_hdr(xdr, req, &hdr);
2180 encode_compound_hdr(&xdr, req, &hdr); 2151 encode_release_lockowner(xdr, &args->lock_owner, &hdr);
2181 encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
2182 encode_nops(&hdr); 2152 encode_nops(&hdr);
2183 return 0;
2184} 2153}
2185 2154
2186/* 2155/*
2187 * Encode a READLINK request 2156 * Encode a READLINK request
2188 */ 2157 */
2189static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args) 2158static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
2159 const struct nfs4_readlink *args)
2190{ 2160{
2191 struct xdr_stream xdr;
2192 struct compound_hdr hdr = { 2161 struct compound_hdr hdr = {
2193 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2162 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2194 }; 2163 };
2195 2164
2196 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2165 encode_compound_hdr(xdr, req, &hdr);
2197 encode_compound_hdr(&xdr, req, &hdr); 2166 encode_sequence(xdr, &args->seq_args, &hdr);
2198 encode_sequence(&xdr, &args->seq_args, &hdr); 2167 encode_putfh(xdr, args->fh, &hdr);
2199 encode_putfh(&xdr, args->fh, &hdr); 2168 encode_readlink(xdr, args, req, &hdr);
2200 encode_readlink(&xdr, args, req, &hdr);
2201 2169
2202 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, 2170 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
2203 args->pgbase, args->pglen); 2171 args->pgbase, args->pglen);
2204 encode_nops(&hdr); 2172 encode_nops(&hdr);
2205 return 0;
2206} 2173}
2207 2174
2208/* 2175/*
2209 * Encode a READDIR request 2176 * Encode a READDIR request
2210 */ 2177 */
2211static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args) 2178static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
2179 const struct nfs4_readdir_arg *args)
2212{ 2180{
2213 struct xdr_stream xdr;
2214 struct compound_hdr hdr = { 2181 struct compound_hdr hdr = {
2215 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2182 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2216 }; 2183 };
2217 2184
2218 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2185 encode_compound_hdr(xdr, req, &hdr);
2219 encode_compound_hdr(&xdr, req, &hdr); 2186 encode_sequence(xdr, &args->seq_args, &hdr);
2220 encode_sequence(&xdr, &args->seq_args, &hdr); 2187 encode_putfh(xdr, args->fh, &hdr);
2221 encode_putfh(&xdr, args->fh, &hdr); 2188 encode_readdir(xdr, args, req, &hdr);
2222 encode_readdir(&xdr, args, req, &hdr);
2223 2189
2224 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, 2190 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
2225 args->pgbase, args->count); 2191 args->pgbase, args->count);
@@ -2227,428 +2193,387 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
2227 __func__, hdr.replen << 2, args->pages, 2193 __func__, hdr.replen << 2, args->pages,
2228 args->pgbase, args->count); 2194 args->pgbase, args->count);
2229 encode_nops(&hdr); 2195 encode_nops(&hdr);
2230 return 0;
2231} 2196}
2232 2197
2233/* 2198/*
2234 * Encode a READ request 2199 * Encode a READ request
2235 */ 2200 */
2236static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 2201static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
2202 struct nfs_readargs *args)
2237{ 2203{
2238 struct xdr_stream xdr;
2239 struct compound_hdr hdr = { 2204 struct compound_hdr hdr = {
2240 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2205 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2241 }; 2206 };
2242 2207
2243 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2208 encode_compound_hdr(xdr, req, &hdr);
2244 encode_compound_hdr(&xdr, req, &hdr); 2209 encode_sequence(xdr, &args->seq_args, &hdr);
2245 encode_sequence(&xdr, &args->seq_args, &hdr); 2210 encode_putfh(xdr, args->fh, &hdr);
2246 encode_putfh(&xdr, args->fh, &hdr); 2211 encode_read(xdr, args, &hdr);
2247 encode_read(&xdr, args, &hdr);
2248 2212
2249 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, 2213 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
2250 args->pages, args->pgbase, args->count); 2214 args->pages, args->pgbase, args->count);
2251 req->rq_rcv_buf.flags |= XDRBUF_READ; 2215 req->rq_rcv_buf.flags |= XDRBUF_READ;
2252 encode_nops(&hdr); 2216 encode_nops(&hdr);
2253 return 0;
2254} 2217}
2255 2218
2256/* 2219/*
2257 * Encode an SETATTR request 2220 * Encode an SETATTR request
2258 */ 2221 */
2259static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args) 2222static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
2223 struct nfs_setattrargs *args)
2260{ 2224{
2261 struct xdr_stream xdr;
2262 struct compound_hdr hdr = { 2225 struct compound_hdr hdr = {
2263 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2226 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2264 }; 2227 };
2265 2228
2266 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2229 encode_compound_hdr(xdr, req, &hdr);
2267 encode_compound_hdr(&xdr, req, &hdr); 2230 encode_sequence(xdr, &args->seq_args, &hdr);
2268 encode_sequence(&xdr, &args->seq_args, &hdr); 2231 encode_putfh(xdr, args->fh, &hdr);
2269 encode_putfh(&xdr, args->fh, &hdr); 2232 encode_setattr(xdr, args, args->server, &hdr);
2270 encode_setattr(&xdr, args, args->server, &hdr); 2233 encode_getfattr(xdr, args->bitmask, &hdr);
2271 encode_getfattr(&xdr, args->bitmask, &hdr);
2272 encode_nops(&hdr); 2234 encode_nops(&hdr);
2273 return 0;
2274} 2235}
2275 2236
2276/* 2237/*
2277 * Encode a GETACL request 2238 * Encode a GETACL request
2278 */ 2239 */
2279static int 2240static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
2280nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, 2241 struct nfs_getaclargs *args)
2281 struct nfs_getaclargs *args)
2282{ 2242{
2283 struct xdr_stream xdr;
2284 struct compound_hdr hdr = { 2243 struct compound_hdr hdr = {
2285 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2244 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2286 }; 2245 };
2287 uint32_t replen; 2246 uint32_t replen;
2288 2247
2289 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2248 encode_compound_hdr(xdr, req, &hdr);
2290 encode_compound_hdr(&xdr, req, &hdr); 2249 encode_sequence(xdr, &args->seq_args, &hdr);
2291 encode_sequence(&xdr, &args->seq_args, &hdr); 2250 encode_putfh(xdr, args->fh, &hdr);
2292 encode_putfh(&xdr, args->fh, &hdr);
2293 replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; 2251 replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
2294 encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); 2252 encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
2295 2253
2296 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, 2254 xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
2297 args->acl_pages, args->acl_pgbase, args->acl_len); 2255 args->acl_pages, args->acl_pgbase, args->acl_len);
2298 encode_nops(&hdr); 2256 encode_nops(&hdr);
2299 return 0;
2300} 2257}
2301 2258
2302/* 2259/*
2303 * Encode a WRITE request 2260 * Encode a WRITE request
2304 */ 2261 */
2305static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 2262static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
2263 struct nfs_writeargs *args)
2306{ 2264{
2307 struct xdr_stream xdr;
2308 struct compound_hdr hdr = { 2265 struct compound_hdr hdr = {
2309 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2266 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2310 }; 2267 };
2311 2268
2312 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2269 encode_compound_hdr(xdr, req, &hdr);
2313 encode_compound_hdr(&xdr, req, &hdr); 2270 encode_sequence(xdr, &args->seq_args, &hdr);
2314 encode_sequence(&xdr, &args->seq_args, &hdr); 2271 encode_putfh(xdr, args->fh, &hdr);
2315 encode_putfh(&xdr, args->fh, &hdr); 2272 encode_write(xdr, args, &hdr);
2316 encode_write(&xdr, args, &hdr);
2317 req->rq_snd_buf.flags |= XDRBUF_WRITE; 2273 req->rq_snd_buf.flags |= XDRBUF_WRITE;
2318 encode_getfattr(&xdr, args->bitmask, &hdr); 2274 encode_getfattr(xdr, args->bitmask, &hdr);
2319 encode_nops(&hdr); 2275 encode_nops(&hdr);
2320 return 0;
2321} 2276}
2322 2277
2323/* 2278/*
2324 * a COMMIT request 2279 * a COMMIT request
2325 */ 2280 */
2326static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 2281static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
2282 struct nfs_writeargs *args)
2327{ 2283{
2328 struct xdr_stream xdr;
2329 struct compound_hdr hdr = { 2284 struct compound_hdr hdr = {
2330 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2285 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2331 }; 2286 };
2332 2287
2333 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2288 encode_compound_hdr(xdr, req, &hdr);
2334 encode_compound_hdr(&xdr, req, &hdr); 2289 encode_sequence(xdr, &args->seq_args, &hdr);
2335 encode_sequence(&xdr, &args->seq_args, &hdr); 2290 encode_putfh(xdr, args->fh, &hdr);
2336 encode_putfh(&xdr, args->fh, &hdr); 2291 encode_commit(xdr, args, &hdr);
2337 encode_commit(&xdr, args, &hdr); 2292 encode_getfattr(xdr, args->bitmask, &hdr);
2338 encode_getfattr(&xdr, args->bitmask, &hdr);
2339 encode_nops(&hdr); 2293 encode_nops(&hdr);
2340 return 0;
2341} 2294}
2342 2295
2343/* 2296/*
2344 * FSINFO request 2297 * FSINFO request
2345 */ 2298 */
2346static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args) 2299static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
2300 struct nfs4_fsinfo_arg *args)
2347{ 2301{
2348 struct xdr_stream xdr;
2349 struct compound_hdr hdr = { 2302 struct compound_hdr hdr = {
2350 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2303 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2351 }; 2304 };
2352 2305
2353 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2306 encode_compound_hdr(xdr, req, &hdr);
2354 encode_compound_hdr(&xdr, req, &hdr); 2307 encode_sequence(xdr, &args->seq_args, &hdr);
2355 encode_sequence(&xdr, &args->seq_args, &hdr); 2308 encode_putfh(xdr, args->fh, &hdr);
2356 encode_putfh(&xdr, args->fh, &hdr); 2309 encode_fsinfo(xdr, args->bitmask, &hdr);
2357 encode_fsinfo(&xdr, args->bitmask, &hdr);
2358 encode_nops(&hdr); 2310 encode_nops(&hdr);
2359 return 0;
2360} 2311}
2361 2312
2362/* 2313/*
2363 * a PATHCONF request 2314 * a PATHCONF request
2364 */ 2315 */
2365static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args) 2316static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
2317 const struct nfs4_pathconf_arg *args)
2366{ 2318{
2367 struct xdr_stream xdr;
2368 struct compound_hdr hdr = { 2319 struct compound_hdr hdr = {
2369 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2320 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2370 }; 2321 };
2371 2322
2372 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2323 encode_compound_hdr(xdr, req, &hdr);
2373 encode_compound_hdr(&xdr, req, &hdr); 2324 encode_sequence(xdr, &args->seq_args, &hdr);
2374 encode_sequence(&xdr, &args->seq_args, &hdr); 2325 encode_putfh(xdr, args->fh, &hdr);
2375 encode_putfh(&xdr, args->fh, &hdr); 2326 encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
2376 encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
2377 &hdr); 2327 &hdr);
2378 encode_nops(&hdr); 2328 encode_nops(&hdr);
2379 return 0;
2380} 2329}
2381 2330
2382/* 2331/*
2383 * a STATFS request 2332 * a STATFS request
2384 */ 2333 */
2385static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args) 2334static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
2335 const struct nfs4_statfs_arg *args)
2386{ 2336{
2387 struct xdr_stream xdr;
2388 struct compound_hdr hdr = { 2337 struct compound_hdr hdr = {
2389 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2338 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2390 }; 2339 };
2391 2340
2392 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2341 encode_compound_hdr(xdr, req, &hdr);
2393 encode_compound_hdr(&xdr, req, &hdr); 2342 encode_sequence(xdr, &args->seq_args, &hdr);
2394 encode_sequence(&xdr, &args->seq_args, &hdr); 2343 encode_putfh(xdr, args->fh, &hdr);
2395 encode_putfh(&xdr, args->fh, &hdr); 2344 encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
2396 encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
2397 args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); 2345 args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
2398 encode_nops(&hdr); 2346 encode_nops(&hdr);
2399 return 0;
2400} 2347}
2401 2348
2402/* 2349/*
2403 * GETATTR_BITMAP request 2350 * GETATTR_BITMAP request
2404 */ 2351 */
2405static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, 2352static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
2406 struct nfs4_server_caps_arg *args) 2353 struct xdr_stream *xdr,
2354 struct nfs4_server_caps_arg *args)
2407{ 2355{
2408 struct xdr_stream xdr;
2409 struct compound_hdr hdr = { 2356 struct compound_hdr hdr = {
2410 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2357 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2411 }; 2358 };
2412 2359
2413 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2360 encode_compound_hdr(xdr, req, &hdr);
2414 encode_compound_hdr(&xdr, req, &hdr); 2361 encode_sequence(xdr, &args->seq_args, &hdr);
2415 encode_sequence(&xdr, &args->seq_args, &hdr); 2362 encode_putfh(xdr, args->fhandle, &hdr);
2416 encode_putfh(&xdr, args->fhandle, &hdr); 2363 encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
2417 encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
2418 FATTR4_WORD0_LINK_SUPPORT| 2364 FATTR4_WORD0_LINK_SUPPORT|
2419 FATTR4_WORD0_SYMLINK_SUPPORT| 2365 FATTR4_WORD0_SYMLINK_SUPPORT|
2420 FATTR4_WORD0_ACLSUPPORT, &hdr); 2366 FATTR4_WORD0_ACLSUPPORT, &hdr);
2421 encode_nops(&hdr); 2367 encode_nops(&hdr);
2422 return 0;
2423} 2368}
2424 2369
2425/* 2370/*
2426 * a RENEW request 2371 * a RENEW request
2427 */ 2372 */
2428static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) 2373static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
2374 struct nfs_client *clp)
2429{ 2375{
2430 struct xdr_stream xdr;
2431 struct compound_hdr hdr = { 2376 struct compound_hdr hdr = {
2432 .nops = 0, 2377 .nops = 0,
2433 }; 2378 };
2434 2379
2435 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2380 encode_compound_hdr(xdr, req, &hdr);
2436 encode_compound_hdr(&xdr, req, &hdr); 2381 encode_renew(xdr, clp, &hdr);
2437 encode_renew(&xdr, clp, &hdr);
2438 encode_nops(&hdr); 2382 encode_nops(&hdr);
2439 return 0;
2440} 2383}
2441 2384
2442/* 2385/*
2443 * a SETCLIENTID request 2386 * a SETCLIENTID request
2444 */ 2387 */
2445static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc) 2388static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
2389 struct xdr_stream *xdr,
2390 struct nfs4_setclientid *sc)
2446{ 2391{
2447 struct xdr_stream xdr;
2448 struct compound_hdr hdr = { 2392 struct compound_hdr hdr = {
2449 .nops = 0, 2393 .nops = 0,
2450 }; 2394 };
2451 2395
2452 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2396 encode_compound_hdr(xdr, req, &hdr);
2453 encode_compound_hdr(&xdr, req, &hdr); 2397 encode_setclientid(xdr, sc, &hdr);
2454 encode_setclientid(&xdr, sc, &hdr);
2455 encode_nops(&hdr); 2398 encode_nops(&hdr);
2456 return 0;
2457} 2399}
2458 2400
2459/* 2401/*
2460 * a SETCLIENTID_CONFIRM request 2402 * a SETCLIENTID_CONFIRM request
2461 */ 2403 */
2462static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg) 2404static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
2405 struct xdr_stream *xdr,
2406 struct nfs4_setclientid_res *arg)
2463{ 2407{
2464 struct xdr_stream xdr;
2465 struct compound_hdr hdr = { 2408 struct compound_hdr hdr = {
2466 .nops = 0, 2409 .nops = 0,
2467 }; 2410 };
2468 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2411 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
2469 2412
2470 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2413 encode_compound_hdr(xdr, req, &hdr);
2471 encode_compound_hdr(&xdr, req, &hdr); 2414 encode_setclientid_confirm(xdr, arg, &hdr);
2472 encode_setclientid_confirm(&xdr, arg, &hdr); 2415 encode_putrootfh(xdr, &hdr);
2473 encode_putrootfh(&xdr, &hdr); 2416 encode_fsinfo(xdr, lease_bitmap, &hdr);
2474 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2475 encode_nops(&hdr); 2417 encode_nops(&hdr);
2476 return 0;
2477} 2418}
2478 2419
2479/* 2420/*
2480 * DELEGRETURN request 2421 * DELEGRETURN request
2481 */ 2422 */
2482static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args) 2423static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
2424 struct xdr_stream *xdr,
2425 const struct nfs4_delegreturnargs *args)
2483{ 2426{
2484 struct xdr_stream xdr;
2485 struct compound_hdr hdr = { 2427 struct compound_hdr hdr = {
2486 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2428 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2487 }; 2429 };
2488 2430
2489 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2431 encode_compound_hdr(xdr, req, &hdr);
2490 encode_compound_hdr(&xdr, req, &hdr); 2432 encode_sequence(xdr, &args->seq_args, &hdr);
2491 encode_sequence(&xdr, &args->seq_args, &hdr); 2433 encode_putfh(xdr, args->fhandle, &hdr);
2492 encode_putfh(&xdr, args->fhandle, &hdr); 2434 encode_delegreturn(xdr, args->stateid, &hdr);
2493 encode_delegreturn(&xdr, args->stateid, &hdr); 2435 encode_getfattr(xdr, args->bitmask, &hdr);
2494 encode_getfattr(&xdr, args->bitmask, &hdr);
2495 encode_nops(&hdr); 2436 encode_nops(&hdr);
2496 return 0;
2497} 2437}
2498 2438
2499/* 2439/*
2500 * Encode FS_LOCATIONS request 2440 * Encode FS_LOCATIONS request
2501 */ 2441 */
2502static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args) 2442static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
2443 struct xdr_stream *xdr,
2444 struct nfs4_fs_locations_arg *args)
2503{ 2445{
2504 struct xdr_stream xdr;
2505 struct compound_hdr hdr = { 2446 struct compound_hdr hdr = {
2506 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2447 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2507 }; 2448 };
2508 uint32_t replen; 2449 uint32_t replen;
2509 2450
2510 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2451 encode_compound_hdr(xdr, req, &hdr);
2511 encode_compound_hdr(&xdr, req, &hdr); 2452 encode_sequence(xdr, &args->seq_args, &hdr);
2512 encode_sequence(&xdr, &args->seq_args, &hdr); 2453 encode_putfh(xdr, args->dir_fh, &hdr);
2513 encode_putfh(&xdr, args->dir_fh, &hdr); 2454 encode_lookup(xdr, args->name, &hdr);
2514 encode_lookup(&xdr, args->name, &hdr);
2515 replen = hdr.replen; /* get the attribute into args->page */ 2455 replen = hdr.replen; /* get the attribute into args->page */
2516 encode_fs_locations(&xdr, args->bitmask, &hdr); 2456 encode_fs_locations(xdr, args->bitmask, &hdr);
2517 2457
2518 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, 2458 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
2519 0, PAGE_SIZE); 2459 0, PAGE_SIZE);
2520 encode_nops(&hdr); 2460 encode_nops(&hdr);
2521 return 0;
2522} 2461}
2523 2462
2524#if defined(CONFIG_NFS_V4_1) 2463#if defined(CONFIG_NFS_V4_1)
2525/* 2464/*
2526 * EXCHANGE_ID request 2465 * EXCHANGE_ID request
2527 */ 2466 */
2528static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p, 2467static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
2529 struct nfs41_exchange_id_args *args) 2468 struct xdr_stream *xdr,
2469 struct nfs41_exchange_id_args *args)
2530{ 2470{
2531 struct xdr_stream xdr;
2532 struct compound_hdr hdr = { 2471 struct compound_hdr hdr = {
2533 .minorversion = args->client->cl_mvops->minor_version, 2472 .minorversion = args->client->cl_mvops->minor_version,
2534 }; 2473 };
2535 2474
2536 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2475 encode_compound_hdr(xdr, req, &hdr);
2537 encode_compound_hdr(&xdr, req, &hdr); 2476 encode_exchange_id(xdr, args, &hdr);
2538 encode_exchange_id(&xdr, args, &hdr);
2539 encode_nops(&hdr); 2477 encode_nops(&hdr);
2540 return 0;
2541} 2478}
2542 2479
2543/* 2480/*
2544 * a CREATE_SESSION request 2481 * a CREATE_SESSION request
2545 */ 2482 */
2546static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p, 2483static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
2547 struct nfs41_create_session_args *args) 2484 struct xdr_stream *xdr,
2485 struct nfs41_create_session_args *args)
2548{ 2486{
2549 struct xdr_stream xdr;
2550 struct compound_hdr hdr = { 2487 struct compound_hdr hdr = {
2551 .minorversion = args->client->cl_mvops->minor_version, 2488 .minorversion = args->client->cl_mvops->minor_version,
2552 }; 2489 };
2553 2490
2554 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2491 encode_compound_hdr(xdr, req, &hdr);
2555 encode_compound_hdr(&xdr, req, &hdr); 2492 encode_create_session(xdr, args, &hdr);
2556 encode_create_session(&xdr, args, &hdr);
2557 encode_nops(&hdr); 2493 encode_nops(&hdr);
2558 return 0;
2559} 2494}
2560 2495
2561/* 2496/*
2562 * a DESTROY_SESSION request 2497 * a DESTROY_SESSION request
2563 */ 2498 */
2564static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p, 2499static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
2565 struct nfs4_session *session) 2500 struct xdr_stream *xdr,
2501 struct nfs4_session *session)
2566{ 2502{
2567 struct xdr_stream xdr;
2568 struct compound_hdr hdr = { 2503 struct compound_hdr hdr = {
2569 .minorversion = session->clp->cl_mvops->minor_version, 2504 .minorversion = session->clp->cl_mvops->minor_version,
2570 }; 2505 };
2571 2506
2572 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2507 encode_compound_hdr(xdr, req, &hdr);
2573 encode_compound_hdr(&xdr, req, &hdr); 2508 encode_destroy_session(xdr, session, &hdr);
2574 encode_destroy_session(&xdr, session, &hdr);
2575 encode_nops(&hdr); 2509 encode_nops(&hdr);
2576 return 0;
2577} 2510}
2578 2511
2579/* 2512/*
2580 * a SEQUENCE request 2513 * a SEQUENCE request
2581 */ 2514 */
2582static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p, 2515static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
2583 struct nfs4_sequence_args *args) 2516 struct nfs4_sequence_args *args)
2584{ 2517{
2585 struct xdr_stream xdr;
2586 struct compound_hdr hdr = { 2518 struct compound_hdr hdr = {
2587 .minorversion = nfs4_xdr_minorversion(args), 2519 .minorversion = nfs4_xdr_minorversion(args),
2588 }; 2520 };
2589 2521
2590 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2522 encode_compound_hdr(xdr, req, &hdr);
2591 encode_compound_hdr(&xdr, req, &hdr); 2523 encode_sequence(xdr, args, &hdr);
2592 encode_sequence(&xdr, args, &hdr);
2593 encode_nops(&hdr); 2524 encode_nops(&hdr);
2594 return 0;
2595} 2525}
2596 2526
2597/* 2527/*
2598 * a GET_LEASE_TIME request 2528 * a GET_LEASE_TIME request
2599 */ 2529 */
2600static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, 2530static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
2601 struct nfs4_get_lease_time_args *args) 2531 struct xdr_stream *xdr,
2532 struct nfs4_get_lease_time_args *args)
2602{ 2533{
2603 struct xdr_stream xdr;
2604 struct compound_hdr hdr = { 2534 struct compound_hdr hdr = {
2605 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), 2535 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
2606 }; 2536 };
2607 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2537 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
2608 2538
2609 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2539 encode_compound_hdr(xdr, req, &hdr);
2610 encode_compound_hdr(&xdr, req, &hdr); 2540 encode_sequence(xdr, &args->la_seq_args, &hdr);
2611 encode_sequence(&xdr, &args->la_seq_args, &hdr); 2541 encode_putrootfh(xdr, &hdr);
2612 encode_putrootfh(&xdr, &hdr); 2542 encode_fsinfo(xdr, lease_bitmap, &hdr);
2613 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2614 encode_nops(&hdr); 2543 encode_nops(&hdr);
2615 return 0;
2616} 2544}
2617 2545
2618/* 2546/*
2619 * a RECLAIM_COMPLETE request 2547 * a RECLAIM_COMPLETE request
2620 */ 2548 */
2621static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, 2549static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2622 struct nfs41_reclaim_complete_args *args) 2550 struct xdr_stream *xdr,
2551 struct nfs41_reclaim_complete_args *args)
2623{ 2552{
2624 struct xdr_stream xdr;
2625 struct compound_hdr hdr = { 2553 struct compound_hdr hdr = {
2626 .minorversion = nfs4_xdr_minorversion(&args->seq_args) 2554 .minorversion = nfs4_xdr_minorversion(&args->seq_args)
2627 }; 2555 };
2628 2556
2629 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2557 encode_compound_hdr(xdr, req, &hdr);
2630 encode_compound_hdr(&xdr, req, &hdr); 2558 encode_sequence(xdr, &args->seq_args, &hdr);
2631 encode_sequence(&xdr, &args->seq_args, &hdr); 2559 encode_reclaim_complete(xdr, args, &hdr);
2632 encode_reclaim_complete(&xdr, args, &hdr);
2633 encode_nops(&hdr); 2560 encode_nops(&hdr);
2634 return 0;
2635} 2561}
2636 2562
2637/* 2563/*
2638 * Encode GETDEVICEINFO request 2564 * Encode GETDEVICEINFO request
2639 */ 2565 */
2640static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, 2566static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
2641 struct nfs4_getdeviceinfo_args *args) 2567 struct xdr_stream *xdr,
2568 struct nfs4_getdeviceinfo_args *args)
2642{ 2569{
2643 struct xdr_stream xdr;
2644 struct compound_hdr hdr = { 2570 struct compound_hdr hdr = {
2645 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2571 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2646 }; 2572 };
2647 2573
2648 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2574 encode_compound_hdr(xdr, req, &hdr);
2649 encode_compound_hdr(&xdr, req, &hdr); 2575 encode_sequence(xdr, &args->seq_args, &hdr);
2650 encode_sequence(&xdr, &args->seq_args, &hdr); 2576 encode_getdeviceinfo(xdr, args, &hdr);
2651 encode_getdeviceinfo(&xdr, args, &hdr);
2652 2577
2653 /* set up reply kvec. Subtract notification bitmap max size (2) 2578 /* set up reply kvec. Subtract notification bitmap max size (2)
2654 * so that notification bitmap is put in xdr_buf tail */ 2579 * so that notification bitmap is put in xdr_buf tail */
@@ -2657,27 +2582,24 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
2657 args->pdev->pglen); 2582 args->pdev->pglen);
2658 2583
2659 encode_nops(&hdr); 2584 encode_nops(&hdr);
2660 return 0;
2661} 2585}
2662 2586
2663/* 2587/*
2664 * Encode LAYOUTGET request 2588 * Encode LAYOUTGET request
2665 */ 2589 */
2666static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, 2590static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
2667 struct nfs4_layoutget_args *args) 2591 struct xdr_stream *xdr,
2592 struct nfs4_layoutget_args *args)
2668{ 2593{
2669 struct xdr_stream xdr;
2670 struct compound_hdr hdr = { 2594 struct compound_hdr hdr = {
2671 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2595 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2672 }; 2596 };
2673 2597
2674 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2598 encode_compound_hdr(xdr, req, &hdr);
2675 encode_compound_hdr(&xdr, req, &hdr); 2599 encode_sequence(xdr, &args->seq_args, &hdr);
2676 encode_sequence(&xdr, &args->seq_args, &hdr); 2600 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2677 encode_putfh(&xdr, NFS_FH(args->inode), &hdr); 2601 encode_layoutget(xdr, args, &hdr);
2678 encode_layoutget(&xdr, args, &hdr);
2679 encode_nops(&hdr); 2602 encode_nops(&hdr);
2680 return 0;
2681} 2603}
2682#endif /* CONFIG_NFS_V4_1 */ 2604#endif /* CONFIG_NFS_V4_1 */
2683 2605
@@ -4475,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
4475 goto out_overflow; 4397 goto out_overflow;
4476 eof = be32_to_cpup(p++); 4398 eof = be32_to_cpup(p++);
4477 count = be32_to_cpup(p); 4399 count = be32_to_cpup(p);
4478 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 4400 hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
4479 recvd = req->rq_rcv_buf.len - hdrlen; 4401 recvd = req->rq_rcv_buf.len - hdrlen;
4480 if (count > recvd) { 4402 if (count > recvd) {
4481 dprintk("NFS: server cheating in read reply: " 4403 dprintk("NFS: server cheating in read reply: "
@@ -4518,7 +4440,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
4518 xdr_read_pages(xdr, pglen); 4440 xdr_read_pages(xdr, pglen);
4519 4441
4520 4442
4521 return 0; 4443 return pglen;
4522} 4444}
4523 4445
4524static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) 4446static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -5000,7 +4922,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
5000 goto out_overflow; 4922 goto out_overflow;
5001 len = be32_to_cpup(p); 4923 len = be32_to_cpup(p);
5002 if (len) { 4924 if (len) {
5003 int i; 4925 uint32_t i;
5004 4926
5005 p = xdr_inline_decode(xdr, 4 * len); 4927 p = xdr_inline_decode(xdr, 4 * len);
5006 if (unlikely(!p)) 4928 if (unlikely(!p))
@@ -5090,26 +5012,26 @@ out_overflow:
5090/* 5012/*
5091 * Decode OPEN_DOWNGRADE response 5013 * Decode OPEN_DOWNGRADE response
5092 */ 5014 */
5093static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) 5015static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
5016 struct xdr_stream *xdr,
5017 struct nfs_closeres *res)
5094{ 5018{
5095 struct xdr_stream xdr;
5096 struct compound_hdr hdr; 5019 struct compound_hdr hdr;
5097 int status; 5020 int status;
5098 5021
5099 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5022 status = decode_compound_hdr(xdr, &hdr);
5100 status = decode_compound_hdr(&xdr, &hdr);
5101 if (status) 5023 if (status)
5102 goto out; 5024 goto out;
5103 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5025 status = decode_sequence(xdr, &res->seq_res, rqstp);
5104 if (status) 5026 if (status)
5105 goto out; 5027 goto out;
5106 status = decode_putfh(&xdr); 5028 status = decode_putfh(xdr);
5107 if (status) 5029 if (status)
5108 goto out; 5030 goto out;
5109 status = decode_open_downgrade(&xdr, res); 5031 status = decode_open_downgrade(xdr, res);
5110 if (status != 0) 5032 if (status != 0)
5111 goto out; 5033 goto out;
5112 decode_getfattr(&xdr, res->fattr, res->server, 5034 decode_getfattr(xdr, res->fattr, res->server,
5113 !RPC_IS_ASYNC(rqstp->rq_task)); 5035 !RPC_IS_ASYNC(rqstp->rq_task));
5114out: 5036out:
5115 return status; 5037 return status;
@@ -5118,26 +5040,25 @@ out:
5118/* 5040/*
5119 * Decode ACCESS response 5041 * Decode ACCESS response
5120 */ 5042 */
5121static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res) 5043static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5044 struct nfs4_accessres *res)
5122{ 5045{
5123 struct xdr_stream xdr;
5124 struct compound_hdr hdr; 5046 struct compound_hdr hdr;
5125 int status; 5047 int status;
5126 5048
5127 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5049 status = decode_compound_hdr(xdr, &hdr);
5128 status = decode_compound_hdr(&xdr, &hdr);
5129 if (status) 5050 if (status)
5130 goto out; 5051 goto out;
5131 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5052 status = decode_sequence(xdr, &res->seq_res, rqstp);
5132 if (status) 5053 if (status)
5133 goto out; 5054 goto out;
5134 status = decode_putfh(&xdr); 5055 status = decode_putfh(xdr);
5135 if (status != 0) 5056 if (status != 0)
5136 goto out; 5057 goto out;
5137 status = decode_access(&xdr, res); 5058 status = decode_access(xdr, res);
5138 if (status != 0) 5059 if (status != 0)
5139 goto out; 5060 goto out;
5140 decode_getfattr(&xdr, res->fattr, res->server, 5061 decode_getfattr(xdr, res->fattr, res->server,
5141 !RPC_IS_ASYNC(rqstp->rq_task)); 5062 !RPC_IS_ASYNC(rqstp->rq_task));
5142out: 5063out:
5143 return status; 5064 return status;
@@ -5146,26 +5067,28 @@ out:
5146/* 5067/*
5147 * Decode LOOKUP response 5068 * Decode LOOKUP response
5148 */ 5069 */
5149static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) 5070static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5071 struct nfs4_lookup_res *res)
5150{ 5072{
5151 struct xdr_stream xdr;
5152 struct compound_hdr hdr; 5073 struct compound_hdr hdr;
5153 int status; 5074 int status;
5154 5075
5155 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5076 status = decode_compound_hdr(xdr, &hdr);
5156 status = decode_compound_hdr(&xdr, &hdr);
5157 if (status) 5077 if (status)
5158 goto out; 5078 goto out;
5159 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5079 status = decode_sequence(xdr, &res->seq_res, rqstp);
5160 if (status) 5080 if (status)
5161 goto out; 5081 goto out;
5162 if ((status = decode_putfh(&xdr)) != 0) 5082 status = decode_putfh(xdr);
5083 if (status)
5163 goto out; 5084 goto out;
5164 if ((status = decode_lookup(&xdr)) != 0) 5085 status = decode_lookup(xdr);
5086 if (status)
5165 goto out; 5087 goto out;
5166 if ((status = decode_getfh(&xdr, res->fh)) != 0) 5088 status = decode_getfh(xdr, res->fh);
5089 if (status)
5167 goto out; 5090 goto out;
5168 status = decode_getfattr(&xdr, res->fattr, res->server 5091 status = decode_getfattr(xdr, res->fattr, res->server
5169 ,!RPC_IS_ASYNC(rqstp->rq_task)); 5092 ,!RPC_IS_ASYNC(rqstp->rq_task));
5170out: 5093out:
5171 return status; 5094 return status;
@@ -5174,23 +5097,25 @@ out:
5174/* 5097/*
5175 * Decode LOOKUP_ROOT response 5098 * Decode LOOKUP_ROOT response
5176 */ 5099 */
5177static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) 5100static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
5101 struct xdr_stream *xdr,
5102 struct nfs4_lookup_res *res)
5178{ 5103{
5179 struct xdr_stream xdr;
5180 struct compound_hdr hdr; 5104 struct compound_hdr hdr;
5181 int status; 5105 int status;
5182 5106
5183 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5107 status = decode_compound_hdr(xdr, &hdr);
5184 status = decode_compound_hdr(&xdr, &hdr);
5185 if (status) 5108 if (status)
5186 goto out; 5109 goto out;
5187 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5110 status = decode_sequence(xdr, &res->seq_res, rqstp);
5188 if (status) 5111 if (status)
5189 goto out; 5112 goto out;
5190 if ((status = decode_putrootfh(&xdr)) != 0) 5113 status = decode_putrootfh(xdr);
5114 if (status)
5191 goto out; 5115 goto out;
5192 if ((status = decode_getfh(&xdr, res->fh)) == 0) 5116 status = decode_getfh(xdr, res->fh);
5193 status = decode_getfattr(&xdr, res->fattr, res->server, 5117 if (status == 0)
5118 status = decode_getfattr(xdr, res->fattr, res->server,
5194 !RPC_IS_ASYNC(rqstp->rq_task)); 5119 !RPC_IS_ASYNC(rqstp->rq_task));
5195out: 5120out:
5196 return status; 5121 return status;
@@ -5199,24 +5124,25 @@ out:
5199/* 5124/*
5200 * Decode REMOVE response 5125 * Decode REMOVE response
5201 */ 5126 */
5202static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res) 5127static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5128 struct nfs_removeres *res)
5203{ 5129{
5204 struct xdr_stream xdr;
5205 struct compound_hdr hdr; 5130 struct compound_hdr hdr;
5206 int status; 5131 int status;
5207 5132
5208 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5133 status = decode_compound_hdr(xdr, &hdr);
5209 status = decode_compound_hdr(&xdr, &hdr);
5210 if (status) 5134 if (status)
5211 goto out; 5135 goto out;
5212 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5136 status = decode_sequence(xdr, &res->seq_res, rqstp);
5213 if (status) 5137 if (status)
5214 goto out; 5138 goto out;
5215 if ((status = decode_putfh(&xdr)) != 0) 5139 status = decode_putfh(xdr);
5140 if (status)
5216 goto out; 5141 goto out;
5217 if ((status = decode_remove(&xdr, &res->cinfo)) != 0) 5142 status = decode_remove(xdr, &res->cinfo);
5143 if (status)
5218 goto out; 5144 goto out;
5219 decode_getfattr(&xdr, res->dir_attr, res->server, 5145 decode_getfattr(xdr, res->dir_attr, res->server,
5220 !RPC_IS_ASYNC(rqstp->rq_task)); 5146 !RPC_IS_ASYNC(rqstp->rq_task));
5221out: 5147out:
5222 return status; 5148 return status;
@@ -5225,34 +5151,38 @@ out:
5225/* 5151/*
5226 * Decode RENAME response 5152 * Decode RENAME response
5227 */ 5153 */
5228static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res) 5154static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5155 struct nfs_renameres *res)
5229{ 5156{
5230 struct xdr_stream xdr;
5231 struct compound_hdr hdr; 5157 struct compound_hdr hdr;
5232 int status; 5158 int status;
5233 5159
5234 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5160 status = decode_compound_hdr(xdr, &hdr);
5235 status = decode_compound_hdr(&xdr, &hdr);
5236 if (status) 5161 if (status)
5237 goto out; 5162 goto out;
5238 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5163 status = decode_sequence(xdr, &res->seq_res, rqstp);
5239 if (status) 5164 if (status)
5240 goto out; 5165 goto out;
5241 if ((status = decode_putfh(&xdr)) != 0) 5166 status = decode_putfh(xdr);
5167 if (status)
5242 goto out; 5168 goto out;
5243 if ((status = decode_savefh(&xdr)) != 0) 5169 status = decode_savefh(xdr);
5170 if (status)
5244 goto out; 5171 goto out;
5245 if ((status = decode_putfh(&xdr)) != 0) 5172 status = decode_putfh(xdr);
5173 if (status)
5246 goto out; 5174 goto out;
5247 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) 5175 status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
5176 if (status)
5248 goto out; 5177 goto out;
5249 /* Current FH is target directory */ 5178 /* Current FH is target directory */
5250 if (decode_getfattr(&xdr, res->new_fattr, res->server, 5179 if (decode_getfattr(xdr, res->new_fattr, res->server,
5251 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5180 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5252 goto out; 5181 goto out;
5253 if ((status = decode_restorefh(&xdr)) != 0) 5182 status = decode_restorefh(xdr);
5183 if (status)
5254 goto out; 5184 goto out;
5255 decode_getfattr(&xdr, res->old_fattr, res->server, 5185 decode_getfattr(xdr, res->old_fattr, res->server,
5256 !RPC_IS_ASYNC(rqstp->rq_task)); 5186 !RPC_IS_ASYNC(rqstp->rq_task));
5257out: 5187out:
5258 return status; 5188 return status;
@@ -5261,37 +5191,41 @@ out:
5261/* 5191/*
5262 * Decode LINK response 5192 * Decode LINK response
5263 */ 5193 */
5264static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res) 5194static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5195 struct nfs4_link_res *res)
5265{ 5196{
5266 struct xdr_stream xdr;
5267 struct compound_hdr hdr; 5197 struct compound_hdr hdr;
5268 int status; 5198 int status;
5269 5199
5270 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5200 status = decode_compound_hdr(xdr, &hdr);
5271 status = decode_compound_hdr(&xdr, &hdr);
5272 if (status) 5201 if (status)
5273 goto out; 5202 goto out;
5274 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5203 status = decode_sequence(xdr, &res->seq_res, rqstp);
5275 if (status) 5204 if (status)
5276 goto out; 5205 goto out;
5277 if ((status = decode_putfh(&xdr)) != 0) 5206 status = decode_putfh(xdr);
5207 if (status)
5278 goto out; 5208 goto out;
5279 if ((status = decode_savefh(&xdr)) != 0) 5209 status = decode_savefh(xdr);
5210 if (status)
5280 goto out; 5211 goto out;
5281 if ((status = decode_putfh(&xdr)) != 0) 5212 status = decode_putfh(xdr);
5213 if (status)
5282 goto out; 5214 goto out;
5283 if ((status = decode_link(&xdr, &res->cinfo)) != 0) 5215 status = decode_link(xdr, &res->cinfo);
5216 if (status)
5284 goto out; 5217 goto out;
5285 /* 5218 /*
5286 * Note order: OP_LINK leaves the directory as the current 5219 * Note order: OP_LINK leaves the directory as the current
5287 * filehandle. 5220 * filehandle.
5288 */ 5221 */
5289 if (decode_getfattr(&xdr, res->dir_attr, res->server, 5222 if (decode_getfattr(xdr, res->dir_attr, res->server,
5290 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5223 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5291 goto out; 5224 goto out;
5292 if ((status = decode_restorefh(&xdr)) != 0) 5225 status = decode_restorefh(xdr);
5226 if (status)
5293 goto out; 5227 goto out;
5294 decode_getfattr(&xdr, res->fattr, res->server, 5228 decode_getfattr(xdr, res->fattr, res->server,
5295 !RPC_IS_ASYNC(rqstp->rq_task)); 5229 !RPC_IS_ASYNC(rqstp->rq_task));
5296out: 5230out:
5297 return status; 5231 return status;
@@ -5300,33 +5234,37 @@ out:
5300/* 5234/*
5301 * Decode CREATE response 5235 * Decode CREATE response
5302 */ 5236 */
5303static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) 5237static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5238 struct nfs4_create_res *res)
5304{ 5239{
5305 struct xdr_stream xdr;
5306 struct compound_hdr hdr; 5240 struct compound_hdr hdr;
5307 int status; 5241 int status;
5308 5242
5309 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5243 status = decode_compound_hdr(xdr, &hdr);
5310 status = decode_compound_hdr(&xdr, &hdr);
5311 if (status) 5244 if (status)
5312 goto out; 5245 goto out;
5313 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5246 status = decode_sequence(xdr, &res->seq_res, rqstp);
5314 if (status) 5247 if (status)
5315 goto out; 5248 goto out;
5316 if ((status = decode_putfh(&xdr)) != 0) 5249 status = decode_putfh(xdr);
5250 if (status)
5317 goto out; 5251 goto out;
5318 if ((status = decode_savefh(&xdr)) != 0) 5252 status = decode_savefh(xdr);
5253 if (status)
5319 goto out; 5254 goto out;
5320 if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0) 5255 status = decode_create(xdr, &res->dir_cinfo);
5256 if (status)
5321 goto out; 5257 goto out;
5322 if ((status = decode_getfh(&xdr, res->fh)) != 0) 5258 status = decode_getfh(xdr, res->fh);
5259 if (status)
5323 goto out; 5260 goto out;
5324 if (decode_getfattr(&xdr, res->fattr, res->server, 5261 if (decode_getfattr(xdr, res->fattr, res->server,
5325 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5262 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5326 goto out; 5263 goto out;
5327 if ((status = decode_restorefh(&xdr)) != 0) 5264 status = decode_restorefh(xdr);
5265 if (status)
5328 goto out; 5266 goto out;
5329 decode_getfattr(&xdr, res->dir_fattr, res->server, 5267 decode_getfattr(xdr, res->dir_fattr, res->server,
5330 !RPC_IS_ASYNC(rqstp->rq_task)); 5268 !RPC_IS_ASYNC(rqstp->rq_task));
5331out: 5269out:
5332 return status; 5270 return status;
@@ -5335,31 +5273,31 @@ out:
5335/* 5273/*
5336 * Decode SYMLINK response 5274 * Decode SYMLINK response
5337 */ 5275 */
5338static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) 5276static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5277 struct nfs4_create_res *res)
5339{ 5278{
5340 return nfs4_xdr_dec_create(rqstp, p, res); 5279 return nfs4_xdr_dec_create(rqstp, xdr, res);
5341} 5280}
5342 5281
5343/* 5282/*
5344 * Decode GETATTR response 5283 * Decode GETATTR response
5345 */ 5284 */
5346static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res) 5285static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5286 struct nfs4_getattr_res *res)
5347{ 5287{
5348 struct xdr_stream xdr;
5349 struct compound_hdr hdr; 5288 struct compound_hdr hdr;
5350 int status; 5289 int status;
5351 5290
5352 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5291 status = decode_compound_hdr(xdr, &hdr);
5353 status = decode_compound_hdr(&xdr, &hdr);
5354 if (status) 5292 if (status)
5355 goto out; 5293 goto out;
5356 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5294 status = decode_sequence(xdr, &res->seq_res, rqstp);
5357 if (status) 5295 if (status)
5358 goto out; 5296 goto out;
5359 status = decode_putfh(&xdr); 5297 status = decode_putfh(xdr);
5360 if (status) 5298 if (status)
5361 goto out; 5299 goto out;
5362 status = decode_getfattr(&xdr, res->fattr, res->server, 5300 status = decode_getfattr(xdr, res->fattr, res->server,
5363 !RPC_IS_ASYNC(rqstp->rq_task)); 5301 !RPC_IS_ASYNC(rqstp->rq_task));
5364out: 5302out:
5365 return status; 5303 return status;
@@ -5368,46 +5306,40 @@ out:
5368/* 5306/*
5369 * Encode an SETACL request 5307 * Encode an SETACL request
5370 */ 5308 */
5371static int 5309static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
5372nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args) 5310 struct nfs_setaclargs *args)
5373{ 5311{
5374 struct xdr_stream xdr;
5375 struct compound_hdr hdr = { 5312 struct compound_hdr hdr = {
5376 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 5313 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
5377 }; 5314 };
5378 int status;
5379 5315
5380 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 5316 encode_compound_hdr(xdr, req, &hdr);
5381 encode_compound_hdr(&xdr, req, &hdr); 5317 encode_sequence(xdr, &args->seq_args, &hdr);
5382 encode_sequence(&xdr, &args->seq_args, &hdr); 5318 encode_putfh(xdr, args->fh, &hdr);
5383 encode_putfh(&xdr, args->fh, &hdr); 5319 encode_setacl(xdr, args, &hdr);
5384 status = encode_setacl(&xdr, args, &hdr);
5385 encode_nops(&hdr); 5320 encode_nops(&hdr);
5386 return status;
5387} 5321}
5388 5322
5389/* 5323/*
5390 * Decode SETACL response 5324 * Decode SETACL response
5391 */ 5325 */
5392static int 5326static int
5393nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, 5327nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5394 struct nfs_setaclres *res) 5328 struct nfs_setaclres *res)
5395{ 5329{
5396 struct xdr_stream xdr;
5397 struct compound_hdr hdr; 5330 struct compound_hdr hdr;
5398 int status; 5331 int status;
5399 5332
5400 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5333 status = decode_compound_hdr(xdr, &hdr);
5401 status = decode_compound_hdr(&xdr, &hdr);
5402 if (status) 5334 if (status)
5403 goto out; 5335 goto out;
5404 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5336 status = decode_sequence(xdr, &res->seq_res, rqstp);
5405 if (status) 5337 if (status)
5406 goto out; 5338 goto out;
5407 status = decode_putfh(&xdr); 5339 status = decode_putfh(xdr);
5408 if (status) 5340 if (status)
5409 goto out; 5341 goto out;
5410 status = decode_setattr(&xdr); 5342 status = decode_setattr(xdr);
5411out: 5343out:
5412 return status; 5344 return status;
5413} 5345}
@@ -5416,24 +5348,22 @@ out:
5416 * Decode GETACL response 5348 * Decode GETACL response
5417 */ 5349 */
5418static int 5350static int
5419nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, 5351nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5420 struct nfs_getaclres *res) 5352 struct nfs_getaclres *res)
5421{ 5353{
5422 struct xdr_stream xdr;
5423 struct compound_hdr hdr; 5354 struct compound_hdr hdr;
5424 int status; 5355 int status;
5425 5356
5426 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5357 status = decode_compound_hdr(xdr, &hdr);
5427 status = decode_compound_hdr(&xdr, &hdr);
5428 if (status) 5358 if (status)
5429 goto out; 5359 goto out;
5430 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5360 status = decode_sequence(xdr, &res->seq_res, rqstp);
5431 if (status) 5361 if (status)
5432 goto out; 5362 goto out;
5433 status = decode_putfh(&xdr); 5363 status = decode_putfh(xdr);
5434 if (status) 5364 if (status)
5435 goto out; 5365 goto out;
5436 status = decode_getacl(&xdr, rqstp, &res->acl_len); 5366 status = decode_getacl(xdr, rqstp, &res->acl_len);
5437 5367
5438out: 5368out:
5439 return status; 5369 return status;
@@ -5442,23 +5372,22 @@ out:
5442/* 5372/*
5443 * Decode CLOSE response 5373 * Decode CLOSE response
5444 */ 5374 */
5445static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) 5375static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5376 struct nfs_closeres *res)
5446{ 5377{
5447 struct xdr_stream xdr;
5448 struct compound_hdr hdr; 5378 struct compound_hdr hdr;
5449 int status; 5379 int status;
5450 5380
5451 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5381 status = decode_compound_hdr(xdr, &hdr);
5452 status = decode_compound_hdr(&xdr, &hdr);
5453 if (status) 5382 if (status)
5454 goto out; 5383 goto out;
5455 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5384 status = decode_sequence(xdr, &res->seq_res, rqstp);
5456 if (status) 5385 if (status)
5457 goto out; 5386 goto out;
5458 status = decode_putfh(&xdr); 5387 status = decode_putfh(xdr);
5459 if (status) 5388 if (status)
5460 goto out; 5389 goto out;
5461 status = decode_close(&xdr, res); 5390 status = decode_close(xdr, res);
5462 if (status != 0) 5391 if (status != 0)
5463 goto out; 5392 goto out;
5464 /* 5393 /*
@@ -5467,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
5467 * an ESTALE error. Shouldn't be a problem, 5396 * an ESTALE error. Shouldn't be a problem,
5468 * though, since fattr->valid will remain unset. 5397 * though, since fattr->valid will remain unset.
5469 */ 5398 */
5470 decode_getfattr(&xdr, res->fattr, res->server, 5399 decode_getfattr(xdr, res->fattr, res->server,
5471 !RPC_IS_ASYNC(rqstp->rq_task)); 5400 !RPC_IS_ASYNC(rqstp->rq_task));
5472out: 5401out:
5473 return status; 5402 return status;
@@ -5476,36 +5405,35 @@ out:
5476/* 5405/*
5477 * Decode OPEN response 5406 * Decode OPEN response
5478 */ 5407 */
5479static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) 5408static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5409 struct nfs_openres *res)
5480{ 5410{
5481 struct xdr_stream xdr;
5482 struct compound_hdr hdr; 5411 struct compound_hdr hdr;
5483 int status; 5412 int status;
5484 5413
5485 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5414 status = decode_compound_hdr(xdr, &hdr);
5486 status = decode_compound_hdr(&xdr, &hdr);
5487 if (status) 5415 if (status)
5488 goto out; 5416 goto out;
5489 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5417 status = decode_sequence(xdr, &res->seq_res, rqstp);
5490 if (status) 5418 if (status)
5491 goto out; 5419 goto out;
5492 status = decode_putfh(&xdr); 5420 status = decode_putfh(xdr);
5493 if (status) 5421 if (status)
5494 goto out; 5422 goto out;
5495 status = decode_savefh(&xdr); 5423 status = decode_savefh(xdr);
5496 if (status) 5424 if (status)
5497 goto out; 5425 goto out;
5498 status = decode_open(&xdr, res); 5426 status = decode_open(xdr, res);
5499 if (status) 5427 if (status)
5500 goto out; 5428 goto out;
5501 if (decode_getfh(&xdr, &res->fh) != 0) 5429 if (decode_getfh(xdr, &res->fh) != 0)
5502 goto out; 5430 goto out;
5503 if (decode_getfattr(&xdr, res->f_attr, res->server, 5431 if (decode_getfattr(xdr, res->f_attr, res->server,
5504 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5432 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5505 goto out; 5433 goto out;
5506 if (decode_restorefh(&xdr) != 0) 5434 if (decode_restorefh(xdr) != 0)
5507 goto out; 5435 goto out;
5508 decode_getfattr(&xdr, res->dir_attr, res->server, 5436 decode_getfattr(xdr, res->dir_attr, res->server,
5509 !RPC_IS_ASYNC(rqstp->rq_task)); 5437 !RPC_IS_ASYNC(rqstp->rq_task));
5510out: 5438out:
5511 return status; 5439 return status;
@@ -5514,20 +5442,20 @@ out:
5514/* 5442/*
5515 * Decode OPEN_CONFIRM response 5443 * Decode OPEN_CONFIRM response
5516 */ 5444 */
5517static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res) 5445static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
5446 struct xdr_stream *xdr,
5447 struct nfs_open_confirmres *res)
5518{ 5448{
5519 struct xdr_stream xdr;
5520 struct compound_hdr hdr; 5449 struct compound_hdr hdr;
5521 int status; 5450 int status;
5522 5451
5523 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5452 status = decode_compound_hdr(xdr, &hdr);
5524 status = decode_compound_hdr(&xdr, &hdr);
5525 if (status) 5453 if (status)
5526 goto out; 5454 goto out;
5527 status = decode_putfh(&xdr); 5455 status = decode_putfh(xdr);
5528 if (status) 5456 if (status)
5529 goto out; 5457 goto out;
5530 status = decode_open_confirm(&xdr, res); 5458 status = decode_open_confirm(xdr, res);
5531out: 5459out:
5532 return status; 5460 return status;
5533} 5461}
@@ -5535,26 +5463,26 @@ out:
5535/* 5463/*
5536 * Decode OPEN response 5464 * Decode OPEN response
5537 */ 5465 */
5538static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) 5466static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
5467 struct xdr_stream *xdr,
5468 struct nfs_openres *res)
5539{ 5469{
5540 struct xdr_stream xdr;
5541 struct compound_hdr hdr; 5470 struct compound_hdr hdr;
5542 int status; 5471 int status;
5543 5472
5544 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5473 status = decode_compound_hdr(xdr, &hdr);
5545 status = decode_compound_hdr(&xdr, &hdr);
5546 if (status) 5474 if (status)
5547 goto out; 5475 goto out;
5548 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5476 status = decode_sequence(xdr, &res->seq_res, rqstp);
5549 if (status) 5477 if (status)
5550 goto out; 5478 goto out;
5551 status = decode_putfh(&xdr); 5479 status = decode_putfh(xdr);
5552 if (status) 5480 if (status)
5553 goto out; 5481 goto out;
5554 status = decode_open(&xdr, res); 5482 status = decode_open(xdr, res);
5555 if (status) 5483 if (status)
5556 goto out; 5484 goto out;
5557 decode_getfattr(&xdr, res->f_attr, res->server, 5485 decode_getfattr(xdr, res->f_attr, res->server,
5558 !RPC_IS_ASYNC(rqstp->rq_task)); 5486 !RPC_IS_ASYNC(rqstp->rq_task));
5559out: 5487out:
5560 return status; 5488 return status;
@@ -5563,26 +5491,26 @@ out:
5563/* 5491/*
5564 * Decode SETATTR response 5492 * Decode SETATTR response
5565 */ 5493 */
5566static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res) 5494static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
5495 struct xdr_stream *xdr,
5496 struct nfs_setattrres *res)
5567{ 5497{
5568 struct xdr_stream xdr;
5569 struct compound_hdr hdr; 5498 struct compound_hdr hdr;
5570 int status; 5499 int status;
5571 5500
5572 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5501 status = decode_compound_hdr(xdr, &hdr);
5573 status = decode_compound_hdr(&xdr, &hdr);
5574 if (status) 5502 if (status)
5575 goto out; 5503 goto out;
5576 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5504 status = decode_sequence(xdr, &res->seq_res, rqstp);
5577 if (status) 5505 if (status)
5578 goto out; 5506 goto out;
5579 status = decode_putfh(&xdr); 5507 status = decode_putfh(xdr);
5580 if (status) 5508 if (status)
5581 goto out; 5509 goto out;
5582 status = decode_setattr(&xdr); 5510 status = decode_setattr(xdr);
5583 if (status) 5511 if (status)
5584 goto out; 5512 goto out;
5585 decode_getfattr(&xdr, res->fattr, res->server, 5513 decode_getfattr(xdr, res->fattr, res->server,
5586 !RPC_IS_ASYNC(rqstp->rq_task)); 5514 !RPC_IS_ASYNC(rqstp->rq_task));
5587out: 5515out:
5588 return status; 5516 return status;
@@ -5591,23 +5519,22 @@ out:
5591/* 5519/*
5592 * Decode LOCK response 5520 * Decode LOCK response
5593 */ 5521 */
5594static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res) 5522static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5523 struct nfs_lock_res *res)
5595{ 5524{
5596 struct xdr_stream xdr;
5597 struct compound_hdr hdr; 5525 struct compound_hdr hdr;
5598 int status; 5526 int status;
5599 5527
5600 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5528 status = decode_compound_hdr(xdr, &hdr);
5601 status = decode_compound_hdr(&xdr, &hdr);
5602 if (status) 5529 if (status)
5603 goto out; 5530 goto out;
5604 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5531 status = decode_sequence(xdr, &res->seq_res, rqstp);
5605 if (status) 5532 if (status)
5606 goto out; 5533 goto out;
5607 status = decode_putfh(&xdr); 5534 status = decode_putfh(xdr);
5608 if (status) 5535 if (status)
5609 goto out; 5536 goto out;
5610 status = decode_lock(&xdr, res); 5537 status = decode_lock(xdr, res);
5611out: 5538out:
5612 return status; 5539 return status;
5613} 5540}
@@ -5615,23 +5542,22 @@ out:
5615/* 5542/*
5616 * Decode LOCKT response 5543 * Decode LOCKT response
5617 */ 5544 */
5618static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res) 5545static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5546 struct nfs_lockt_res *res)
5619{ 5547{
5620 struct xdr_stream xdr;
5621 struct compound_hdr hdr; 5548 struct compound_hdr hdr;
5622 int status; 5549 int status;
5623 5550
5624 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5551 status = decode_compound_hdr(xdr, &hdr);
5625 status = decode_compound_hdr(&xdr, &hdr);
5626 if (status) 5552 if (status)
5627 goto out; 5553 goto out;
5628 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5554 status = decode_sequence(xdr, &res->seq_res, rqstp);
5629 if (status) 5555 if (status)
5630 goto out; 5556 goto out;
5631 status = decode_putfh(&xdr); 5557 status = decode_putfh(xdr);
5632 if (status) 5558 if (status)
5633 goto out; 5559 goto out;
5634 status = decode_lockt(&xdr, res); 5560 status = decode_lockt(xdr, res);
5635out: 5561out:
5636 return status; 5562 return status;
5637} 5563}
@@ -5639,61 +5565,58 @@ out:
5639/* 5565/*
5640 * Decode LOCKU response 5566 * Decode LOCKU response
5641 */ 5567 */
5642static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res) 5568static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5569 struct nfs_locku_res *res)
5643{ 5570{
5644 struct xdr_stream xdr;
5645 struct compound_hdr hdr; 5571 struct compound_hdr hdr;
5646 int status; 5572 int status;
5647 5573
5648 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5574 status = decode_compound_hdr(xdr, &hdr);
5649 status = decode_compound_hdr(&xdr, &hdr);
5650 if (status) 5575 if (status)
5651 goto out; 5576 goto out;
5652 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5577 status = decode_sequence(xdr, &res->seq_res, rqstp);
5653 if (status) 5578 if (status)
5654 goto out; 5579 goto out;
5655 status = decode_putfh(&xdr); 5580 status = decode_putfh(xdr);
5656 if (status) 5581 if (status)
5657 goto out; 5582 goto out;
5658 status = decode_locku(&xdr, res); 5583 status = decode_locku(xdr, res);
5659out: 5584out:
5660 return status; 5585 return status;
5661} 5586}
5662 5587
5663static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) 5588static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
5589 struct xdr_stream *xdr, void *dummy)
5664{ 5590{
5665 struct xdr_stream xdr;
5666 struct compound_hdr hdr; 5591 struct compound_hdr hdr;
5667 int status; 5592 int status;
5668 5593
5669 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5594 status = decode_compound_hdr(xdr, &hdr);
5670 status = decode_compound_hdr(&xdr, &hdr);
5671 if (!status) 5595 if (!status)
5672 status = decode_release_lockowner(&xdr); 5596 status = decode_release_lockowner(xdr);
5673 return status; 5597 return status;
5674} 5598}
5675 5599
5676/* 5600/*
5677 * Decode READLINK response 5601 * Decode READLINK response
5678 */ 5602 */
5679static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, 5603static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
5604 struct xdr_stream *xdr,
5680 struct nfs4_readlink_res *res) 5605 struct nfs4_readlink_res *res)
5681{ 5606{
5682 struct xdr_stream xdr;
5683 struct compound_hdr hdr; 5607 struct compound_hdr hdr;
5684 int status; 5608 int status;
5685 5609
5686 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5610 status = decode_compound_hdr(xdr, &hdr);
5687 status = decode_compound_hdr(&xdr, &hdr);
5688 if (status) 5611 if (status)
5689 goto out; 5612 goto out;
5690 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5613 status = decode_sequence(xdr, &res->seq_res, rqstp);
5691 if (status) 5614 if (status)
5692 goto out; 5615 goto out;
5693 status = decode_putfh(&xdr); 5616 status = decode_putfh(xdr);
5694 if (status) 5617 if (status)
5695 goto out; 5618 goto out;
5696 status = decode_readlink(&xdr, rqstp); 5619 status = decode_readlink(xdr, rqstp);
5697out: 5620out:
5698 return status; 5621 return status;
5699} 5622}
@@ -5701,23 +5624,22 @@ out:
5701/* 5624/*
5702 * Decode READDIR response 5625 * Decode READDIR response
5703 */ 5626 */
5704static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res) 5627static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5628 struct nfs4_readdir_res *res)
5705{ 5629{
5706 struct xdr_stream xdr;
5707 struct compound_hdr hdr; 5630 struct compound_hdr hdr;
5708 int status; 5631 int status;
5709 5632
5710 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5633 status = decode_compound_hdr(xdr, &hdr);
5711 status = decode_compound_hdr(&xdr, &hdr);
5712 if (status) 5634 if (status)
5713 goto out; 5635 goto out;
5714 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5636 status = decode_sequence(xdr, &res->seq_res, rqstp);
5715 if (status) 5637 if (status)
5716 goto out; 5638 goto out;
5717 status = decode_putfh(&xdr); 5639 status = decode_putfh(xdr);
5718 if (status) 5640 if (status)
5719 goto out; 5641 goto out;
5720 status = decode_readdir(&xdr, rqstp, res); 5642 status = decode_readdir(xdr, rqstp, res);
5721out: 5643out:
5722 return status; 5644 return status;
5723} 5645}
@@ -5725,23 +5647,22 @@ out:
5725/* 5647/*
5726 * Decode Read response 5648 * Decode Read response
5727 */ 5649 */
5728static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res) 5650static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5651 struct nfs_readres *res)
5729{ 5652{
5730 struct xdr_stream xdr;
5731 struct compound_hdr hdr; 5653 struct compound_hdr hdr;
5732 int status; 5654 int status;
5733 5655
5734 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5656 status = decode_compound_hdr(xdr, &hdr);
5735 status = decode_compound_hdr(&xdr, &hdr);
5736 if (status) 5657 if (status)
5737 goto out; 5658 goto out;
5738 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5659 status = decode_sequence(xdr, &res->seq_res, rqstp);
5739 if (status) 5660 if (status)
5740 goto out; 5661 goto out;
5741 status = decode_putfh(&xdr); 5662 status = decode_putfh(xdr);
5742 if (status) 5663 if (status)
5743 goto out; 5664 goto out;
5744 status = decode_read(&xdr, rqstp, res); 5665 status = decode_read(xdr, rqstp, res);
5745 if (!status) 5666 if (!status)
5746 status = res->count; 5667 status = res->count;
5747out: 5668out:
@@ -5751,26 +5672,25 @@ out:
5751/* 5672/*
5752 * Decode WRITE response 5673 * Decode WRITE response
5753 */ 5674 */
5754static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) 5675static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5676 struct nfs_writeres *res)
5755{ 5677{
5756 struct xdr_stream xdr;
5757 struct compound_hdr hdr; 5678 struct compound_hdr hdr;
5758 int status; 5679 int status;
5759 5680
5760 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5681 status = decode_compound_hdr(xdr, &hdr);
5761 status = decode_compound_hdr(&xdr, &hdr);
5762 if (status) 5682 if (status)
5763 goto out; 5683 goto out;
5764 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5684 status = decode_sequence(xdr, &res->seq_res, rqstp);
5765 if (status) 5685 if (status)
5766 goto out; 5686 goto out;
5767 status = decode_putfh(&xdr); 5687 status = decode_putfh(xdr);
5768 if (status) 5688 if (status)
5769 goto out; 5689 goto out;
5770 status = decode_write(&xdr, res); 5690 status = decode_write(xdr, res);
5771 if (status) 5691 if (status)
5772 goto out; 5692 goto out;
5773 decode_getfattr(&xdr, res->fattr, res->server, 5693 decode_getfattr(xdr, res->fattr, res->server,
5774 !RPC_IS_ASYNC(rqstp->rq_task)); 5694 !RPC_IS_ASYNC(rqstp->rq_task));
5775 if (!status) 5695 if (!status)
5776 status = res->count; 5696 status = res->count;
@@ -5781,26 +5701,25 @@ out:
5781/* 5701/*
5782 * Decode COMMIT response 5702 * Decode COMMIT response
5783 */ 5703 */
5784static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) 5704static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5705 struct nfs_writeres *res)
5785{ 5706{
5786 struct xdr_stream xdr;
5787 struct compound_hdr hdr; 5707 struct compound_hdr hdr;
5788 int status; 5708 int status;
5789 5709
5790 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5710 status = decode_compound_hdr(xdr, &hdr);
5791 status = decode_compound_hdr(&xdr, &hdr);
5792 if (status) 5711 if (status)
5793 goto out; 5712 goto out;
5794 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5713 status = decode_sequence(xdr, &res->seq_res, rqstp);
5795 if (status) 5714 if (status)
5796 goto out; 5715 goto out;
5797 status = decode_putfh(&xdr); 5716 status = decode_putfh(xdr);
5798 if (status) 5717 if (status)
5799 goto out; 5718 goto out;
5800 status = decode_commit(&xdr, res); 5719 status = decode_commit(xdr, res);
5801 if (status) 5720 if (status)
5802 goto out; 5721 goto out;
5803 decode_getfattr(&xdr, res->fattr, res->server, 5722 decode_getfattr(xdr, res->fattr, res->server,
5804 !RPC_IS_ASYNC(rqstp->rq_task)); 5723 !RPC_IS_ASYNC(rqstp->rq_task));
5805out: 5724out:
5806 return status; 5725 return status;
@@ -5809,85 +5728,80 @@ out:
5809/* 5728/*
5810 * Decode FSINFO response 5729 * Decode FSINFO response
5811 */ 5730 */
5812static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, 5731static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
5813 struct nfs4_fsinfo_res *res) 5732 struct nfs4_fsinfo_res *res)
5814{ 5733{
5815 struct xdr_stream xdr;
5816 struct compound_hdr hdr; 5734 struct compound_hdr hdr;
5817 int status; 5735 int status;
5818 5736
5819 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5737 status = decode_compound_hdr(xdr, &hdr);
5820 status = decode_compound_hdr(&xdr, &hdr);
5821 if (!status) 5738 if (!status)
5822 status = decode_sequence(&xdr, &res->seq_res, req); 5739 status = decode_sequence(xdr, &res->seq_res, req);
5823 if (!status) 5740 if (!status)
5824 status = decode_putfh(&xdr); 5741 status = decode_putfh(xdr);
5825 if (!status) 5742 if (!status)
5826 status = decode_fsinfo(&xdr, res->fsinfo); 5743 status = decode_fsinfo(xdr, res->fsinfo);
5827 return status; 5744 return status;
5828} 5745}
5829 5746
5830/* 5747/*
5831 * Decode PATHCONF response 5748 * Decode PATHCONF response
5832 */ 5749 */
5833static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, 5750static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
5834 struct nfs4_pathconf_res *res) 5751 struct nfs4_pathconf_res *res)
5835{ 5752{
5836 struct xdr_stream xdr;
5837 struct compound_hdr hdr; 5753 struct compound_hdr hdr;
5838 int status; 5754 int status;
5839 5755
5840 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5756 status = decode_compound_hdr(xdr, &hdr);
5841 status = decode_compound_hdr(&xdr, &hdr);
5842 if (!status) 5757 if (!status)
5843 status = decode_sequence(&xdr, &res->seq_res, req); 5758 status = decode_sequence(xdr, &res->seq_res, req);
5844 if (!status) 5759 if (!status)
5845 status = decode_putfh(&xdr); 5760 status = decode_putfh(xdr);
5846 if (!status) 5761 if (!status)
5847 status = decode_pathconf(&xdr, res->pathconf); 5762 status = decode_pathconf(xdr, res->pathconf);
5848 return status; 5763 return status;
5849} 5764}
5850 5765
5851/* 5766/*
5852 * Decode STATFS response 5767 * Decode STATFS response
5853 */ 5768 */
5854static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, 5769static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
5855 struct nfs4_statfs_res *res) 5770 struct nfs4_statfs_res *res)
5856{ 5771{
5857 struct xdr_stream xdr;
5858 struct compound_hdr hdr; 5772 struct compound_hdr hdr;
5859 int status; 5773 int status;
5860 5774
5861 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5775 status = decode_compound_hdr(xdr, &hdr);
5862 status = decode_compound_hdr(&xdr, &hdr);
5863 if (!status) 5776 if (!status)
5864 status = decode_sequence(&xdr, &res->seq_res, req); 5777 status = decode_sequence(xdr, &res->seq_res, req);
5865 if (!status) 5778 if (!status)
5866 status = decode_putfh(&xdr); 5779 status = decode_putfh(xdr);
5867 if (!status) 5780 if (!status)
5868 status = decode_statfs(&xdr, res->fsstat); 5781 status = decode_statfs(xdr, res->fsstat);
5869 return status; 5782 return status;
5870} 5783}
5871 5784
5872/* 5785/*
5873 * Decode GETATTR_BITMAP response 5786 * Decode GETATTR_BITMAP response
5874 */ 5787 */
5875static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res) 5788static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
5789 struct xdr_stream *xdr,
5790 struct nfs4_server_caps_res *res)
5876{ 5791{
5877 struct xdr_stream xdr;
5878 struct compound_hdr hdr; 5792 struct compound_hdr hdr;
5879 int status; 5793 int status;
5880 5794
5881 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5795 status = decode_compound_hdr(xdr, &hdr);
5882 status = decode_compound_hdr(&xdr, &hdr);
5883 if (status) 5796 if (status)
5884 goto out; 5797 goto out;
5885 status = decode_sequence(&xdr, &res->seq_res, req); 5798 status = decode_sequence(xdr, &res->seq_res, req);
5886 if (status) 5799 if (status)
5887 goto out; 5800 goto out;
5888 if ((status = decode_putfh(&xdr)) != 0) 5801 status = decode_putfh(xdr);
5802 if (status)
5889 goto out; 5803 goto out;
5890 status = decode_server_caps(&xdr, res); 5804 status = decode_server_caps(xdr, res);
5891out: 5805out:
5892 return status; 5806 return status;
5893} 5807}
@@ -5895,79 +5809,77 @@ out:
5895/* 5809/*
5896 * Decode RENEW response 5810 * Decode RENEW response
5897 */ 5811 */
5898static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy) 5812static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5813 void *__unused)
5899{ 5814{
5900 struct xdr_stream xdr;
5901 struct compound_hdr hdr; 5815 struct compound_hdr hdr;
5902 int status; 5816 int status;
5903 5817
5904 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5818 status = decode_compound_hdr(xdr, &hdr);
5905 status = decode_compound_hdr(&xdr, &hdr);
5906 if (!status) 5819 if (!status)
5907 status = decode_renew(&xdr); 5820 status = decode_renew(xdr);
5908 return status; 5821 return status;
5909} 5822}
5910 5823
5911/* 5824/*
5912 * Decode SETCLIENTID response 5825 * Decode SETCLIENTID response
5913 */ 5826 */
5914static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, 5827static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
5915 struct nfs4_setclientid_res *res) 5828 struct xdr_stream *xdr,
5829 struct nfs4_setclientid_res *res)
5916{ 5830{
5917 struct xdr_stream xdr;
5918 struct compound_hdr hdr; 5831 struct compound_hdr hdr;
5919 int status; 5832 int status;
5920 5833
5921 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5834 status = decode_compound_hdr(xdr, &hdr);
5922 status = decode_compound_hdr(&xdr, &hdr);
5923 if (!status) 5835 if (!status)
5924 status = decode_setclientid(&xdr, res); 5836 status = decode_setclientid(xdr, res);
5925 return status; 5837 return status;
5926} 5838}
5927 5839
5928/* 5840/*
5929 * Decode SETCLIENTID_CONFIRM response 5841 * Decode SETCLIENTID_CONFIRM response
5930 */ 5842 */
5931static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) 5843static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
5844 struct xdr_stream *xdr,
5845 struct nfs_fsinfo *fsinfo)
5932{ 5846{
5933 struct xdr_stream xdr;
5934 struct compound_hdr hdr; 5847 struct compound_hdr hdr;
5935 int status; 5848 int status;
5936 5849
5937 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5850 status = decode_compound_hdr(xdr, &hdr);
5938 status = decode_compound_hdr(&xdr, &hdr);
5939 if (!status) 5851 if (!status)
5940 status = decode_setclientid_confirm(&xdr); 5852 status = decode_setclientid_confirm(xdr);
5941 if (!status) 5853 if (!status)
5942 status = decode_putrootfh(&xdr); 5854 status = decode_putrootfh(xdr);
5943 if (!status) 5855 if (!status)
5944 status = decode_fsinfo(&xdr, fsinfo); 5856 status = decode_fsinfo(xdr, fsinfo);
5945 return status; 5857 return status;
5946} 5858}
5947 5859
5948/* 5860/*
5949 * Decode DELEGRETURN response 5861 * Decode DELEGRETURN response
5950 */ 5862 */
5951static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res) 5863static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
5864 struct xdr_stream *xdr,
5865 struct nfs4_delegreturnres *res)
5952{ 5866{
5953 struct xdr_stream xdr;
5954 struct compound_hdr hdr; 5867 struct compound_hdr hdr;
5955 int status; 5868 int status;
5956 5869
5957 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5870 status = decode_compound_hdr(xdr, &hdr);
5958 status = decode_compound_hdr(&xdr, &hdr);
5959 if (status) 5871 if (status)
5960 goto out; 5872 goto out;
5961 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5873 status = decode_sequence(xdr, &res->seq_res, rqstp);
5962 if (status) 5874 if (status)
5963 goto out; 5875 goto out;
5964 status = decode_putfh(&xdr); 5876 status = decode_putfh(xdr);
5965 if (status != 0) 5877 if (status != 0)
5966 goto out; 5878 goto out;
5967 status = decode_delegreturn(&xdr); 5879 status = decode_delegreturn(xdr);
5968 if (status != 0) 5880 if (status != 0)
5969 goto out; 5881 goto out;
5970 decode_getfattr(&xdr, res->fattr, res->server, 5882 decode_getfattr(xdr, res->fattr, res->server,
5971 !RPC_IS_ASYNC(rqstp->rq_task)); 5883 !RPC_IS_ASYNC(rqstp->rq_task));
5972out: 5884out:
5973 return status; 5885 return status;
@@ -5976,26 +5888,27 @@ out:
5976/* 5888/*
5977 * Decode FS_LOCATIONS response 5889 * Decode FS_LOCATIONS response
5978 */ 5890 */
5979static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, 5891static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
5892 struct xdr_stream *xdr,
5980 struct nfs4_fs_locations_res *res) 5893 struct nfs4_fs_locations_res *res)
5981{ 5894{
5982 struct xdr_stream xdr;
5983 struct compound_hdr hdr; 5895 struct compound_hdr hdr;
5984 int status; 5896 int status;
5985 5897
5986 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5898 status = decode_compound_hdr(xdr, &hdr);
5987 status = decode_compound_hdr(&xdr, &hdr);
5988 if (status) 5899 if (status)
5989 goto out; 5900 goto out;
5990 status = decode_sequence(&xdr, &res->seq_res, req); 5901 status = decode_sequence(xdr, &res->seq_res, req);
5991 if (status) 5902 if (status)
5992 goto out; 5903 goto out;
5993 if ((status = decode_putfh(&xdr)) != 0) 5904 status = decode_putfh(xdr);
5905 if (status)
5994 goto out; 5906 goto out;
5995 if ((status = decode_lookup(&xdr)) != 0) 5907 status = decode_lookup(xdr);
5908 if (status)
5996 goto out; 5909 goto out;
5997 xdr_enter_page(&xdr, PAGE_SIZE); 5910 xdr_enter_page(xdr, PAGE_SIZE);
5998 status = decode_getfattr(&xdr, &res->fs_locations->fattr, 5911 status = decode_getfattr(xdr, &res->fs_locations->fattr,
5999 res->fs_locations->server, 5912 res->fs_locations->server,
6000 !RPC_IS_ASYNC(req->rq_task)); 5913 !RPC_IS_ASYNC(req->rq_task));
6001out: 5914out:
@@ -6006,129 +5919,122 @@ out:
6006/* 5919/*
6007 * Decode EXCHANGE_ID response 5920 * Decode EXCHANGE_ID response
6008 */ 5921 */
6009static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, 5922static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
5923 struct xdr_stream *xdr,
6010 void *res) 5924 void *res)
6011{ 5925{
6012 struct xdr_stream xdr;
6013 struct compound_hdr hdr; 5926 struct compound_hdr hdr;
6014 int status; 5927 int status;
6015 5928
6016 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5929 status = decode_compound_hdr(xdr, &hdr);
6017 status = decode_compound_hdr(&xdr, &hdr);
6018 if (!status) 5930 if (!status)
6019 status = decode_exchange_id(&xdr, res); 5931 status = decode_exchange_id(xdr, res);
6020 return status; 5932 return status;
6021} 5933}
6022 5934
6023/* 5935/*
6024 * Decode CREATE_SESSION response 5936 * Decode CREATE_SESSION response
6025 */ 5937 */
6026static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, 5938static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
5939 struct xdr_stream *xdr,
6027 struct nfs41_create_session_res *res) 5940 struct nfs41_create_session_res *res)
6028{ 5941{
6029 struct xdr_stream xdr;
6030 struct compound_hdr hdr; 5942 struct compound_hdr hdr;
6031 int status; 5943 int status;
6032 5944
6033 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5945 status = decode_compound_hdr(xdr, &hdr);
6034 status = decode_compound_hdr(&xdr, &hdr);
6035 if (!status) 5946 if (!status)
6036 status = decode_create_session(&xdr, res); 5947 status = decode_create_session(xdr, res);
6037 return status; 5948 return status;
6038} 5949}
6039 5950
6040/* 5951/*
6041 * Decode DESTROY_SESSION response 5952 * Decode DESTROY_SESSION response
6042 */ 5953 */
6043static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, 5954static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
6044 void *dummy) 5955 struct xdr_stream *xdr,
5956 void *res)
6045{ 5957{
6046 struct xdr_stream xdr;
6047 struct compound_hdr hdr; 5958 struct compound_hdr hdr;
6048 int status; 5959 int status;
6049 5960
6050 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5961 status = decode_compound_hdr(xdr, &hdr);
6051 status = decode_compound_hdr(&xdr, &hdr);
6052 if (!status) 5962 if (!status)
6053 status = decode_destroy_session(&xdr, dummy); 5963 status = decode_destroy_session(xdr, res);
6054 return status; 5964 return status;
6055} 5965}
6056 5966
6057/* 5967/*
6058 * Decode SEQUENCE response 5968 * Decode SEQUENCE response
6059 */ 5969 */
6060static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, 5970static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
5971 struct xdr_stream *xdr,
6061 struct nfs4_sequence_res *res) 5972 struct nfs4_sequence_res *res)
6062{ 5973{
6063 struct xdr_stream xdr;
6064 struct compound_hdr hdr; 5974 struct compound_hdr hdr;
6065 int status; 5975 int status;
6066 5976
6067 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5977 status = decode_compound_hdr(xdr, &hdr);
6068 status = decode_compound_hdr(&xdr, &hdr);
6069 if (!status) 5978 if (!status)
6070 status = decode_sequence(&xdr, res, rqstp); 5979 status = decode_sequence(xdr, res, rqstp);
6071 return status; 5980 return status;
6072} 5981}
6073 5982
6074/* 5983/*
6075 * Decode GET_LEASE_TIME response 5984 * Decode GET_LEASE_TIME response
6076 */ 5985 */
6077static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, 5986static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
5987 struct xdr_stream *xdr,
6078 struct nfs4_get_lease_time_res *res) 5988 struct nfs4_get_lease_time_res *res)
6079{ 5989{
6080 struct xdr_stream xdr;
6081 struct compound_hdr hdr; 5990 struct compound_hdr hdr;
6082 int status; 5991 int status;
6083 5992
6084 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5993 status = decode_compound_hdr(xdr, &hdr);
6085 status = decode_compound_hdr(&xdr, &hdr);
6086 if (!status) 5994 if (!status)
6087 status = decode_sequence(&xdr, &res->lr_seq_res, rqstp); 5995 status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
6088 if (!status) 5996 if (!status)
6089 status = decode_putrootfh(&xdr); 5997 status = decode_putrootfh(xdr);
6090 if (!status) 5998 if (!status)
6091 status = decode_fsinfo(&xdr, res->lr_fsinfo); 5999 status = decode_fsinfo(xdr, res->lr_fsinfo);
6092 return status; 6000 return status;
6093} 6001}
6094 6002
6095/* 6003/*
6096 * Decode RECLAIM_COMPLETE response 6004 * Decode RECLAIM_COMPLETE response
6097 */ 6005 */
6098static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, 6006static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
6007 struct xdr_stream *xdr,
6099 struct nfs41_reclaim_complete_res *res) 6008 struct nfs41_reclaim_complete_res *res)
6100{ 6009{
6101 struct xdr_stream xdr;
6102 struct compound_hdr hdr; 6010 struct compound_hdr hdr;
6103 int status; 6011 int status;
6104 6012
6105 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 6013 status = decode_compound_hdr(xdr, &hdr);
6106 status = decode_compound_hdr(&xdr, &hdr);
6107 if (!status) 6014 if (!status)
6108 status = decode_sequence(&xdr, &res->seq_res, rqstp); 6015 status = decode_sequence(xdr, &res->seq_res, rqstp);
6109 if (!status) 6016 if (!status)
6110 status = decode_reclaim_complete(&xdr, (void *)NULL); 6017 status = decode_reclaim_complete(xdr, (void *)NULL);
6111 return status; 6018 return status;
6112} 6019}
6113 6020
6114/* 6021/*
6115 * Decode GETDEVINFO response 6022 * Decode GETDEVINFO response
6116 */ 6023 */
6117static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, 6024static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
6025 struct xdr_stream *xdr,
6118 struct nfs4_getdeviceinfo_res *res) 6026 struct nfs4_getdeviceinfo_res *res)
6119{ 6027{
6120 struct xdr_stream xdr;
6121 struct compound_hdr hdr; 6028 struct compound_hdr hdr;
6122 int status; 6029 int status;
6123 6030
6124 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 6031 status = decode_compound_hdr(xdr, &hdr);
6125 status = decode_compound_hdr(&xdr, &hdr);
6126 if (status != 0) 6032 if (status != 0)
6127 goto out; 6033 goto out;
6128 status = decode_sequence(&xdr, &res->seq_res, rqstp); 6034 status = decode_sequence(xdr, &res->seq_res, rqstp);
6129 if (status != 0) 6035 if (status != 0)
6130 goto out; 6036 goto out;
6131 status = decode_getdeviceinfo(&xdr, res->pdev); 6037 status = decode_getdeviceinfo(xdr, res->pdev);
6132out: 6038out:
6133 return status; 6039 return status;
6134} 6040}
@@ -6136,45 +6042,58 @@ out:
6136/* 6042/*
6137 * Decode LAYOUTGET response 6043 * Decode LAYOUTGET response
6138 */ 6044 */
6139static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, 6045static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
6046 struct xdr_stream *xdr,
6140 struct nfs4_layoutget_res *res) 6047 struct nfs4_layoutget_res *res)
6141{ 6048{
6142 struct xdr_stream xdr;
6143 struct compound_hdr hdr; 6049 struct compound_hdr hdr;
6144 int status; 6050 int status;
6145 6051
6146 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 6052 status = decode_compound_hdr(xdr, &hdr);
6147 status = decode_compound_hdr(&xdr, &hdr);
6148 if (status) 6053 if (status)
6149 goto out; 6054 goto out;
6150 status = decode_sequence(&xdr, &res->seq_res, rqstp); 6055 status = decode_sequence(xdr, &res->seq_res, rqstp);
6151 if (status) 6056 if (status)
6152 goto out; 6057 goto out;
6153 status = decode_putfh(&xdr); 6058 status = decode_putfh(xdr);
6154 if (status) 6059 if (status)
6155 goto out; 6060 goto out;
6156 status = decode_layoutget(&xdr, rqstp, res); 6061 status = decode_layoutget(xdr, rqstp, res);
6157out: 6062out:
6158 return status; 6063 return status;
6159} 6064}
6160#endif /* CONFIG_NFS_V4_1 */ 6065#endif /* CONFIG_NFS_V4_1 */
6161 6066
6162__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, 6067/**
6163 struct nfs_server *server, int plus) 6068 * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
6069 * the local page cache.
6070 * @xdr: XDR stream where entry resides
6071 * @entry: buffer to fill in with entry data
6072 * @plus: boolean indicating whether this should be a readdirplus entry
6073 *
6074 * Returns zero if successful, otherwise a negative errno value is
6075 * returned.
6076 *
6077 * This function is not invoked during READDIR reply decoding, but
6078 * rather whenever an application invokes the getdents(2) system call
6079 * on a directory already in our cache.
6080 */
6081int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6082 int plus)
6164{ 6083{
6165 uint32_t bitmap[2] = {0}; 6084 uint32_t bitmap[2] = {0};
6166 uint32_t len; 6085 uint32_t len;
6167 __be32 *p = xdr_inline_decode(xdr, 4); 6086 __be32 *p = xdr_inline_decode(xdr, 4);
6168 if (unlikely(!p)) 6087 if (unlikely(!p))
6169 goto out_overflow; 6088 goto out_overflow;
6170 if (!ntohl(*p++)) { 6089 if (*p == xdr_zero) {
6171 p = xdr_inline_decode(xdr, 4); 6090 p = xdr_inline_decode(xdr, 4);
6172 if (unlikely(!p)) 6091 if (unlikely(!p))
6173 goto out_overflow; 6092 goto out_overflow;
6174 if (!ntohl(*p++)) 6093 if (*p == xdr_zero)
6175 return ERR_PTR(-EAGAIN); 6094 return -EAGAIN;
6176 entry->eof = 1; 6095 entry->eof = 1;
6177 return ERR_PTR(-EBADCOOKIE); 6096 return -EBADCOOKIE;
6178 } 6097 }
6179 6098
6180 p = xdr_inline_decode(xdr, 12); 6099 p = xdr_inline_decode(xdr, 12);
@@ -6182,7 +6101,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6182 goto out_overflow; 6101 goto out_overflow;
6183 entry->prev_cookie = entry->cookie; 6102 entry->prev_cookie = entry->cookie;
6184 p = xdr_decode_hyper(p, &entry->cookie); 6103 p = xdr_decode_hyper(p, &entry->cookie);
6185 entry->len = ntohl(*p++); 6104 entry->len = be32_to_cpup(p);
6186 6105
6187 p = xdr_inline_decode(xdr, entry->len); 6106 p = xdr_inline_decode(xdr, entry->len);
6188 if (unlikely(!p)) 6107 if (unlikely(!p))
@@ -6203,25 +6122,21 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6203 if (decode_attr_length(xdr, &len, &p) < 0) 6122 if (decode_attr_length(xdr, &len, &p) < 0)
6204 goto out_overflow; 6123 goto out_overflow;
6205 6124
6206 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0) 6125 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
6126 entry->server, 1) < 0)
6207 goto out_overflow; 6127 goto out_overflow;
6208 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) 6128 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
6209 entry->ino = entry->fattr->fileid; 6129 entry->ino = entry->fattr->fileid;
6210 6130
6211 if (verify_attr_len(xdr, p, len) < 0) 6131 entry->d_type = DT_UNKNOWN;
6212 goto out_overflow; 6132 if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
6213 6133 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
6214 p = xdr_inline_peek(xdr, 8);
6215 if (p != NULL)
6216 entry->eof = !p[0] && p[1];
6217 else
6218 entry->eof = 0;
6219 6134
6220 return p; 6135 return 0;
6221 6136
6222out_overflow: 6137out_overflow:
6223 print_overflow_msg(__func__, xdr); 6138 print_overflow_msg(__func__, xdr);
6224 return ERR_PTR(-EIO); 6139 return -EAGAIN;
6225} 6140}
6226 6141
6227/* 6142/*
@@ -6297,8 +6212,8 @@ nfs4_stat_to_errno(int stat)
6297#define PROC(proc, argtype, restype) \ 6212#define PROC(proc, argtype, restype) \
6298[NFSPROC4_CLNT_##proc] = { \ 6213[NFSPROC4_CLNT_##proc] = { \
6299 .p_proc = NFSPROC4_COMPOUND, \ 6214 .p_proc = NFSPROC4_COMPOUND, \
6300 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ 6215 .p_encode = (kxdreproc_t)nfs4_xdr_##argtype, \
6301 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ 6216 .p_decode = (kxdrdproc_t)nfs4_xdr_##restype, \
6302 .p_arglen = NFS4_##argtype##_sz, \ 6217 .p_arglen = NFS4_##argtype##_sz, \
6303 .p_replen = NFS4_##restype##_sz, \ 6218 .p_replen = NFS4_##restype##_sz, \
6304 .p_statidx = NFSPROC4_CLNT_##proc, \ 6219 .p_statidx = NFSPROC4_CLNT_##proc, \
@@ -6306,50 +6221,50 @@ nfs4_stat_to_errno(int stat)
6306} 6221}
6307 6222
6308struct rpc_procinfo nfs4_procedures[] = { 6223struct rpc_procinfo nfs4_procedures[] = {
6309 PROC(READ, enc_read, dec_read), 6224 PROC(READ, enc_read, dec_read),
6310 PROC(WRITE, enc_write, dec_write), 6225 PROC(WRITE, enc_write, dec_write),
6311 PROC(COMMIT, enc_commit, dec_commit), 6226 PROC(COMMIT, enc_commit, dec_commit),
6312 PROC(OPEN, enc_open, dec_open), 6227 PROC(OPEN, enc_open, dec_open),
6313 PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), 6228 PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm),
6314 PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), 6229 PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr),
6315 PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), 6230 PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade),
6316 PROC(CLOSE, enc_close, dec_close), 6231 PROC(CLOSE, enc_close, dec_close),
6317 PROC(SETATTR, enc_setattr, dec_setattr), 6232 PROC(SETATTR, enc_setattr, dec_setattr),
6318 PROC(FSINFO, enc_fsinfo, dec_fsinfo), 6233 PROC(FSINFO, enc_fsinfo, dec_fsinfo),
6319 PROC(RENEW, enc_renew, dec_renew), 6234 PROC(RENEW, enc_renew, dec_renew),
6320 PROC(SETCLIENTID, enc_setclientid, dec_setclientid), 6235 PROC(SETCLIENTID, enc_setclientid, dec_setclientid),
6321 PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), 6236 PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
6322 PROC(LOCK, enc_lock, dec_lock), 6237 PROC(LOCK, enc_lock, dec_lock),
6323 PROC(LOCKT, enc_lockt, dec_lockt), 6238 PROC(LOCKT, enc_lockt, dec_lockt),
6324 PROC(LOCKU, enc_locku, dec_locku), 6239 PROC(LOCKU, enc_locku, dec_locku),
6325 PROC(ACCESS, enc_access, dec_access), 6240 PROC(ACCESS, enc_access, dec_access),
6326 PROC(GETATTR, enc_getattr, dec_getattr), 6241 PROC(GETATTR, enc_getattr, dec_getattr),
6327 PROC(LOOKUP, enc_lookup, dec_lookup), 6242 PROC(LOOKUP, enc_lookup, dec_lookup),
6328 PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), 6243 PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root),
6329 PROC(REMOVE, enc_remove, dec_remove), 6244 PROC(REMOVE, enc_remove, dec_remove),
6330 PROC(RENAME, enc_rename, dec_rename), 6245 PROC(RENAME, enc_rename, dec_rename),
6331 PROC(LINK, enc_link, dec_link), 6246 PROC(LINK, enc_link, dec_link),
6332 PROC(SYMLINK, enc_symlink, dec_symlink), 6247 PROC(SYMLINK, enc_symlink, dec_symlink),
6333 PROC(CREATE, enc_create, dec_create), 6248 PROC(CREATE, enc_create, dec_create),
6334 PROC(PATHCONF, enc_pathconf, dec_pathconf), 6249 PROC(PATHCONF, enc_pathconf, dec_pathconf),
6335 PROC(STATFS, enc_statfs, dec_statfs), 6250 PROC(STATFS, enc_statfs, dec_statfs),
6336 PROC(READLINK, enc_readlink, dec_readlink), 6251 PROC(READLINK, enc_readlink, dec_readlink),
6337 PROC(READDIR, enc_readdir, dec_readdir), 6252 PROC(READDIR, enc_readdir, dec_readdir),
6338 PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), 6253 PROC(SERVER_CAPS, enc_server_caps, dec_server_caps),
6339 PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), 6254 PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn),
6340 PROC(GETACL, enc_getacl, dec_getacl), 6255 PROC(GETACL, enc_getacl, dec_getacl),
6341 PROC(SETACL, enc_setacl, dec_setacl), 6256 PROC(SETACL, enc_setacl, dec_setacl),
6342 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), 6257 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
6343 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), 6258 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
6344#if defined(CONFIG_NFS_V4_1) 6259#if defined(CONFIG_NFS_V4_1)
6345 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), 6260 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
6346 PROC(CREATE_SESSION, enc_create_session, dec_create_session), 6261 PROC(CREATE_SESSION, enc_create_session, dec_create_session),
6347 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), 6262 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session),
6348 PROC(SEQUENCE, enc_sequence, dec_sequence), 6263 PROC(SEQUENCE, enc_sequence, dec_sequence),
6349 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), 6264 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
6350 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), 6265 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
6351 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), 6266 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6352 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 6267 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
6353#endif /* CONFIG_NFS_V4_1 */ 6268#endif /* CONFIG_NFS_V4_1 */
6354}; 6269};
6355 6270
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 137b549e63d..e1164e3f9e6 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep;
26static inline struct nfs_page * 26static inline struct nfs_page *
27nfs_page_alloc(void) 27nfs_page_alloc(void)
28{ 28{
29 struct nfs_page *p; 29 struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
30 p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL); 30 if (p)
31 if (p) {
32 memset(p, 0, sizeof(*p));
33 INIT_LIST_HEAD(&p->wb_list); 31 INIT_LIST_HEAD(&p->wb_list);
34 }
35 return p; 32 return p;
36} 33}
37 34
@@ -115,7 +112,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
115{ 112{
116 if (!nfs_lock_request_dontget(req)) 113 if (!nfs_lock_request_dontget(req))
117 return 0; 114 return 0;
118 if (req->wb_page != NULL) 115 if (test_bit(PG_MAPPED, &req->wb_flags))
119 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 116 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
120 return 1; 117 return 1;
121} 118}
@@ -125,7 +122,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
125 */ 122 */
126void nfs_clear_page_tag_locked(struct nfs_page *req) 123void nfs_clear_page_tag_locked(struct nfs_page *req)
127{ 124{
128 if (req->wb_page != NULL) { 125 if (test_bit(PG_MAPPED, &req->wb_flags)) {
129 struct inode *inode = req->wb_context->path.dentry->d_inode; 126 struct inode *inode = req->wb_context->path.dentry->d_inode;
130 struct nfs_inode *nfsi = NFS_I(inode); 127 struct nfs_inode *nfsi = NFS_I(inode);
131 128
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db773428f95..1b1bc1a0fb0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,105 +177,149 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
177 * pNFS client layout cache 177 * pNFS client layout cache
178 */ 178 */
179 179
180/* Need to hold i_lock if caller does not already hold reference */
181void
182get_layout_hdr(struct pnfs_layout_hdr *lo)
183{
184 atomic_inc(&lo->plh_refcount);
185}
186
180static void 187static void
181get_layout_hdr_locked(struct pnfs_layout_hdr *lo) 188destroy_layout_hdr(struct pnfs_layout_hdr *lo)
182{ 189{
183 assert_spin_locked(&lo->inode->i_lock); 190 dprintk("%s: freeing layout cache %p\n", __func__, lo);
184 lo->refcount++; 191 BUG_ON(!list_empty(&lo->plh_layouts));
192 NFS_I(lo->plh_inode)->layout = NULL;
193 kfree(lo);
185} 194}
186 195
187static void 196static void
188put_layout_hdr_locked(struct pnfs_layout_hdr *lo) 197put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
189{ 198{
190 assert_spin_locked(&lo->inode->i_lock); 199 if (atomic_dec_and_test(&lo->plh_refcount))
191 BUG_ON(lo->refcount == 0); 200 destroy_layout_hdr(lo);
192
193 lo->refcount--;
194 if (!lo->refcount) {
195 dprintk("%s: freeing layout cache %p\n", __func__, lo);
196 BUG_ON(!list_empty(&lo->layouts));
197 NFS_I(lo->inode)->layout = NULL;
198 kfree(lo);
199 }
200} 201}
201 202
202void 203void
203put_layout_hdr(struct inode *inode) 204put_layout_hdr(struct pnfs_layout_hdr *lo)
204{ 205{
205 spin_lock(&inode->i_lock); 206 struct inode *inode = lo->plh_inode;
206 put_layout_hdr_locked(NFS_I(inode)->layout); 207
207 spin_unlock(&inode->i_lock); 208 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
209 destroy_layout_hdr(lo);
210 spin_unlock(&inode->i_lock);
211 }
208} 212}
209 213
210static void 214static void
211init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 215init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
212{ 216{
213 INIT_LIST_HEAD(&lseg->fi_list); 217 INIT_LIST_HEAD(&lseg->pls_list);
214 kref_init(&lseg->kref); 218 atomic_set(&lseg->pls_refcount, 1);
215 lseg->layout = lo; 219 smp_mb();
220 set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
221 lseg->pls_layout = lo;
216} 222}
217 223
218/* Called without i_lock held, as the free_lseg call may sleep */ 224static void free_lseg(struct pnfs_layout_segment *lseg)
219static void
220destroy_lseg(struct kref *kref)
221{ 225{
222 struct pnfs_layout_segment *lseg = 226 struct inode *ino = lseg->pls_layout->plh_inode;
223 container_of(kref, struct pnfs_layout_segment, kref);
224 struct inode *ino = lseg->layout->inode;
225 227
226 dprintk("--> %s\n", __func__);
227 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 228 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
228 /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ 229 /* Matched by get_layout_hdr in pnfs_insert_layout */
229 put_layout_hdr(ino); 230 put_layout_hdr(NFS_I(ino)->layout);
230} 231}
231 232
232static void 233/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
233put_lseg(struct pnfs_layout_segment *lseg) 234 * could sleep, so must be called outside of the lock.
235 * Returns 1 if object was removed, otherwise return 0.
236 */
237static int
238put_lseg_locked(struct pnfs_layout_segment *lseg,
239 struct list_head *tmp_list)
240{
241 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
242 atomic_read(&lseg->pls_refcount),
243 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
244 if (atomic_dec_and_test(&lseg->pls_refcount)) {
245 struct inode *ino = lseg->pls_layout->plh_inode;
246
247 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
248 list_del(&lseg->pls_list);
249 if (list_empty(&lseg->pls_layout->plh_segs)) {
250 struct nfs_client *clp;
251
252 clp = NFS_SERVER(ino)->nfs_client;
253 spin_lock(&clp->cl_lock);
254 /* List does not take a reference, so no need for put here */
255 list_del_init(&lseg->pls_layout->plh_layouts);
256 spin_unlock(&clp->cl_lock);
257 clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
258 }
259 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
260 list_add(&lseg->pls_list, tmp_list);
261 return 1;
262 }
263 return 0;
264}
265
266static bool
267should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
234{ 268{
235 if (!lseg) 269 return (recall_iomode == IOMODE_ANY ||
236 return; 270 lseg_iomode == recall_iomode);
271}
237 272
238 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 273/* Returns 1 if lseg is removed from list, 0 otherwise */
239 atomic_read(&lseg->kref.refcount)); 274static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
240 kref_put(&lseg->kref, destroy_lseg); 275 struct list_head *tmp_list)
276{
277 int rv = 0;
278
279 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
280 /* Remove the reference keeping the lseg in the
281 * list. It will now be removed when all
282 * outstanding io is finished.
283 */
284 rv = put_lseg_locked(lseg, tmp_list);
285 }
286 return rv;
241} 287}
242 288
243static void 289/* Returns count of number of matching invalid lsegs remaining in list
244pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list) 290 * after call.
291 */
292int
293mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
294 struct list_head *tmp_list,
295 u32 iomode)
245{ 296{
246 struct pnfs_layout_segment *lseg, *next; 297 struct pnfs_layout_segment *lseg, *next;
247 struct nfs_client *clp; 298 int invalid = 0, removed = 0;
248 299
249 dprintk("%s:Begin lo %p\n", __func__, lo); 300 dprintk("%s:Begin lo %p\n", __func__, lo);
250 301
251 assert_spin_locked(&lo->inode->i_lock); 302 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
252 list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) { 303 if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
253 dprintk("%s: freeing lseg %p\n", __func__, lseg); 304 dprintk("%s: freeing lseg %p iomode %d "
254 list_move(&lseg->fi_list, tmp_list); 305 "offset %llu length %llu\n", __func__,
255 } 306 lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
256 clp = NFS_SERVER(lo->inode)->nfs_client; 307 lseg->pls_range.length);
257 spin_lock(&clp->cl_lock); 308 invalid++;
258 /* List does not take a reference, so no need for put here */ 309 removed += mark_lseg_invalid(lseg, tmp_list);
259 list_del_init(&lo->layouts); 310 }
260 spin_unlock(&clp->cl_lock); 311 dprintk("%s:Return %i\n", __func__, invalid - removed);
261 write_seqlock(&lo->seqlock); 312 return invalid - removed;
262 clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
263 write_sequnlock(&lo->seqlock);
264
265 dprintk("%s:Return\n", __func__);
266} 313}
267 314
268static void 315void
269pnfs_free_lseg_list(struct list_head *tmp_list) 316pnfs_free_lseg_list(struct list_head *free_me)
270{ 317{
271 struct pnfs_layout_segment *lseg; 318 struct pnfs_layout_segment *lseg, *tmp;
272 319
273 while (!list_empty(tmp_list)) { 320 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
274 lseg = list_entry(tmp_list->next, struct pnfs_layout_segment, 321 list_del(&lseg->pls_list);
275 fi_list); 322 free_lseg(lseg);
276 dprintk("%s calling put_lseg on %p\n", __func__, lseg);
277 list_del(&lseg->fi_list);
278 put_lseg(lseg);
279 } 323 }
280} 324}
281 325
@@ -288,7 +332,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
288 spin_lock(&nfsi->vfs_inode.i_lock); 332 spin_lock(&nfsi->vfs_inode.i_lock);
289 lo = nfsi->layout; 333 lo = nfsi->layout;
290 if (lo) { 334 if (lo) {
291 pnfs_clear_lseg_list(lo, &tmp_list); 335 set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
336 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
292 /* Matched by refcount set to 1 in alloc_init_layout_hdr */ 337 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
293 put_layout_hdr_locked(lo); 338 put_layout_hdr_locked(lo);
294 } 339 }
@@ -312,76 +357,80 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
312 357
313 while (!list_empty(&tmp_list)) { 358 while (!list_empty(&tmp_list)) {
314 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, 359 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
315 layouts); 360 plh_layouts);
316 dprintk("%s freeing layout for inode %lu\n", __func__, 361 dprintk("%s freeing layout for inode %lu\n", __func__,
317 lo->inode->i_ino); 362 lo->plh_inode->i_ino);
318 pnfs_destroy_layout(NFS_I(lo->inode)); 363 pnfs_destroy_layout(NFS_I(lo->plh_inode));
319 } 364 }
320} 365}
321 366
322/* update lo->stateid with new if is more recent 367/* update lo->plh_stateid with new if is more recent */
323 * 368void
324 * lo->stateid could be the open stateid, in which case we just use what given. 369pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
325 */ 370 bool update_barrier)
326static void 371{
327pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, 372 u32 oldseq, newseq;
328 const nfs4_stateid *new) 373
329{ 374 oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
330 nfs4_stateid *old = &lo->stateid; 375 newseq = be32_to_cpu(new->stateid.seqid);
331 bool overwrite = false; 376 if ((int)(newseq - oldseq) > 0) {
332 377 memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
333 write_seqlock(&lo->seqlock); 378 if (update_barrier) {
334 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) || 379 u32 new_barrier = be32_to_cpu(new->stateid.seqid);
335 memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other))) 380
336 overwrite = true; 381 if ((int)(new_barrier - lo->plh_barrier))
337 else { 382 lo->plh_barrier = new_barrier;
338 u32 oldseq, newseq; 383 } else {
339 384 /* Because of wraparound, we want to keep the barrier
340 oldseq = be32_to_cpu(old->stateid.seqid); 385 * "close" to the current seqids. It needs to be
341 newseq = be32_to_cpu(new->stateid.seqid); 386 * within 2**31 to count as "behind", so if it
342 if ((int)(newseq - oldseq) > 0) 387 * gets too near that limit, give us a litle leeway
343 overwrite = true; 388 * and bring it to within 2**30.
389 * NOTE - and yes, this is all unsigned arithmetic.
390 */
391 if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
392 lo->plh_barrier = newseq - (1 << 30);
393 }
344 } 394 }
345 if (overwrite)
346 memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
347 write_sequnlock(&lo->seqlock);
348} 395}
349 396
350static void 397/* lget is set to 1 if called from inside send_layoutget call chain */
351pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, 398static bool
352 struct nfs4_state *state) 399pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
400 int lget)
353{ 401{
354 int seq; 402 if ((stateid) &&
355 403 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
356 dprintk("--> %s\n", __func__); 404 return true;
357 write_seqlock(&lo->seqlock); 405 return lo->plh_block_lgets ||
358 do { 406 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
359 seq = read_seqbegin(&state->seqlock); 407 (list_empty(&lo->plh_segs) &&
360 memcpy(lo->stateid.data, state->stateid.data, 408 (atomic_read(&lo->plh_outstanding) > lget));
361 sizeof(state->stateid.data));
362 } while (read_seqretry(&state->seqlock, seq));
363 set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
364 write_sequnlock(&lo->seqlock);
365 dprintk("<-- %s\n", __func__);
366} 409}
367 410
368void 411int
369pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 412pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
370 struct nfs4_state *open_state) 413 struct nfs4_state *open_state)
371{ 414{
372 int seq; 415 int status = 0;
373 416
374 dprintk("--> %s\n", __func__); 417 dprintk("--> %s\n", __func__);
375 do { 418 spin_lock(&lo->plh_inode->i_lock);
376 seq = read_seqbegin(&lo->seqlock); 419 if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
377 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) { 420 status = -EAGAIN;
378 /* This will trigger retry of the read */ 421 } else if (list_empty(&lo->plh_segs)) {
379 pnfs_layout_from_open_stateid(lo, open_state); 422 int seq;
380 } else 423
381 memcpy(dst->data, lo->stateid.data, 424 do {
382 sizeof(lo->stateid.data)); 425 seq = read_seqbegin(&open_state->seqlock);
383 } while (read_seqretry(&lo->seqlock, seq)); 426 memcpy(dst->data, open_state->stateid.data,
427 sizeof(open_state->stateid.data));
428 } while (read_seqretry(&open_state->seqlock, seq));
429 } else
430 memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
431 spin_unlock(&lo->plh_inode->i_lock);
384 dprintk("<-- %s\n", __func__); 432 dprintk("<-- %s\n", __func__);
433 return status;
385} 434}
386 435
387/* 436/*
@@ -395,7 +444,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
395 struct nfs_open_context *ctx, 444 struct nfs_open_context *ctx,
396 u32 iomode) 445 u32 iomode)
397{ 446{
398 struct inode *ino = lo->inode; 447 struct inode *ino = lo->plh_inode;
399 struct nfs_server *server = NFS_SERVER(ino); 448 struct nfs_server *server = NFS_SERVER(ino);
400 struct nfs4_layoutget *lgp; 449 struct nfs4_layoutget *lgp;
401 struct pnfs_layout_segment *lseg = NULL; 450 struct pnfs_layout_segment *lseg = NULL;
@@ -404,10 +453,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
404 453
405 BUG_ON(ctx == NULL); 454 BUG_ON(ctx == NULL);
406 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); 455 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
407 if (lgp == NULL) { 456 if (lgp == NULL)
408 put_layout_hdr(lo->inode);
409 return NULL; 457 return NULL;
410 }
411 lgp->args.minlength = NFS4_MAX_UINT64; 458 lgp->args.minlength = NFS4_MAX_UINT64;
412 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 459 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
413 lgp->args.range.iomode = iomode; 460 lgp->args.range.iomode = iomode;
@@ -424,11 +471,88 @@ send_layoutget(struct pnfs_layout_hdr *lo,
424 nfs4_proc_layoutget(lgp); 471 nfs4_proc_layoutget(lgp);
425 if (!lseg) { 472 if (!lseg) {
426 /* remember that LAYOUTGET failed and suspend trying */ 473 /* remember that LAYOUTGET failed and suspend trying */
427 set_bit(lo_fail_bit(iomode), &lo->state); 474 set_bit(lo_fail_bit(iomode), &lo->plh_flags);
428 } 475 }
429 return lseg; 476 return lseg;
430} 477}
431 478
479bool pnfs_roc(struct inode *ino)
480{
481 struct pnfs_layout_hdr *lo;
482 struct pnfs_layout_segment *lseg, *tmp;
483 LIST_HEAD(tmp_list);
484 bool found = false;
485
486 spin_lock(&ino->i_lock);
487 lo = NFS_I(ino)->layout;
488 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
489 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
490 goto out_nolayout;
491 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
492 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
493 mark_lseg_invalid(lseg, &tmp_list);
494 found = true;
495 }
496 if (!found)
497 goto out_nolayout;
498 lo->plh_block_lgets++;
499 get_layout_hdr(lo); /* matched in pnfs_roc_release */
500 spin_unlock(&ino->i_lock);
501 pnfs_free_lseg_list(&tmp_list);
502 return true;
503
504out_nolayout:
505 spin_unlock(&ino->i_lock);
506 return false;
507}
508
509void pnfs_roc_release(struct inode *ino)
510{
511 struct pnfs_layout_hdr *lo;
512
513 spin_lock(&ino->i_lock);
514 lo = NFS_I(ino)->layout;
515 lo->plh_block_lgets--;
516 put_layout_hdr_locked(lo);
517 spin_unlock(&ino->i_lock);
518}
519
520void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
521{
522 struct pnfs_layout_hdr *lo;
523
524 spin_lock(&ino->i_lock);
525 lo = NFS_I(ino)->layout;
526 if ((int)(barrier - lo->plh_barrier) > 0)
527 lo->plh_barrier = barrier;
528 spin_unlock(&ino->i_lock);
529}
530
531bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
532{
533 struct nfs_inode *nfsi = NFS_I(ino);
534 struct pnfs_layout_segment *lseg;
535 bool found = false;
536
537 spin_lock(&ino->i_lock);
538 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
539 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
540 found = true;
541 break;
542 }
543 if (!found) {
544 struct pnfs_layout_hdr *lo = nfsi->layout;
545 u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
546
547 /* Since close does not return a layout stateid for use as
548 * a barrier, we choose the worst-case barrier.
549 */
550 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
551 }
552 spin_unlock(&ino->i_lock);
553 return found;
554}
555
432/* 556/*
433 * Compare two layout segments for sorting into layout cache. 557 * Compare two layout segments for sorting into layout cache.
434 * We want to preferentially return RW over RO layouts, so ensure those 558 * We want to preferentially return RW over RO layouts, so ensure those
@@ -450,37 +574,29 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
450 574
451 dprintk("%s:Begin\n", __func__); 575 dprintk("%s:Begin\n", __func__);
452 576
453 assert_spin_locked(&lo->inode->i_lock); 577 assert_spin_locked(&lo->plh_inode->i_lock);
454 if (list_empty(&lo->segs)) { 578 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
455 struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client; 579 if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
456
457 spin_lock(&clp->cl_lock);
458 BUG_ON(!list_empty(&lo->layouts));
459 list_add_tail(&lo->layouts, &clp->cl_layouts);
460 spin_unlock(&clp->cl_lock);
461 }
462 list_for_each_entry(lp, &lo->segs, fi_list) {
463 if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
464 continue; 580 continue;
465 list_add_tail(&lseg->fi_list, &lp->fi_list); 581 list_add_tail(&lseg->pls_list, &lp->pls_list);
466 dprintk("%s: inserted lseg %p " 582 dprintk("%s: inserted lseg %p "
467 "iomode %d offset %llu length %llu before " 583 "iomode %d offset %llu length %llu before "
468 "lp %p iomode %d offset %llu length %llu\n", 584 "lp %p iomode %d offset %llu length %llu\n",
469 __func__, lseg, lseg->range.iomode, 585 __func__, lseg, lseg->pls_range.iomode,
470 lseg->range.offset, lseg->range.length, 586 lseg->pls_range.offset, lseg->pls_range.length,
471 lp, lp->range.iomode, lp->range.offset, 587 lp, lp->pls_range.iomode, lp->pls_range.offset,
472 lp->range.length); 588 lp->pls_range.length);
473 found = 1; 589 found = 1;
474 break; 590 break;
475 } 591 }
476 if (!found) { 592 if (!found) {
477 list_add_tail(&lseg->fi_list, &lo->segs); 593 list_add_tail(&lseg->pls_list, &lo->plh_segs);
478 dprintk("%s: inserted lseg %p " 594 dprintk("%s: inserted lseg %p "
479 "iomode %d offset %llu length %llu at tail\n", 595 "iomode %d offset %llu length %llu at tail\n",
480 __func__, lseg, lseg->range.iomode, 596 __func__, lseg, lseg->pls_range.iomode,
481 lseg->range.offset, lseg->range.length); 597 lseg->pls_range.offset, lseg->pls_range.length);
482 } 598 }
483 get_layout_hdr_locked(lo); 599 get_layout_hdr(lo);
484 600
485 dprintk("%s:Return\n", __func__); 601 dprintk("%s:Return\n", __func__);
486} 602}
@@ -493,11 +609,11 @@ alloc_init_layout_hdr(struct inode *ino)
493 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); 609 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
494 if (!lo) 610 if (!lo)
495 return NULL; 611 return NULL;
496 lo->refcount = 1; 612 atomic_set(&lo->plh_refcount, 1);
497 INIT_LIST_HEAD(&lo->layouts); 613 INIT_LIST_HEAD(&lo->plh_layouts);
498 INIT_LIST_HEAD(&lo->segs); 614 INIT_LIST_HEAD(&lo->plh_segs);
499 seqlock_init(&lo->seqlock); 615 INIT_LIST_HEAD(&lo->plh_bulk_recall);
500 lo->inode = ino; 616 lo->plh_inode = ino;
501 return lo; 617 return lo;
502} 618}
503 619
@@ -510,9 +626,12 @@ pnfs_find_alloc_layout(struct inode *ino)
510 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 626 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
511 627
512 assert_spin_locked(&ino->i_lock); 628 assert_spin_locked(&ino->i_lock);
513 if (nfsi->layout) 629 if (nfsi->layout) {
514 return nfsi->layout; 630 if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
515 631 return NULL;
632 else
633 return nfsi->layout;
634 }
516 spin_unlock(&ino->i_lock); 635 spin_unlock(&ino->i_lock);
517 new = alloc_init_layout_hdr(ino); 636 new = alloc_init_layout_hdr(ino);
518 spin_lock(&ino->i_lock); 637 spin_lock(&ino->i_lock);
@@ -538,31 +657,32 @@ pnfs_find_alloc_layout(struct inode *ino)
538static int 657static int
539is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) 658is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
540{ 659{
541 return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW); 660 return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
542} 661}
543 662
544/* 663/*
545 * lookup range in layout 664 * lookup range in layout
546 */ 665 */
547static struct pnfs_layout_segment * 666static struct pnfs_layout_segment *
548pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) 667pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
549{ 668{
550 struct pnfs_layout_segment *lseg, *ret = NULL; 669 struct pnfs_layout_segment *lseg, *ret = NULL;
551 670
552 dprintk("%s:Begin\n", __func__); 671 dprintk("%s:Begin\n", __func__);
553 672
554 assert_spin_locked(&lo->inode->i_lock); 673 assert_spin_locked(&lo->plh_inode->i_lock);
555 list_for_each_entry(lseg, &lo->segs, fi_list) { 674 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
556 if (is_matching_lseg(lseg, iomode)) { 675 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
676 is_matching_lseg(lseg, iomode)) {
557 ret = lseg; 677 ret = lseg;
558 break; 678 break;
559 } 679 }
560 if (cmp_layout(iomode, lseg->range.iomode) > 0) 680 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
561 break; 681 break;
562 } 682 }
563 683
564 dprintk("%s:Return lseg %p ref %d\n", 684 dprintk("%s:Return lseg %p ref %d\n",
565 __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0); 685 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
566 return ret; 686 return ret;
567} 687}
568 688
@@ -576,6 +696,7 @@ pnfs_update_layout(struct inode *ino,
576 enum pnfs_iomode iomode) 696 enum pnfs_iomode iomode)
577{ 697{
578 struct nfs_inode *nfsi = NFS_I(ino); 698 struct nfs_inode *nfsi = NFS_I(ino);
699 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
579 struct pnfs_layout_hdr *lo; 700 struct pnfs_layout_hdr *lo;
580 struct pnfs_layout_segment *lseg = NULL; 701 struct pnfs_layout_segment *lseg = NULL;
581 702
@@ -588,25 +709,53 @@ pnfs_update_layout(struct inode *ino,
588 goto out_unlock; 709 goto out_unlock;
589 } 710 }
590 711
591 /* Check to see if the layout for the given range already exists */ 712 /* Do we even need to bother with this? */
592 lseg = pnfs_has_layout(lo, iomode); 713 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
593 if (lseg) { 714 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
594 dprintk("%s: Using cached lseg %p for iomode %d)\n", 715 dprintk("%s matches recall, use MDS\n", __func__);
595 __func__, lseg, iomode);
596 goto out_unlock; 716 goto out_unlock;
597 } 717 }
718 /* Check to see if the layout for the given range already exists */
719 lseg = pnfs_find_lseg(lo, iomode);
720 if (lseg)
721 goto out_unlock;
598 722
599 /* if LAYOUTGET already failed once we don't try again */ 723 /* if LAYOUTGET already failed once we don't try again */
600 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) 724 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
725 goto out_unlock;
726
727 if (pnfs_layoutgets_blocked(lo, NULL, 0))
601 goto out_unlock; 728 goto out_unlock;
729 atomic_inc(&lo->plh_outstanding);
602 730
603 get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */ 731 get_layout_hdr(lo);
732 if (list_empty(&lo->plh_segs)) {
733 /* The lo must be on the clp list if there is any
734 * chance of a CB_LAYOUTRECALL(FILE) coming in.
735 */
736 spin_lock(&clp->cl_lock);
737 BUG_ON(!list_empty(&lo->plh_layouts));
738 list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
739 spin_unlock(&clp->cl_lock);
740 }
604 spin_unlock(&ino->i_lock); 741 spin_unlock(&ino->i_lock);
605 742
606 lseg = send_layoutget(lo, ctx, iomode); 743 lseg = send_layoutget(lo, ctx, iomode);
744 if (!lseg) {
745 spin_lock(&ino->i_lock);
746 if (list_empty(&lo->plh_segs)) {
747 spin_lock(&clp->cl_lock);
748 list_del_init(&lo->plh_layouts);
749 spin_unlock(&clp->cl_lock);
750 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
751 }
752 spin_unlock(&ino->i_lock);
753 }
754 atomic_dec(&lo->plh_outstanding);
755 put_layout_hdr(lo);
607out: 756out:
608 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 757 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
609 nfsi->layout->state, lseg); 758 nfsi->layout->plh_flags, lseg);
610 return lseg; 759 return lseg;
611out_unlock: 760out_unlock:
612 spin_unlock(&ino->i_lock); 761 spin_unlock(&ino->i_lock);
@@ -619,9 +768,21 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
619 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 768 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
620 struct nfs4_layoutget_res *res = &lgp->res; 769 struct nfs4_layoutget_res *res = &lgp->res;
621 struct pnfs_layout_segment *lseg; 770 struct pnfs_layout_segment *lseg;
622 struct inode *ino = lo->inode; 771 struct inode *ino = lo->plh_inode;
772 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
623 int status = 0; 773 int status = 0;
624 774
775 /* Verify we got what we asked for.
776 * Note that because the xdr parsing only accepts a single
777 * element array, this can fail even if the server is behaving
778 * correctly.
779 */
780 if (lgp->args.range.iomode > res->range.iomode ||
781 res->range.offset != 0 ||
782 res->range.length != NFS4_MAX_UINT64) {
783 status = -EINVAL;
784 goto out;
785 }
625 /* Inject layout blob into I/O device driver */ 786 /* Inject layout blob into I/O device driver */
626 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); 787 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
627 if (!lseg || IS_ERR(lseg)) { 788 if (!lseg || IS_ERR(lseg)) {
@@ -635,16 +796,37 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
635 } 796 }
636 797
637 spin_lock(&ino->i_lock); 798 spin_lock(&ino->i_lock);
799 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
800 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
801 dprintk("%s forget reply due to recall\n", __func__);
802 goto out_forget_reply;
803 }
804
805 if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
806 dprintk("%s forget reply due to state\n", __func__);
807 goto out_forget_reply;
808 }
638 init_lseg(lo, lseg); 809 init_lseg(lo, lseg);
639 lseg->range = res->range; 810 lseg->pls_range = res->range;
640 *lgp->lsegpp = lseg; 811 *lgp->lsegpp = lseg;
641 pnfs_insert_layout(lo, lseg); 812 pnfs_insert_layout(lo, lseg);
642 813
814 if (res->return_on_close) {
815 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
816 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
817 }
818
643 /* Done processing layoutget. Set the layout stateid */ 819 /* Done processing layoutget. Set the layout stateid */
644 pnfs_set_layout_stateid(lo, &res->stateid); 820 pnfs_set_layout_stateid(lo, &res->stateid, false);
645 spin_unlock(&ino->i_lock); 821 spin_unlock(&ino->i_lock);
646out: 822out:
647 return status; 823 return status;
824
825out_forget_reply:
826 spin_unlock(&ino->i_lock);
827 lseg->pls_layout = lo;
828 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
829 goto out;
648} 830}
649 831
650/* 832/*
@@ -769,7 +951,7 @@ pnfs_put_deviceid_cache(struct nfs_client *clp)
769{ 951{
770 struct pnfs_deviceid_cache *local = clp->cl_devid_cache; 952 struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
771 953
772 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); 954 dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
773 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { 955 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
774 int i; 956 int i;
775 /* Verify cache is empty */ 957 /* Verify cache is empty */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e12367d5048..e2612ea0cbe 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,11 +30,17 @@
30#ifndef FS_NFS_PNFS_H 30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H 31#define FS_NFS_PNFS_H
32 32
33enum {
34 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
35 NFS_LSEG_ROC, /* roc bit received from server */
36};
37
33struct pnfs_layout_segment { 38struct pnfs_layout_segment {
34 struct list_head fi_list; 39 struct list_head pls_list;
35 struct pnfs_layout_range range; 40 struct pnfs_layout_range pls_range;
36 struct kref kref; 41 atomic_t pls_refcount;
37 struct pnfs_layout_hdr *layout; 42 unsigned long pls_flags;
43 struct pnfs_layout_hdr *pls_layout;
38}; 44};
39 45
40#ifdef CONFIG_NFS_V4_1 46#ifdef CONFIG_NFS_V4_1
@@ -44,7 +50,9 @@ struct pnfs_layout_segment {
44enum { 50enum {
45 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ 51 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
46 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ 52 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
47 NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */ 53 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
54 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
55 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
48}; 56};
49 57
50/* Per-layout driver specific registration structure */ 58/* Per-layout driver specific registration structure */
@@ -60,13 +68,16 @@ struct pnfs_layoutdriver_type {
60}; 68};
61 69
62struct pnfs_layout_hdr { 70struct pnfs_layout_hdr {
63 unsigned long refcount; 71 atomic_t plh_refcount;
64 struct list_head layouts; /* other client layouts */ 72 struct list_head plh_layouts; /* other client layouts */
65 struct list_head segs; /* layout segments list */ 73 struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
66 seqlock_t seqlock; /* Protects the stateid */ 74 struct list_head plh_segs; /* layout segments list */
67 nfs4_stateid stateid; 75 nfs4_stateid plh_stateid;
68 unsigned long state; 76 atomic_t plh_outstanding; /* number of RPCs out */
69 struct inode *inode; 77 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
78 u32 plh_barrier; /* ignore lower seqids */
79 unsigned long plh_flags;
80 struct inode *plh_inode;
70}; 81};
71 82
72struct pnfs_device { 83struct pnfs_device {
@@ -134,17 +145,30 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
134extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); 145extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
135 146
136/* pnfs.c */ 147/* pnfs.c */
148void get_layout_hdr(struct pnfs_layout_hdr *lo);
137struct pnfs_layout_segment * 149struct pnfs_layout_segment *
138pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 150pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
139 enum pnfs_iomode access_type); 151 enum pnfs_iomode access_type);
140void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 152void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
141void unset_pnfs_layoutdriver(struct nfs_server *); 153void unset_pnfs_layoutdriver(struct nfs_server *);
142int pnfs_layout_process(struct nfs4_layoutget *lgp); 154int pnfs_layout_process(struct nfs4_layoutget *lgp);
155void pnfs_free_lseg_list(struct list_head *tmp_list);
143void pnfs_destroy_layout(struct nfs_inode *); 156void pnfs_destroy_layout(struct nfs_inode *);
144void pnfs_destroy_all_layouts(struct nfs_client *); 157void pnfs_destroy_all_layouts(struct nfs_client *);
145void put_layout_hdr(struct inode *inode); 158void put_layout_hdr(struct pnfs_layout_hdr *lo);
146void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 159void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
147 struct nfs4_state *open_state); 160 const nfs4_stateid *new,
161 bool update_barrier);
162int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
163 struct pnfs_layout_hdr *lo,
164 struct nfs4_state *open_state);
165int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
166 struct list_head *tmp_list,
167 u32 iomode);
168bool pnfs_roc(struct inode *ino);
169void pnfs_roc_release(struct inode *ino);
170void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
171bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
148 172
149 173
150static inline int lo_fail_bit(u32 iomode) 174static inline int lo_fail_bit(u32 iomode)
@@ -176,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
176 return NULL; 200 return NULL;
177} 201}
178 202
203static inline bool
204pnfs_roc(struct inode *ino)
205{
206 return false;
207}
208
209static inline void
210pnfs_roc_release(struct inode *ino)
211{
212}
213
214static inline void
215pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
216{
217}
218
219static inline bool
220pnfs_roc_drain(struct inode *ino, u32 *barrier)
221{
222 return false;
223}
224
179static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) 225static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
180{ 226{
181} 227}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 58e7f84fc1f..77d5e21c4ad 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -458,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
458 fattr = nfs_alloc_fattr(); 458 fattr = nfs_alloc_fattr();
459 status = -ENOMEM; 459 status = -ENOMEM;
460 if (fh == NULL || fattr == NULL) 460 if (fh == NULL || fattr == NULL)
461 goto out; 461 goto out_free;
462 462
463 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 463 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
464 nfs_mark_for_revalidate(dir); 464 nfs_mark_for_revalidate(dir);
@@ -471,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
471 if (status == 0) 471 if (status == 0)
472 status = nfs_instantiate(dentry, fh, fattr); 472 status = nfs_instantiate(dentry, fh, fattr);
473 473
474out_free:
474 nfs_free_fattr(fattr); 475 nfs_free_fattr(fattr);
475 nfs_free_fhandle(fh); 476 nfs_free_fhandle(fh);
476out: 477out:
@@ -731,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
731 .statfs = nfs_proc_statfs, 732 .statfs = nfs_proc_statfs,
732 .fsinfo = nfs_proc_fsinfo, 733 .fsinfo = nfs_proc_fsinfo,
733 .pathconf = nfs_proc_pathconf, 734 .pathconf = nfs_proc_pathconf,
734 .decode_dirent = nfs_decode_dirent, 735 .decode_dirent = nfs2_decode_dirent,
735 .read_setup = nfs_proc_read_setup, 736 .read_setup = nfs_proc_read_setup,
736 .read_done = nfs_read_done, 737 .read_done = nfs_read_done,
737 .write_setup = nfs_proc_write_setup, 738 .write_setup = nfs_proc_write_setup,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e4b62c6f5a6..aedcaa7f291 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req)
152 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 152 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
153 req->wb_bytes, 153 req->wb_bytes,
154 (long long)req_offset(req)); 154 (long long)req_offset(req));
155 nfs_clear_request(req);
156 nfs_release_request(req); 155 nfs_release_request(req);
157} 156}
158 157
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0a42e8f4adc..b68c8607770 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -39,7 +39,6 @@
39#include <linux/nfs_mount.h> 39#include <linux/nfs_mount.h>
40#include <linux/nfs4_mount.h> 40#include <linux/nfs4_mount.h>
41#include <linux/lockd/bind.h> 41#include <linux/lockd/bind.h>
42#include <linux/smp_lock.h>
43#include <linux/seq_file.h> 42#include <linux/seq_file.h>
44#include <linux/mount.h> 43#include <linux/mount.h>
45#include <linux/mnt_namespace.h> 44#include <linux/mnt_namespace.h>
@@ -67,6 +66,12 @@
67 66
68#define NFSDBG_FACILITY NFSDBG_VFS 67#define NFSDBG_FACILITY NFSDBG_VFS
69 68
69#ifdef CONFIG_NFS_V3
70#define NFS_DEFAULT_VERSION 3
71#else
72#define NFS_DEFAULT_VERSION 2
73#endif
74
70enum { 75enum {
71 /* Mount options that take no arguments */ 76 /* Mount options that take no arguments */
72 Opt_soft, Opt_hard, 77 Opt_soft, Opt_hard,
@@ -593,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
593 598
594 if (nfss->mountd_version || showdefaults) 599 if (nfss->mountd_version || showdefaults)
595 seq_printf(m, ",mountvers=%u", nfss->mountd_version); 600 seq_printf(m, ",mountvers=%u", nfss->mountd_version);
596 if (nfss->mountd_port || showdefaults) 601 if ((nfss->mountd_port &&
602 nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
603 showdefaults)
597 seq_printf(m, ",mountport=%u", nfss->mountd_port); 604 seq_printf(m, ",mountport=%u", nfss->mountd_port);
598 605
599 nfs_show_mountd_netid(m, nfss, showdefaults); 606 nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -1064,12 +1071,10 @@ static int nfs_parse_mount_options(char *raw,
1064 mnt->flags |= NFS_MOUNT_VER3; 1071 mnt->flags |= NFS_MOUNT_VER3;
1065 mnt->version = 3; 1072 mnt->version = 3;
1066 break; 1073 break;
1067#ifdef CONFIG_NFS_V4
1068 case Opt_v4: 1074 case Opt_v4:
1069 mnt->flags &= ~NFS_MOUNT_VER3; 1075 mnt->flags &= ~NFS_MOUNT_VER3;
1070 mnt->version = 4; 1076 mnt->version = 4;
1071 break; 1077 break;
1072#endif
1073 case Opt_udp: 1078 case Opt_udp:
1074 mnt->flags &= ~NFS_MOUNT_TCP; 1079 mnt->flags &= ~NFS_MOUNT_TCP;
1075 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1080 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1281,12 +1286,10 @@ static int nfs_parse_mount_options(char *raw,
1281 mnt->flags |= NFS_MOUNT_VER3; 1286 mnt->flags |= NFS_MOUNT_VER3;
1282 mnt->version = 3; 1287 mnt->version = 3;
1283 break; 1288 break;
1284#ifdef CONFIG_NFS_V4
1285 case NFS4_VERSION: 1289 case NFS4_VERSION:
1286 mnt->flags &= ~NFS_MOUNT_VER3; 1290 mnt->flags &= ~NFS_MOUNT_VER3;
1287 mnt->version = 4; 1291 mnt->version = 4;
1288 break; 1292 break;
1289#endif
1290 default: 1293 default:
1291 goto out_invalid_value; 1294 goto out_invalid_value;
1292 } 1295 }
@@ -2199,6 +2202,7 @@ static int nfs_set_super(struct super_block *s, void *data)
2199 2202
2200 s->s_flags = sb_mntdata->mntflags; 2203 s->s_flags = sb_mntdata->mntflags;
2201 s->s_fs_info = server; 2204 s->s_fs_info = server;
2205 s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
2202 ret = set_anon_super(s, server); 2206 ret = set_anon_super(s, server);
2203 if (ret == 0) 2207 if (ret == 0)
2204 server->s_dev = s->s_dev; 2208 server->s_dev = s->s_dev;
@@ -2277,7 +2281,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2277 }; 2281 };
2278 int error = -ENOMEM; 2282 int error = -ENOMEM;
2279 2283
2280 data = nfs_alloc_parsed_mount_data(3); 2284 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
2281 mntfh = nfs_alloc_fhandle(); 2285 mntfh = nfs_alloc_fhandle();
2282 if (data == NULL || mntfh == NULL) 2286 if (data == NULL || mntfh == NULL)
2283 goto out_free_fh; 2287 goto out_free_fh;
@@ -2493,7 +2497,13 @@ static void nfs4_clone_super(struct super_block *sb,
2493 sb->s_maxbytes = old_sb->s_maxbytes; 2497 sb->s_maxbytes = old_sb->s_maxbytes;
2494 sb->s_time_gran = 1; 2498 sb->s_time_gran = 1;
2495 sb->s_op = old_sb->s_op; 2499 sb->s_op = old_sb->s_op;
2496 nfs_initialise_sb(sb); 2500 /*
2501 * The VFS shouldn't apply the umask to mode bits. We will do
2502 * so ourselves when necessary.
2503 */
2504 sb->s_flags |= MS_POSIXACL;
2505 sb->s_xattr = old_sb->s_xattr;
2506 nfs_initialise_sb(sb);
2497} 2507}
2498 2508
2499/* 2509/*
@@ -2503,6 +2513,12 @@ static void nfs4_fill_super(struct super_block *sb)
2503{ 2513{
2504 sb->s_time_gran = 1; 2514 sb->s_time_gran = 1;
2505 sb->s_op = &nfs4_sops; 2515 sb->s_op = &nfs4_sops;
2516 /*
2517 * The VFS shouldn't apply the umask to mode bits. We will do
2518 * so ourselves when necessary.
2519 */
2520 sb->s_flags |= MS_POSIXACL;
2521 sb->s_xattr = nfs4_xattr_handlers;
2506 nfs_initialise_sb(sb); 2522 nfs_initialise_sb(sb);
2507} 2523}
2508 2524
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 7bdec853140..e313a51acdd 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -429,7 +429,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
429 data = kzalloc(sizeof(*data), GFP_KERNEL); 429 data = kzalloc(sizeof(*data), GFP_KERNEL);
430 if (data == NULL) 430 if (data == NULL)
431 return ERR_PTR(-ENOMEM); 431 return ERR_PTR(-ENOMEM);
432 task_setup_data.callback_data = data, 432 task_setup_data.callback_data = data;
433 433
434 data->cred = rpc_lookup_cred(); 434 data->cred = rpc_lookup_cred();
435 if (IS_ERR(data->cred)) { 435 if (IS_ERR(data->cred)) {
@@ -496,7 +496,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
496 496
497 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", 497 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
498 dentry->d_parent->d_name.name, dentry->d_name.name, 498 dentry->d_parent->d_name.name, dentry->d_name.name,
499 atomic_read(&dentry->d_count)); 499 dentry->d_count);
500 nfs_inc_stats(dir, NFSIOS_SILLYRENAME); 500 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
501 501
502 /* 502 /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4c14c17a527..c8278f4046c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
390 if (nfs_have_delegation(inode, FMODE_WRITE)) 390 if (nfs_have_delegation(inode, FMODE_WRITE))
391 nfsi->change_attr++; 391 nfsi->change_attr++;
392 } 392 }
393 set_bit(PG_MAPPED, &req->wb_flags);
393 SetPagePrivate(req->wb_page); 394 SetPagePrivate(req->wb_page);
394 set_page_private(req->wb_page, (unsigned long)req); 395 set_page_private(req->wb_page, (unsigned long)req);
395 nfsi->npages++; 396 nfsi->npages++;
@@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
415 spin_lock(&inode->i_lock); 416 spin_lock(&inode->i_lock);
416 set_page_private(req->wb_page, 0); 417 set_page_private(req->wb_page, 0);
417 ClearPagePrivate(req->wb_page); 418 ClearPagePrivate(req->wb_page);
419 clear_bit(PG_MAPPED, &req->wb_flags);
418 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); 420 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
419 nfsi->npages--; 421 nfsi->npages--;
420 if (!nfsi->npages) { 422 if (!nfsi->npages) {
@@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
422 iput(inode); 424 iput(inode);
423 } else 425 } else
424 spin_unlock(&inode->i_lock); 426 spin_unlock(&inode->i_lock);
425 nfs_clear_request(req);
426 nfs_release_request(req); 427 nfs_release_request(req);
427} 428}
428 429
@@ -931,7 +932,7 @@ out_bad:
931 while (!list_empty(&list)) { 932 while (!list_empty(&list)) {
932 data = list_entry(list.next, struct nfs_write_data, pages); 933 data = list_entry(list.next, struct nfs_write_data, pages);
933 list_del(&data->pages); 934 list_del(&data->pages);
934 nfs_writedata_release(data); 935 nfs_writedata_free(data);
935 } 936 }
936 nfs_redirty_request(req); 937 nfs_redirty_request(req);
937 return -ENOMEM; 938 return -ENOMEM;
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c52571c0..84c27d69d42 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
42 gid_t gid; 42 gid_t gid;
43}; 43};
44 44
45struct nfsacl_simple_acl {
46 struct posix_acl acl;
47 struct posix_acl_entry ace[4];
48};
49
45static int 50static int
46xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) 51xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
47{ 52{
@@ -72,9 +77,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
72 return 0; 77 return 0;
73} 78}
74 79
75unsigned int 80/**
76nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, 81 * nfsacl_encode - Encode an NFSv3 ACL
77 struct posix_acl *acl, int encode_entries, int typeflag) 82 *
83 * @buf: destination xdr_buf to contain XDR encoded ACL
84 * @base: byte offset in xdr_buf where XDR'd ACL begins
85 * @inode: inode of file whose ACL this is
86 * @acl: posix_acl to encode
87 * @encode_entries: whether to encode ACEs as well
88 * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
89 *
90 * Returns size of encoded ACL in bytes or a negative errno value.
91 */
92int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
93 struct posix_acl *acl, int encode_entries, int typeflag)
78{ 94{
79 int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; 95 int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
80 struct nfsacl_encode_desc nfsacl_desc = { 96 struct nfsacl_encode_desc nfsacl_desc = {
@@ -88,17 +104,22 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
88 .uid = inode->i_uid, 104 .uid = inode->i_uid,
89 .gid = inode->i_gid, 105 .gid = inode->i_gid,
90 }; 106 };
107 struct nfsacl_simple_acl aclbuf;
91 int err; 108 int err;
92 struct posix_acl *acl2 = NULL;
93 109
94 if (entries > NFS_ACL_MAX_ENTRIES || 110 if (entries > NFS_ACL_MAX_ENTRIES ||
95 xdr_encode_word(buf, base, entries)) 111 xdr_encode_word(buf, base, entries))
96 return -EINVAL; 112 return -EINVAL;
97 if (encode_entries && acl && acl->a_count == 3) { 113 if (encode_entries && acl && acl->a_count == 3) {
98 /* Fake up an ACL_MASK entry. */ 114 struct posix_acl *acl2 = &aclbuf.acl;
99 acl2 = posix_acl_alloc(4, GFP_KERNEL); 115
100 if (!acl2) 116 /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
101 return -ENOMEM; 117 * invoked in contexts where a memory allocation failure is
118 * fatal. Fortunately this fake ACL is small enough to
119 * construct on the stack. */
120 memset(acl2, 0, sizeof(acl2));
121 posix_acl_init(acl2, 4);
122
102 /* Insert entries in canonical order: other orders seem 123 /* Insert entries in canonical order: other orders seem
103 to confuse Solaris VxFS. */ 124 to confuse Solaris VxFS. */
104 acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */ 125 acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */
@@ -109,8 +130,6 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
109 nfsacl_desc.acl = acl2; 130 nfsacl_desc.acl = acl2;
110 } 131 }
111 err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc); 132 err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
112 if (acl2)
113 posix_acl_release(acl2);
114 if (!err) 133 if (!err)
115 err = 8 + nfsacl_desc.desc.elem_size * 134 err = 8 + nfsacl_desc.desc.elem_size *
116 nfsacl_desc.desc.array_len; 135 nfsacl_desc.desc.array_len;
@@ -224,9 +243,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
224 return 0; 243 return 0;
225} 244}
226 245
227unsigned int 246/**
228nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, 247 * nfsacl_decode - Decode an NFSv3 ACL
229 struct posix_acl **pacl) 248 *
249 * @buf: xdr_buf containing XDR'd ACL data to decode
250 * @base: byte offset in xdr_buf where XDR'd ACL begins
251 * @aclcnt: count of ACEs in decoded posix_acl
252 * @pacl: buffer in which to place decoded posix_acl
253 *
254 * Returns the length of the decoded ACL in bytes, or a negative errno value.
255 */
256int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
257 struct posix_acl **pacl)
230{ 258{
231 struct nfsacl_decode_desc nfsacl_desc = { 259 struct nfsacl_decode_desc nfsacl_desc = {
232 .desc = { 260 .desc = {
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 00000000000..34e5c40af5e
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
1/*
2 * Common NFSv4 ACL handling definitions.
3 *
4 * Copyright (c) 2002 The Regents of the University of Michigan.
5 * All rights reserved.
6 *
7 * Marius Aamodt Eriksen <marius@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35#ifndef LINUX_NFS4_ACL_H
36#define LINUX_NFS4_ACL_H
37
38#include <linux/posix_acl.h>
39
40/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
41 * fit in a page: */
42#define NFS4_ACL_MAX 170
43
44struct nfs4_acl *nfs4_acl_new(int);
45int nfs4_acl_get_whotype(char *, u32);
46int nfs4_acl_write_who(int who, char *p);
47int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
48 uid_t who, u32 mask);
49
50#define NFS4_ACL_TYPE_DEFAULT 0x01
51#define NFS4_ACL_DIR 0x02
52#define NFS4_ACL_OWNER 0x04
53
54struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
55 struct posix_acl *, unsigned int flags);
56int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
57 struct posix_acl **, unsigned int flags);
58
59#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c0fcb7ab7f6..8b31e5f8795 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
1#define MSNFS /* HACK HACK */
2/* 1/*
3 * NFS exporting and validation. 2 * NFS exporting and validation.
4 * 3 *
@@ -1444,9 +1443,6 @@ static struct flags {
1444 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, 1443 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
1445 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, 1444 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
1446 { NFSEXP_V4ROOT, {"v4root", ""}}, 1445 { NFSEXP_V4ROOT, {"v4root", ""}},
1447#ifdef MSNFS
1448 { NFSEXP_MSNFS, {"msnfs", ""}},
1449#endif
1450 { 0, {"", ""}} 1446 { 0, {"", ""}}
1451}; 1447};
1452 1448
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 00000000000..2f3be132153
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
1/*
2 * Mapping of UID to name and vice versa.
3 *
4 * Copyright (c) 2002, 2003 The Regents of the University of
5 * Michigan. All rights reserved.
6> *
7 * Marius Aamodt Eriksen <marius@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35#ifndef LINUX_NFSD_IDMAP_H
36#define LINUX_NFSD_IDMAP_H
37
38#include <linux/in.h>
39#include <linux/sunrpc/svc.h>
40
41/* XXX from linux/nfs_idmap.h */
42#define IDMAP_NAMESZ 128
43
44#ifdef CONFIG_NFSD_V4
45int nfsd_idmap_init(void);
46void nfsd_idmap_shutdown(void);
47#else
48static inline int nfsd_idmap_init(void)
49{
50 return 0;
51}
52static inline void nfsd_idmap_shutdown(void)
53{
54}
55#endif
56
57__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
58__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
59int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
60int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
61
62#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06..2247fc91d5e 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
151 __be32 nfserr; 151 __be32 nfserr;
152 u32 max_blocksize = svc_max_payload(rqstp); 152 u32 max_blocksize = svc_max_payload(rqstp);
153 153
154 dprintk("nfsd: READ(3) %s %lu bytes at %lu\n", 154 dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
155 SVCFH_fmt(&argp->fh), 155 SVCFH_fmt(&argp->fh),
156 (unsigned long) argp->count, 156 (unsigned long) argp->count,
157 (unsigned long) argp->offset); 157 (unsigned long long) argp->offset);
158 158
159 /* Obtain buffer pointer for payload. 159 /* Obtain buffer pointer for payload.
160 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) 160 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
191 __be32 nfserr; 191 __be32 nfserr;
192 unsigned long cnt = argp->len; 192 unsigned long cnt = argp->len;
193 193
194 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 194 dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n",
195 SVCFH_fmt(&argp->fh), 195 SVCFH_fmt(&argp->fh),
196 argp->len, 196 argp->len,
197 (unsigned long) argp->offset, 197 (unsigned long long) argp->offset,
198 argp->stable? " stable" : ""); 198 argp->stable? " stable" : "");
199 199
200 fh_copy(&resp->fh, &argp->fh); 200 fh_copy(&resp->fh, &argp->fh);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0af2a..7e84a852cda 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
260 err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, 260 err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
261 &fhp->fh_post_attr); 261 &fhp->fh_post_attr);
262 fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; 262 fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
263 if (err) 263 if (err) {
264 fhp->fh_post_saved = 0; 264 fhp->fh_post_saved = 0;
265 else 265 /* Grab the ctime anyway - set_change_info might use it */
266 fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
267 } else
266 fhp->fh_post_saved = 1; 268 fhp->fh_post_saved = 1;
267} 269}
268 270
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e4805261515..ad88f1c0a4c 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
36 36
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
39#include <linux/nfs4_acl.h> 39#include "acl.h"
40 40
41 41
42/* mode bit translations: */ 42/* mode bit translations: */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 143da2eecd7..3be975e1891 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -50,11 +50,6 @@ enum {
50 NFSPROC4_CLNT_CB_SEQUENCE, 50 NFSPROC4_CLNT_CB_SEQUENCE,
51}; 51};
52 52
53enum nfs_cb_opnum4 {
54 OP_CB_RECALL = 4,
55 OP_CB_SEQUENCE = 11,
56};
57
58#define NFS4_MAXTAGLEN 20 53#define NFS4_MAXTAGLEN 20
59 54
60#define NFS4_enc_cb_null_sz 0 55#define NFS4_enc_cb_null_sz 0
@@ -79,61 +74,6 @@ enum nfs_cb_opnum4 {
79 cb_sequence_dec_sz + \ 74 cb_sequence_dec_sz + \
80 op_dec_sz) 75 op_dec_sz)
81 76
82/*
83* Generic encode routines from fs/nfs/nfs4xdr.c
84*/
85static inline __be32 *
86xdr_writemem(__be32 *p, const void *ptr, int nbytes)
87{
88 int tmp = XDR_QUADLEN(nbytes);
89 if (!tmp)
90 return p;
91 p[tmp-1] = 0;
92 memcpy(p, ptr, nbytes);
93 return p + tmp;
94}
95
96#define WRITE32(n) *p++ = htonl(n)
97#define WRITEMEM(ptr,nbytes) do { \
98 p = xdr_writemem(p, ptr, nbytes); \
99} while (0)
100#define RESERVE_SPACE(nbytes) do { \
101 p = xdr_reserve_space(xdr, nbytes); \
102 if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
103 BUG_ON(!p); \
104} while (0)
105
106/*
107 * Generic decode routines from fs/nfs/nfs4xdr.c
108 */
109#define DECODE_TAIL \
110 status = 0; \
111out: \
112 return status; \
113xdr_error: \
114 dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
115 status = -EIO; \
116 goto out
117
118#define READ32(x) (x) = ntohl(*p++)
119#define READ64(x) do { \
120 (x) = (u64)ntohl(*p++) << 32; \
121 (x) |= ntohl(*p++); \
122} while (0)
123#define READTIME(x) do { \
124 p++; \
125 (x.tv_sec) = ntohl(*p++); \
126 (x.tv_nsec) = ntohl(*p++); \
127} while (0)
128#define READ_BUF(nbytes) do { \
129 p = xdr_inline_decode(xdr, nbytes); \
130 if (!p) { \
131 dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
132 __func__, __LINE__); \
133 return -EIO; \
134 } \
135} while (0)
136
137struct nfs4_cb_compound_hdr { 77struct nfs4_cb_compound_hdr {
138 /* args */ 78 /* args */
139 u32 ident; /* minorversion 0 only */ 79 u32 ident; /* minorversion 0 only */
@@ -144,295 +84,513 @@ struct nfs4_cb_compound_hdr {
144 int status; 84 int status;
145}; 85};
146 86
147static struct { 87/*
148int stat; 88 * Handle decode buffer overflows out-of-line.
149int errno; 89 */
150} nfs_cb_errtbl[] = { 90static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
151 { NFS4_OK, 0 }, 91{
152 { NFS4ERR_PERM, EPERM }, 92 dprintk("NFS: %s prematurely hit the end of our receive buffer. "
153 { NFS4ERR_NOENT, ENOENT }, 93 "Remaining buffer length is %tu words.\n",
154 { NFS4ERR_IO, EIO }, 94 func, xdr->end - xdr->p);
155 { NFS4ERR_NXIO, ENXIO }, 95}
156 { NFS4ERR_ACCESS, EACCES },
157 { NFS4ERR_EXIST, EEXIST },
158 { NFS4ERR_XDEV, EXDEV },
159 { NFS4ERR_NOTDIR, ENOTDIR },
160 { NFS4ERR_ISDIR, EISDIR },
161 { NFS4ERR_INVAL, EINVAL },
162 { NFS4ERR_FBIG, EFBIG },
163 { NFS4ERR_NOSPC, ENOSPC },
164 { NFS4ERR_ROFS, EROFS },
165 { NFS4ERR_MLINK, EMLINK },
166 { NFS4ERR_NAMETOOLONG, ENAMETOOLONG },
167 { NFS4ERR_NOTEMPTY, ENOTEMPTY },
168 { NFS4ERR_DQUOT, EDQUOT },
169 { NFS4ERR_STALE, ESTALE },
170 { NFS4ERR_BADHANDLE, EBADHANDLE },
171 { NFS4ERR_BAD_COOKIE, EBADCOOKIE },
172 { NFS4ERR_NOTSUPP, ENOTSUPP },
173 { NFS4ERR_TOOSMALL, ETOOSMALL },
174 { NFS4ERR_SERVERFAULT, ESERVERFAULT },
175 { NFS4ERR_BADTYPE, EBADTYPE },
176 { NFS4ERR_LOCKED, EAGAIN },
177 { NFS4ERR_RESOURCE, EREMOTEIO },
178 { NFS4ERR_SYMLINK, ELOOP },
179 { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP },
180 { NFS4ERR_DEADLOCK, EDEADLK },
181 { -1, EIO }
182};
183 96
184static int 97static __be32 *xdr_encode_empty_array(__be32 *p)
185nfs_cb_stat_to_errno(int stat)
186{ 98{
187 int i; 99 *p++ = xdr_zero;
188 for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { 100 return p;
189 if (nfs_cb_errtbl[i].stat == stat)
190 return nfs_cb_errtbl[i].errno;
191 }
192 /* If we cannot translate the error, the recovery routines should
193 * handle it.
194 * Note: remaining NFSv4 error codes have values > 10000, so should
195 * not conflict with native Linux error codes.
196 */
197 return stat;
198} 101}
199 102
200/* 103/*
201 * XDR encode 104 * Encode/decode NFSv4 CB basic data types
105 *
106 * Basic NFSv4 callback data types are defined in section 15 of RFC
107 * 3530: "Network File System (NFS) version 4 Protocol" and section
108 * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
109 * 1 Protocol"
202 */ 110 */
203 111
204static void 112/*
205encode_stateid(struct xdr_stream *xdr, stateid_t *sid) 113 * nfs_cb_opnum4
114 *
115 * enum nfs_cb_opnum4 {
116 * OP_CB_GETATTR = 3,
117 * ...
118 * };
119 */
120enum nfs_cb_opnum4 {
121 OP_CB_GETATTR = 3,
122 OP_CB_RECALL = 4,
123 OP_CB_LAYOUTRECALL = 5,
124 OP_CB_NOTIFY = 6,
125 OP_CB_PUSH_DELEG = 7,
126 OP_CB_RECALL_ANY = 8,
127 OP_CB_RECALLABLE_OBJ_AVAIL = 9,
128 OP_CB_RECALL_SLOT = 10,
129 OP_CB_SEQUENCE = 11,
130 OP_CB_WANTS_CANCELLED = 12,
131 OP_CB_NOTIFY_LOCK = 13,
132 OP_CB_NOTIFY_DEVICEID = 14,
133 OP_CB_ILLEGAL = 10044
134};
135
136static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
206{ 137{
207 __be32 *p; 138 __be32 *p;
208 139
209 RESERVE_SPACE(sizeof(stateid_t)); 140 p = xdr_reserve_space(xdr, 4);
210 WRITE32(sid->si_generation); 141 *p = cpu_to_be32(op);
211 WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
212} 142}
213 143
214static void 144/*
215encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) 145 * nfs_fh4
146 *
147 * typedef opaque nfs_fh4<NFS4_FHSIZE>;
148 */
149static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
216{ 150{
217 __be32 * p; 151 u32 length = fh->fh_size;
152 __be32 *p;
218 153
219 RESERVE_SPACE(16); 154 BUG_ON(length > NFS4_FHSIZE);
220 WRITE32(0); /* tag length is always 0 */ 155 p = xdr_reserve_space(xdr, 4 + length);
221 WRITE32(hdr->minorversion); 156 xdr_encode_opaque(p, &fh->fh_base, length);
222 WRITE32(hdr->ident);
223 hdr->nops_p = p;
224 WRITE32(hdr->nops);
225} 157}
226 158
227static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr) 159/*
160 * stateid4
161 *
162 * struct stateid4 {
163 * uint32_t seqid;
164 * opaque other[12];
165 * };
166 */
167static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
228{ 168{
229 *hdr->nops_p = htonl(hdr->nops); 169 __be32 *p;
170
171 p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
172 *p++ = cpu_to_be32(sid->si_generation);
173 xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
230} 174}
231 175
232static void 176/*
233encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp, 177 * sessionid4
234 struct nfs4_cb_compound_hdr *hdr) 178 *
179 * typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
180 */
181static void encode_sessionid4(struct xdr_stream *xdr,
182 const struct nfsd4_session *session)
235{ 183{
236 __be32 *p; 184 __be32 *p;
237 int len = dp->dl_fh.fh_size; 185
238 186 p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
239 RESERVE_SPACE(4); 187 xdr_encode_opaque_fixed(p, session->se_sessionid.data,
240 WRITE32(OP_CB_RECALL); 188 NFS4_MAX_SESSIONID_LEN);
241 encode_stateid(xdr, &dp->dl_stateid);
242 RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
243 WRITE32(0); /* truncate optimization not implemented */
244 WRITE32(len);
245 WRITEMEM(&dp->dl_fh.fh_base, len);
246 hdr->nops++;
247} 189}
248 190
249static void 191/*
250encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb, 192 * nfsstat4
251 struct nfs4_cb_compound_hdr *hdr) 193 */
252{ 194static const struct {
253 __be32 *p; 195 int stat;
254 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session; 196 int errno;
197} nfs_cb_errtbl[] = {
198 { NFS4_OK, 0 },
199 { NFS4ERR_PERM, -EPERM },
200 { NFS4ERR_NOENT, -ENOENT },
201 { NFS4ERR_IO, -EIO },
202 { NFS4ERR_NXIO, -ENXIO },
203 { NFS4ERR_ACCESS, -EACCES },
204 { NFS4ERR_EXIST, -EEXIST },
205 { NFS4ERR_XDEV, -EXDEV },
206 { NFS4ERR_NOTDIR, -ENOTDIR },
207 { NFS4ERR_ISDIR, -EISDIR },
208 { NFS4ERR_INVAL, -EINVAL },
209 { NFS4ERR_FBIG, -EFBIG },
210 { NFS4ERR_NOSPC, -ENOSPC },
211 { NFS4ERR_ROFS, -EROFS },
212 { NFS4ERR_MLINK, -EMLINK },
213 { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG },
214 { NFS4ERR_NOTEMPTY, -ENOTEMPTY },
215 { NFS4ERR_DQUOT, -EDQUOT },
216 { NFS4ERR_STALE, -ESTALE },
217 { NFS4ERR_BADHANDLE, -EBADHANDLE },
218 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
219 { NFS4ERR_NOTSUPP, -ENOTSUPP },
220 { NFS4ERR_TOOSMALL, -ETOOSMALL },
221 { NFS4ERR_SERVERFAULT, -ESERVERFAULT },
222 { NFS4ERR_BADTYPE, -EBADTYPE },
223 { NFS4ERR_LOCKED, -EAGAIN },
224 { NFS4ERR_RESOURCE, -EREMOTEIO },
225 { NFS4ERR_SYMLINK, -ELOOP },
226 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
227 { NFS4ERR_DEADLOCK, -EDEADLK },
228 { -1, -EIO }
229};
255 230
256 if (hdr->minorversion == 0) 231/*
257 return; 232 * If we cannot translate the error, the recovery routines should
233 * handle it.
234 *
235 * Note: remaining NFSv4 error codes have values > 10000, so should
236 * not conflict with native Linux error codes.
237 */
238static int nfs_cb_stat_to_errno(int status)
239{
240 int i;
258 241
259 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); 242 for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
243 if (nfs_cb_errtbl[i].stat == status)
244 return nfs_cb_errtbl[i].errno;
245 }
260 246
261 WRITE32(OP_CB_SEQUENCE); 247 dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
262 WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); 248 return -status;
263 WRITE32(ses->se_cb_seq_nr);
264 WRITE32(0); /* slotid, always 0 */
265 WRITE32(0); /* highest slotid always 0 */
266 WRITE32(0); /* cachethis always 0 */
267 WRITE32(0); /* FIXME: support referring_call_lists */
268 hdr->nops++;
269} 249}
270 250
271static int 251static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
272nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) 252 enum nfsstat4 *status)
273{ 253{
274 struct xdr_stream xdrs, *xdr = &xdrs; 254 __be32 *p;
255 u32 op;
275 256
276 xdr_init_encode(&xdrs, &req->rq_snd_buf, p); 257 p = xdr_inline_decode(xdr, 4 + 4);
277 RESERVE_SPACE(0); 258 if (unlikely(p == NULL))
259 goto out_overflow;
260 op = be32_to_cpup(p++);
261 if (unlikely(op != expected))
262 goto out_unexpected;
263 *status = be32_to_cpup(p);
278 return 0; 264 return 0;
265out_overflow:
266 print_overflow_msg(__func__, xdr);
267 return -EIO;
268out_unexpected:
269 dprintk("NFSD: Callback server returned operation %d but "
270 "we issued a request for %d\n", op, expected);
271 return -EIO;
279} 272}
280 273
281static int 274/*
282nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, 275 * CB_COMPOUND4args
283 struct nfsd4_callback *cb) 276 *
277 * struct CB_COMPOUND4args {
278 * utf8str_cs tag;
279 * uint32_t minorversion;
280 * uint32_t callback_ident;
281 * nfs_cb_argop4 argarray<>;
282 * };
283*/
284static void encode_cb_compound4args(struct xdr_stream *xdr,
285 struct nfs4_cb_compound_hdr *hdr)
284{ 286{
285 struct xdr_stream xdr; 287 __be32 * p;
286 struct nfs4_delegation *args = cb->cb_op;
287 struct nfs4_cb_compound_hdr hdr = {
288 .ident = cb->cb_clp->cl_cb_ident,
289 .minorversion = cb->cb_minorversion,
290 };
291 288
292 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 289 p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
293 encode_cb_compound_hdr(&xdr, &hdr); 290 p = xdr_encode_empty_array(p); /* empty tag */
294 encode_cb_sequence(&xdr, cb, &hdr); 291 *p++ = cpu_to_be32(hdr->minorversion);
295 encode_cb_recall(&xdr, args, &hdr); 292 *p++ = cpu_to_be32(hdr->ident);
296 encode_cb_nops(&hdr); 293
294 hdr->nops_p = p;
295 *p = cpu_to_be32(hdr->nops); /* argarray element count */
296}
297
298/*
299 * Update argarray element count
300 */
301static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
302{
303 BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
304 *hdr->nops_p = cpu_to_be32(hdr->nops);
305}
306
307/*
308 * CB_COMPOUND4res
309 *
310 * struct CB_COMPOUND4res {
311 * nfsstat4 status;
312 * utf8str_cs tag;
313 * nfs_cb_resop4 resarray<>;
314 * };
315 */
316static int decode_cb_compound4res(struct xdr_stream *xdr,
317 struct nfs4_cb_compound_hdr *hdr)
318{
319 u32 length;
320 __be32 *p;
321
322 p = xdr_inline_decode(xdr, 4 + 4);
323 if (unlikely(p == NULL))
324 goto out_overflow;
325 hdr->status = be32_to_cpup(p++);
326 /* Ignore the tag */
327 length = be32_to_cpup(p++);
328 p = xdr_inline_decode(xdr, length + 4);
329 if (unlikely(p == NULL))
330 goto out_overflow;
331 hdr->nops = be32_to_cpup(p);
297 return 0; 332 return 0;
333out_overflow:
334 print_overflow_msg(__func__, xdr);
335 return -EIO;
298} 336}
299 337
338/*
339 * CB_RECALL4args
340 *
341 * struct CB_RECALL4args {
342 * stateid4 stateid;
343 * bool truncate;
344 * nfs_fh4 fh;
345 * };
346 */
347static void encode_cb_recall4args(struct xdr_stream *xdr,
348 const struct nfs4_delegation *dp,
349 struct nfs4_cb_compound_hdr *hdr)
350{
351 __be32 *p;
352
353 encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
354 encode_stateid4(xdr, &dp->dl_stateid);
355
356 p = xdr_reserve_space(xdr, 4);
357 *p++ = xdr_zero; /* truncate */
300 358
301static int 359 encode_nfs_fh4(xdr, &dp->dl_fh);
302decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
303 __be32 *p;
304 u32 taglen;
305 360
306 READ_BUF(8); 361 hdr->nops++;
307 READ32(hdr->status);
308 /* We've got no use for the tag; ignore it: */
309 READ32(taglen);
310 READ_BUF(taglen + 4);
311 p += XDR_QUADLEN(taglen);
312 READ32(hdr->nops);
313 return 0;
314} 362}
315 363
316static int 364/*
317decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) 365 * CB_SEQUENCE4args
366 *
367 * struct CB_SEQUENCE4args {
368 * sessionid4 csa_sessionid;
369 * sequenceid4 csa_sequenceid;
370 * slotid4 csa_slotid;
371 * slotid4 csa_highest_slotid;
372 * bool csa_cachethis;
373 * referring_call_list4 csa_referring_call_lists<>;
374 * };
375 */
376static void encode_cb_sequence4args(struct xdr_stream *xdr,
377 const struct nfsd4_callback *cb,
378 struct nfs4_cb_compound_hdr *hdr)
318{ 379{
380 struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
319 __be32 *p; 381 __be32 *p;
320 u32 op; 382
321 int32_t nfserr; 383 if (hdr->minorversion == 0)
322 384 return;
323 READ_BUF(8); 385
324 READ32(op); 386 encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
325 if (op != expected) { 387 encode_sessionid4(xdr, session);
326 dprintk("NFSD: decode_cb_op_hdr: Callback server returned " 388
327 " operation %d but we issued a request for %d\n", 389 p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
328 op, expected); 390 *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */
329 return -EIO; 391 *p++ = xdr_zero; /* csa_slotid */
330 } 392 *p++ = xdr_zero; /* csa_highest_slotid */
331 READ32(nfserr); 393 *p++ = xdr_zero; /* csa_cachethis */
332 if (nfserr != NFS_OK) 394 xdr_encode_empty_array(p); /* csa_referring_call_lists */
333 return -nfs_cb_stat_to_errno(nfserr); 395
334 return 0; 396 hdr->nops++;
335} 397}
336 398
337/* 399/*
400 * CB_SEQUENCE4resok
401 *
402 * struct CB_SEQUENCE4resok {
403 * sessionid4 csr_sessionid;
404 * sequenceid4 csr_sequenceid;
405 * slotid4 csr_slotid;
406 * slotid4 csr_highest_slotid;
407 * slotid4 csr_target_highest_slotid;
408 * };
409 *
410 * union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
411 * case NFS4_OK:
412 * CB_SEQUENCE4resok csr_resok4;
413 * default:
414 * void;
415 * };
416 *
338 * Our current back channel implmentation supports a single backchannel 417 * Our current back channel implmentation supports a single backchannel
339 * with a single slot. 418 * with a single slot.
340 */ 419 */
341static int 420static int decode_cb_sequence4resok(struct xdr_stream *xdr,
342decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb, 421 struct nfsd4_callback *cb)
343 struct rpc_rqst *rqstp)
344{ 422{
345 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session; 423 struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
346 struct nfs4_sessionid id; 424 struct nfs4_sessionid id;
347 int status; 425 int status;
348 u32 dummy;
349 __be32 *p; 426 __be32 *p;
427 u32 dummy;
350 428
351 if (cb->cb_minorversion == 0) 429 status = -ESERVERFAULT;
352 return 0;
353
354 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
355 if (status)
356 return status;
357 430
358 /* 431 /*
359 * If the server returns different values for sessionID, slotID or 432 * If the server returns different values for sessionID, slotID or
360 * sequence number, the server is looney tunes. 433 * sequence number, the server is looney tunes.
361 */ 434 */
362 status = -ESERVERFAULT; 435 p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
363 436 if (unlikely(p == NULL))
364 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); 437 goto out_overflow;
365 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); 438 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
366 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); 439 if (memcmp(id.data, session->se_sessionid.data,
367 if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) { 440 NFS4_MAX_SESSIONID_LEN) != 0) {
368 dprintk("%s Invalid session id\n", __func__); 441 dprintk("NFS: %s Invalid session id\n", __func__);
369 goto out; 442 goto out;
370 } 443 }
371 READ32(dummy); 444 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
372 if (dummy != ses->se_cb_seq_nr) { 445
373 dprintk("%s Invalid sequence number\n", __func__); 446 dummy = be32_to_cpup(p++);
447 if (dummy != session->se_cb_seq_nr) {
448 dprintk("NFS: %s Invalid sequence number\n", __func__);
374 goto out; 449 goto out;
375 } 450 }
376 READ32(dummy); /* slotid must be 0 */ 451
452 dummy = be32_to_cpup(p++);
377 if (dummy != 0) { 453 if (dummy != 0) {
378 dprintk("%s Invalid slotid\n", __func__); 454 dprintk("NFS: %s Invalid slotid\n", __func__);
379 goto out; 455 goto out;
380 } 456 }
381 /* FIXME: process highest slotid and target highest slotid */ 457
458 /*
459 * FIXME: process highest slotid and target highest slotid
460 */
382 status = 0; 461 status = 0;
383out: 462out:
384 return status; 463 return status;
464out_overflow:
465 print_overflow_msg(__func__, xdr);
466 return -EIO;
385} 467}
386 468
469static int decode_cb_sequence4res(struct xdr_stream *xdr,
470 struct nfsd4_callback *cb)
471{
472 enum nfsstat4 nfserr;
473 int status;
474
475 if (cb->cb_minorversion == 0)
476 return 0;
387 477
388static int 478 status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
389nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) 479 if (unlikely(status))
480 goto out;
481 if (unlikely(nfserr != NFS4_OK))
482 goto out_default;
483 status = decode_cb_sequence4resok(xdr, cb);
484out:
485 return status;
486out_default:
487 return nfs_cb_stat_to_errno(status);
488}
489
490/*
491 * NFSv4.0 and NFSv4.1 XDR encode functions
492 *
493 * NFSv4.0 callback argument types are defined in section 15 of RFC
494 * 3530: "Network File System (NFS) version 4 Protocol" and section 20
495 * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1
496 * Protocol".
497 */
498
499/*
500 * NB: Without this zero space reservation, callbacks over krb5p fail
501 */
502static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
503 void *__unused)
504{
505 xdr_reserve_space(xdr, 0);
506}
507
508/*
509 * 20.2. Operation 4: CB_RECALL - Recall a Delegation
510 */
511static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
512 const struct nfsd4_callback *cb)
513{
514 const struct nfs4_delegation *args = cb->cb_op;
515 struct nfs4_cb_compound_hdr hdr = {
516 .ident = cb->cb_clp->cl_cb_ident,
517 .minorversion = cb->cb_minorversion,
518 };
519
520 encode_cb_compound4args(xdr, &hdr);
521 encode_cb_sequence4args(xdr, cb, &hdr);
522 encode_cb_recall4args(xdr, args, &hdr);
523 encode_cb_nops(&hdr);
524}
525
526
527/*
528 * NFSv4.0 and NFSv4.1 XDR decode functions
529 *
530 * NFSv4.0 callback result types are defined in section 15 of RFC
531 * 3530: "Network File System (NFS) version 4 Protocol" and section 20
532 * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1
533 * Protocol".
534 */
535
536static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
537 void *__unused)
390{ 538{
391 return 0; 539 return 0;
392} 540}
393 541
394static int 542/*
395nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, 543 * 20.2. Operation 4: CB_RECALL - Recall a Delegation
396 struct nfsd4_callback *cb) 544 */
545static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
546 struct xdr_stream *xdr,
547 struct nfsd4_callback *cb)
397{ 548{
398 struct xdr_stream xdr;
399 struct nfs4_cb_compound_hdr hdr; 549 struct nfs4_cb_compound_hdr hdr;
550 enum nfsstat4 nfserr;
400 int status; 551 int status;
401 552
402 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 553 status = decode_cb_compound4res(xdr, &hdr);
403 status = decode_cb_compound_hdr(&xdr, &hdr); 554 if (unlikely(status))
404 if (status)
405 goto out; 555 goto out;
406 if (cb) { 556
407 status = decode_cb_sequence(&xdr, cb, rqstp); 557 if (cb != NULL) {
408 if (status) 558 status = decode_cb_sequence4res(xdr, cb);
559 if (unlikely(status))
409 goto out; 560 goto out;
410 } 561 }
411 status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); 562
563 status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
564 if (unlikely(status))
565 goto out;
566 if (unlikely(nfserr != NFS4_OK))
567 goto out_default;
412out: 568out:
413 return status; 569 return status;
570out_default:
571 return nfs_cb_stat_to_errno(status);
414} 572}
415 573
416/* 574/*
417 * RPC procedure tables 575 * RPC procedure tables
418 */ 576 */
419#define PROC(proc, call, argtype, restype) \ 577#define PROC(proc, call, argtype, restype) \
420[NFSPROC4_CLNT_##proc] = { \ 578[NFSPROC4_CLNT_##proc] = { \
421 .p_proc = NFSPROC4_CB_##call, \ 579 .p_proc = NFSPROC4_CB_##call, \
422 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ 580 .p_encode = (kxdreproc_t)nfs4_xdr_enc_##argtype, \
423 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ 581 .p_decode = (kxdrdproc_t)nfs4_xdr_dec_##restype, \
424 .p_arglen = NFS4_##argtype##_sz, \ 582 .p_arglen = NFS4_enc_##argtype##_sz, \
425 .p_replen = NFS4_##restype##_sz, \ 583 .p_replen = NFS4_dec_##restype##_sz, \
426 .p_statidx = NFSPROC4_CB_##call, \ 584 .p_statidx = NFSPROC4_CB_##call, \
427 .p_name = #proc, \ 585 .p_name = #proc, \
428} 586}
429 587
430static struct rpc_procinfo nfs4_cb_procedures[] = { 588static struct rpc_procinfo nfs4_cb_procedures[] = {
431 PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), 589 PROC(CB_NULL, NULL, cb_null, cb_null),
432 PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), 590 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
433}; 591};
434 592
435static struct rpc_version nfs_cb_version4 = { 593static struct rpc_version nfs_cb_version4 = {
436/* 594/*
437 * Note on the callback rpc program version number: despite language in rfc 595 * Note on the callback rpc program version number: despite language in rfc
438 * 5661 section 18.36.3 requiring servers to use 4 in this field, the 596 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
@@ -440,29 +598,29 @@ static struct rpc_version nfs_cb_version4 = {
440 * in practice that appears to be what implementations use. The section 598 * in practice that appears to be what implementations use. The section
441 * 18.36.3 language is expected to be fixed in an erratum. 599 * 18.36.3 language is expected to be fixed in an erratum.
442 */ 600 */
443 .number = 1, 601 .number = 1,
444 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), 602 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures),
445 .procs = nfs4_cb_procedures 603 .procs = nfs4_cb_procedures
446}; 604};
447 605
448static struct rpc_version * nfs_cb_version[] = { 606static struct rpc_version *nfs_cb_version[] = {
449 &nfs_cb_version4, 607 &nfs_cb_version4,
450}; 608};
451 609
452static struct rpc_program cb_program; 610static struct rpc_program cb_program;
453 611
454static struct rpc_stat cb_stats = { 612static struct rpc_stat cb_stats = {
455 .program = &cb_program 613 .program = &cb_program
456}; 614};
457 615
458#define NFS4_CALLBACK 0x40000000 616#define NFS4_CALLBACK 0x40000000
459static struct rpc_program cb_program = { 617static struct rpc_program cb_program = {
460 .name = "nfs4_cb", 618 .name = "nfs4_cb",
461 .number = NFS4_CALLBACK, 619 .number = NFS4_CALLBACK,
462 .nrvers = ARRAY_SIZE(nfs_cb_version), 620 .nrvers = ARRAY_SIZE(nfs_cb_version),
463 .version = nfs_cb_version, 621 .version = nfs_cb_version,
464 .stats = &cb_stats, 622 .stats = &cb_stats,
465 .pipe_dir_name = "/nfsd4_cb", 623 .pipe_dir_name = "/nfsd4_cb",
466}; 624};
467 625
468static int max_cb_time(void) 626static int max_cb_time(void)
@@ -470,10 +628,8 @@ static int max_cb_time(void)
470 return max(nfsd4_lease/10, (time_t)1) * HZ; 628 return max(nfsd4_lease/10, (time_t)1) * HZ;
471} 629}
472 630
473/* Reference counting, callback cleanup, etc., all look racy as heck.
474 * And why is cl_cb_set an atomic? */
475 631
476int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) 632static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
477{ 633{
478 struct rpc_timeout timeparms = { 634 struct rpc_timeout timeparms = {
479 .to_initval = max_cb_time(), 635 .to_initval = max_cb_time(),
@@ -483,6 +639,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
483 .net = &init_net, 639 .net = &init_net,
484 .address = (struct sockaddr *) &conn->cb_addr, 640 .address = (struct sockaddr *) &conn->cb_addr,
485 .addrsize = conn->cb_addrlen, 641 .addrsize = conn->cb_addrlen,
642 .saddress = (struct sockaddr *) &conn->cb_saddr,
486 .timeout = &timeparms, 643 .timeout = &timeparms,
487 .program = &cb_program, 644 .program = &cb_program,
488 .version = 0, 645 .version = 0,
@@ -499,6 +656,10 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
499 args.protocol = XPRT_TRANSPORT_TCP; 656 args.protocol = XPRT_TRANSPORT_TCP;
500 clp->cl_cb_ident = conn->cb_ident; 657 clp->cl_cb_ident = conn->cb_ident;
501 } else { 658 } else {
659 if (!conn->cb_xprt)
660 return -EINVAL;
661 clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
662 clp->cl_cb_session = ses;
502 args.bc_xprt = conn->cb_xprt; 663 args.bc_xprt = conn->cb_xprt;
503 args.prognumber = clp->cl_cb_session->se_cb_prog; 664 args.prognumber = clp->cl_cb_session->se_cb_prog;
504 args.protocol = XPRT_TRANSPORT_BC_TCP; 665 args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -521,14 +682,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
521 (int)clp->cl_name.len, clp->cl_name.data, reason); 682 (int)clp->cl_name.len, clp->cl_name.data, reason);
522} 683}
523 684
685static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
686{
687 clp->cl_cb_state = NFSD4_CB_DOWN;
688 warn_no_callback_path(clp, reason);
689}
690
524static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) 691static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
525{ 692{
526 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); 693 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
527 694
528 if (task->tk_status) 695 if (task->tk_status)
529 warn_no_callback_path(clp, task->tk_status); 696 nfsd4_mark_cb_down(clp, task->tk_status);
530 else 697 else
531 atomic_set(&clp->cl_cb_set, 1); 698 clp->cl_cb_state = NFSD4_CB_UP;
532} 699}
533 700
534static const struct rpc_call_ops nfsd4_cb_probe_ops = { 701static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -551,6 +718,11 @@ int set_callback_cred(void)
551 718
552static struct workqueue_struct *callback_wq; 719static struct workqueue_struct *callback_wq;
553 720
721static void run_nfsd4_cb(struct nfsd4_callback *cb)
722{
723 queue_work(callback_wq, &cb->cb_work);
724}
725
554static void do_probe_callback(struct nfs4_client *clp) 726static void do_probe_callback(struct nfs4_client *clp)
555{ 727{
556 struct nfsd4_callback *cb = &clp->cl_cb_null; 728 struct nfsd4_callback *cb = &clp->cl_cb_null;
@@ -565,7 +737,7 @@ static void do_probe_callback(struct nfs4_client *clp)
565 737
566 cb->cb_ops = &nfsd4_cb_probe_ops; 738 cb->cb_ops = &nfsd4_cb_probe_ops;
567 739
568 queue_work(callback_wq, &cb->cb_work); 740 run_nfsd4_cb(cb);
569} 741}
570 742
571/* 743/*
@@ -574,14 +746,21 @@ static void do_probe_callback(struct nfs4_client *clp)
574 */ 746 */
575void nfsd4_probe_callback(struct nfs4_client *clp) 747void nfsd4_probe_callback(struct nfs4_client *clp)
576{ 748{
749 /* XXX: atomicity? Also, should we be using cl_cb_flags? */
750 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
577 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); 751 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
578 do_probe_callback(clp); 752 do_probe_callback(clp);
579} 753}
580 754
581void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) 755void nfsd4_probe_callback_sync(struct nfs4_client *clp)
582{ 756{
583 BUG_ON(atomic_read(&clp->cl_cb_set)); 757 nfsd4_probe_callback(clp);
758 flush_workqueue(callback_wq);
759}
584 760
761void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
762{
763 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
585 spin_lock(&clp->cl_lock); 764 spin_lock(&clp->cl_lock);
586 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); 765 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
587 spin_unlock(&clp->cl_lock); 766 spin_unlock(&clp->cl_lock);
@@ -592,24 +771,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
592 * If the slot is available, then mark it busy. Otherwise, set the 771 * If the slot is available, then mark it busy. Otherwise, set the
593 * thread for sleeping on the callback RPC wait queue. 772 * thread for sleeping on the callback RPC wait queue.
594 */ 773 */
595static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, 774static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
596 struct rpc_task *task)
597{ 775{
598 u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
599 int status = 0;
600
601 dprintk("%s: %u:%u:%u:%u\n", __func__,
602 ptr[0], ptr[1], ptr[2], ptr[3]);
603
604 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { 776 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
605 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); 777 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
606 dprintk("%s slot is busy\n", __func__); 778 dprintk("%s slot is busy\n", __func__);
607 status = -EAGAIN; 779 return false;
608 goto out;
609 } 780 }
610out: 781 return true;
611 dprintk("%s status=%d\n", __func__, status);
612 return status;
613} 782}
614 783
615/* 784/*
@@ -622,20 +791,19 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
622 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 791 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
623 struct nfs4_client *clp = dp->dl_client; 792 struct nfs4_client *clp = dp->dl_client;
624 u32 minorversion = clp->cl_minorversion; 793 u32 minorversion = clp->cl_minorversion;
625 int status = 0;
626 794
627 cb->cb_minorversion = minorversion; 795 cb->cb_minorversion = minorversion;
628 if (minorversion) { 796 if (minorversion) {
629 status = nfsd41_cb_setup_sequence(clp, task); 797 if (!nfsd41_cb_get_slot(clp, task))
630 if (status) {
631 if (status != -EAGAIN) {
632 /* terminate rpc task */
633 task->tk_status = status;
634 task->tk_action = NULL;
635 }
636 return; 798 return;
637 }
638 } 799 }
800 spin_lock(&clp->cl_lock);
801 if (list_empty(&cb->cb_per_client)) {
802 /* This is the first call, not a restart */
803 cb->cb_done = false;
804 list_add(&cb->cb_per_client, &clp->cl_callbacks);
805 }
806 spin_unlock(&clp->cl_lock);
639 rpc_call_start(task); 807 rpc_call_start(task);
640} 808}
641 809
@@ -671,15 +839,18 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
671 839
672 nfsd4_cb_done(task, calldata); 840 nfsd4_cb_done(task, calldata);
673 841
674 if (current_rpc_client == NULL) { 842 if (current_rpc_client != task->tk_client) {
675 /* We're shutting down; give up. */ 843 /* We're shutting down or changing cl_cb_client; leave
676 /* XXX: err, or is it ok just to fall through 844 * it to nfsd4_process_cb_update to restart the call if
677 * and rpc_restart_call? */ 845 * necessary. */
678 return; 846 return;
679 } 847 }
680 848
849 if (cb->cb_done)
850 return;
681 switch (task->tk_status) { 851 switch (task->tk_status) {
682 case 0: 852 case 0:
853 cb->cb_done = true;
683 return; 854 return;
684 case -EBADHANDLE: 855 case -EBADHANDLE:
685 case -NFS4ERR_BAD_STATEID: 856 case -NFS4ERR_BAD_STATEID:
@@ -688,32 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
688 break; 859 break;
689 default: 860 default:
690 /* Network partition? */ 861 /* Network partition? */
691 atomic_set(&clp->cl_cb_set, 0); 862 nfsd4_mark_cb_down(clp, task->tk_status);
692 warn_no_callback_path(clp, task->tk_status);
693 if (current_rpc_client != task->tk_client) {
694 /* queue a callback on the new connection: */
695 atomic_inc(&dp->dl_count);
696 nfsd4_cb_recall(dp);
697 return;
698 }
699 } 863 }
700 if (dp->dl_retries--) { 864 if (dp->dl_retries--) {
701 rpc_delay(task, 2*HZ); 865 rpc_delay(task, 2*HZ);
702 task->tk_status = 0; 866 task->tk_status = 0;
703 rpc_restart_call_prepare(task); 867 rpc_restart_call_prepare(task);
704 return; 868 return;
705 } else {
706 atomic_set(&clp->cl_cb_set, 0);
707 warn_no_callback_path(clp, task->tk_status);
708 } 869 }
870 nfsd4_mark_cb_down(clp, task->tk_status);
871 cb->cb_done = true;
709} 872}
710 873
711static void nfsd4_cb_recall_release(void *calldata) 874static void nfsd4_cb_recall_release(void *calldata)
712{ 875{
713 struct nfsd4_callback *cb = calldata; 876 struct nfsd4_callback *cb = calldata;
877 struct nfs4_client *clp = cb->cb_clp;
714 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 878 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
715 879
716 nfs4_put_delegation(dp); 880 if (cb->cb_done) {
881 spin_lock(&clp->cl_lock);
882 list_del(&cb->cb_per_client);
883 spin_unlock(&clp->cl_lock);
884 nfs4_put_delegation(dp);
885 }
717} 886}
718 887
719static const struct rpc_call_ops nfsd4_cb_recall_ops = { 888static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -748,16 +917,33 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
748 flush_workqueue(callback_wq); 917 flush_workqueue(callback_wq);
749} 918}
750 919
751void nfsd4_release_cb(struct nfsd4_callback *cb) 920static void nfsd4_release_cb(struct nfsd4_callback *cb)
752{ 921{
753 if (cb->cb_ops->rpc_release) 922 if (cb->cb_ops->rpc_release)
754 cb->cb_ops->rpc_release(cb); 923 cb->cb_ops->rpc_release(cb);
755} 924}
756 925
757void nfsd4_process_cb_update(struct nfsd4_callback *cb) 926/* requires cl_lock: */
927static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
928{
929 struct nfsd4_session *s;
930 struct nfsd4_conn *c;
931
932 list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
933 list_for_each_entry(c, &s->se_conns, cn_persession) {
934 if (c->cn_flags & NFS4_CDFC4_BACK)
935 return c;
936 }
937 }
938 return NULL;
939}
940
941static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
758{ 942{
759 struct nfs4_cb_conn conn; 943 struct nfs4_cb_conn conn;
760 struct nfs4_client *clp = cb->cb_clp; 944 struct nfs4_client *clp = cb->cb_clp;
945 struct nfsd4_session *ses = NULL;
946 struct nfsd4_conn *c;
761 int err; 947 int err;
762 948
763 /* 949 /*
@@ -768,6 +954,10 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
768 rpc_shutdown_client(clp->cl_cb_client); 954 rpc_shutdown_client(clp->cl_cb_client);
769 clp->cl_cb_client = NULL; 955 clp->cl_cb_client = NULL;
770 } 956 }
957 if (clp->cl_cb_conn.cb_xprt) {
958 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
959 clp->cl_cb_conn.cb_xprt = NULL;
960 }
771 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags)) 961 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
772 return; 962 return;
773 spin_lock(&clp->cl_lock); 963 spin_lock(&clp->cl_lock);
@@ -778,11 +968,22 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
778 BUG_ON(!clp->cl_cb_flags); 968 BUG_ON(!clp->cl_cb_flags);
779 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); 969 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
780 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); 970 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
971 c = __nfsd4_find_backchannel(clp);
972 if (c) {
973 svc_xprt_get(c->cn_xprt);
974 conn.cb_xprt = c->cn_xprt;
975 ses = c->cn_session;
976 }
781 spin_unlock(&clp->cl_lock); 977 spin_unlock(&clp->cl_lock);
782 978
783 err = setup_callback_client(clp, &conn); 979 err = setup_callback_client(clp, &conn, ses);
784 if (err) 980 if (err) {
785 warn_no_callback_path(clp, err); 981 warn_no_callback_path(clp, err);
982 return;
983 }
984 /* Yay, the callback channel's back! Restart any callbacks: */
985 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
986 run_nfsd4_cb(cb);
786} 987}
787 988
788void nfsd4_do_callback_rpc(struct work_struct *w) 989void nfsd4_do_callback_rpc(struct work_struct *w)
@@ -807,10 +1008,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
807void nfsd4_cb_recall(struct nfs4_delegation *dp) 1008void nfsd4_cb_recall(struct nfs4_delegation *dp)
808{ 1009{
809 struct nfsd4_callback *cb = &dp->dl_recall; 1010 struct nfsd4_callback *cb = &dp->dl_recall;
1011 struct nfs4_client *clp = dp->dl_client;
810 1012
811 dp->dl_retries = 1; 1013 dp->dl_retries = 1;
812 cb->cb_op = dp; 1014 cb->cb_op = dp;
813 cb->cb_clp = dp->dl_client; 1015 cb->cb_clp = clp;
814 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; 1016 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
815 cb->cb_msg.rpc_argp = cb; 1017 cb->cb_msg.rpc_argp = cb;
816 cb->cb_msg.rpc_resp = cb; 1018 cb->cb_msg.rpc_resp = cb;
@@ -819,5 +1021,8 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
819 cb->cb_ops = &nfsd4_cb_recall_ops; 1021 cb->cb_ops = &nfsd4_cb_recall_ops;
820 dp->dl_retries = 1; 1022 dp->dl_retries = 1;
821 1023
822 queue_work(callback_wq, &dp->dl_recall.cb_work); 1024 INIT_LIST_HEAD(&cb->cb_per_client);
1025 cb->cb_done = true;
1026
1027 run_nfsd4_cb(&dp->dl_recall);
823} 1028}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f0695e815f0..6d2c397d458 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,11 @@
33 */ 33 */
34 34
35#include <linux/module.h> 35#include <linux/module.h>
36#include <linux/nfsd_idmap.h>
37#include <linux/seq_file.h> 36#include <linux/seq_file.h>
38#include <linux/sched.h> 37#include <linux/sched.h>
39#include <linux/slab.h> 38#include <linux/slab.h>
39#include "idmap.h"
40#include "nfsd.h"
40 41
41/* 42/*
42 * Cache entry 43 * Cache entry
@@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
514 return clp->name; 515 return clp->name;
515} 516}
516 517
517static int 518static __be32
518idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, 519idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
519 uid_t *id) 520 uid_t *id)
520{ 521{
@@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
524 int ret; 525 int ret;
525 526
526 if (namelen + 1 > sizeof(key.name)) 527 if (namelen + 1 > sizeof(key.name))
527 return -EINVAL; 528 return nfserr_badowner;
528 memcpy(key.name, name, namelen); 529 memcpy(key.name, name, namelen);
529 key.name[namelen] = '\0'; 530 key.name[namelen] = '\0';
530 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); 531 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
531 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); 532 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
532 if (ret == -ENOENT) 533 if (ret == -ENOENT)
533 ret = -ESRCH; /* nfserr_badname */ 534 return nfserr_badowner;
534 if (ret) 535 if (ret)
535 return ret; 536 return nfserrno(ret);
536 *id = item->id; 537 *id = item->id;
537 cache_put(&item->h, &nametoid_cache); 538 cache_put(&item->h, &nametoid_cache);
538 return 0; 539 return 0;
@@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
560 return ret; 561 return ret;
561} 562}
562 563
563int 564__be32
564nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, 565nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
565 __u32 *id) 566 __u32 *id)
566{ 567{
567 return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); 568 return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
568} 569}
569 570
570int 571__be32
571nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, 572nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
572 __u32 *id) 573 __u32 *id)
573{ 574{
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0cdfd022bb7..db52546143d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
604 return status; 604 return status;
605} 605}
606 606
607static __be32 607static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
608nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
609 void *arg)
610{ 608{
611 struct svc_fh tmp_fh; 609 struct svc_fh tmp_fh;
612 __be32 ret; 610 __be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
615 ret = exp_pseudoroot(rqstp, &tmp_fh); 613 ret = exp_pseudoroot(rqstp, &tmp_fh);
616 if (ret) 614 if (ret)
617 return ret; 615 return ret;
618 if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { 616 if (tmp_fh.fh_dentry == fh->fh_dentry) {
619 fh_put(&tmp_fh); 617 fh_put(&tmp_fh);
620 return nfserr_noent; 618 return nfserr_noent;
621 } 619 }
622 fh_put(&tmp_fh); 620 fh_put(&tmp_fh);
623 return nfsd_lookup(rqstp, &cstate->current_fh, 621 return nfsd_lookup(rqstp, fh, "..", 2, fh);
624 "..", 2, &cstate->current_fh); 622}
623
624static __be32
625nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
626 void *arg)
627{
628 return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
625} 629}
626 630
627static __be32 631static __be32
@@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
769 } else 773 } else
770 secinfo->si_exp = exp; 774 secinfo->si_exp = exp;
771 dput(dentry); 775 dput(dentry);
776 if (cstate->minorversion)
777 /* See rfc 5661 section 2.6.3.1.1.8 */
778 fh_put(&cstate->current_fh);
772 return err; 779 return err;
773} 780}
774 781
775static __be32 782static __be32
783nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
784 struct nfsd4_secinfo_no_name *sin)
785{
786 __be32 err;
787
788 switch (sin->sin_style) {
789 case NFS4_SECINFO_STYLE4_CURRENT_FH:
790 break;
791 case NFS4_SECINFO_STYLE4_PARENT:
792 err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
793 if (err)
794 return err;
795 break;
796 default:
797 return nfserr_inval;
798 }
799 exp_get(cstate->current_fh.fh_export);
800 sin->sin_exp = cstate->current_fh.fh_export;
801 fh_put(&cstate->current_fh);
802 return nfs_ok;
803}
804
805static __be32
776nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 806nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
777 struct nfsd4_setattr *setattr) 807 struct nfsd4_setattr *setattr)
778{ 808{
@@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
974 * Also note, enforced elsewhere: 1004 * Also note, enforced elsewhere:
975 * - SEQUENCE other than as first op results in 1005 * - SEQUENCE other than as first op results in
976 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) 1006 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
977 * - BIND_CONN_TO_SESSION must be the only op in its compound 1007 * - BIND_CONN_TO_SESSION must be the only op in its compound.
978 * (Will be enforced in nfsd4_bind_conn_to_session().) 1008 * (Enforced in nfsd4_bind_conn_to_session().)
979 * - DESTROY_SESSION must be the final operation in a compound, if 1009 * - DESTROY_SESSION must be the final operation in a compound, if
980 * sessionid's in SEQUENCE and DESTROY_SESSION are the same. 1010 * sessionid's in SEQUENCE and DESTROY_SESSION are the same.
981 * (Enforced in nfsd4_destroy_session().) 1011 * (Enforced in nfsd4_destroy_session().)
@@ -1126,10 +1156,6 @@ encode_op:
1126 1156
1127 nfsd4_increment_op_stats(op->opnum); 1157 nfsd4_increment_op_stats(op->opnum);
1128 } 1158 }
1129 if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
1130 dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
1131 status = nfserr_jukebox;
1132 }
1133 1159
1134 resp->cstate.status = status; 1160 resp->cstate.status = status;
1135 fh_put(&resp->cstate.current_fh); 1161 fh_put(&resp->cstate.current_fh);
@@ -1300,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
1300 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1326 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1301 .op_name = "OP_EXCHANGE_ID", 1327 .op_name = "OP_EXCHANGE_ID",
1302 }, 1328 },
1329 [OP_BIND_CONN_TO_SESSION] = {
1330 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
1331 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1332 .op_name = "OP_BIND_CONN_TO_SESSION",
1333 },
1303 [OP_CREATE_SESSION] = { 1334 [OP_CREATE_SESSION] = {
1304 .op_func = (nfsd4op_func)nfsd4_create_session, 1335 .op_func = (nfsd4op_func)nfsd4_create_session,
1305 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1336 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
@@ -1320,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
1320 .op_flags = ALLOWED_WITHOUT_FH, 1351 .op_flags = ALLOWED_WITHOUT_FH,
1321 .op_name = "OP_RECLAIM_COMPLETE", 1352 .op_name = "OP_RECLAIM_COMPLETE",
1322 }, 1353 },
1354 [OP_SECINFO_NO_NAME] = {
1355 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
1356 .op_name = "OP_SECINFO_NO_NAME",
1357 },
1323}; 1358};
1324 1359
1325static const char *nfsd4_op_name(unsigned opnum) 1360static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a2..ffb59ef6f82 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
302{ 302{
303 int status; 303 int status;
304 304
305 /* note: we currently use this path only for minorversion 0 */
306 if (nfs4_has_reclaimed_state(child->d_name.name, false)) 305 if (nfs4_has_reclaimed_state(child->d_name.name, false))
307 return 0; 306 return 0;
308 307
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f1e5ec6b510..d98d0213285 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
230 dp->dl_client = clp; 230 dp->dl_client = clp;
231 get_nfs4_file(fp); 231 get_nfs4_file(fp);
232 dp->dl_file = fp; 232 dp->dl_file = fp;
233 nfs4_file_get_access(fp, O_RDONLY); 233 dp->dl_vfs_file = find_readable_file(fp);
234 get_file(dp->dl_vfs_file);
234 dp->dl_flock = NULL; 235 dp->dl_flock = NULL;
235 dp->dl_type = type; 236 dp->dl_type = type;
236 dp->dl_stateid.si_boot = boot_time; 237 dp->dl_stateid.si_boot = boot_time;
@@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
252 if (atomic_dec_and_test(&dp->dl_count)) { 253 if (atomic_dec_and_test(&dp->dl_count)) {
253 dprintk("NFSD: freeing dp %p\n",dp); 254 dprintk("NFSD: freeing dp %p\n",dp);
254 put_nfs4_file(dp->dl_file); 255 put_nfs4_file(dp->dl_file);
256 fput(dp->dl_vfs_file);
255 kmem_cache_free(deleg_slab, dp); 257 kmem_cache_free(deleg_slab, dp);
256 num_delegations--; 258 num_delegations--;
257 } 259 }
@@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
265static void 267static void
266nfs4_close_delegation(struct nfs4_delegation *dp) 268nfs4_close_delegation(struct nfs4_delegation *dp)
267{ 269{
268 struct file *filp = find_readable_file(dp->dl_file);
269
270 dprintk("NFSD: close_delegation dp %p\n",dp); 270 dprintk("NFSD: close_delegation dp %p\n",dp);
271 /* XXX: do we even need this check?: */
271 if (dp->dl_flock) 272 if (dp->dl_flock)
272 vfs_setlease(filp, F_UNLCK, &dp->dl_flock); 273 vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
273 nfs4_file_put_access(dp->dl_file, O_RDONLY);
274} 274}
275 275
276/* Called under the state lock. */ 276/* Called under the state lock. */
@@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
642 free_conn(c); 642 free_conn(c);
643 } 643 }
644 spin_unlock(&clp->cl_lock); 644 spin_unlock(&clp->cl_lock);
645 nfsd4_probe_callback(clp);
645} 646}
646 647
647static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) 648static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -673,27 +674,39 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
673 spin_unlock(&clp->cl_lock); 674 spin_unlock(&clp->cl_lock);
674} 675}
675 676
676static void nfsd4_register_conn(struct nfsd4_conn *conn) 677static int nfsd4_register_conn(struct nfsd4_conn *conn)
677{ 678{
678 conn->cn_xpt_user.callback = nfsd4_conn_lost; 679 conn->cn_xpt_user.callback = nfsd4_conn_lost;
679 register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); 680 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
680} 681}
681 682
682static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) 683static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
683{ 684{
684 struct nfsd4_conn *conn; 685 struct nfsd4_conn *conn;
685 u32 flags = NFS4_CDFC4_FORE; 686 int ret;
686 687
687 if (ses->se_flags & SESSION4_BACK_CHAN) 688 conn = alloc_conn(rqstp, dir);
688 flags |= NFS4_CDFC4_BACK;
689 conn = alloc_conn(rqstp, flags);
690 if (!conn) 689 if (!conn)
691 return nfserr_jukebox; 690 return nfserr_jukebox;
692 nfsd4_hash_conn(conn, ses); 691 nfsd4_hash_conn(conn, ses);
693 nfsd4_register_conn(conn); 692 ret = nfsd4_register_conn(conn);
693 if (ret)
694 /* oops; xprt is already down: */
695 nfsd4_conn_lost(&conn->cn_xpt_user);
694 return nfs_ok; 696 return nfs_ok;
695} 697}
696 698
699static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
700{
701 u32 dir = NFS4_CDFC4_FORE;
702
703 if (ses->se_flags & SESSION4_BACK_CHAN)
704 dir |= NFS4_CDFC4_BACK;
705
706 return nfsd4_new_conn(rqstp, ses, dir);
707}
708
709/* must be called under client_lock */
697static void nfsd4_del_conns(struct nfsd4_session *s) 710static void nfsd4_del_conns(struct nfsd4_session *s)
698{ 711{
699 struct nfs4_client *clp = s->se_client; 712 struct nfs4_client *clp = s->se_client;
@@ -745,6 +758,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
745 */ 758 */
746 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached); 759 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
747 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs); 760 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
761 if (numslots < 1)
762 return NULL;
748 763
749 new = alloc_session(slotsize, numslots); 764 new = alloc_session(slotsize, numslots);
750 if (!new) { 765 if (!new) {
@@ -765,25 +780,30 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
765 idx = hash_sessionid(&new->se_sessionid); 780 idx = hash_sessionid(&new->se_sessionid);
766 spin_lock(&client_lock); 781 spin_lock(&client_lock);
767 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 782 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
783 spin_lock(&clp->cl_lock);
768 list_add(&new->se_perclnt, &clp->cl_sessions); 784 list_add(&new->se_perclnt, &clp->cl_sessions);
785 spin_unlock(&clp->cl_lock);
769 spin_unlock(&client_lock); 786 spin_unlock(&client_lock);
770 787
771 status = nfsd4_new_conn(rqstp, new); 788 status = nfsd4_new_conn_from_crses(rqstp, new);
772 /* whoops: benny points out, status is ignored! (err, or bogus) */ 789 /* whoops: benny points out, status is ignored! (err, or bogus) */
773 if (status) { 790 if (status) {
774 free_session(&new->se_ref); 791 free_session(&new->se_ref);
775 return NULL; 792 return NULL;
776 } 793 }
777 if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) { 794 if (cses->flags & SESSION4_BACK_CHAN) {
778 struct sockaddr *sa = svc_addr(rqstp); 795 struct sockaddr *sa = svc_addr(rqstp);
779 796 /*
780 clp->cl_cb_session = new; 797 * This is a little silly; with sessions there's no real
781 clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt; 798 * use for the callback address. Use the peer address
782 svc_xprt_get(rqstp->rq_xprt); 799 * as a reasonable default for now, but consider fixing
800 * the rpc client not to require an address in the
801 * future:
802 */
783 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); 803 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
784 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); 804 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
785 nfsd4_probe_callback(clp);
786 } 805 }
806 nfsd4_probe_callback(clp);
787 return new; 807 return new;
788} 808}
789 809
@@ -813,7 +833,9 @@ static void
813unhash_session(struct nfsd4_session *ses) 833unhash_session(struct nfsd4_session *ses)
814{ 834{
815 list_del(&ses->se_hash); 835 list_del(&ses->se_hash);
836 spin_lock(&ses->se_client->cl_lock);
816 list_del(&ses->se_perclnt); 837 list_del(&ses->se_perclnt);
838 spin_unlock(&ses->se_client->cl_lock);
817} 839}
818 840
819/* must be called under the client_lock */ 841/* must be called under the client_lock */
@@ -919,8 +941,10 @@ unhash_client_locked(struct nfs4_client *clp)
919 941
920 mark_client_expired(clp); 942 mark_client_expired(clp);
921 list_del(&clp->cl_lru); 943 list_del(&clp->cl_lru);
944 spin_lock(&clp->cl_lock);
922 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) 945 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
923 list_del_init(&ses->se_hash); 946 list_del_init(&ses->se_hash);
947 spin_unlock(&clp->cl_lock);
924} 948}
925 949
926static void 950static void
@@ -1047,12 +1071,13 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1047 1071
1048 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 1072 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
1049 atomic_set(&clp->cl_refcount, 0); 1073 atomic_set(&clp->cl_refcount, 0);
1050 atomic_set(&clp->cl_cb_set, 0); 1074 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
1051 INIT_LIST_HEAD(&clp->cl_idhash); 1075 INIT_LIST_HEAD(&clp->cl_idhash);
1052 INIT_LIST_HEAD(&clp->cl_strhash); 1076 INIT_LIST_HEAD(&clp->cl_strhash);
1053 INIT_LIST_HEAD(&clp->cl_openowners); 1077 INIT_LIST_HEAD(&clp->cl_openowners);
1054 INIT_LIST_HEAD(&clp->cl_delegations); 1078 INIT_LIST_HEAD(&clp->cl_delegations);
1055 INIT_LIST_HEAD(&clp->cl_lru); 1079 INIT_LIST_HEAD(&clp->cl_lru);
1080 INIT_LIST_HEAD(&clp->cl_callbacks);
1056 spin_lock_init(&clp->cl_lock); 1081 spin_lock_init(&clp->cl_lock);
1057 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); 1082 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
1058 clp->cl_time = get_seconds(); 1083 clp->cl_time = get_seconds();
@@ -1128,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid)
1128 return NULL; 1153 return NULL;
1129} 1154}
1130 1155
1131/* 1156static bool clp_used_exchangeid(struct nfs4_client *clp)
1132 * Return 1 iff clp's clientid establishment method matches the use_exchange_id
1133 * parameter. Matching is based on the fact the at least one of the
1134 * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
1135 *
1136 * FIXME: we need to unify the clientid namespaces for nfsv4.x
1137 * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
1138 * and SET_CLIENTID{,_CONFIRM}
1139 */
1140static inline int
1141match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
1142{ 1157{
1143 bool has_exchange_flags = (clp->cl_exchange_flags != 0); 1158 return clp->cl_exchange_flags != 0;
1144 return use_exchange_id == has_exchange_flags; 1159}
1145}
1146 1160
1147static struct nfs4_client * 1161static struct nfs4_client *
1148find_confirmed_client_by_str(const char *dname, unsigned int hashval, 1162find_confirmed_client_by_str(const char *dname, unsigned int hashval)
1149 bool use_exchange_id)
1150{ 1163{
1151 struct nfs4_client *clp; 1164 struct nfs4_client *clp;
1152 1165
1153 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { 1166 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
1154 if (same_name(clp->cl_recdir, dname) && 1167 if (same_name(clp->cl_recdir, dname))
1155 match_clientid_establishment(clp, use_exchange_id))
1156 return clp; 1168 return clp;
1157 } 1169 }
1158 return NULL; 1170 return NULL;
1159} 1171}
1160 1172
1161static struct nfs4_client * 1173static struct nfs4_client *
1162find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, 1174find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
1163 bool use_exchange_id)
1164{ 1175{
1165 struct nfs4_client *clp; 1176 struct nfs4_client *clp;
1166 1177
1167 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { 1178 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
1168 if (same_name(clp->cl_recdir, dname) && 1179 if (same_name(clp->cl_recdir, dname))
1169 match_clientid_establishment(clp, use_exchange_id))
1170 return clp; 1180 return clp;
1171 } 1181 }
1172 return NULL; 1182 return NULL;
1173} 1183}
1174 1184
1185static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
1186{
1187 switch (family) {
1188 case AF_INET:
1189 ((struct sockaddr_in *)sa)->sin_family = AF_INET;
1190 ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
1191 return;
1192 case AF_INET6:
1193 ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
1194 ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
1195 return;
1196 }
1197}
1198
1175static void 1199static void
1176gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) 1200gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
1177{ 1201{
1178 struct nfs4_cb_conn *conn = &clp->cl_cb_conn; 1202 struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
1203 struct sockaddr *sa = svc_addr(rqstp);
1204 u32 scopeid = rpc_get_scope_id(sa);
1179 unsigned short expected_family; 1205 unsigned short expected_family;
1180 1206
1181 /* Currently, we only support tcp and tcp6 for the callback channel */ 1207 /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1201,6 +1227,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
1201 1227
1202 conn->cb_prog = se->se_callback_prog; 1228 conn->cb_prog = se->se_callback_prog;
1203 conn->cb_ident = se->se_callback_ident; 1229 conn->cb_ident = se->se_callback_ident;
1230 rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
1204 return; 1231 return;
1205out_err: 1232out_err:
1206 conn->cb_addr.ss_family = AF_UNSPEC; 1233 conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1340,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1340 case SP4_NONE: 1367 case SP4_NONE:
1341 break; 1368 break;
1342 case SP4_SSV: 1369 case SP4_SSV:
1343 return nfserr_encr_alg_unsupp; 1370 return nfserr_serverfault;
1344 default: 1371 default:
1345 BUG(); /* checked by xdr code */ 1372 BUG(); /* checked by xdr code */
1346 case SP4_MACH_CRED: 1373 case SP4_MACH_CRED:
@@ -1357,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1357 nfs4_lock_state(); 1384 nfs4_lock_state();
1358 status = nfs_ok; 1385 status = nfs_ok;
1359 1386
1360 conf = find_confirmed_client_by_str(dname, strhashval, true); 1387 conf = find_confirmed_client_by_str(dname, strhashval);
1361 if (conf) { 1388 if (conf) {
1389 if (!clp_used_exchangeid(conf)) {
1390 status = nfserr_clid_inuse; /* XXX: ? */
1391 goto out;
1392 }
1362 if (!same_verf(&verf, &conf->cl_verifier)) { 1393 if (!same_verf(&verf, &conf->cl_verifier)) {
1363 /* 18.35.4 case 8 */ 1394 /* 18.35.4 case 8 */
1364 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { 1395 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1399,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1399 goto out; 1430 goto out;
1400 } 1431 }
1401 1432
1402 unconf = find_unconfirmed_client_by_str(dname, strhashval, true); 1433 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1403 if (unconf) { 1434 if (unconf) {
1404 /* 1435 /*
1405 * Possible retry or client restart. Per 18.35.4 case 4, 1436 * Possible retry or client restart. Per 18.35.4 case 4,
@@ -1556,6 +1587,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1556 status = nfs_ok; 1587 status = nfs_ok;
1557 memcpy(cr_ses->sessionid.data, new->se_sessionid.data, 1588 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
1558 NFS4_MAX_SESSIONID_LEN); 1589 NFS4_MAX_SESSIONID_LEN);
1590 memcpy(&cr_ses->fore_channel, &new->se_fchannel,
1591 sizeof(struct nfsd4_channel_attrs));
1559 cs_slot->sl_seqid++; 1592 cs_slot->sl_seqid++;
1560 cr_ses->seqid = cs_slot->sl_seqid; 1593 cr_ses->seqid = cs_slot->sl_seqid;
1561 1594
@@ -1577,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
1577 return argp->opcnt == resp->opcnt; 1610 return argp->opcnt == resp->opcnt;
1578} 1611}
1579 1612
1613static __be32 nfsd4_map_bcts_dir(u32 *dir)
1614{
1615 switch (*dir) {
1616 case NFS4_CDFC4_FORE:
1617 case NFS4_CDFC4_BACK:
1618 return nfs_ok;
1619 case NFS4_CDFC4_FORE_OR_BOTH:
1620 case NFS4_CDFC4_BACK_OR_BOTH:
1621 *dir = NFS4_CDFC4_BOTH;
1622 return nfs_ok;
1623 };
1624 return nfserr_inval;
1625}
1626
1627__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1628 struct nfsd4_compound_state *cstate,
1629 struct nfsd4_bind_conn_to_session *bcts)
1630{
1631 __be32 status;
1632
1633 if (!nfsd4_last_compound_op(rqstp))
1634 return nfserr_not_only_op;
1635 spin_lock(&client_lock);
1636 cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
1637 /* Sorta weird: we only need the refcnt'ing because new_conn acquires
1638 * client_lock iself: */
1639 if (cstate->session) {
1640 nfsd4_get_session(cstate->session);
1641 atomic_inc(&cstate->session->se_client->cl_refcount);
1642 }
1643 spin_unlock(&client_lock);
1644 if (!cstate->session)
1645 return nfserr_badsession;
1646
1647 status = nfsd4_map_bcts_dir(&bcts->dir);
1648 nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
1649 return nfs_ok;
1650}
1651
1580static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) 1652static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
1581{ 1653{
1582 if (!session) 1654 if (!session)
@@ -1615,8 +1687,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
1615 spin_unlock(&client_lock); 1687 spin_unlock(&client_lock);
1616 1688
1617 nfs4_lock_state(); 1689 nfs4_lock_state();
1618 /* wait for callbacks */ 1690 nfsd4_probe_callback_sync(ses->se_client);
1619 nfsd4_shutdown_callback(ses->se_client);
1620 nfs4_unlock_state(); 1691 nfs4_unlock_state();
1621 1692
1622 nfsd4_del_conns(ses); 1693 nfsd4_del_conns(ses);
@@ -1644,6 +1715,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
1644{ 1715{
1645 struct nfs4_client *clp = ses->se_client; 1716 struct nfs4_client *clp = ses->se_client;
1646 struct nfsd4_conn *c; 1717 struct nfsd4_conn *c;
1718 int ret;
1647 1719
1648 spin_lock(&clp->cl_lock); 1720 spin_lock(&clp->cl_lock);
1649 c = __nfsd4_find_conn(new->cn_xprt, ses); 1721 c = __nfsd4_find_conn(new->cn_xprt, ses);
@@ -1654,7 +1726,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
1654 } 1726 }
1655 __nfsd4_hash_conn(new, ses); 1727 __nfsd4_hash_conn(new, ses);
1656 spin_unlock(&clp->cl_lock); 1728 spin_unlock(&clp->cl_lock);
1657 nfsd4_register_conn(new); 1729 ret = nfsd4_register_conn(new);
1730 if (ret)
1731 /* oops; xprt is already down: */
1732 nfsd4_conn_lost(&new->cn_xpt_user);
1658 return; 1733 return;
1659} 1734}
1660 1735
@@ -1725,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1725out: 1800out:
1726 /* Hold a session reference until done processing the compound. */ 1801 /* Hold a session reference until done processing the compound. */
1727 if (cstate->session) { 1802 if (cstate->session) {
1803 struct nfs4_client *clp = session->se_client;
1804
1728 nfsd4_get_session(cstate->session); 1805 nfsd4_get_session(cstate->session);
1729 atomic_inc(&session->se_client->cl_refcount); 1806 atomic_inc(&clp->cl_refcount);
1807 if (clp->cl_cb_state == NFSD4_CB_DOWN)
1808 seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
1730 } 1809 }
1731 kfree(conn); 1810 kfree(conn);
1732 spin_unlock(&client_lock); 1811 spin_unlock(&client_lock);
@@ -1767,7 +1846,6 @@ __be32
1767nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1846nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1768 struct nfsd4_setclientid *setclid) 1847 struct nfsd4_setclientid *setclid)
1769{ 1848{
1770 struct sockaddr *sa = svc_addr(rqstp);
1771 struct xdr_netobj clname = { 1849 struct xdr_netobj clname = {
1772 .len = setclid->se_namelen, 1850 .len = setclid->se_namelen,
1773 .data = setclid->se_name, 1851 .data = setclid->se_name,
@@ -1793,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1793 strhashval = clientstr_hashval(dname); 1871 strhashval = clientstr_hashval(dname);
1794 1872
1795 nfs4_lock_state(); 1873 nfs4_lock_state();
1796 conf = find_confirmed_client_by_str(dname, strhashval, false); 1874 conf = find_confirmed_client_by_str(dname, strhashval);
1797 if (conf) { 1875 if (conf) {
1798 /* RFC 3530 14.2.33 CASE 0: */ 1876 /* RFC 3530 14.2.33 CASE 0: */
1799 status = nfserr_clid_inuse; 1877 status = nfserr_clid_inuse;
1878 if (clp_used_exchangeid(conf))
1879 goto out;
1800 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { 1880 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1801 char addr_str[INET6_ADDRSTRLEN]; 1881 char addr_str[INET6_ADDRSTRLEN];
1802 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, 1882 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1811,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1811 * has a description of SETCLIENTID request processing consisting 1891 * has a description of SETCLIENTID request processing consisting
1812 * of 5 bullet points, labeled as CASE0 - CASE4 below. 1892 * of 5 bullet points, labeled as CASE0 - CASE4 below.
1813 */ 1893 */
1814 unconf = find_unconfirmed_client_by_str(dname, strhashval, false); 1894 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1815 status = nfserr_resource; 1895 status = nfserr_resource;
1816 if (!conf) { 1896 if (!conf) {
1817 /* 1897 /*
@@ -1868,7 +1948,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1868 * for consistent minorversion use throughout: 1948 * for consistent minorversion use throughout:
1869 */ 1949 */
1870 new->cl_minorversion = 0; 1950 new->cl_minorversion = 0;
1871 gen_callback(new, setclid, rpc_get_scope_id(sa)); 1951 gen_callback(new, setclid, rqstp);
1872 add_to_unconfirmed(new, strhashval); 1952 add_to_unconfirmed(new, strhashval);
1873 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1953 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
1874 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 1954 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1927,7 +2007,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1927 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 2007 if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
1928 status = nfserr_clid_inuse; 2008 status = nfserr_clid_inuse;
1929 else { 2009 else {
1930 atomic_set(&conf->cl_cb_set, 0);
1931 nfsd4_change_callback(conf, &unconf->cl_cb_conn); 2010 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
1932 nfsd4_probe_callback(conf); 2011 nfsd4_probe_callback(conf);
1933 expire_client(unconf); 2012 expire_client(unconf);
@@ -1956,7 +2035,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1956 unsigned int hash = 2035 unsigned int hash =
1957 clientstr_hashval(unconf->cl_recdir); 2036 clientstr_hashval(unconf->cl_recdir);
1958 conf = find_confirmed_client_by_str(unconf->cl_recdir, 2037 conf = find_confirmed_client_by_str(unconf->cl_recdir,
1959 hash, false); 2038 hash);
1960 if (conf) { 2039 if (conf) {
1961 nfsd4_remove_clid_dir(conf); 2040 nfsd4_remove_clid_dir(conf);
1962 expire_client(conf); 2041 expire_client(conf);
@@ -2254,7 +2333,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
2254 * Spawn a thread to perform a recall on the delegation represented 2333 * Spawn a thread to perform a recall on the delegation represented
2255 * by the lease (file_lock) 2334 * by the lease (file_lock)
2256 * 2335 *
2257 * Called from break_lease() with lock_kernel() held. 2336 * Called from break_lease() with lock_flocks() held.
2258 * Note: we assume break_lease will only call this *once* for any given 2337 * Note: we assume break_lease will only call this *once* for any given
2259 * lease. 2338 * lease.
2260 */ 2339 */
@@ -2278,7 +2357,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2278 list_add_tail(&dp->dl_recall_lru, &del_recall_lru); 2357 list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
2279 spin_unlock(&recall_lock); 2358 spin_unlock(&recall_lock);
2280 2359
2281 /* only place dl_time is set. protected by lock_kernel*/ 2360 /* only place dl_time is set. protected by lock_flocks*/
2282 dp->dl_time = get_seconds(); 2361 dp->dl_time = get_seconds();
2283 2362
2284 /* 2363 /*
@@ -2292,41 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2292 nfsd4_cb_recall(dp); 2371 nfsd4_cb_recall(dp);
2293} 2372}
2294 2373
2295/*
2296 * The file_lock is being reapd.
2297 *
2298 * Called by locks_free_lock() with lock_kernel() held.
2299 */
2300static
2301void nfsd_release_deleg_cb(struct file_lock *fl)
2302{
2303 struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
2304
2305 dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
2306
2307 if (!(fl->fl_flags & FL_LEASE) || !dp)
2308 return;
2309 dp->dl_flock = NULL;
2310}
2311
2312/*
2313 * Called from setlease() with lock_kernel() held
2314 */
2315static
2316int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
2317{
2318 struct nfs4_delegation *onlistd =
2319 (struct nfs4_delegation *)onlist->fl_owner;
2320 struct nfs4_delegation *tryd =
2321 (struct nfs4_delegation *)try->fl_owner;
2322
2323 if (onlist->fl_lmops != try->fl_lmops)
2324 return 0;
2325
2326 return onlistd->dl_client == tryd->dl_client;
2327}
2328
2329
2330static 2374static
2331int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) 2375int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2332{ 2376{
@@ -2338,8 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2338 2382
2339static const struct lock_manager_operations nfsd_lease_mng_ops = { 2383static const struct lock_manager_operations nfsd_lease_mng_ops = {
2340 .fl_break = nfsd_break_deleg_cb, 2384 .fl_break = nfsd_break_deleg_cb,
2341 .fl_release_private = nfsd_release_deleg_cb,
2342 .fl_mylease = nfsd_same_client_deleg_cb,
2343 .fl_change = nfsd_change_deleg_cb, 2385 .fl_change = nfsd_change_deleg_cb,
2344}; 2386};
2345 2387
@@ -2506,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
2506 if (!fp->fi_fds[oflag]) { 2548 if (!fp->fi_fds[oflag]) {
2507 status = nfsd_open(rqstp, cur_fh, S_IFREG, access, 2549 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
2508 &fp->fi_fds[oflag]); 2550 &fp->fi_fds[oflag]);
2509 if (status == nfserr_dropit)
2510 status = nfserr_jukebox;
2511 if (status) 2551 if (status)
2512 return status; 2552 return status;
2513 } 2553 }
@@ -2588,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
2588 open->op_stateowner->so_client->cl_firststate = 1; 2628 open->op_stateowner->so_client->cl_firststate = 1;
2589} 2629}
2590 2630
2631/* Should we give out recallable state?: */
2632static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
2633{
2634 if (clp->cl_cb_state == NFSD4_CB_UP)
2635 return true;
2636 /*
2637 * In the sessions case, since we don't have to establish a
2638 * separate connection for callbacks, we assume it's OK
2639 * until we hear otherwise:
2640 */
2641 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
2642}
2643
2591/* 2644/*
2592 * Attempt to hand out a delegation. 2645 * Attempt to hand out a delegation.
2593 */ 2646 */
@@ -2596,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2596{ 2649{
2597 struct nfs4_delegation *dp; 2650 struct nfs4_delegation *dp;
2598 struct nfs4_stateowner *sop = stp->st_stateowner; 2651 struct nfs4_stateowner *sop = stp->st_stateowner;
2599 int cb_up = atomic_read(&sop->so_client->cl_cb_set); 2652 int cb_up;
2600 struct file_lock *fl; 2653 struct file_lock *fl;
2601 int status, flag = 0; 2654 int status, flag = 0;
2602 2655
2656 cb_up = nfsd4_cb_channel_good(sop->so_client);
2603 flag = NFS4_OPEN_DELEGATE_NONE; 2657 flag = NFS4_OPEN_DELEGATE_NONE;
2604 open->op_recall = 0; 2658 open->op_recall = 0;
2605 switch (open->op_claim_type) { 2659 switch (open->op_claim_type) {
@@ -2647,7 +2701,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2647 dp->dl_flock = fl; 2701 dp->dl_flock = fl;
2648 2702
2649 /* vfs_setlease checks to see if delegation should be handed out. 2703 /* vfs_setlease checks to see if delegation should be handed out.
2650 * the lock_manager callbacks fl_mylease and fl_change are used 2704 * the lock_manager callback fl_change is used
2651 */ 2705 */
2652 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { 2706 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2653 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2707 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
@@ -2786,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2786 renew_client(clp); 2840 renew_client(clp);
2787 status = nfserr_cb_path_down; 2841 status = nfserr_cb_path_down;
2788 if (!list_empty(&clp->cl_delegations) 2842 if (!list_empty(&clp->cl_delegations)
2789 && !atomic_read(&clp->cl_cb_set)) 2843 && clp->cl_cb_state != NFSD4_CB_UP)
2790 goto out; 2844 goto out;
2791 status = nfs_ok; 2845 status = nfs_ok;
2792out: 2846out:
@@ -3073,9 +3127,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
3073 if (status) 3127 if (status)
3074 goto out; 3128 goto out;
3075 renew_client(dp->dl_client); 3129 renew_client(dp->dl_client);
3076 if (filpp) 3130 if (filpp) {
3077 *filpp = find_readable_file(dp->dl_file); 3131 *filpp = find_readable_file(dp->dl_file);
3078 BUG_ON(!*filpp); 3132 BUG_ON(!*filpp);
3133 }
3079 } else { /* open or lock stateid */ 3134 } else { /* open or lock stateid */
3080 stp = find_stateid(stateid, flags); 3135 stp = find_stateid(stateid, flags);
3081 if (!stp) 3136 if (!stp)
@@ -4099,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
4099 unsigned int strhashval = clientstr_hashval(name); 4154 unsigned int strhashval = clientstr_hashval(name);
4100 struct nfs4_client *clp; 4155 struct nfs4_client *clp;
4101 4156
4102 clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id); 4157 clp = find_confirmed_client_by_str(name, strhashval);
4103 return clp ? 1 : 0; 4158 return clp ? 1 : 0;
4104} 4159}
4105 4160
@@ -4328,7 +4383,7 @@ __nfs4_state_shutdown(void)
4328void 4383void
4329nfs4_state_shutdown(void) 4384nfs4_state_shutdown(void)
4330{ 4385{
4331 cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work); 4386 cancel_delayed_work_sync(&laundromat_work);
4332 destroy_workqueue(laundry_wq); 4387 destroy_workqueue(laundry_wq);
4333 locks_end_grace(&nfsd4_manager); 4388 locks_end_grace(&nfsd4_manager);
4334 nfs4_lock_state(); 4389 nfs4_lock_state();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f35a94a0402..956629b9cdc 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
44#include <linux/namei.h> 44#include <linux/namei.h>
45#include <linux/statfs.h> 45#include <linux/statfs.h>
46#include <linux/utsname.h> 46#include <linux/utsname.h>
47#include <linux/nfsd_idmap.h>
48#include <linux/nfs4_acl.h>
49#include <linux/sunrpc/svcauth_gss.h> 47#include <linux/sunrpc/svcauth_gss.h>
50 48
49#include "idmap.h"
50#include "acl.h"
51#include "xdr4.h" 51#include "xdr4.h"
52#include "vfs.h" 52#include "vfs.h"
53 53
54
54#define NFSDDBG_FACILITY NFSDDBG_XDR 55#define NFSDDBG_FACILITY NFSDDBG_XDR
55 56
56/* 57/*
@@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
288 len += XDR_QUADLEN(dummy32) << 2; 289 len += XDR_QUADLEN(dummy32) << 2;
289 READMEM(buf, dummy32); 290 READMEM(buf, dummy32);
290 ace->whotype = nfs4_acl_get_whotype(buf, dummy32); 291 ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
291 host_err = 0; 292 status = nfs_ok;
292 if (ace->whotype != NFS4_ACL_WHO_NAMED) 293 if (ace->whotype != NFS4_ACL_WHO_NAMED)
293 ace->who = 0; 294 ace->who = 0;
294 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) 295 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
295 host_err = nfsd_map_name_to_gid(argp->rqstp, 296 status = nfsd_map_name_to_gid(argp->rqstp,
296 buf, dummy32, &ace->who); 297 buf, dummy32, &ace->who);
297 else 298 else
298 host_err = nfsd_map_name_to_uid(argp->rqstp, 299 status = nfsd_map_name_to_uid(argp->rqstp,
299 buf, dummy32, &ace->who); 300 buf, dummy32, &ace->who);
300 if (host_err) 301 if (status)
301 goto out_nfserr; 302 return status;
302 } 303 }
303 } else 304 } else
304 *acl = NULL; 305 *acl = NULL;
@@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
420 DECODE_TAIL; 421 DECODE_TAIL;
421} 422}
422 423
424static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
425{
426 DECODE_HEAD;
427 u32 dummy;
428
429 READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
430 COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
431 READ32(bcts->dir);
432 /* XXX: Perhaps Tom Tucker could help us figure out how we
433 * should be using ctsa_use_conn_in_rdma_mode: */
434 READ32(dummy);
435
436 DECODE_TAIL;
437}
438
423static __be32 439static __be32
424nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) 440nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
425{ 441{
@@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
847} 863}
848 864
849static __be32 865static __be32
866nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
867 struct nfsd4_secinfo_no_name *sin)
868{
869 DECODE_HEAD;
870
871 READ_BUF(4);
872 READ32(sin->sin_style);
873 DECODE_TAIL;
874}
875
876static __be32
850nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) 877nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
851{ 878{
852 __be32 status; 879 __be32 status;
@@ -1005,7 +1032,7 @@ static __be32
1005nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, 1032nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1006 struct nfsd4_exchange_id *exid) 1033 struct nfsd4_exchange_id *exid)
1007{ 1034{
1008 int dummy; 1035 int dummy, tmp;
1009 DECODE_HEAD; 1036 DECODE_HEAD;
1010 1037
1011 READ_BUF(NFS4_VERIFIER_SIZE); 1038 READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1053 1080
1054 /* ssp_hash_algs<> */ 1081 /* ssp_hash_algs<> */
1055 READ_BUF(4); 1082 READ_BUF(4);
1056 READ32(dummy); 1083 READ32(tmp);
1057 READ_BUF(dummy); 1084 while (tmp--) {
1058 p += XDR_QUADLEN(dummy); 1085 READ_BUF(4);
1086 READ32(dummy);
1087 READ_BUF(dummy);
1088 p += XDR_QUADLEN(dummy);
1089 }
1059 1090
1060 /* ssp_encr_algs<> */ 1091 /* ssp_encr_algs<> */
1061 READ_BUF(4); 1092 READ_BUF(4);
1062 READ32(dummy); 1093 READ32(tmp);
1063 READ_BUF(dummy); 1094 while (tmp--) {
1064 p += XDR_QUADLEN(dummy); 1095 READ_BUF(4);
1096 READ32(dummy);
1097 READ_BUF(dummy);
1098 p += XDR_QUADLEN(dummy);
1099 }
1065 1100
1066 /* ssp_window and ssp_num_gss_handles */ 1101 /* ssp_window and ssp_num_gss_handles */
1067 READ_BUF(8); 1102 READ_BUF(8);
@@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1339 1374
1340 /* new operations for NFSv4.1 */ 1375 /* new operations for NFSv4.1 */
1341 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, 1376 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp,
1342 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp, 1377 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
1343 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, 1378 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
1344 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, 1379 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
1345 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, 1380 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1350 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, 1385 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
1351 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, 1386 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, 1387 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
1353 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, 1388 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
1354 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, 1389 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
1355 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, 1390 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
1356 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, 1391 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2309,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2309 case nfserr_resource: 2344 case nfserr_resource:
2310 nfserr = nfserr_toosmall; 2345 nfserr = nfserr_toosmall;
2311 goto fail; 2346 goto fail;
2312 case nfserr_dropit:
2313 goto fail;
2314 case nfserr_noent: 2347 case nfserr_noent:
2315 goto skip_entry; 2348 goto skip_entry;
2316 default: 2349 default:
@@ -2365,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2365 return nfserr; 2398 return nfserr;
2366} 2399}
2367 2400
2401static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
2402{
2403 __be32 *p;
2404
2405 if (!nfserr) {
2406 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
2407 WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
2408 WRITE32(bcts->dir);
2409 /* XXX: ? */
2410 WRITE32(0);
2411 ADJUST_ARGS();
2412 }
2413 return nfserr;
2414}
2415
2368static __be32 2416static __be32
2369nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) 2417nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
2370{ 2418{
@@ -2826,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2826} 2874}
2827 2875
2828static __be32 2876static __be32
2829nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, 2877nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
2830 struct nfsd4_secinfo *secinfo) 2878 __be32 nfserr,struct svc_export *exp)
2831{ 2879{
2832 int i = 0; 2880 int i = 0;
2833 struct svc_export *exp = secinfo->si_exp;
2834 u32 nflavs; 2881 u32 nflavs;
2835 struct exp_flavor_info *flavs; 2882 struct exp_flavor_info *flavs;
2836 struct exp_flavor_info def_flavs[2]; 2883 struct exp_flavor_info def_flavs[2];
@@ -2892,6 +2939,20 @@ out:
2892 return nfserr; 2939 return nfserr;
2893} 2940}
2894 2941
2942static __be32
2943nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
2944 struct nfsd4_secinfo *secinfo)
2945{
2946 return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
2947}
2948
2949static __be32
2950nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
2951 struct nfsd4_secinfo_no_name *secinfo)
2952{
2953 return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
2954}
2955
2895/* 2956/*
2896 * The SETATTR encode routine is special -- it always encodes a bitmap, 2957 * The SETATTR encode routine is special -- it always encodes a bitmap,
2897 * regardless of the error status. 2958 * regardless of the error status.
@@ -3076,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3076 WRITE32(seq->seqid); 3137 WRITE32(seq->seqid);
3077 WRITE32(seq->slotid); 3138 WRITE32(seq->slotid);
3078 WRITE32(seq->maxslots); 3139 WRITE32(seq->maxslots);
3079 /* 3140 /* For now: target_maxslots = maxslots */
3080 * FIXME: for now:
3081 * target_maxslots = maxslots
3082 * status_flags = 0
3083 */
3084 WRITE32(seq->maxslots); 3141 WRITE32(seq->maxslots);
3085 WRITE32(0); 3142 WRITE32(seq->status_flags);
3086 3143
3087 ADJUST_ARGS(); 3144 ADJUST_ARGS();
3088 resp->cstate.datap = p; /* DRC cache data pointer */ 3145 resp->cstate.datap = p; /* DRC cache data pointer */
@@ -3143,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3143 3200
3144 /* NFSv4.1 operations */ 3201 /* NFSv4.1 operations */
3145 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, 3202 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
3146 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop, 3203 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
3147 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, 3204 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3148 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, 3205 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3149 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, 3206 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
@@ -3154,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3154 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, 3211 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3155 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, 3212 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3156 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, 3213 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
3157 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, 3214 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
3158 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, 3215 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3159 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, 3216 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
3160 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, 3217 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4514ebbee4d..33b3e2b0677 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
8#include <linux/namei.h> 8#include <linux/namei.h>
9#include <linux/ctype.h> 9#include <linux/ctype.h>
10 10
11#include <linux/nfsd_idmap.h>
12#include <linux/sunrpc/svcsock.h> 11#include <linux/sunrpc/svcsock.h>
13#include <linux/nfsd/syscall.h> 12#include <linux/nfsd/syscall.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
16 15
16#include "idmap.h"
17#include "nfsd.h" 17#include "nfsd.h"
18#include "cache.h" 18#include "cache.h"
19 19
@@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
127 127
128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
129{ 129{
130#ifdef CONFIG_NFSD_DEPRECATED
130 static int warned; 131 static int warned;
131 if (file->f_dentry->d_name.name[0] == '.' && !warned) { 132 if (file->f_dentry->d_name.name[0] == '.' && !warned) {
132 printk(KERN_INFO 133 printk(KERN_INFO
@@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
135 current->comm, file->f_dentry->d_name.name); 136 current->comm, file->f_dentry->d_name.name);
136 warned = 1; 137 warned = 1;
137 } 138 }
139#endif
138 if (! file->private_data) { 140 if (! file->private_data) {
139 /* An attempt to read a transaction file without writing 141 /* An attempt to read a transaction file without writing
140 * causes a 0-byte write so that the file can return 142 * causes a 0-byte write so that the file can return
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6b641cf2c19..7ecfa242030 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void nfsd_lockd_shutdown(void);
158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) 158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) 159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
160#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) 160#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE)
161#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER)
161#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) 162#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD)
162#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) 163#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL)
163#define nfserr_grace cpu_to_be32(NFSERR_GRACE) 164#define nfserr_grace cpu_to_be32(NFSERR_GRACE)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784..e15dc45fc5e 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,9 +735,9 @@ nfserrno (int errno)
735 { nfserr_stale, -ESTALE }, 735 { nfserr_stale, -ESTALE },
736 { nfserr_jukebox, -ETIMEDOUT }, 736 { nfserr_jukebox, -ETIMEDOUT },
737 { nfserr_jukebox, -ERESTARTSYS }, 737 { nfserr_jukebox, -ERESTARTSYS },
738 { nfserr_dropit, -EAGAIN }, 738 { nfserr_jukebox, -EAGAIN },
739 { nfserr_dropit, -ENOMEM }, 739 { nfserr_jukebox, -EWOULDBLOCK },
740 { nfserr_badname, -ESRCH }, 740 { nfserr_jukebox, -ENOMEM },
741 { nfserr_io, -ETXTBSY }, 741 { nfserr_io, -ETXTBSY },
742 { nfserr_notsupp, -EOPNOTSUPP }, 742 { nfserr_notsupp, -EOPNOTSUPP },
743 { nfserr_toosmall, -ETOOSMALL }, 743 { nfserr_toosmall, -ETOOSMALL },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2bae1d86f5f..18743c4d8bc 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
608 /* Now call the procedure handler, and encode NFS status. */ 608 /* Now call the procedure handler, and encode NFS status. */
609 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 609 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
610 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 610 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
611 if (nfserr == nfserr_dropit) { 611 if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
612 dprintk("nfsd: Dropping request; may be revisited later\n"); 612 dprintk("nfsd: Dropping request; may be revisited later\n");
613 nfsd_cache_update(rqstp, RC_NOCACHE, NULL); 613 nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
614 return 0; 614 return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 39adc27b068..3074656ba7b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,10 +68,12 @@ typedef struct {
68struct nfsd4_callback { 68struct nfsd4_callback {
69 void *cb_op; 69 void *cb_op;
70 struct nfs4_client *cb_clp; 70 struct nfs4_client *cb_clp;
71 struct list_head cb_per_client;
71 u32 cb_minorversion; 72 u32 cb_minorversion;
72 struct rpc_message cb_msg; 73 struct rpc_message cb_msg;
73 const struct rpc_call_ops *cb_ops; 74 const struct rpc_call_ops *cb_ops;
74 struct work_struct cb_work; 75 struct work_struct cb_work;
76 bool cb_done;
75}; 77};
76 78
77struct nfs4_delegation { 79struct nfs4_delegation {
@@ -81,6 +83,7 @@ struct nfs4_delegation {
81 atomic_t dl_count; /* ref count */ 83 atomic_t dl_count; /* ref count */
82 struct nfs4_client *dl_client; 84 struct nfs4_client *dl_client;
83 struct nfs4_file *dl_file; 85 struct nfs4_file *dl_file;
86 struct file *dl_vfs_file;
84 struct file_lock *dl_flock; 87 struct file_lock *dl_flock;
85 u32 dl_type; 88 u32 dl_type;
86 time_t dl_time; 89 time_t dl_time;
@@ -95,6 +98,7 @@ struct nfs4_delegation {
95struct nfs4_cb_conn { 98struct nfs4_cb_conn {
96 /* SETCLIENTID info */ 99 /* SETCLIENTID info */
97 struct sockaddr_storage cb_addr; 100 struct sockaddr_storage cb_addr;
101 struct sockaddr_storage cb_saddr;
98 size_t cb_addrlen; 102 size_t cb_addrlen;
99 u32 cb_prog; /* used only in 4.0 case; 103 u32 cb_prog; /* used only in 4.0 case;
100 per-session otherwise */ 104 per-session otherwise */
@@ -146,6 +150,11 @@ struct nfsd4_create_session {
146 u32 gid; 150 u32 gid;
147}; 151};
148 152
153struct nfsd4_bind_conn_to_session {
154 struct nfs4_sessionid sessionid;
155 u32 dir;
156};
157
149/* The single slot clientid cache structure */ 158/* The single slot clientid cache structure */
150struct nfsd4_clid_slot { 159struct nfsd4_clid_slot {
151 u32 sl_seqid; 160 u32 sl_seqid;
@@ -235,9 +244,13 @@ struct nfs4_client {
235 unsigned long cl_cb_flags; 244 unsigned long cl_cb_flags;
236 struct rpc_clnt *cl_cb_client; 245 struct rpc_clnt *cl_cb_client;
237 u32 cl_cb_ident; 246 u32 cl_cb_ident;
238 atomic_t cl_cb_set; 247#define NFSD4_CB_UP 0
248#define NFSD4_CB_UNKNOWN 1
249#define NFSD4_CB_DOWN 2
250 int cl_cb_state;
239 struct nfsd4_callback cl_cb_null; 251 struct nfsd4_callback cl_cb_null;
240 struct nfsd4_session *cl_cb_session; 252 struct nfsd4_session *cl_cb_session;
253 struct list_head cl_callbacks; /* list of in-progress callbacks */
241 254
242 /* for all client information that callback code might need: */ 255 /* for all client information that callback code might need: */
243 spinlock_t cl_lock; 256 spinlock_t cl_lock;
@@ -454,6 +467,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
454extern void nfs4_free_stateowner(struct kref *kref); 467extern void nfs4_free_stateowner(struct kref *kref);
455extern int set_callback_cred(void); 468extern int set_callback_cred(void);
456extern void nfsd4_probe_callback(struct nfs4_client *clp); 469extern void nfsd4_probe_callback(struct nfs4_client *clp);
470extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
457extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 471extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
458extern void nfsd4_do_callback_rpc(struct work_struct *); 472extern void nfsd4_do_callback_rpc(struct work_struct *);
459extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 473extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 184938fcff0..641117f2188 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
1#define MSNFS /* HACK HACK */
2/* 1/*
3 * File operations used by nfsd. Some of these have been ripped from 2 * File operations used by nfsd. Some of these have been ripped from
4 * other parts of the kernel because they weren't exported, others 3 * other parts of the kernel because they weren't exported, others
@@ -35,8 +34,8 @@
35#endif /* CONFIG_NFSD_V3 */ 34#endif /* CONFIG_NFSD_V3 */
36 35
37#ifdef CONFIG_NFSD_V4 36#ifdef CONFIG_NFSD_V4
38#include <linux/nfs4_acl.h> 37#include "acl.h"
39#include <linux/nfsd_idmap.h> 38#include "idmap.h"
40#endif /* CONFIG_NFSD_V4 */ 39#endif /* CONFIG_NFSD_V4 */
41 40
42#include "nfsd.h" 41#include "nfsd.h"
@@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
88 .dentry = dget(dentry)}; 87 .dentry = dget(dentry)};
89 int err = 0; 88 int err = 0;
90 89
91 while (d_mountpoint(path.dentry) && follow_down(&path)) 90 err = follow_down(&path, false);
92 ; 91 if (err < 0)
92 goto out;
93 93
94 exp2 = rqst_exp_get_by_name(rqstp, &path); 94 exp2 = rqst_exp_get_by_name(rqstp, &path);
95 if (IS_ERR(exp2)) { 95 if (IS_ERR(exp2)) {
@@ -273,6 +273,13 @@ out:
273 return err; 273 return err;
274} 274}
275 275
276static int nfsd_break_lease(struct inode *inode)
277{
278 if (!S_ISREG(inode->i_mode))
279 return 0;
280 return break_lease(inode, O_WRONLY | O_NONBLOCK);
281}
282
276/* 283/*
277 * Commit metadata changes to stable storage. 284 * Commit metadata changes to stable storage.
278 */ 285 */
@@ -375,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
375 goto out; 382 goto out;
376 } 383 }
377 384
378 /*
379 * If we are changing the size of the file, then
380 * we need to break all leases.
381 */
382 host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
383 if (host_err == -EWOULDBLOCK)
384 host_err = -ETIMEDOUT;
385 if (host_err) /* ENOMEM or EWOULDBLOCK */
386 goto out_nfserr;
387
388 host_err = get_write_access(inode); 385 host_err = get_write_access(inode);
389 if (host_err) 386 if (host_err)
390 goto out_nfserr; 387 goto out_nfserr;
@@ -425,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
425 422
426 err = nfserr_notsync; 423 err = nfserr_notsync;
427 if (!check_guard || guardtime == inode->i_ctime.tv_sec) { 424 if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
425 host_err = nfsd_break_lease(inode);
426 if (host_err)
427 goto out_nfserr;
428 fh_lock(fhp); 428 fh_lock(fhp);
429
429 host_err = notify_change(dentry, iap); 430 host_err = notify_change(dentry, iap);
430 err = nfserrno(host_err); 431 err = nfserrno(host_err);
431 fh_unlock(fhp); 432 fh_unlock(fhp);
@@ -752,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
752 */ 753 */
753 if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) 754 if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
754 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); 755 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
755 if (host_err == -EWOULDBLOCK)
756 host_err = -ETIMEDOUT;
757 if (host_err) /* NOMEM or WOULDBLOCK */ 756 if (host_err) /* NOMEM or WOULDBLOCK */
758 goto out_nfserr; 757 goto out_nfserr;
759 758
@@ -845,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
845 struct page **pp = rqstp->rq_respages + rqstp->rq_resused; 844 struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
846 struct page *page = buf->page; 845 struct page *page = buf->page;
847 size_t size; 846 size_t size;
848 int ret;
849
850 ret = buf->ops->confirm(pipe, buf);
851 if (unlikely(ret))
852 return ret;
853 847
854 size = sd->len; 848 size = sd->len;
855 849
@@ -879,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
879 return __splice_from_pipe(pipe, sd, nfsd_splice_actor); 873 return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
880} 874}
881 875
882static inline int svc_msnfs(struct svc_fh *ffhp)
883{
884#ifdef MSNFS
885 return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
886#else
887 return 0;
888#endif
889}
890
891static __be32 876static __be32
892nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 877nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
893 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 878 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -900,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
900 err = nfserr_perm; 885 err = nfserr_perm;
901 inode = file->f_path.dentry->d_inode; 886 inode = file->f_path.dentry->d_inode;
902 887
903 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
904 goto out;
905
906 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 888 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
907 struct splice_desc sd = { 889 struct splice_desc sd = {
908 .len = 0, 890 .len = 0,
@@ -927,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
927 fsnotify_access(file); 909 fsnotify_access(file);
928 } else 910 } else
929 err = nfserrno(host_err); 911 err = nfserrno(host_err);
930out:
931 return err; 912 return err;
932} 913}
933 914
@@ -992,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
992 int stable = *stablep; 973 int stable = *stablep;
993 int use_wgather; 974 int use_wgather;
994 975
995#ifdef MSNFS
996 err = nfserr_perm;
997
998 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
999 (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
1000 goto out;
1001#endif
1002
1003 dentry = file->f_path.dentry; 976 dentry = file->f_path.dentry;
1004 inode = dentry->d_inode; 977 inode = dentry->d_inode;
1005 exp = fhp->fh_export; 978 exp = fhp->fh_export;
@@ -1050,7 +1023,6 @@ out_nfserr:
1050 err = 0; 1023 err = 0;
1051 else 1024 else
1052 err = nfserrno(host_err); 1025 err = nfserrno(host_err);
1053out:
1054 return err; 1026 return err;
1055} 1027}
1056 1028
@@ -1670,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1670 err = nfserrno(host_err); 1642 err = nfserrno(host_err);
1671 goto out_dput; 1643 goto out_dput;
1672 } 1644 }
1645 err = nfserr_noent;
1646 if (!dold->d_inode)
1647 goto out_drop_write;
1648 host_err = nfsd_break_lease(dold->d_inode);
1649 if (host_err)
1650 goto out_drop_write;
1673 host_err = vfs_link(dold, dirp, dnew); 1651 host_err = vfs_link(dold, dirp, dnew);
1674 if (!host_err) { 1652 if (!host_err) {
1675 err = nfserrno(commit_metadata(ffhp)); 1653 err = nfserrno(commit_metadata(ffhp));
@@ -1681,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1681 else 1659 else
1682 err = nfserrno(host_err); 1660 err = nfserrno(host_err);
1683 } 1661 }
1662out_drop_write:
1684 mnt_drop_write(tfhp->fh_export->ex_path.mnt); 1663 mnt_drop_write(tfhp->fh_export->ex_path.mnt);
1685out_dput: 1664out_dput:
1686 dput(dnew); 1665 dput(dnew);
@@ -1755,13 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1755 if (ndentry == trap) 1734 if (ndentry == trap)
1756 goto out_dput_new; 1735 goto out_dput_new;
1757 1736
1758 if (svc_msnfs(ffhp) &&
1759 ((atomic_read(&odentry->d_count) > 1)
1760 || (atomic_read(&ndentry->d_count) > 1))) {
1761 host_err = -EPERM;
1762 goto out_dput_new;
1763 }
1764
1765 host_err = -EXDEV; 1737 host_err = -EXDEV;
1766 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) 1738 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
1767 goto out_dput_new; 1739 goto out_dput_new;
@@ -1769,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1769 if (host_err) 1741 if (host_err)
1770 goto out_dput_new; 1742 goto out_dput_new;
1771 1743
1744 host_err = nfsd_break_lease(odentry->d_inode);
1745 if (host_err)
1746 goto out_drop_write;
1772 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1747 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1773 if (!host_err) { 1748 if (!host_err) {
1774 host_err = commit_metadata(tfhp); 1749 host_err = commit_metadata(tfhp);
1775 if (!host_err) 1750 if (!host_err)
1776 host_err = commit_metadata(ffhp); 1751 host_err = commit_metadata(ffhp);
1777 } 1752 }
1778 1753out_drop_write:
1779 mnt_drop_write(ffhp->fh_export->ex_path.mnt); 1754 mnt_drop_write(ffhp->fh_export->ex_path.mnt);
1780
1781 out_dput_new: 1755 out_dput_new:
1782 dput(ndentry); 1756 dput(ndentry);
1783 out_dput_old: 1757 out_dput_old:
@@ -1840,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1840 if (host_err) 1814 if (host_err)
1841 goto out_nfserr; 1815 goto out_nfserr;
1842 1816
1843 if (type != S_IFDIR) { /* It's UNLINK */ 1817 host_err = nfsd_break_lease(rdentry->d_inode);
1844#ifdef MSNFS 1818 if (host_err)
1845 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1819 goto out_put;
1846 (atomic_read(&rdentry->d_count) > 1)) { 1820 if (type != S_IFDIR)
1847 host_err = -EPERM;
1848 } else
1849#endif
1850 host_err = vfs_unlink(dirp, rdentry); 1821 host_err = vfs_unlink(dirp, rdentry);
1851 } else { /* It's RMDIR */ 1822 else
1852 host_err = vfs_rmdir(dirp, rdentry); 1823 host_err = vfs_rmdir(dirp, rdentry);
1853 } 1824out_put:
1854
1855 dput(rdentry); 1825 dput(rdentry);
1856 1826
1857 if (!host_err) 1827 if (!host_err)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4d476ff08ae..366401e1a53 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
311 struct svc_export *si_exp; /* response */ 311 struct svc_export *si_exp; /* response */
312}; 312};
313 313
314struct nfsd4_secinfo_no_name {
315 u32 sin_style; /* request */
316 struct svc_export *sin_exp; /* response */
317};
318
314struct nfsd4_setattr { 319struct nfsd4_setattr {
315 stateid_t sa_stateid; /* request */ 320 stateid_t sa_stateid; /* request */
316 u32 sa_bmval[3]; /* request */ 321 u32 sa_bmval[3]; /* request */
@@ -373,8 +378,8 @@ struct nfsd4_sequence {
373 u32 cachethis; /* request */ 378 u32 cachethis; /* request */
374#if 0 379#if 0
375 u32 target_maxslots; /* response */ 380 u32 target_maxslots; /* response */
376 u32 status_flags; /* response */
377#endif /* not yet */ 381#endif /* not yet */
382 u32 status_flags; /* response */
378}; 383};
379 384
380struct nfsd4_destroy_session { 385struct nfsd4_destroy_session {
@@ -422,6 +427,7 @@ struct nfsd4_op {
422 427
423 /* NFSv4.1 */ 428 /* NFSv4.1 */
424 struct nfsd4_exchange_id exchange_id; 429 struct nfsd4_exchange_id exchange_id;
430 struct nfsd4_bind_conn_to_session bind_conn_to_session;
425 struct nfsd4_create_session create_session; 431 struct nfsd4_create_session create_session;
426 struct nfsd4_destroy_session destroy_session; 432 struct nfsd4_destroy_session destroy_session;
427 struct nfsd4_sequence sequence; 433 struct nfsd4_sequence sequence;
@@ -484,18 +490,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
484static inline void 490static inline void
485set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) 491set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
486{ 492{
487 BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); 493 BUG_ON(!fhp->fh_pre_saved);
488 cinfo->atomic = 1; 494 cinfo->atomic = fhp->fh_post_saved;
489 cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); 495 cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
490 if (cinfo->change_supported) { 496
491 cinfo->before_change = fhp->fh_pre_change; 497 cinfo->before_change = fhp->fh_pre_change;
492 cinfo->after_change = fhp->fh_post_change; 498 cinfo->after_change = fhp->fh_post_change;
493 } else { 499 cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
494 cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; 500 cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
495 cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; 501 cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
496 cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; 502 cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
497 cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; 503
498 }
499} 504}
500 505
501int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); 506int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
@@ -519,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
519 struct nfsd4_sequence *seq); 524 struct nfsd4_sequence *seq);
520extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 525extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
521 struct nfsd4_compound_state *, struct nfsd4_exchange_id *); 526 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
527extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
522extern __be32 nfsd4_create_session(struct svc_rqst *, 528extern __be32 nfsd4_create_session(struct svc_rqst *,
523 struct nfsd4_compound_state *, 529 struct nfsd4_compound_state *,
524 struct nfsd4_create_session *); 530 struct nfsd4_create_session *);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 8b782b062ba..3ee67c67cc5 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -35,7 +35,20 @@
35 35
36struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) 36struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
37{ 37{
38 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); 38 return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
39}
40
41static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
42 const char *fname, int err)
43{
44 struct inode *inode = bmap->b_inode;
45
46 if (err == -EINVAL) {
47 nilfs_error(inode->i_sb, fname,
48 "broken bmap (inode number=%lu)\n", inode->i_ino);
49 err = -EIO;
50 }
51 return err;
39} 52}
40 53
41/** 54/**
@@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
66 79
67 down_read(&bmap->b_sem); 80 down_read(&bmap->b_sem);
68 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); 81 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
69 if (ret < 0) 82 if (ret < 0) {
83 ret = nilfs_bmap_convert_error(bmap, __func__, ret);
70 goto out; 84 goto out;
85 }
71 if (NILFS_BMAP_USE_VBN(bmap)) { 86 if (NILFS_BMAP_USE_VBN(bmap)) {
72 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp, 87 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
73 &blocknr); 88 &blocknr);
@@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
88 down_read(&bmap->b_sem); 103 down_read(&bmap->b_sem);
89 ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks); 104 ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
90 up_read(&bmap->b_sem); 105 up_read(&bmap->b_sem);
91 return ret; 106
107 return nilfs_bmap_convert_error(bmap, __func__, ret);
92} 108}
93 109
94static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 110static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap,
144 down_write(&bmap->b_sem); 160 down_write(&bmap->b_sem);
145 ret = nilfs_bmap_do_insert(bmap, key, rec); 161 ret = nilfs_bmap_do_insert(bmap, key, rec);
146 up_write(&bmap->b_sem); 162 up_write(&bmap->b_sem);
147 return ret; 163
164 return nilfs_bmap_convert_error(bmap, __func__, ret);
148} 165}
149 166
150static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) 167static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
@@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
180 197
181 down_read(&bmap->b_sem); 198 down_read(&bmap->b_sem);
182 ret = bmap->b_ops->bop_last_key(bmap, &lastkey); 199 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
183 if (!ret)
184 *key = lastkey;
185 up_read(&bmap->b_sem); 200 up_read(&bmap->b_sem);
201
202 if (ret < 0)
203 ret = nilfs_bmap_convert_error(bmap, __func__, ret);
204 else
205 *key = lastkey;
186 return ret; 206 return ret;
187} 207}
188 208
@@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
210 down_write(&bmap->b_sem); 230 down_write(&bmap->b_sem);
211 ret = nilfs_bmap_do_delete(bmap, key); 231 ret = nilfs_bmap_do_delete(bmap, key);
212 up_write(&bmap->b_sem); 232 up_write(&bmap->b_sem);
213 return ret; 233
234 return nilfs_bmap_convert_error(bmap, __func__, ret);
214} 235}
215 236
216static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) 237static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
@@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
261 down_write(&bmap->b_sem); 282 down_write(&bmap->b_sem);
262 ret = nilfs_bmap_do_truncate(bmap, key); 283 ret = nilfs_bmap_do_truncate(bmap, key);
263 up_write(&bmap->b_sem); 284 up_write(&bmap->b_sem);
264 return ret; 285
286 return nilfs_bmap_convert_error(bmap, __func__, ret);
265} 287}
266 288
267/** 289/**
@@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
300 down_write(&bmap->b_sem); 322 down_write(&bmap->b_sem);
301 ret = bmap->b_ops->bop_propagate(bmap, bh); 323 ret = bmap->b_ops->bop_propagate(bmap, bh);
302 up_write(&bmap->b_sem); 324 up_write(&bmap->b_sem);
303 return ret; 325
326 return nilfs_bmap_convert_error(bmap, __func__, ret);
304} 327}
305 328
306/** 329/**
@@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
344 down_write(&bmap->b_sem); 367 down_write(&bmap->b_sem);
345 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo); 368 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
346 up_write(&bmap->b_sem); 369 up_write(&bmap->b_sem);
347 return ret; 370
371 return nilfs_bmap_convert_error(bmap, __func__, ret);
348} 372}
349 373
350/** 374/**
@@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
373 down_write(&bmap->b_sem); 397 down_write(&bmap->b_sem);
374 ret = bmap->b_ops->bop_mark(bmap, key, level); 398 ret = bmap->b_ops->bop_mark(bmap, key, level);
375 up_write(&bmap->b_sem); 399 up_write(&bmap->b_sem);
376 return ret; 400
401 return nilfs_bmap_convert_error(bmap, __func__, ret);
377} 402}
378 403
379/** 404/**
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5115814cb74..388e9e8f528 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -104,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
104 if (pblocknr == 0) { 104 if (pblocknr == 0) {
105 pblocknr = blocknr; 105 pblocknr = blocknr;
106 if (inode->i_ino != NILFS_DAT_INO) { 106 if (inode->i_ino != NILFS_DAT_INO) {
107 struct inode *dat = 107 struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
108 nilfs_dat_inode(NILFS_I_NILFS(inode));
109 108
110 /* blocknr is a virtual block number */ 109 /* blocknr is a virtual block number */
111 err = nilfs_dat_translate(dat, blocknr, &pblocknr); 110 err = nilfs_dat_translate(dat, blocknr, &pblocknr);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 49c844dab33..59e5fe742f7 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
335 * the device at this point. 335 * the device at this point.
336 * 336 *
337 * To prevent nilfs_dat_translate() from returning the 337 * To prevent nilfs_dat_translate() from returning the
338 * uncommited block number, this makes a copy of the entry 338 * uncommitted block number, this makes a copy of the entry
339 * buffer and redirects nilfs_dat_translate() to the copy. 339 * buffer and redirects nilfs_dat_translate() to the copy.
340 */ 340 */
341 if (!buffer_nilfs_redirected(entry_bh)) { 341 if (!buffer_nilfs_redirected(entry_bh)) {
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index cb003c8ee1f..9d45773b79e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page,
91 unsigned from, unsigned to) 91 unsigned from, unsigned to)
92{ 92{
93 struct inode *dir = mapping->host; 93 struct inode *dir = mapping->host;
94 struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
95 loff_t pos = page_offset(page) + from; 94 loff_t pos = page_offset(page) + from;
96 unsigned len = to - from; 95 unsigned len = to - from;
97 unsigned nr_dirty, copied; 96 unsigned nr_dirty, copied;
@@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page,
103 i_size_write(dir, pos + copied); 102 i_size_write(dir, pos + copied);
104 if (IS_DIRSYNC(dir)) 103 if (IS_DIRSYNC(dir))
105 nilfs_set_transaction_flag(NILFS_TI_SYNC); 104 nilfs_set_transaction_flag(NILFS_TI_SYNC);
106 err = nilfs_set_file_dirty(sbi, dir, nr_dirty); 105 err = nilfs_set_file_dirty(dir, nr_dirty);
107 WARN_ON(err); /* do not happen */ 106 WARN_ON(err); /* do not happen */
108 unlock_page(page); 107 unlock_page(page);
109} 108}
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c9a30d7ff6f..2f560c9fb80 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate, 155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr, 156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission, 157 .permission = nilfs_permission,
158 .fiemap = nilfs_fiemap,
158}; 159};
159 160
160/* end of file */ 161/* end of file */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 33ad25ddd5c..caf9a6a3fb5 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
176int nilfs_init_gcinode(struct inode *inode) 176int nilfs_init_gcinode(struct inode *inode)
177{ 177{
178 struct nilfs_inode_info *ii = NILFS_I(inode); 178 struct nilfs_inode_info *ii = NILFS_I(inode);
179 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
180 179
181 inode->i_mode = S_IFREG; 180 inode->i_mode = S_IFREG;
182 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); 181 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode)
186 ii->i_flags = 0; 185 ii->i_flags = 0;
187 nilfs_bmap_init_gc(ii->i_bmap); 186 nilfs_bmap_init_gc(ii->i_bmap);
188 187
189 /*
190 * Add the inode to GC inode list. Garbage Collection
191 * is serialized and no two processes manipulate the
192 * list simultaneously.
193 */
194 igrab(inode);
195 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
196
197 return 0; 188 return 0;
198} 189}
199 190
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 9f8a2da67f9..bfc73d3a30e 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -149,14 +149,9 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
149 } 149 }
150 150
151 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); 151 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
152 if (unlikely(err)) { 152 if (unlikely(err))
153 if (err == -EINVAL) 153 nilfs_warning(sb, __func__, "unable to read inode: %lu",
154 nilfs_error(sb, __func__, "ifile is broken"); 154 (unsigned long) ino);
155 else
156 nilfs_warning(sb, __func__,
157 "unable to read inode: %lu",
158 (unsigned long) ino);
159 }
160 return err; 155 return err;
161} 156}
162 157
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 71d4bc8464e..2fd440d8d6b 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -58,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
58 struct nilfs_inode_info *ii = NILFS_I(inode); 58 struct nilfs_inode_info *ii = NILFS_I(inode);
59 __u64 blknum = 0; 59 __u64 blknum = 0;
60 int err = 0, ret; 60 int err = 0, ret;
61 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); 61 struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
62 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; 62 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
63 63
64 down_read(&NILFS_MDT(dat)->mi_sem); 64 down_read(&NILFS_MDT(dat)->mi_sem);
@@ -96,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
96 inode->i_ino, 96 inode->i_ino,
97 (unsigned long long)blkoff); 97 (unsigned long long)blkoff);
98 err = 0; 98 err = 0;
99 } else if (err == -EINVAL) {
100 nilfs_error(inode->i_sb, __func__,
101 "broken bmap (inode=%lu)\n",
102 inode->i_ino);
103 err = -EIO;
104 } 99 }
105 nilfs_transaction_abort(inode->i_sb); 100 nilfs_transaction_abort(inode->i_sb);
106 goto out; 101 goto out;
@@ -109,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
109 nilfs_transaction_commit(inode->i_sb); /* never fails */ 104 nilfs_transaction_commit(inode->i_sb); /* never fails */
110 /* Error handling should be detailed */ 105 /* Error handling should be detailed */
111 set_buffer_new(bh_result); 106 set_buffer_new(bh_result);
107 set_buffer_delay(bh_result);
112 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed 108 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
113 to proper value */ 109 to proper value */
114 } else if (ret == -ENOENT) { 110 } else if (ret == -ENOENT) {
@@ -185,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page)
185 181
186 if (ret) { 182 if (ret) {
187 struct inode *inode = page->mapping->host; 183 struct inode *inode = page->mapping->host;
188 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
189 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); 184 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
190 185
191 nilfs_set_file_dirty(sbi, inode, nr_dirty); 186 nilfs_set_file_dirty(inode, nr_dirty);
192 } 187 }
193 return ret; 188 return ret;
194} 189}
@@ -229,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
229 start + copied); 224 start + copied);
230 copied = generic_write_end(file, mapping, pos, len, copied, page, 225 copied = generic_write_end(file, mapping, pos, len, copied, page,
231 fsdata); 226 fsdata);
232 nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty); 227 nilfs_set_file_dirty(inode, nr_dirty);
233 err = nilfs_transaction_commit(inode->i_sb); 228 err = nilfs_transaction_commit(inode->i_sb);
234 return err ? : copied; 229 return err ? : copied;
235} 230}
@@ -425,13 +420,12 @@ static int __nilfs_read_inode(struct super_block *sb,
425 struct nilfs_root *root, unsigned long ino, 420 struct nilfs_root *root, unsigned long ino,
426 struct inode *inode) 421 struct inode *inode)
427{ 422{
428 struct nilfs_sb_info *sbi = NILFS_SB(sb); 423 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
429 struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
430 struct buffer_head *bh; 424 struct buffer_head *bh;
431 struct nilfs_inode *raw_inode; 425 struct nilfs_inode *raw_inode;
432 int err; 426 int err;
433 427
434 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 428 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
435 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh); 429 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
436 if (unlikely(err)) 430 if (unlikely(err))
437 goto bad_inode; 431 goto bad_inode;
@@ -461,7 +455,7 @@ static int __nilfs_read_inode(struct super_block *sb,
461 } 455 }
462 nilfs_ifile_unmap_inode(root->ifile, ino, bh); 456 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
463 brelse(bh); 457 brelse(bh);
464 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 458 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
465 nilfs_set_inode_flags(inode); 459 nilfs_set_inode_flags(inode);
466 return 0; 460 return 0;
467 461
@@ -470,7 +464,7 @@ static int __nilfs_read_inode(struct super_block *sb,
470 brelse(bh); 464 brelse(bh);
471 465
472 bad_inode: 466 bad_inode:
473 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 467 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
474 return err; 468 return err;
475} 469}
476 470
@@ -629,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
629 623
630 if (!test_bit(NILFS_I_BMAP, &ii->i_state)) 624 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
631 return; 625 return;
632 repeat: 626repeat:
633 ret = nilfs_bmap_last_key(ii->i_bmap, &b); 627 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
634 if (ret == -ENOENT) 628 if (ret == -ENOENT)
635 return; 629 return;
@@ -646,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
646 nilfs_bmap_truncate(ii->i_bmap, b) == 0)) 640 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
647 goto repeat; 641 goto repeat;
648 642
649 failed: 643failed:
650 if (ret == -EINVAL) 644 nilfs_warning(ii->vfs_inode.i_sb, __func__,
651 nilfs_error(ii->vfs_inode.i_sb, __func__, 645 "failed to truncate bmap (ino=%lu, err=%d)",
652 "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino); 646 ii->vfs_inode.i_ino, ret);
653 else
654 nilfs_warning(ii->vfs_inode.i_sb, __func__,
655 "failed to truncate bmap (ino=%lu, err=%d)",
656 ii->vfs_inode.i_ino, ret);
657} 647}
658 648
659void nilfs_truncate(struct inode *inode) 649void nilfs_truncate(struct inode *inode)
@@ -682,7 +672,7 @@ void nilfs_truncate(struct inode *inode)
682 nilfs_set_transaction_flag(NILFS_TI_SYNC); 672 nilfs_set_transaction_flag(NILFS_TI_SYNC);
683 673
684 nilfs_mark_inode_dirty(inode); 674 nilfs_mark_inode_dirty(inode);
685 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); 675 nilfs_set_file_dirty(inode, 0);
686 nilfs_transaction_commit(sb); 676 nilfs_transaction_commit(sb);
687 /* May construct a logical segment and may fail in sync mode. 677 /* May construct a logical segment and may fail in sync mode.
688 But truncate has no return value. */ 678 But truncate has no return value. */
@@ -785,20 +775,24 @@ out_err:
785 return err; 775 return err;
786} 776}
787 777
788int nilfs_permission(struct inode *inode, int mask) 778int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
789{ 779{
790 struct nilfs_root *root = NILFS_I(inode)->i_root; 780 struct nilfs_root *root;
791 781
782 if (flags & IPERM_FLAG_RCU)
783 return -ECHILD;
784
785 root = NILFS_I(inode)->i_root;
792 if ((mask & MAY_WRITE) && root && 786 if ((mask & MAY_WRITE) && root &&
793 root->cno != NILFS_CPTREE_CURRENT_CNO) 787 root->cno != NILFS_CPTREE_CURRENT_CNO)
794 return -EROFS; /* snapshot is not writable */ 788 return -EROFS; /* snapshot is not writable */
795 789
796 return generic_permission(inode, mask, NULL); 790 return generic_permission(inode, mask, flags, NULL);
797} 791}
798 792
799int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, 793int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
800 struct buffer_head **pbh)
801{ 794{
795 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
802 struct nilfs_inode_info *ii = NILFS_I(inode); 796 struct nilfs_inode_info *ii = NILFS_I(inode);
803 int err; 797 int err;
804 798
@@ -839,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode)
839 return ret; 833 return ret;
840} 834}
841 835
842int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, 836int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
843 unsigned nr_dirty)
844{ 837{
838 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
845 struct nilfs_inode_info *ii = NILFS_I(inode); 839 struct nilfs_inode_info *ii = NILFS_I(inode);
846 840
847 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); 841 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
@@ -874,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
874 868
875int nilfs_mark_inode_dirty(struct inode *inode) 869int nilfs_mark_inode_dirty(struct inode *inode)
876{ 870{
877 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
878 struct buffer_head *ibh; 871 struct buffer_head *ibh;
879 int err; 872 int err;
880 873
881 err = nilfs_load_inode_block(sbi, inode, &ibh); 874 err = nilfs_load_inode_block(inode, &ibh);
882 if (unlikely(err)) { 875 if (unlikely(err)) {
883 nilfs_warning(inode->i_sb, __func__, 876 nilfs_warning(inode->i_sb, __func__,
884 "failed to reget inode block.\n"); 877 "failed to reget inode block.\n");
@@ -920,3 +913,134 @@ void nilfs_dirty_inode(struct inode *inode)
920 nilfs_mark_inode_dirty(inode); 913 nilfs_mark_inode_dirty(inode);
921 nilfs_transaction_commit(inode->i_sb); /* never fails */ 914 nilfs_transaction_commit(inode->i_sb); /* never fails */
922} 915}
916
917int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
918 __u64 start, __u64 len)
919{
920 struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
921 __u64 logical = 0, phys = 0, size = 0;
922 __u32 flags = 0;
923 loff_t isize;
924 sector_t blkoff, end_blkoff;
925 sector_t delalloc_blkoff;
926 unsigned long delalloc_blklen;
927 unsigned int blkbits = inode->i_blkbits;
928 int ret, n;
929
930 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
931 if (ret)
932 return ret;
933
934 mutex_lock(&inode->i_mutex);
935
936 isize = i_size_read(inode);
937
938 blkoff = start >> blkbits;
939 end_blkoff = (start + len - 1) >> blkbits;
940
941 delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
942 &delalloc_blkoff);
943
944 do {
945 __u64 blkphy;
946 unsigned int maxblocks;
947
948 if (delalloc_blklen && blkoff == delalloc_blkoff) {
949 if (size) {
950 /* End of the current extent */
951 ret = fiemap_fill_next_extent(
952 fieinfo, logical, phys, size, flags);
953 if (ret)
954 break;
955 }
956 if (blkoff > end_blkoff)
957 break;
958
959 flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
960 logical = blkoff << blkbits;
961 phys = 0;
962 size = delalloc_blklen << blkbits;
963
964 blkoff = delalloc_blkoff + delalloc_blklen;
965 delalloc_blklen = nilfs_find_uncommitted_extent(
966 inode, blkoff, &delalloc_blkoff);
967 continue;
968 }
969
970 /*
971 * Limit the number of blocks that we look up so as
972 * not to get into the next delayed allocation extent.
973 */
974 maxblocks = INT_MAX;
975 if (delalloc_blklen)
976 maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
977 maxblocks);
978 blkphy = 0;
979
980 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
981 n = nilfs_bmap_lookup_contig(
982 NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
983 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
984
985 if (n < 0) {
986 int past_eof;
987
988 if (unlikely(n != -ENOENT))
989 break; /* error */
990
991 /* HOLE */
992 blkoff++;
993 past_eof = ((blkoff << blkbits) >= isize);
994
995 if (size) {
996 /* End of the current extent */
997
998 if (past_eof)
999 flags |= FIEMAP_EXTENT_LAST;
1000
1001 ret = fiemap_fill_next_extent(
1002 fieinfo, logical, phys, size, flags);
1003 if (ret)
1004 break;
1005 size = 0;
1006 }
1007 if (blkoff > end_blkoff || past_eof)
1008 break;
1009 } else {
1010 if (size) {
1011 if (phys && blkphy << blkbits == phys + size) {
1012 /* The current extent goes on */
1013 size += n << blkbits;
1014 } else {
1015 /* Terminate the current extent */
1016 ret = fiemap_fill_next_extent(
1017 fieinfo, logical, phys, size,
1018 flags);
1019 if (ret || blkoff > end_blkoff)
1020 break;
1021
1022 /* Start another extent */
1023 flags = FIEMAP_EXTENT_MERGED;
1024 logical = blkoff << blkbits;
1025 phys = blkphy << blkbits;
1026 size = n << blkbits;
1027 }
1028 } else {
1029 /* Start a new extent */
1030 flags = FIEMAP_EXTENT_MERGED;
1031 logical = blkoff << blkbits;
1032 phys = blkphy << blkbits;
1033 size = n << blkbits;
1034 }
1035 blkoff += n;
1036 }
1037 cond_resched();
1038 } while (true);
1039
1040 /* If ret is 1 then we just hit the end of the extent array */
1041 if (ret == 1)
1042 ret = 0;
1043
1044 mutex_unlock(&inode->i_mutex);
1045 return ret;
1046}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 3e90f86d5bf..496738963fd 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -233,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
233 int ret; 233 int ret;
234 234
235 down_read(&nilfs->ns_segctor_sem); 235 down_read(&nilfs->ns_segctor_sem);
236 ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs); 236 ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
237 up_read(&nilfs->ns_segctor_sem); 237 up_read(&nilfs->ns_segctor_sem);
238 return ret; 238 return ret;
239} 239}
@@ -242,8 +242,7 @@ static ssize_t
242nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, 242nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
243 void *buf, size_t size, size_t nmembs) 243 void *buf, size_t size, size_t nmembs)
244{ 244{
245 struct inode *dat = nilfs_dat_inode(nilfs); 245 struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
246 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
247 struct nilfs_bdesc *bdescs = buf; 246 struct nilfs_bdesc *bdescs = buf;
248 int ret, i; 247 int ret, i;
249 248
@@ -337,6 +336,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
337 struct nilfs_argv *argv, void *buf) 336 struct nilfs_argv *argv, void *buf)
338{ 337{
339 size_t nmembs = argv->v_nmembs; 338 size_t nmembs = argv->v_nmembs;
339 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
340 struct inode *inode; 340 struct inode *inode;
341 struct nilfs_vdesc *vdesc; 341 struct nilfs_vdesc *vdesc;
342 struct buffer_head *bh, *n; 342 struct buffer_head *bh, *n;
@@ -349,10 +349,21 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
349 ino = vdesc->vd_ino; 349 ino = vdesc->vd_ino;
350 cno = vdesc->vd_cno; 350 cno = vdesc->vd_cno;
351 inode = nilfs_iget_for_gc(sb, ino, cno); 351 inode = nilfs_iget_for_gc(sb, ino, cno);
352 if (unlikely(inode == NULL)) { 352 if (IS_ERR(inode)) {
353 ret = -ENOMEM; 353 ret = PTR_ERR(inode);
354 goto failed; 354 goto failed;
355 } 355 }
356 if (list_empty(&NILFS_I(inode)->i_dirty)) {
357 /*
358 * Add the inode to GC inode list. Garbage Collection
359 * is serialized and no two processes manipulate the
360 * list simultaneously.
361 */
362 igrab(inode);
363 list_add(&NILFS_I(inode)->i_dirty,
364 &nilfs->ns_gc_inodes);
365 }
366
356 do { 367 do {
357 ret = nilfs_ioctl_move_inode_block(inode, vdesc, 368 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
358 &buffers); 369 &buffers);
@@ -409,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
409 size_t nmembs = argv->v_nmembs; 420 size_t nmembs = argv->v_nmembs;
410 int ret; 421 int ret;
411 422
412 ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs); 423 ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
413 424
414 return (ret < 0) ? ret : nmembs; 425 return (ret < 0) ? ret : nmembs;
415} 426}
@@ -418,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
418 struct nilfs_argv *argv, void *buf) 429 struct nilfs_argv *argv, void *buf)
419{ 430{
420 size_t nmembs = argv->v_nmembs; 431 size_t nmembs = argv->v_nmembs;
421 struct inode *dat = nilfs_dat_inode(nilfs); 432 struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
422 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
423 struct nilfs_bdesc *bdescs = buf; 433 struct nilfs_bdesc *bdescs = buf;
424 int ret, i; 434 int ret, i;
425 435
@@ -438,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
438 /* skip dead block */ 448 /* skip dead block */
439 continue; 449 continue;
440 if (bdescs[i].bd_level == 0) { 450 if (bdescs[i].bd_level == 0) {
441 ret = nilfs_mdt_mark_block_dirty(dat, 451 ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
442 bdescs[i].bd_offset); 452 bdescs[i].bd_offset);
443 if (ret < 0) { 453 if (ret < 0) {
444 WARN_ON(ret == -ENOENT); 454 WARN_ON(ret == -ENOENT);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 39a5b84e2c9..6a0e2a189f6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -237,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
237 * 237 *
238 * %-ENOENT - the specified block does not exist (hole block) 238 * %-ENOENT - the specified block does not exist (hole block)
239 * 239 *
240 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
241 *
242 * %-EROFS - Read only filesystem (for create mode) 240 * %-EROFS - Read only filesystem (for create mode)
243 */ 241 */
244int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, 242int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
@@ -273,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
273 * %-ENOMEM - Insufficient memory available. 271 * %-ENOMEM - Insufficient memory available.
274 * 272 *
275 * %-EIO - I/O error 273 * %-EIO - I/O error
276 *
277 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
278 */ 274 */
279int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) 275int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
280{ 276{
@@ -350,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
350 * %-EIO - I/O error 346 * %-EIO - I/O error
351 * 347 *
352 * %-ENOENT - the specified block does not exist (hole block) 348 * %-ENOENT - the specified block does not exist (hole block)
353 *
354 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
355 */ 349 */
356int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) 350int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
357{ 351{
@@ -499,31 +493,29 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
499 struct buffer_head *bh_frozen; 493 struct buffer_head *bh_frozen;
500 struct page *page; 494 struct page *page;
501 int blkbits = inode->i_blkbits; 495 int blkbits = inode->i_blkbits;
502 int ret = -ENOMEM;
503 496
504 page = grab_cache_page(&shadow->frozen_data, bh->b_page->index); 497 page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
505 if (!page) 498 if (!page)
506 return ret; 499 return -ENOMEM;
507 500
508 if (!page_has_buffers(page)) 501 if (!page_has_buffers(page))
509 create_empty_buffers(page, 1 << blkbits, 0); 502 create_empty_buffers(page, 1 << blkbits, 0);
510 503
511 bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); 504 bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
512 if (bh_frozen) { 505
513 if (!buffer_uptodate(bh_frozen)) 506 if (!buffer_uptodate(bh_frozen))
514 nilfs_copy_buffer(bh_frozen, bh); 507 nilfs_copy_buffer(bh_frozen, bh);
515 if (list_empty(&bh_frozen->b_assoc_buffers)) { 508 if (list_empty(&bh_frozen->b_assoc_buffers)) {
516 list_add_tail(&bh_frozen->b_assoc_buffers, 509 list_add_tail(&bh_frozen->b_assoc_buffers,
517 &shadow->frozen_buffers); 510 &shadow->frozen_buffers);
518 set_buffer_nilfs_redirected(bh); 511 set_buffer_nilfs_redirected(bh);
519 } else { 512 } else {
520 brelse(bh_frozen); /* already frozen */ 513 brelse(bh_frozen); /* already frozen */
521 }
522 ret = 0;
523 } 514 }
515
524 unlock_page(page); 516 unlock_page(page);
525 page_cache_release(page); 517 page_cache_release(page);
526 return ret; 518 return 0;
527} 519}
528 520
529struct buffer_head * 521struct buffer_head *
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 6e9557ecf16..98034271cd0 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -577,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = {
577 .rename = nilfs_rename, 577 .rename = nilfs_rename,
578 .setattr = nilfs_setattr, 578 .setattr = nilfs_setattr,
579 .permission = nilfs_permission, 579 .permission = nilfs_permission,
580 .fiemap = nilfs_fiemap,
580}; 581};
581 582
582const struct inode_operations nilfs_special_inode_operations = { 583const struct inode_operations nilfs_special_inode_operations = {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f7560da5a56..777e8fd0430 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -190,11 +190,6 @@ static inline int nilfs_doing_construction(void)
190 return nilfs_test_transaction_flag(NILFS_TI_WRITER); 190 return nilfs_test_transaction_flag(NILFS_TI_WRITER);
191} 191}
192 192
193static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
194{
195 return nilfs->ns_dat;
196}
197
198/* 193/*
199 * function prototype 194 * function prototype
200 */ 195 */
@@ -256,14 +251,14 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
256extern void nilfs_truncate(struct inode *); 251extern void nilfs_truncate(struct inode *);
257extern void nilfs_evict_inode(struct inode *); 252extern void nilfs_evict_inode(struct inode *);
258extern int nilfs_setattr(struct dentry *, struct iattr *); 253extern int nilfs_setattr(struct dentry *, struct iattr *);
259int nilfs_permission(struct inode *inode, int mask); 254int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
260extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, 255int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
261 struct buffer_head **);
262extern int nilfs_inode_dirty(struct inode *); 256extern int nilfs_inode_dirty(struct inode *);
263extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *, 257int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
264 unsigned);
265extern int nilfs_mark_inode_dirty(struct inode *); 258extern int nilfs_mark_inode_dirty(struct inode *);
266extern void nilfs_dirty_inode(struct inode *); 259extern void nilfs_dirty_inode(struct inode *);
260int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
261 __u64 start, __u64 len);
267 262
268/* super.c */ 263/* super.c */
269extern struct inode *nilfs_alloc_inode(struct super_block *); 264extern struct inode *nilfs_alloc_inode(struct super_block *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a6c3c2e817f..0c432416cfe 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -491,7 +491,7 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
491 } 491 }
492 return nc; 492 return nc;
493} 493}
494 494
495void nilfs_mapping_init_once(struct address_space *mapping) 495void nilfs_mapping_init_once(struct address_space *mapping)
496{ 496{
497 memset(mapping, 0, sizeof(*mapping)); 497 memset(mapping, 0, sizeof(*mapping));
@@ -546,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page)
546 } 546 }
547 return TestClearPageDirty(page); 547 return TestClearPageDirty(page);
548} 548}
549
550/**
551 * nilfs_find_uncommitted_extent - find extent of uncommitted data
552 * @inode: inode
553 * @start_blk: start block offset (in)
554 * @blkoff: start offset of the found extent (out)
555 *
556 * This function searches an extent of buffers marked "delayed" which
557 * starts from a block offset equal to or larger than @start_blk. If
558 * such an extent was found, this will store the start offset in
559 * @blkoff and return its length in blocks. Otherwise, zero is
560 * returned.
561 */
562unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
563 sector_t start_blk,
564 sector_t *blkoff)
565{
566 unsigned int i;
567 pgoff_t index;
568 unsigned int nblocks_in_page;
569 unsigned long length = 0;
570 sector_t b;
571 struct pagevec pvec;
572 struct page *page;
573
574 if (inode->i_mapping->nrpages == 0)
575 return 0;
576
577 index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
578 nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
579
580 pagevec_init(&pvec, 0);
581
582repeat:
583 pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
584 pvec.pages);
585 if (pvec.nr == 0)
586 return length;
587
588 if (length > 0 && pvec.pages[0]->index > index)
589 goto out;
590
591 b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
592 i = 0;
593 do {
594 page = pvec.pages[i];
595
596 lock_page(page);
597 if (page_has_buffers(page)) {
598 struct buffer_head *bh, *head;
599
600 bh = head = page_buffers(page);
601 do {
602 if (b < start_blk)
603 continue;
604 if (buffer_delay(bh)) {
605 if (length == 0)
606 *blkoff = b;
607 length++;
608 } else if (length > 0) {
609 goto out_locked;
610 }
611 } while (++b, bh = bh->b_this_page, bh != head);
612 } else {
613 if (length > 0)
614 goto out_locked;
615
616 b += nblocks_in_page;
617 }
618 unlock_page(page);
619
620 } while (++i < pagevec_count(&pvec));
621
622 index = page->index + 1;
623 pagevec_release(&pvec);
624 cond_resched();
625 goto repeat;
626
627out_locked:
628 unlock_page(page);
629out:
630 pagevec_release(&pvec);
631 return length;
632}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index fb9e8a8a203..622df27cd89 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -66,6 +66,9 @@ void nilfs_mapping_init(struct address_space *mapping,
66 struct backing_dev_info *bdi, 66 struct backing_dev_info *bdi,
67 const struct address_space_operations *aops); 67 const struct address_space_operations *aops);
68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
69unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
70 sector_t start_blk,
71 sector_t *blkoff);
69 72
70#define NILFS_PAGE_BUG(page, m, a...) \ 73#define NILFS_PAGE_BUG(page, m, a...) \
71 do { nilfs_page_bug(page); BUG(); } while (0) 74 do { nilfs_page_bug(page); BUG(); } while (0)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5d2711c28da..3dfcd3b7d38 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -535,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
535 if (unlikely(err)) 535 if (unlikely(err))
536 goto failed_page; 536 goto failed_page;
537 537
538 err = nilfs_set_file_dirty(sbi, inode, 1); 538 err = nilfs_set_file_dirty(inode, 1);
539 if (unlikely(err)) 539 if (unlikely(err))
540 goto failed_page; 540 goto failed_page;
541 541
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 35a07157b98..7a17715f215 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -27,14 +27,6 @@
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29 29
30/*
31 * Mount options
32 */
33struct nilfs_mount_options {
34 unsigned long mount_opt;
35 __u64 snapshot_cno;
36};
37
38struct the_nilfs; 30struct the_nilfs;
39struct nilfs_sc_info; 31struct nilfs_sc_info;
40 32
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 687d090cea3..55ebae5c7f3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -504,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
504 return err; 504 return err;
505} 505}
506 506
507static int nilfs_handle_bmap_error(int err, const char *fname,
508 struct inode *inode, struct super_block *sb)
509{
510 if (err == -EINVAL) {
511 nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
512 inode->i_ino);
513 err = -EIO;
514 }
515 return err;
516}
517
518/* 507/*
519 * Callback functions that enumerate, mark, and collect dirty blocks 508 * Callback functions that enumerate, mark, and collect dirty blocks
520 */ 509 */
@@ -524,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
524 int err; 513 int err;
525 514
526 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); 515 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
527 if (unlikely(err < 0)) 516 if (err < 0)
528 return nilfs_handle_bmap_error(err, __func__, inode, 517 return err;
529 sci->sc_super);
530 518
531 err = nilfs_segctor_add_file_block(sci, bh, inode, 519 err = nilfs_segctor_add_file_block(sci, bh, inode,
532 sizeof(struct nilfs_binfo_v)); 520 sizeof(struct nilfs_binfo_v));
@@ -539,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
539 struct buffer_head *bh, 527 struct buffer_head *bh,
540 struct inode *inode) 528 struct inode *inode)
541{ 529{
542 int err; 530 return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
543
544 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
545 if (unlikely(err < 0))
546 return nilfs_handle_bmap_error(err, __func__, inode,
547 sci->sc_super);
548 return 0;
549} 531}
550 532
551static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, 533static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
@@ -588,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
588 int err; 570 int err;
589 571
590 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); 572 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
591 if (unlikely(err < 0)) 573 if (err < 0)
592 return nilfs_handle_bmap_error(err, __func__, inode, 574 return err;
593 sci->sc_super);
594 575
595 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); 576 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
596 if (!err) 577 if (!err)
@@ -776,9 +757,8 @@ static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
776 ret++; 757 ret++;
777 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) 758 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
778 ret++; 759 ret++;
779 if (ret || nilfs_doing_gc()) 760 if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
780 if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs))) 761 ret++;
781 ret++;
782 return ret; 762 return ret;
783} 763}
784 764
@@ -814,7 +794,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
814 nilfs_mdt_clear_dirty(sci->sc_root->ifile); 794 nilfs_mdt_clear_dirty(sci->sc_root->ifile);
815 nilfs_mdt_clear_dirty(nilfs->ns_cpfile); 795 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
816 nilfs_mdt_clear_dirty(nilfs->ns_sufile); 796 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
817 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); 797 nilfs_mdt_clear_dirty(nilfs->ns_dat);
818} 798}
819 799
820static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) 800static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
@@ -923,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
923 nilfs->ns_nongc_ctime : sci->sc_seg_ctime); 903 nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
924 raw_sr->sr_flags = 0; 904 raw_sr->sr_flags = 0;
925 905
926 nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr + 906 nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
927 NILFS_SR_DAT_OFFSET(isz), 1); 907 NILFS_SR_DAT_OFFSET(isz), 1);
928 nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + 908 nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
929 NILFS_SR_CPFILE_OFFSET(isz), 1); 909 NILFS_SR_CPFILE_OFFSET(isz), 1);
@@ -1179,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1179 sci->sc_stage.scnt++; /* Fall through */ 1159 sci->sc_stage.scnt++; /* Fall through */
1180 case NILFS_ST_DAT: 1160 case NILFS_ST_DAT:
1181 dat_stage: 1161 dat_stage:
1182 err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs), 1162 err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
1183 &nilfs_sc_dat_ops); 1163 &nilfs_sc_dat_ops);
1184 if (unlikely(err)) 1164 if (unlikely(err))
1185 break; 1165 break;
@@ -1563,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1563 return 0; 1543 return 0;
1564 1544
1565 failed_bmap: 1545 failed_bmap:
1566 err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
1567 return err; 1546 return err;
1568} 1547}
1569 1548
@@ -1783,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1783 if (!err) { 1762 if (!err) {
1784 set_buffer_uptodate(bh); 1763 set_buffer_uptodate(bh);
1785 clear_buffer_dirty(bh); 1764 clear_buffer_dirty(bh);
1765 clear_buffer_delay(bh);
1786 clear_buffer_nilfs_volatile(bh); 1766 clear_buffer_nilfs_volatile(bh);
1787 } 1767 }
1788 brelse(bh); /* for b_assoc_buffers */ 1768 brelse(bh); /* for b_assoc_buffers */
@@ -1909,6 +1889,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1909 b_assoc_buffers) { 1889 b_assoc_buffers) {
1910 set_buffer_uptodate(bh); 1890 set_buffer_uptodate(bh);
1911 clear_buffer_dirty(bh); 1891 clear_buffer_dirty(bh);
1892 clear_buffer_delay(bh);
1912 clear_buffer_nilfs_volatile(bh); 1893 clear_buffer_nilfs_volatile(bh);
1913 clear_buffer_nilfs_redirected(bh); 1894 clear_buffer_nilfs_redirected(bh);
1914 if (bh == segbuf->sb_super_root) { 1895 if (bh == segbuf->sb_super_root) {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f804d41ec9d..58fd707174e 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -47,7 +47,6 @@
47#include <linux/crc32.h> 47#include <linux/crc32.h>
48#include <linux/vfs.h> 48#include <linux/vfs.h>
49#include <linux/writeback.h> 49#include <linux/writeback.h>
50#include <linux/kobject.h>
51#include <linux/seq_file.h> 50#include <linux/seq_file.h>
52#include <linux/mount.h> 51#include <linux/mount.h>
53#include "nilfs.h" 52#include "nilfs.h"
@@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function,
111 const char *fmt, ...) 110 const char *fmt, ...)
112{ 111{
113 struct nilfs_sb_info *sbi = NILFS_SB(sb); 112 struct nilfs_sb_info *sbi = NILFS_SB(sb);
113 struct va_format vaf;
114 va_list args; 114 va_list args;
115 115
116 va_start(args, fmt); 116 va_start(args, fmt);
117 printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function); 117
118 vprintk(fmt, args); 118 vaf.fmt = fmt;
119 printk("\n"); 119 vaf.va = &args;
120
121 printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
122 sb->s_id, function, &vaf);
123
120 va_end(args); 124 va_end(args);
121 125
122 if (!(sb->s_flags & MS_RDONLY)) { 126 if (!(sb->s_flags & MS_RDONLY)) {
@@ -136,13 +140,17 @@ void nilfs_error(struct super_block *sb, const char *function,
136void nilfs_warning(struct super_block *sb, const char *function, 140void nilfs_warning(struct super_block *sb, const char *function,
137 const char *fmt, ...) 141 const char *fmt, ...)
138{ 142{
143 struct va_format vaf;
139 va_list args; 144 va_list args;
140 145
141 va_start(args, fmt); 146 va_start(args, fmt);
142 printk(KERN_WARNING "NILFS warning (device %s): %s: ", 147
143 sb->s_id, function); 148 vaf.fmt = fmt;
144 vprintk(fmt, args); 149 vaf.va = &args;
145 printk("\n"); 150
151 printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
152 sb->s_id, function, &vaf);
153
146 va_end(args); 154 va_end(args);
147} 155}
148 156
@@ -162,10 +170,13 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
162 return &ii->vfs_inode; 170 return &ii->vfs_inode;
163} 171}
164 172
165void nilfs_destroy_inode(struct inode *inode) 173static void nilfs_i_callback(struct rcu_head *head)
166{ 174{
175 struct inode *inode = container_of(head, struct inode, i_rcu);
167 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 176 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
168 177
178 INIT_LIST_HEAD(&inode->i_dentry);
179
169 if (mdi) { 180 if (mdi) {
170 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ 181 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
171 kfree(mdi); 182 kfree(mdi);
@@ -173,6 +184,11 @@ void nilfs_destroy_inode(struct inode *inode)
173 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 184 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
174} 185}
175 186
187void nilfs_destroy_inode(struct inode *inode)
188{
189 call_rcu(&inode->i_rcu, nilfs_i_callback);
190}
191
176static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) 192static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
177{ 193{
178 struct the_nilfs *nilfs = sbi->s_nilfs; 194 struct the_nilfs *nilfs = sbi->s_nilfs;
@@ -688,7 +704,8 @@ skip_mount_setup:
688 sbp[0]->s_state = 704 sbp[0]->s_state =
689 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); 705 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
690 /* synchronize sbp[1] with sbp[0] */ 706 /* synchronize sbp[1] with sbp[0] */
691 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 707 if (sbp[1])
708 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
692 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); 709 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
693} 710}
694 711
@@ -838,7 +855,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
838 855
839static int nilfs_tree_was_touched(struct dentry *root_dentry) 856static int nilfs_tree_was_touched(struct dentry *root_dentry)
840{ 857{
841 return atomic_read(&root_dentry->d_count) > 1; 858 return root_dentry->d_count > 1;
842} 859}
843 860
844/** 861/**
@@ -1002,11 +1019,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1002 struct nilfs_sb_info *sbi = NILFS_SB(sb); 1019 struct nilfs_sb_info *sbi = NILFS_SB(sb);
1003 struct the_nilfs *nilfs = sbi->s_nilfs; 1020 struct the_nilfs *nilfs = sbi->s_nilfs;
1004 unsigned long old_sb_flags; 1021 unsigned long old_sb_flags;
1005 struct nilfs_mount_options old_opts; 1022 unsigned long old_mount_opt;
1006 int err; 1023 int err;
1007 1024
1008 old_sb_flags = sb->s_flags; 1025 old_sb_flags = sb->s_flags;
1009 old_opts.mount_opt = sbi->s_mount_opt; 1026 old_mount_opt = sbi->s_mount_opt;
1010 1027
1011 if (!parse_options(data, sb, 1)) { 1028 if (!parse_options(data, sb, 1)) {
1012 err = -EINVAL; 1029 err = -EINVAL;
@@ -1075,7 +1092,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1075 1092
1076 restore_opts: 1093 restore_opts:
1077 sb->s_flags = old_sb_flags; 1094 sb->s_flags = old_sb_flags;
1078 sbi->s_mount_opt = old_opts.mount_opt; 1095 sbi->s_mount_opt = old_mount_opt;
1079 return err; 1096 return err;
1080} 1097}
1081 1098
@@ -1147,14 +1164,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
1147{ 1164{
1148 struct nilfs_super_data sd; 1165 struct nilfs_super_data sd;
1149 struct super_block *s; 1166 struct super_block *s;
1150 fmode_t mode = FMODE_READ; 1167 fmode_t mode = FMODE_READ | FMODE_EXCL;
1151 struct dentry *root_dentry; 1168 struct dentry *root_dentry;
1152 int err, s_new = false; 1169 int err, s_new = false;
1153 1170
1154 if (!(flags & MS_RDONLY)) 1171 if (!(flags & MS_RDONLY))
1155 mode |= FMODE_WRITE; 1172 mode |= FMODE_WRITE;
1156 1173
1157 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1174 sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1158 if (IS_ERR(sd.bdev)) 1175 if (IS_ERR(sd.bdev))
1159 return ERR_CAST(sd.bdev); 1176 return ERR_CAST(sd.bdev);
1160 1177
@@ -1233,7 +1250,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
1233 } 1250 }
1234 1251
1235 if (!s_new) 1252 if (!s_new)
1236 close_bdev_exclusive(sd.bdev, mode); 1253 blkdev_put(sd.bdev, mode);
1237 1254
1238 return root_dentry; 1255 return root_dentry;
1239 1256
@@ -1242,7 +1259,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
1242 1259
1243 failed: 1260 failed:
1244 if (!s_new) 1261 if (!s_new)
1245 close_bdev_exclusive(sd.bdev, mode); 1262 blkdev_put(sd.bdev, mode);
1246 return ERR_PTR(err); 1263 return ERR_PTR(err);
1247} 1264}
1248 1265
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0254be2d73c..ad4ac607cf5 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -329,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
329 printk(KERN_INFO "NILFS: recovery complete.\n"); 329 printk(KERN_INFO "NILFS: recovery complete.\n");
330 330
331 skip_recovery: 331 skip_recovery:
332 set_nilfs_loaded(nilfs);
333 nilfs_clear_recovery_info(&ri); 332 nilfs_clear_recovery_info(&ri);
334 sbi->s_super->s_flags = s_flags; 333 sbi->s_super->s_flags = s_flags;
335 return 0; 334 return 0;
@@ -651,12 +650,11 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
651 650
652int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) 651int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
653{ 652{
654 struct inode *dat = nilfs_dat_inode(nilfs);
655 unsigned long ncleansegs; 653 unsigned long ncleansegs;
656 654
657 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 655 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
658 ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); 656 ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
659 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 657 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
660 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; 658 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
661 return 0; 659 return 0;
662} 660}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 69226e14b74..fd85e4c05c6 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -36,8 +36,6 @@
36/* the_nilfs struct */ 36/* the_nilfs struct */
37enum { 37enum {
38 THE_NILFS_INIT = 0, /* Information from super_block is set */ 38 THE_NILFS_INIT = 0, /* Information from super_block is set */
39 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
40 the latest checkpoint was loaded */
41 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
42 THE_NILFS_GC_RUNNING, /* gc process is running */ 40 THE_NILFS_GC_RUNNING, /* gc process is running */
43 THE_NILFS_SB_DIRTY, /* super block is dirty */ 41 THE_NILFS_SB_DIRTY, /* super block is dirty */
@@ -178,7 +176,6 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \
178} 176}
179 177
180THE_NILFS_FNS(INIT, init) 178THE_NILFS_FNS(INIT, init)
181THE_NILFS_FNS(LOADED, loaded)
182THE_NILFS_FNS(DISCONTINUED, discontinued) 179THE_NILFS_FNS(DISCONTINUED, discontinued)
183THE_NILFS_FNS(GC_RUNNING, gc_running) 180THE_NILFS_FNS(GC_RUNNING, gc_running)
184THE_NILFS_FNS(SB_DIRTY, sb_dirty) 181THE_NILFS_FNS(SB_DIRTY, sb_dirty)
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 3ac36b7bf6b..7dceff005a6 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -6,7 +6,7 @@ config FANOTIFY
6 ---help--- 6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access 7 Say Y here to enable fanotify suport. fanotify is a file access
8 notification system which differs from inotify in that it sends 8 notification system which differs from inotify in that it sends
9 and open file descriptor to the userspace listener along with 9 an open file descriptor to the userspace listener along with
10 the event. 10 the event.
11 11
12 If unsure, say Y. 12 If unsure, say Y.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index b04f88eed09..f35794b97e8 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
92 92
93 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
94 94
95 wait_event(group->fanotify_data.access_waitq, event->response); 95 wait_event(group->fanotify_data.access_waitq, event->response ||
96 atomic_read(&group->fanotify_data.bypass_perm));
97
98 if (!event->response) /* bypass_perm set */
99 return 0;
96 100
97 /* userspace responded, convert to something usable */ 101 /* userspace responded, convert to something usable */
98 spin_lock(&event->lock); 102 spin_lock(&event->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 063224812b7..8b61220cffc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -106,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
106 return client_fd; 106 return client_fd;
107} 107}
108 108
109static ssize_t fill_event_metadata(struct fsnotify_group *group, 109static int fill_event_metadata(struct fsnotify_group *group,
110 struct fanotify_event_metadata *metadata, 110 struct fanotify_event_metadata *metadata,
111 struct fsnotify_event *event) 111 struct fsnotify_event *event)
112{ 112{
113 int ret = 0;
114
113 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, 115 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
114 group, metadata, event); 116 group, metadata, event);
115 117
116 metadata->event_len = FAN_EVENT_METADATA_LEN; 118 metadata->event_len = FAN_EVENT_METADATA_LEN;
119 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
117 metadata->vers = FANOTIFY_METADATA_VERSION; 120 metadata->vers = FANOTIFY_METADATA_VERSION;
118 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 121 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
119 metadata->pid = pid_vnr(event->tgid); 122 metadata->pid = pid_vnr(event->tgid);
120 metadata->fd = create_fd(group, event); 123 if (unlikely(event->mask & FAN_Q_OVERFLOW))
124 metadata->fd = FAN_NOFD;
125 else {
126 metadata->fd = create_fd(group, event);
127 if (metadata->fd < 0)
128 ret = metadata->fd;
129 }
121 130
122 return metadata->fd; 131 return ret;
123} 132}
124 133
125#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 134#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -200,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
200 209
201 mutex_lock(&group->fanotify_data.access_mutex); 210 mutex_lock(&group->fanotify_data.access_mutex);
202 211
203 if (group->fanotify_data.bypass_perm) { 212 if (atomic_read(&group->fanotify_data.bypass_perm)) {
204 mutex_unlock(&group->fanotify_data.access_mutex); 213 mutex_unlock(&group->fanotify_data.access_mutex);
205 kmem_cache_free(fanotify_response_event_cache, re); 214 kmem_cache_free(fanotify_response_event_cache, re);
206 event->response = FAN_ALLOW; 215 event->response = FAN_ALLOW;
@@ -257,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
257 266
258 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 267 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
259 268
260 fd = fill_event_metadata(group, &fanotify_event_metadata, event); 269 ret = fill_event_metadata(group, &fanotify_event_metadata, event);
261 if (fd < 0) 270 if (ret < 0)
262 return fd; 271 goto out;
263 272
273 fd = fanotify_event_metadata.fd;
264 ret = prepare_for_access_response(group, event, fd); 274 ret = prepare_for_access_response(group, event, fd);
265 if (ret) 275 if (ret)
266 goto out_close_fd; 276 goto out_close_fd;
267 277
268 ret = -EFAULT; 278 ret = -EFAULT;
269 if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN)) 279 if (copy_to_user(buf, &fanotify_event_metadata,
280 fanotify_event_metadata.event_len))
270 goto out_kill_access_response; 281 goto out_kill_access_response;
271 282
272 return FAN_EVENT_METADATA_LEN; 283 return fanotify_event_metadata.event_len;
273 284
274out_kill_access_response: 285out_kill_access_response:
275 remove_access_response(group, event, fd); 286 remove_access_response(group, event, fd);
276out_close_fd: 287out_close_fd:
277 sys_close(fd); 288 if (fd != FAN_NOFD)
289 sys_close(fd);
290out:
291#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
292 if (event->mask & FAN_ALL_PERM_EVENTS) {
293 event->response = FAN_DENY;
294 wake_up(&group->fanotify_data.access_waitq);
295 }
296#endif
278 return ret; 297 return ret;
279} 298}
280 299
@@ -382,7 +401,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
382 401
383 mutex_lock(&group->fanotify_data.access_mutex); 402 mutex_lock(&group->fanotify_data.access_mutex);
384 403
385 group->fanotify_data.bypass_perm = true; 404 atomic_inc(&group->fanotify_data.bypass_perm);
386 405
387 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { 406 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
388 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, 407 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
@@ -586,11 +605,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
586{ 605{
587 struct fsnotify_mark *fsn_mark; 606 struct fsnotify_mark *fsn_mark;
588 __u32 added; 607 __u32 added;
608 int ret = 0;
589 609
590 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 610 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
591 if (!fsn_mark) { 611 if (!fsn_mark) {
592 int ret;
593
594 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 612 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
595 return -ENOSPC; 613 return -ENOSPC;
596 614
@@ -600,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
600 618
601 fsnotify_init_mark(fsn_mark, fanotify_free_mark); 619 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
602 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); 620 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
603 if (ret) { 621 if (ret)
604 fanotify_free_mark(fsn_mark); 622 goto err;
605 return ret;
606 }
607 } 623 }
608 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 624 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
609 fsnotify_put_mark(fsn_mark); 625
610 if (added & ~mnt->mnt_fsnotify_mask) 626 if (added & ~mnt->mnt_fsnotify_mask)
611 fsnotify_recalc_vfsmount_mask(mnt); 627 fsnotify_recalc_vfsmount_mask(mnt);
612 628err:
613 return 0; 629 fsnotify_put_mark(fsn_mark);
630 return ret;
614} 631}
615 632
616static int fanotify_add_inode_mark(struct fsnotify_group *group, 633static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -619,6 +636,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
619{ 636{
620 struct fsnotify_mark *fsn_mark; 637 struct fsnotify_mark *fsn_mark;
621 __u32 added; 638 __u32 added;
639 int ret = 0;
622 640
623 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 641 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
624 642
@@ -634,8 +652,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
634 652
635 fsn_mark = fsnotify_find_inode_mark(group, inode); 653 fsn_mark = fsnotify_find_inode_mark(group, inode);
636 if (!fsn_mark) { 654 if (!fsn_mark) {
637 int ret;
638
639 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 655 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
640 return -ENOSPC; 656 return -ENOSPC;
641 657
@@ -645,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
645 661
646 fsnotify_init_mark(fsn_mark, fanotify_free_mark); 662 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
647 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); 663 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
648 if (ret) { 664 if (ret)
649 fanotify_free_mark(fsn_mark); 665 goto err;
650 return ret;
651 }
652 } 666 }
653 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 667 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
654 fsnotify_put_mark(fsn_mark); 668
655 if (added & ~inode->i_fsnotify_mask) 669 if (added & ~inode->i_fsnotify_mask)
656 fsnotify_recalc_inode_mask(inode); 670 fsnotify_recalc_inode_mask(inode);
657 return 0; 671err:
672 fsnotify_put_mark(fsn_mark);
673 return ret;
658} 674}
659 675
660/* fanotify syscalls */ 676/* fanotify syscalls */
@@ -687,8 +703,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
687 703
688 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ 704 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
689 group = fsnotify_alloc_group(&fanotify_fsnotify_ops); 705 group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
690 if (IS_ERR(group)) 706 if (IS_ERR(group)) {
707 free_uid(user);
691 return PTR_ERR(group); 708 return PTR_ERR(group);
709 }
692 710
693 group->fanotify_data.user = user; 711 group->fanotify_data.user = user;
694 atomic_inc(&user->fanotify_listeners); 712 atomic_inc(&user->fanotify_listeners);
@@ -698,6 +716,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
698 mutex_init(&group->fanotify_data.access_mutex); 716 mutex_init(&group->fanotify_data.access_mutex);
699 init_waitqueue_head(&group->fanotify_data.access_waitq); 717 init_waitqueue_head(&group->fanotify_data.access_waitq);
700 INIT_LIST_HEAD(&group->fanotify_data.access_list); 718 INIT_LIST_HEAD(&group->fanotify_data.access_list);
719 atomic_set(&group->fanotify_data.bypass_perm, 0);
701#endif 720#endif
702 switch (flags & FAN_ALL_CLASS_BITS) { 721 switch (flags & FAN_ALL_CLASS_BITS) {
703 case FAN_CLASS_NOTIF: 722 case FAN_CLASS_NOTIF:
@@ -764,8 +783,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
764 if (flags & ~FAN_ALL_MARK_FLAGS) 783 if (flags & ~FAN_ALL_MARK_FLAGS)
765 return -EINVAL; 784 return -EINVAL;
766 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { 785 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
767 case FAN_MARK_ADD: 786 case FAN_MARK_ADD: /* fallthrough */
768 case FAN_MARK_REMOVE: 787 case FAN_MARK_REMOVE:
788 if (!mask)
789 return -EINVAL;
769 case FAN_MARK_FLUSH: 790 case FAN_MARK_FLUSH:
770 break; 791 break;
771 default: 792 default:
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 20dc218707c..79b47cbb5cd 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -59,7 +59,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
59 /* determine if the children should tell inode about their events */ 59 /* determine if the children should tell inode about their events */
60 watched = fsnotify_inode_watches_children(inode); 60 watched = fsnotify_inode_watches_children(inode);
61 61
62 spin_lock(&dcache_lock); 62 spin_lock(&inode->i_lock);
63 /* run all of the dentries associated with this inode. Since this is a 63 /* run all of the dentries associated with this inode. Since this is a
64 * directory, there damn well better only be one item on this list */ 64 * directory, there damn well better only be one item on this list */
65 list_for_each_entry(alias, &inode->i_dentry, d_alias) { 65 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
@@ -68,19 +68,21 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
68 /* run all of the children of the original inode and fix their 68 /* run all of the children of the original inode and fix their
69 * d_flags to indicate parental interest (their parent is the 69 * d_flags to indicate parental interest (their parent is the
70 * original inode) */ 70 * original inode) */
71 spin_lock(&alias->d_lock);
71 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { 72 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
72 if (!child->d_inode) 73 if (!child->d_inode)
73 continue; 74 continue;
74 75
75 spin_lock(&child->d_lock); 76 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
76 if (watched) 77 if (watched)
77 child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; 78 child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
78 else 79 else
79 child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; 80 child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
80 spin_unlock(&child->d_lock); 81 spin_unlock(&child->d_lock);
81 } 82 }
83 spin_unlock(&alias->d_lock);
82 } 84 }
83 spin_unlock(&dcache_lock); 85 spin_unlock(&inode->i_lock);
84} 86}
85 87
86/* Notify this dentry's parent about a child's events. */ 88/* Notify this dentry's parent about a child's events. */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 444c305a468..4cd5d5d78f9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
752 if (ret >= 0) 752 if (ret >= 0)
753 return ret; 753 return ret;
754 754
755 fsnotify_put_group(group);
755 atomic_dec(&user->inotify_devs); 756 atomic_dec(&user->inotify_devs);
756out_free_uid: 757out_free_uid:
757 free_uid(user); 758 free_uid(user);
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be99254..4ff028fcfd6 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\" 9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG 12EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a..f4b1057abdd 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2007 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1380 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s 1380 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1381 * single-segment behaviour. 1381 * single-segment behaviour.
1382 * 1382 *
1383 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both 1383 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
1384 * when atomic and when not atomic. This is ok because 1384 * atomic and when not atomic. This is ok because it calls
1385 * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic() 1385 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
1386 * and it is ok to call this when non-atomic. 1386 * fact, the only difference between __copy_from_user_inatomic() and
1387 * Infact, the only difference between __copy_from_user_inatomic() and
1388 * __copy_from_user() is that the latter calls might_sleep() and the former 1387 * __copy_from_user() is that the latter calls might_sleep() and the former
1389 * should not zero the tail of the buffer on error. And on many 1388 * should not zero the tail of the buffer on error. And on many architectures
1390 * architectures __copy_from_user_inatomic() is just defined to 1389 * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
1391 * __copy_from_user() so it makes no difference at all on those architectures. 1390 * makes no difference at all on those architectures.
1392 */ 1391 */
1393static inline size_t ntfs_copy_from_user_iovec(struct page **pages, 1392static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1394 unsigned nr_pages, unsigned ofs, const struct iovec **iov, 1393 unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1409 if (unlikely(copied != len)) { 1408 if (unlikely(copied != len)) {
1410 /* Do it the slow way. */ 1409 /* Do it the slow way. */
1411 addr = kmap(*pages); 1410 addr = kmap(*pages);
1412 copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs, 1411 copied = __ntfs_copy_from_user_iovec_inatomic(addr +
1413 *iov, *iov_ofs, len); 1412 ofs, *iov, *iov_ofs, len);
1414 /*
1415 * Zero the rest of the target like __copy_from_user().
1416 */
1417 memset(addr + ofs + copied, 0, len - copied);
1418 kunmap(*pages);
1419 if (unlikely(copied != len)) 1413 if (unlikely(copied != len))
1420 goto err_out; 1414 goto err_out;
1415 kunmap(*pages);
1421 } 1416 }
1422 total += len; 1417 total += len;
1418 ntfs_set_next_iovec(iov, iov_ofs, len);
1423 bytes -= len; 1419 bytes -= len;
1424 if (!bytes) 1420 if (!bytes)
1425 break; 1421 break;
1426 ntfs_set_next_iovec(iov, iov_ofs, len);
1427 ofs = 0; 1422 ofs = 0;
1428 } while (++pages < last_page); 1423 } while (++pages < last_page);
1429out: 1424out:
1430 return total; 1425 return total;
1431err_out: 1426err_out:
1432 total += copied; 1427 BUG_ON(copied > len);
1433 /* Zero the rest of the target like __copy_from_user(). */ 1428 /* Zero the rest of the target like __copy_from_user(). */
1429 memset(addr + ofs + copied, 0, len - copied);
1430 kunmap(*pages);
1431 total += copied;
1432 ntfs_set_next_iovec(iov, iov_ofs, copied);
1434 while (++pages < last_page) { 1433 while (++pages < last_page) {
1435 bytes -= len; 1434 bytes -= len;
1436 if (!bytes) 1435 if (!bytes)
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 93622b175fc..a627ed82c0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
332 return NULL; 332 return NULL;
333} 333}
334 334
335static void ntfs_i_callback(struct rcu_head *head)
336{
337 struct inode *inode = container_of(head, struct inode, i_rcu);
338 INIT_LIST_HEAD(&inode->i_dentry);
339 kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
340}
341
335void ntfs_destroy_big_inode(struct inode *inode) 342void ntfs_destroy_big_inode(struct inode *inode)
336{ 343{
337 ntfs_inode *ni = NTFS_I(inode); 344 ntfs_inode *ni = NTFS_I(inode);
@@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode)
340 BUG_ON(ni->page); 347 BUG_ON(ni->page);
341 if (!atomic_dec_and_test(&ni->count)) 348 if (!atomic_dec_and_test(&ni->count))
342 BUG(); 349 BUG();
343 kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); 350 call_rcu(&inode->i_rcu, ntfs_i_callback);
344} 351}
345 352
346static inline ntfs_inode *ntfs_alloc_extent_inode(void) 353static inline ntfs_inode *ntfs_alloc_extent_inode(void)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b572b672718..326e7475a22 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. 2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2006 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * Copyright (c) 2002 Richard Russon 5 * Copyright (c) 2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -2576,6 +2576,8 @@ mft_rec_already_initialized:
2576 flush_dcache_page(page); 2576 flush_dcache_page(page);
2577 SetPageUptodate(page); 2577 SetPageUptodate(page);
2578 if (base_ni) { 2578 if (base_ni) {
2579 MFT_RECORD *m_tmp;
2580
2579 /* 2581 /*
2580 * Setup the base mft record in the extent mft record. This 2582 * Setup the base mft record in the extent mft record. This
2581 * completes initialization of the allocated extent mft record 2583 * completes initialization of the allocated extent mft record
@@ -2588,11 +2590,11 @@ mft_rec_already_initialized:
2588 * attach it to the base inode @base_ni and map, pin, and lock 2590 * attach it to the base inode @base_ni and map, pin, and lock
2589 * its, i.e. the allocated, mft record. 2591 * its, i.e. the allocated, mft record.
2590 */ 2592 */
2591 m = map_extent_mft_record(base_ni, bit, &ni); 2593 m_tmp = map_extent_mft_record(base_ni, bit, &ni);
2592 if (IS_ERR(m)) { 2594 if (IS_ERR(m_tmp)) {
2593 ntfs_error(vol->sb, "Failed to map allocated extent " 2595 ntfs_error(vol->sb, "Failed to map allocated extent "
2594 "mft record 0x%llx.", (long long)bit); 2596 "mft record 0x%llx.", (long long)bit);
2595 err = PTR_ERR(m); 2597 err = PTR_ERR(m_tmp);
2596 /* Set the mft record itself not in use. */ 2598 /* Set the mft record itself not in use. */
2597 m->flags &= cpu_to_le16( 2599 m->flags &= cpu_to_le16(
2598 ~le16_to_cpu(MFT_RECORD_IN_USE)); 2600 ~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2603,6 +2605,7 @@ mft_rec_already_initialized:
2603 ntfs_unmap_page(page); 2605 ntfs_unmap_page(page);
2604 goto undo_mftbmp_alloc; 2606 goto undo_mftbmp_alloc;
2605 } 2607 }
2608 BUG_ON(m != m_tmp);
2606 /* 2609 /*
2607 * Make sure the allocated mft record is written out to disk. 2610 * Make sure the allocated mft record is written out to disk.
2608 * No need to set the inode dirty because the caller is going 2611 * No need to set the inode dirty because the caller is going
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a30ecacc01f..29099a07b9f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. 2 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2007 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * Copyright (c) 2001,2002 Richard Russon 5 * Copyright (c) 2001,2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -3193,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
3193 ntfs_sysctl(0); 3193 ntfs_sysctl(0);
3194} 3194}
3195 3195
3196MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>"); 3196MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
3197MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov"); 3197MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
3198MODULE_VERSION(NTFS_VERSION); 3198MODULE_VERSION(NTFS_VERSION);
3199MODULE_LICENSE("GPL"); 3199MODULE_LICENSE("GPL");
3200#ifdef DEBUG 3200#ifdef DEBUG
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698..77a8de5f711 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
1config OCFS2_FS 1config OCFS2_FS
2 tristate "OCFS2 file system support" 2 tristate "OCFS2 file system support"
3 depends on NET && SYSFS 3 depends on NET && SYSFS && CONFIGFS_FS
4 select CONFIGFS_FS
5 select JBD2 4 select JBD2
6 select CRC32 5 select CRC32
7 select QUOTA 6 select QUOTA
@@ -51,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
51 50
52config OCFS2_FS_STATS 51config OCFS2_FS_STATS
53 bool "OCFS2 statistics" 52 bool "OCFS2 statistics"
54 depends on OCFS2_FS 53 depends on OCFS2_FS && DEBUG_FS
55 default y 54 default y
56 help 55 help
57 This option allows some fs statistics to be captured. Enabling 56 This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 391915093fe..704f6b1742f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle,
291 return ret; 291 return ret;
292} 292}
293 293
294int ocfs2_check_acl(struct inode *inode, int mask) 294int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
295{ 295{
296 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 296 struct ocfs2_super *osb;
297 struct buffer_head *di_bh = NULL; 297 struct buffer_head *di_bh = NULL;
298 struct posix_acl *acl; 298 struct posix_acl *acl;
299 int ret = -EAGAIN; 299 int ret = -EAGAIN;
300 300
301 if (flags & IPERM_FLAG_RCU)
302 return -ECHILD;
303
304 osb = OCFS2_SB(inode->i_sb);
301 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 305 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
302 return ret; 306 return ret;
303 307
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 5c5d31f0585..4fe7c9cf4bf 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
26 __le32 e_id; 26 __le32 e_id;
27}; 27};
28 28
29extern int ocfs2_check_acl(struct inode *, int); 29extern int ocfs2_check_acl(struct inode *, int, unsigned int);
30extern int ocfs2_acl_chmod(struct inode *); 30extern int ocfs2_acl_chmod(struct inode *);
31extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, 31extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
32 struct buffer_head *, struct buffer_head *, 32 struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5007d..e4984e259cb 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
565 return ret; 565 return ret;
566} 566}
567 567
568static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
569static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 568static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
570 struct ocfs2_extent_block *eb); 569 struct ocfs2_extent_block *eb);
571static void ocfs2_adjust_rightmost_records(handle_t *handle, 570static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5858 5857
5859 ocfs2_journal_dirty(handle, tl_bh); 5858 ocfs2_journal_dirty(handle, tl_bh);
5860 5859
5860 osb->truncated_clusters += num_clusters;
5861bail: 5861bail:
5862 mlog_exit(status); 5862 mlog_exit(status);
5863 return status; 5863 return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5929 i--; 5929 i--;
5930 } 5930 }
5931 5931
5932 osb->truncated_clusters = 0;
5933
5932bail: 5934bail:
5933 mlog_exit(status); 5935 mlog_exit(status);
5934 return status; 5936 return status;
@@ -7139,64 +7141,6 @@ bail:
7139} 7141}
7140 7142
7141/* 7143/*
7142 * Expects the inode to already be locked.
7143 */
7144int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7145 struct inode *inode,
7146 struct buffer_head *fe_bh,
7147 struct ocfs2_truncate_context **tc)
7148{
7149 int status;
7150 unsigned int new_i_clusters;
7151 struct ocfs2_dinode *fe;
7152 struct ocfs2_extent_block *eb;
7153 struct buffer_head *last_eb_bh = NULL;
7154
7155 mlog_entry_void();
7156
7157 *tc = NULL;
7158
7159 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7160 i_size_read(inode));
7161 fe = (struct ocfs2_dinode *) fe_bh->b_data;
7162
7163 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7164 "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7165 (unsigned long long)le64_to_cpu(fe->i_size));
7166
7167 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7168 if (!(*tc)) {
7169 status = -ENOMEM;
7170 mlog_errno(status);
7171 goto bail;
7172 }
7173 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7174
7175 if (fe->id2.i_list.l_tree_depth) {
7176 status = ocfs2_read_extent_block(INODE_CACHE(inode),
7177 le64_to_cpu(fe->i_last_eb_blk),
7178 &last_eb_bh);
7179 if (status < 0) {
7180 mlog_errno(status);
7181 goto bail;
7182 }
7183 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7184 }
7185
7186 (*tc)->tc_last_eb_bh = last_eb_bh;
7187
7188 status = 0;
7189bail:
7190 if (status < 0) {
7191 if (*tc)
7192 ocfs2_free_truncate_context(*tc);
7193 *tc = NULL;
7194 }
7195 mlog_exit_void();
7196 return status;
7197}
7198
7199/*
7200 * 'start' is inclusive, 'end' is not. 7144 * 'start' is inclusive, 'end' is not.
7201 */ 7145 */
7202int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 7146int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
7270out: 7214out:
7271 return ret; 7215 return ret;
7272} 7216}
7273
7274static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7275{
7276 /*
7277 * The caller is responsible for completing deallocation
7278 * before freeing the context.
7279 */
7280 if (tc->tc_dealloc.c_first_suballocator != NULL)
7281 mlog(ML_NOTICE,
7282 "Truncate completion has non-empty dealloc context\n");
7283
7284 brelse(tc->tc_last_eb_bh);
7285
7286 kfree(tc);
7287}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b554b9..3bd08a03251 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
228 228
229int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 229int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
230 u64 range_start, u64 range_end); 230 u64 range_start, u64 range_end);
231int ocfs2_prepare_truncate(struct ocfs2_super *osb,
232 struct inode *inode,
233 struct buffer_head *fe_bh,
234 struct ocfs2_truncate_context **tc);
235int ocfs2_commit_truncate(struct ocfs2_super *osb, 231int ocfs2_commit_truncate(struct ocfs2_super *osb,
236 struct inode *inode, 232 struct inode *inode,
237 struct buffer_head *di_bh); 233 struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f1e962cb3b7..1fbb0e20131 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -573,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
573 /* this io's submitter should not have unlocked this before we could */ 573 /* this io's submitter should not have unlocked this before we could */
574 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 574 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
575 575
576 if (ocfs2_iocb_is_sem_locked(iocb)) {
577 up_read(&inode->i_alloc_sem);
578 ocfs2_iocb_clear_sem_locked(iocb);
579 }
580
576 ocfs2_iocb_clear_rw_locked(iocb); 581 ocfs2_iocb_clear_rw_locked(iocb);
577 582
578 level = ocfs2_iocb_rw_locked_level(iocb); 583 level = ocfs2_iocb_rw_locked_level(iocb);
579 if (!level)
580 up_read(&inode->i_alloc_sem);
581 ocfs2_rw_unlock(inode, level); 584 ocfs2_rw_unlock(inode, level);
582 585
583 if (is_async) 586 if (is_async)
@@ -1627,6 +1630,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1627 return ret; 1630 return ret;
1628} 1631}
1629 1632
1633/*
1634 * Try to flush truncate logs if we can free enough clusters from it.
1635 * As for return value, "< 0" means error, "0" no space and "1" means
1636 * we have freed enough spaces and let the caller try to allocate again.
1637 */
1638static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
1639 unsigned int needed)
1640{
1641 tid_t target;
1642 int ret = 0;
1643 unsigned int truncated_clusters;
1644
1645 mutex_lock(&osb->osb_tl_inode->i_mutex);
1646 truncated_clusters = osb->truncated_clusters;
1647 mutex_unlock(&osb->osb_tl_inode->i_mutex);
1648
1649 /*
1650 * Check whether we can succeed in allocating if we free
1651 * the truncate log.
1652 */
1653 if (truncated_clusters < needed)
1654 goto out;
1655
1656 ret = ocfs2_flush_truncate_log(osb);
1657 if (ret) {
1658 mlog_errno(ret);
1659 goto out;
1660 }
1661
1662 if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
1663 jbd2_log_wait_commit(osb->journal->j_journal, target);
1664 ret = 1;
1665 }
1666out:
1667 return ret;
1668}
1669
1630int ocfs2_write_begin_nolock(struct file *filp, 1670int ocfs2_write_begin_nolock(struct file *filp,
1631 struct address_space *mapping, 1671 struct address_space *mapping,
1632 loff_t pos, unsigned len, unsigned flags, 1672 loff_t pos, unsigned len, unsigned flags,
@@ -1634,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
1634 struct buffer_head *di_bh, struct page *mmap_page) 1674 struct buffer_head *di_bh, struct page *mmap_page)
1635{ 1675{
1636 int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; 1676 int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
1637 unsigned int clusters_to_alloc, extents_to_split; 1677 unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
1638 struct ocfs2_write_ctxt *wc; 1678 struct ocfs2_write_ctxt *wc;
1639 struct inode *inode = mapping->host; 1679 struct inode *inode = mapping->host;
1640 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1680 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1643,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct file *filp,
1643 struct ocfs2_alloc_context *meta_ac = NULL; 1683 struct ocfs2_alloc_context *meta_ac = NULL;
1644 handle_t *handle; 1684 handle_t *handle;
1645 struct ocfs2_extent_tree et; 1685 struct ocfs2_extent_tree et;
1686 int try_free = 1, ret1;
1646 1687
1688try_again:
1647 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1689 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1648 if (ret) { 1690 if (ret) {
1649 mlog_errno(ret); 1691 mlog_errno(ret);
@@ -1678,6 +1720,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
1678 mlog_errno(ret); 1720 mlog_errno(ret);
1679 goto out; 1721 goto out;
1680 } else if (ret == 1) { 1722 } else if (ret == 1) {
1723 clusters_need = wc->w_clen;
1681 ret = ocfs2_refcount_cow(inode, filp, di_bh, 1724 ret = ocfs2_refcount_cow(inode, filp, di_bh,
1682 wc->w_cpos, wc->w_clen, UINT_MAX); 1725 wc->w_cpos, wc->w_clen, UINT_MAX);
1683 if (ret) { 1726 if (ret) {
@@ -1692,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
1692 mlog_errno(ret); 1735 mlog_errno(ret);
1693 goto out; 1736 goto out;
1694 } 1737 }
1738 clusters_need += clusters_to_alloc;
1695 1739
1696 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1740 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1697 1741
@@ -1814,6 +1858,22 @@ out:
1814 ocfs2_free_alloc_context(data_ac); 1858 ocfs2_free_alloc_context(data_ac);
1815 if (meta_ac) 1859 if (meta_ac)
1816 ocfs2_free_alloc_context(meta_ac); 1860 ocfs2_free_alloc_context(meta_ac);
1861
1862 if (ret == -ENOSPC && try_free) {
1863 /*
1864 * Try to free some truncate log so that we can have enough
1865 * clusters to allocate.
1866 */
1867 try_free = 0;
1868
1869 ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
1870 if (ret1 == 1)
1871 goto try_again;
1872
1873 if (ret1 < 0)
1874 mlog_errno(ret1);
1875 }
1876
1817 return ret; 1877 return ret;
1818} 1878}
1819 1879
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 76bfdfda691..eceb456037c 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -68,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
68 else 68 else
69 clear_bit(1, (unsigned long *)&iocb->private); 69 clear_bit(1, (unsigned long *)&iocb->private);
70} 70}
71
72/*
73 * Using a named enum representing lock types in terms of #N bit stored in
74 * iocb->private, which is going to be used for communication bewteen
75 * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
76 */
77enum ocfs2_iocb_lock_bits {
78 OCFS2_IOCB_RW_LOCK = 0,
79 OCFS2_IOCB_RW_LOCK_LEVEL,
80 OCFS2_IOCB_SEM,
81 OCFS2_IOCB_NUM_LOCKS
82};
83
71#define ocfs2_iocb_clear_rw_locked(iocb) \ 84#define ocfs2_iocb_clear_rw_locked(iocb) \
72 clear_bit(0, (unsigned long *)&iocb->private) 85 clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
73#define ocfs2_iocb_rw_locked_level(iocb) \ 86#define ocfs2_iocb_rw_locked_level(iocb) \
74 test_bit(1, (unsigned long *)&iocb->private) 87 test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
88#define ocfs2_iocb_set_sem_locked(iocb) \
89 set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
90#define ocfs2_iocb_clear_sem_locked(iocb) \
91 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
92#define ocfs2_iocb_is_sem_locked(iocb) \
93 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
75#endif /* OCFS2_FILE_H */ 94#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e2..b108e863d8f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
82#define O2HB_DB_TYPE_REGION_LIVENODES 4 82#define O2HB_DB_TYPE_REGION_LIVENODES 4
83#define O2HB_DB_TYPE_REGION_NUMBER 5 83#define O2HB_DB_TYPE_REGION_NUMBER 5
84#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 84#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
85#define O2HB_DB_TYPE_REGION_PINNED 7
85struct o2hb_debug_buf { 86struct o2hb_debug_buf {
86 int db_type; 87 int db_type;
87 int db_size; 88 int db_size;
@@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
101#define O2HB_DEBUG_FAILEDREGIONS "failed_regions" 102#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
102#define O2HB_DEBUG_REGION_NUMBER "num" 103#define O2HB_DEBUG_REGION_NUMBER "num"
103#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" 104#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
105#define O2HB_DEBUG_REGION_PINNED "pinned"
104 106
105static struct dentry *o2hb_debug_dir; 107static struct dentry *o2hb_debug_dir;
106static struct dentry *o2hb_debug_livenodes; 108static struct dentry *o2hb_debug_livenodes;
@@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
132unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 134unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
133unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; 135unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
134 136
137/*
138 * o2hb_dependent_users tracks the number of registered callbacks that depend
139 * on heartbeat. o2net and o2dlm are two entities that register this callback.
140 * However only o2dlm depends on the heartbeat. It does not want the heartbeat
141 * to stop while a dlm domain is still active.
142 */
143unsigned int o2hb_dependent_users;
144
145/*
146 * In global heartbeat mode, all regions are pinned if there are one or more
147 * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
148 * regions are unpinned if the region count exceeds the cut off or the number
149 * of dependent users falls to zero.
150 */
151#define O2HB_PIN_CUT_OFF 3
152
153/*
154 * In local heartbeat mode, we assume the dlm domain name to be the same as
155 * region uuid. This is true for domains created for the file system but not
156 * necessarily true for userdlm domains. This is a known limitation.
157 *
158 * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
159 * works for both file system and userdlm domains.
160 */
161static int o2hb_region_pin(const char *region_uuid);
162static void o2hb_region_unpin(const char *region_uuid);
163
135/* Only sets a new threshold if there are no active regions. 164/* Only sets a new threshold if there are no active regions.
136 * 165 *
137 * No locking or otherwise interesting code is required for reading 166 * No locking or otherwise interesting code is required for reading
@@ -186,7 +215,9 @@ struct o2hb_region {
186 struct config_item hr_item; 215 struct config_item hr_item;
187 216
188 struct list_head hr_all_item; 217 struct list_head hr_all_item;
189 unsigned hr_unclean_stop:1; 218 unsigned hr_unclean_stop:1,
219 hr_item_pinned:1,
220 hr_item_dropped:1;
190 221
191 /* protected by the hr_callback_sem */ 222 /* protected by the hr_callback_sem */
192 struct task_struct *hr_task; 223 struct task_struct *hr_task;
@@ -212,9 +243,11 @@ struct o2hb_region {
212 struct dentry *hr_debug_livenodes; 243 struct dentry *hr_debug_livenodes;
213 struct dentry *hr_debug_regnum; 244 struct dentry *hr_debug_regnum;
214 struct dentry *hr_debug_elapsed_time; 245 struct dentry *hr_debug_elapsed_time;
246 struct dentry *hr_debug_pinned;
215 struct o2hb_debug_buf *hr_db_livenodes; 247 struct o2hb_debug_buf *hr_db_livenodes;
216 struct o2hb_debug_buf *hr_db_regnum; 248 struct o2hb_debug_buf *hr_db_regnum;
217 struct o2hb_debug_buf *hr_db_elapsed_time; 249 struct o2hb_debug_buf *hr_db_elapsed_time;
250 struct o2hb_debug_buf *hr_db_pinned;
218 251
219 /* let the person setting up hb wait for it to return until it 252 /* let the person setting up hb wait for it to return until it
220 * has reached a 'steady' state. This will be fixed when we have 253 * has reached a 'steady' state. This will be fixed when we have
@@ -307,8 +340,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
307 340
308static void o2hb_disarm_write_timeout(struct o2hb_region *reg) 341static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
309{ 342{
310 cancel_delayed_work(&reg->hr_write_timeout_work); 343 cancel_delayed_work_sync(&reg->hr_write_timeout_work);
311 flush_scheduled_work();
312} 344}
313 345
314static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) 346static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -702,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
702 config_item_name(&reg->hr_item)); 734 config_item_name(&reg->hr_item));
703 735
704 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); 736 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
737
738 /*
739 * If global heartbeat active, unpin all regions if the
740 * region count > CUT_OFF
741 */
742 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
743 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
744 o2hb_region_unpin(NULL);
705} 745}
706 746
707static int o2hb_check_slot(struct o2hb_region *reg, 747static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1042,6 +1082,9 @@ static int o2hb_thread(void *data)
1042 1082
1043 set_user_nice(current, -20); 1083 set_user_nice(current, -20);
1044 1084
1085 /* Pin node */
1086 o2nm_depend_this_node();
1087
1045 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 1088 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
1046 /* We track the time spent inside 1089 /* We track the time spent inside
1047 * o2hb_do_disk_heartbeat so that we avoid more than 1090 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -1091,6 +1134,9 @@ static int o2hb_thread(void *data)
1091 mlog_errno(ret); 1134 mlog_errno(ret);
1092 } 1135 }
1093 1136
1137 /* Unpin node */
1138 o2nm_undepend_this_node();
1139
1094 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); 1140 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
1095 1141
1096 return 0; 1142 return 0;
@@ -1143,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
1143 reg->hr_last_timeout_start)); 1189 reg->hr_last_timeout_start));
1144 goto done; 1190 goto done;
1145 1191
1192 case O2HB_DB_TYPE_REGION_PINNED:
1193 reg = (struct o2hb_region *)db->db_data;
1194 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
1195 !!reg->hr_item_pinned);
1196 goto done;
1197
1146 default: 1198 default:
1147 goto done; 1199 goto done;
1148 } 1200 }
@@ -1316,6 +1368,8 @@ int o2hb_init(void)
1316 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); 1368 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
1317 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); 1369 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
1318 1370
1371 o2hb_dependent_users = 0;
1372
1319 return o2hb_debug_init(); 1373 return o2hb_debug_init();
1320} 1374}
1321 1375
@@ -1385,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item)
1385 debugfs_remove(reg->hr_debug_livenodes); 1439 debugfs_remove(reg->hr_debug_livenodes);
1386 debugfs_remove(reg->hr_debug_regnum); 1440 debugfs_remove(reg->hr_debug_regnum);
1387 debugfs_remove(reg->hr_debug_elapsed_time); 1441 debugfs_remove(reg->hr_debug_elapsed_time);
1442 debugfs_remove(reg->hr_debug_pinned);
1388 debugfs_remove(reg->hr_debug_dir); 1443 debugfs_remove(reg->hr_debug_dir);
1389 1444
1390 spin_lock(&o2hb_live_lock); 1445 spin_lock(&o2hb_live_lock);
@@ -1674,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1674 goto out; 1729 goto out;
1675 1730
1676 reg->hr_bdev = I_BDEV(filp->f_mapping->host); 1731 reg->hr_bdev = I_BDEV(filp->f_mapping->host);
1677 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ); 1732 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
1678 if (ret) { 1733 if (ret) {
1679 reg->hr_bdev = NULL; 1734 reg->hr_bdev = NULL;
1680 goto out; 1735 goto out;
@@ -1949,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
1949 goto bail; 2004 goto bail;
1950 } 2005 }
1951 2006
2007 reg->hr_debug_pinned =
2008 o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
2009 reg->hr_debug_dir,
2010 &(reg->hr_db_pinned),
2011 sizeof(*(reg->hr_db_pinned)),
2012 O2HB_DB_TYPE_REGION_PINNED,
2013 0, 0, reg);
2014 if (!reg->hr_debug_pinned) {
2015 mlog_errno(ret);
2016 goto bail;
2017 }
2018
1952 ret = 0; 2019 ret = 0;
1953bail: 2020bail:
1954 return ret; 2021 return ret;
@@ -1964,8 +2031,10 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
1964 if (reg == NULL) 2031 if (reg == NULL)
1965 return ERR_PTR(-ENOMEM); 2032 return ERR_PTR(-ENOMEM);
1966 2033
1967 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) 2034 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
1968 return ERR_PTR(-ENAMETOOLONG); 2035 ret = -ENAMETOOLONG;
2036 goto free;
2037 }
1969 2038
1970 spin_lock(&o2hb_live_lock); 2039 spin_lock(&o2hb_live_lock);
1971 reg->hr_region_num = 0; 2040 reg->hr_region_num = 0;
@@ -1974,7 +2043,8 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
1974 O2NM_MAX_REGIONS); 2043 O2NM_MAX_REGIONS);
1975 if (reg->hr_region_num >= O2NM_MAX_REGIONS) { 2044 if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
1976 spin_unlock(&o2hb_live_lock); 2045 spin_unlock(&o2hb_live_lock);
1977 return ERR_PTR(-EFBIG); 2046 ret = -EFBIG;
2047 goto free;
1978 } 2048 }
1979 set_bit(reg->hr_region_num, o2hb_region_bitmap); 2049 set_bit(reg->hr_region_num, o2hb_region_bitmap);
1980 } 2050 }
@@ -1986,10 +2056,13 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
1986 ret = o2hb_debug_region_init(reg, o2hb_debug_dir); 2056 ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
1987 if (ret) { 2057 if (ret) {
1988 config_item_put(&reg->hr_item); 2058 config_item_put(&reg->hr_item);
1989 return ERR_PTR(ret); 2059 goto free;
1990 } 2060 }
1991 2061
1992 return &reg->hr_item; 2062 return &reg->hr_item;
2063free:
2064 kfree(reg);
2065 return ERR_PTR(ret);
1993} 2066}
1994 2067
1995static void o2hb_heartbeat_group_drop_item(struct config_group *group, 2068static void o2hb_heartbeat_group_drop_item(struct config_group *group,
@@ -1997,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1997{ 2070{
1998 struct task_struct *hb_task; 2071 struct task_struct *hb_task;
1999 struct o2hb_region *reg = to_o2hb_region(item); 2072 struct o2hb_region *reg = to_o2hb_region(item);
2073 int quorum_region = 0;
2000 2074
2001 /* stop the thread when the user removes the region dir */ 2075 /* stop the thread when the user removes the region dir */
2002 spin_lock(&o2hb_live_lock); 2076 spin_lock(&o2hb_live_lock);
2003 if (o2hb_global_heartbeat_active()) { 2077 if (o2hb_global_heartbeat_active()) {
2004 clear_bit(reg->hr_region_num, o2hb_region_bitmap); 2078 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2005 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); 2079 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2080 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2081 quorum_region = 1;
2082 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2006 } 2083 }
2007 hb_task = reg->hr_task; 2084 hb_task = reg->hr_task;
2008 reg->hr_task = NULL; 2085 reg->hr_task = NULL;
2086 reg->hr_item_dropped = 1;
2009 spin_unlock(&o2hb_live_lock); 2087 spin_unlock(&o2hb_live_lock);
2010 2088
2011 if (hb_task) 2089 if (hb_task)
@@ -2023,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2023 if (o2hb_global_heartbeat_active()) 2101 if (o2hb_global_heartbeat_active())
2024 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", 2102 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2025 config_item_name(&reg->hr_item)); 2103 config_item_name(&reg->hr_item));
2104
2026 config_item_put(item); 2105 config_item_put(item);
2106
2107 if (!o2hb_global_heartbeat_active() || !quorum_region)
2108 return;
2109
2110 /*
2111 * If global heartbeat active and there are dependent users,
2112 * pin all regions if quorum region count <= CUT_OFF
2113 */
2114 spin_lock(&o2hb_live_lock);
2115
2116 if (!o2hb_dependent_users)
2117 goto unlock;
2118
2119 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
2120 O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
2121 o2hb_region_pin(NULL);
2122
2123unlock:
2124 spin_unlock(&o2hb_live_lock);
2027} 2125}
2028 2126
2029struct o2hb_heartbeat_group_attribute { 2127struct o2hb_heartbeat_group_attribute {
@@ -2209,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
2209} 2307}
2210EXPORT_SYMBOL_GPL(o2hb_setup_callback); 2308EXPORT_SYMBOL_GPL(o2hb_setup_callback);
2211 2309
2212static struct o2hb_region *o2hb_find_region(const char *region_uuid) 2310/*
2311 * In local heartbeat mode, region_uuid passed matches the dlm domain name.
2312 * In global heartbeat mode, region_uuid passed is NULL.
2313 *
2314 * In local, we only pin the matching region. In global we pin all the active
2315 * regions.
2316 */
2317static int o2hb_region_pin(const char *region_uuid)
2213{ 2318{
2214 struct o2hb_region *p, *reg = NULL; 2319 int ret = 0, found = 0;
2320 struct o2hb_region *reg;
2321 char *uuid;
2215 2322
2216 assert_spin_locked(&o2hb_live_lock); 2323 assert_spin_locked(&o2hb_live_lock);
2217 2324
2218 list_for_each_entry(p, &o2hb_all_regions, hr_all_item) { 2325 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2219 if (!strcmp(region_uuid, config_item_name(&p->hr_item))) { 2326 uuid = config_item_name(&reg->hr_item);
2220 reg = p; 2327
2221 break; 2328 /* local heartbeat */
2329 if (region_uuid) {
2330 if (strcmp(region_uuid, uuid))
2331 continue;
2332 found = 1;
2222 } 2333 }
2334
2335 if (reg->hr_item_pinned || reg->hr_item_dropped)
2336 goto skip_pin;
2337
2338 /* Ignore ENOENT only for local hb (userdlm domain) */
2339 ret = o2nm_depend_item(&reg->hr_item);
2340 if (!ret) {
2341 mlog(ML_CLUSTER, "Pin region %s\n", uuid);
2342 reg->hr_item_pinned = 1;
2343 } else {
2344 if (ret == -ENOENT && found)
2345 ret = 0;
2346 else {
2347 mlog(ML_ERROR, "Pin region %s fails with %d\n",
2348 uuid, ret);
2349 break;
2350 }
2351 }
2352skip_pin:
2353 if (found)
2354 break;
2223 } 2355 }
2224 2356
2225 return reg; 2357 return ret;
2226} 2358}
2227 2359
2228static int o2hb_region_get(const char *region_uuid) 2360/*
2361 * In local heartbeat mode, region_uuid passed matches the dlm domain name.
2362 * In global heartbeat mode, region_uuid passed is NULL.
2363 *
2364 * In local, we only unpin the matching region. In global we unpin all the
2365 * active regions.
2366 */
2367static void o2hb_region_unpin(const char *region_uuid)
2229{ 2368{
2230 int ret = 0;
2231 struct o2hb_region *reg; 2369 struct o2hb_region *reg;
2370 char *uuid;
2371 int found = 0;
2232 2372
2233 spin_lock(&o2hb_live_lock); 2373 assert_spin_locked(&o2hb_live_lock);
2234 2374
2235 reg = o2hb_find_region(region_uuid); 2375 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2236 if (!reg) 2376 uuid = config_item_name(&reg->hr_item);
2237 ret = -ENOENT; 2377 if (region_uuid) {
2238 spin_unlock(&o2hb_live_lock); 2378 if (strcmp(region_uuid, uuid))
2379 continue;
2380 found = 1;
2381 }
2239 2382
2240 if (ret) 2383 if (reg->hr_item_pinned) {
2241 goto out; 2384 mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
2385 o2nm_undepend_item(&reg->hr_item);
2386 reg->hr_item_pinned = 0;
2387 }
2388 if (found)
2389 break;
2390 }
2391}
2242 2392
2243 ret = o2nm_depend_this_node(); 2393static int o2hb_region_inc_user(const char *region_uuid)
2244 if (ret) 2394{
2245 goto out; 2395 int ret = 0;
2246 2396
2247 ret = o2nm_depend_item(&reg->hr_item); 2397 spin_lock(&o2hb_live_lock);
2248 if (ret)
2249 o2nm_undepend_this_node();
2250 2398
2251out: 2399 /* local heartbeat */
2400 if (!o2hb_global_heartbeat_active()) {
2401 ret = o2hb_region_pin(region_uuid);
2402 goto unlock;
2403 }
2404
2405 /*
2406 * if global heartbeat active and this is the first dependent user,
2407 * pin all regions if quorum region count <= CUT_OFF
2408 */
2409 o2hb_dependent_users++;
2410 if (o2hb_dependent_users > 1)
2411 goto unlock;
2412
2413 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
2414 O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
2415 ret = o2hb_region_pin(NULL);
2416
2417unlock:
2418 spin_unlock(&o2hb_live_lock);
2252 return ret; 2419 return ret;
2253} 2420}
2254 2421
2255static void o2hb_region_put(const char *region_uuid) 2422void o2hb_region_dec_user(const char *region_uuid)
2256{ 2423{
2257 struct o2hb_region *reg;
2258
2259 spin_lock(&o2hb_live_lock); 2424 spin_lock(&o2hb_live_lock);
2260 2425
2261 reg = o2hb_find_region(region_uuid); 2426 /* local heartbeat */
2427 if (!o2hb_global_heartbeat_active()) {
2428 o2hb_region_unpin(region_uuid);
2429 goto unlock;
2430 }
2262 2431
2263 spin_unlock(&o2hb_live_lock); 2432 /*
2433 * if global heartbeat active and there are no dependent users,
2434 * unpin all quorum regions
2435 */
2436 o2hb_dependent_users--;
2437 if (!o2hb_dependent_users)
2438 o2hb_region_unpin(NULL);
2264 2439
2265 if (reg) { 2440unlock:
2266 o2nm_undepend_item(&reg->hr_item); 2441 spin_unlock(&o2hb_live_lock);
2267 o2nm_undepend_this_node();
2268 }
2269} 2442}
2270 2443
2271int o2hb_register_callback(const char *region_uuid, 2444int o2hb_register_callback(const char *region_uuid,
@@ -2286,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
2286 } 2459 }
2287 2460
2288 if (region_uuid) { 2461 if (region_uuid) {
2289 ret = o2hb_region_get(region_uuid); 2462 ret = o2hb_region_inc_user(region_uuid);
2290 if (ret) 2463 if (ret) {
2464 mlog_errno(ret);
2291 goto out; 2465 goto out;
2466 }
2292 } 2467 }
2293 2468
2294 down_write(&o2hb_callback_sem); 2469 down_write(&o2hb_callback_sem);
@@ -2306,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
2306 up_write(&o2hb_callback_sem); 2481 up_write(&o2hb_callback_sem);
2307 ret = 0; 2482 ret = 0;
2308out: 2483out:
2309 mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n", 2484 mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
2310 ret, __builtin_return_address(0), hc); 2485 ret, __builtin_return_address(0), hc);
2311 return ret; 2486 return ret;
2312} 2487}
@@ -2317,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
2317{ 2492{
2318 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); 2493 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
2319 2494
2320 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", 2495 mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
2321 __builtin_return_address(0), hc); 2496 __builtin_return_address(0), hc);
2322 2497
2323 /* XXX Can this happen _with_ a region reference? */ 2498 /* XXX Can this happen _with_ a region reference? */
@@ -2325,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
2325 return; 2500 return;
2326 2501
2327 if (region_uuid) 2502 if (region_uuid)
2328 o2hb_region_put(region_uuid); 2503 o2hb_region_dec_user(region_uuid);
2329 2504
2330 down_write(&o2hb_callback_sem); 2505 down_write(&o2hb_callback_sem);
2331 2506
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index c7fba396392..6c61771469a 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT), 114 define_mask(REFCOUNT),
115 define_mask(BASTS), 115 define_mask(BASTS),
116 define_mask(RESERVATIONS),
117 define_mask(CLUSTER),
116 define_mask(ERROR), 118 define_mask(ERROR),
117 define_mask(NOTICE), 119 define_mask(NOTICE),
118 define_mask(KTHREAD), 120 define_mask(KTHREAD),
119 define_mask(RESERVATIONS),
120}; 121};
121 122
122static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; 123static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index ea2ed9f56c9..34d6544357d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -81,7 +81,7 @@
81#include <linux/sched.h> 81#include <linux/sched.h>
82 82
83/* bits that are frequently given and infrequently matched in the low word */ 83/* bits that are frequently given and infrequently matched in the low word */
84/* NOTE: If you add a flag, you need to also update mlog.c! */ 84/* NOTE: If you add a flag, you need to also update masklog.c! */
85#define ML_ENTRY 0x0000000000000001ULL /* func call entry */ 85#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
86#define ML_EXIT 0x0000000000000002ULL /* func call exit */ 86#define ML_EXIT 0x0000000000000002ULL /* func call exit */
87#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */ 87#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
@@ -114,13 +114,14 @@
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ 116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
117#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */ 117#define ML_BASTS 0x0000000100000000ULL /* dlmglue asts and basts */
118#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
119#define ML_CLUSTER 0x0000000400000000ULL /* cluster stack */
120
118/* bits that are infrequently given and frequently matched in the high word */ 121/* bits that are infrequently given and frequently matched in the high word */
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 122#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 123#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 124#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
123#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */
124 125
125#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 126#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
126#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 127#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b0..3a5835904b3 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
46#define O2NET_DEBUG_DIR "o2net" 46#define O2NET_DEBUG_DIR "o2net"
47#define SC_DEBUG_NAME "sock_containers" 47#define SC_DEBUG_NAME "sock_containers"
48#define NST_DEBUG_NAME "send_tracking" 48#define NST_DEBUG_NAME "send_tracking"
49#define STATS_DEBUG_NAME "stats"
50
51#define SHOW_SOCK_CONTAINERS 0
52#define SHOW_SOCK_STATS 1
49 53
50static struct dentry *o2net_dentry; 54static struct dentry *o2net_dentry;
51static struct dentry *sc_dentry; 55static struct dentry *sc_dentry;
52static struct dentry *nst_dentry; 56static struct dentry *nst_dentry;
57static struct dentry *stats_dentry;
53 58
54static DEFINE_SPINLOCK(o2net_debug_lock); 59static DEFINE_SPINLOCK(o2net_debug_lock);
55 60
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
123static int nst_seq_show(struct seq_file *seq, void *v) 128static int nst_seq_show(struct seq_file *seq, void *v)
124{ 129{
125 struct o2net_send_tracking *nst, *dummy_nst = seq->private; 130 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
131 ktime_t now;
132 s64 sock, send, status;
126 133
127 spin_lock(&o2net_debug_lock); 134 spin_lock(&o2net_debug_lock);
128 nst = next_nst(dummy_nst); 135 nst = next_nst(dummy_nst);
136 if (!nst)
137 goto out;
129 138
130 if (nst != NULL) { 139 now = ktime_get();
131 /* get_task_comm isn't exported. oh well. */ 140 sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
132 seq_printf(seq, "%p:\n" 141 send = ktime_to_us(ktime_sub(now, nst->st_send_time));
133 " pid: %lu\n" 142 status = ktime_to_us(ktime_sub(now, nst->st_status_time));
134 " tgid: %lu\n" 143
135 " process name: %s\n" 144 /* get_task_comm isn't exported. oh well. */
136 " node: %u\n" 145 seq_printf(seq, "%p:\n"
137 " sc: %p\n" 146 " pid: %lu\n"
138 " message id: %d\n" 147 " tgid: %lu\n"
139 " message type: %u\n" 148 " process name: %s\n"
140 " message key: 0x%08x\n" 149 " node: %u\n"
141 " sock acquiry: %lu.%ld\n" 150 " sc: %p\n"
142 " send start: %lu.%ld\n" 151 " message id: %d\n"
143 " wait start: %lu.%ld\n", 152 " message type: %u\n"
144 nst, (unsigned long)nst->st_task->pid, 153 " message key: 0x%08x\n"
145 (unsigned long)nst->st_task->tgid, 154 " sock acquiry: %lld usecs ago\n"
146 nst->st_task->comm, nst->st_node, 155 " send start: %lld usecs ago\n"
147 nst->st_sc, nst->st_id, nst->st_msg_type, 156 " wait start: %lld usecs ago\n",
148 nst->st_msg_key, 157 nst, (unsigned long)task_pid_nr(nst->st_task),
149 nst->st_sock_time.tv_sec, 158 (unsigned long)nst->st_task->tgid,
150 (long)nst->st_sock_time.tv_usec, 159 nst->st_task->comm, nst->st_node,
151 nst->st_send_time.tv_sec, 160 nst->st_sc, nst->st_id, nst->st_msg_type,
152 (long)nst->st_send_time.tv_usec, 161 nst->st_msg_key,
153 nst->st_status_time.tv_sec, 162 (long long)sock,
154 (long)nst->st_status_time.tv_usec); 163 (long long)send,
155 } 164 (long long)status);
156 165
166out:
157 spin_unlock(&o2net_debug_lock); 167 spin_unlock(&o2net_debug_lock);
158 168
159 return 0; 169 return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
228 spin_unlock(&o2net_debug_lock); 238 spin_unlock(&o2net_debug_lock);
229} 239}
230 240
241struct o2net_sock_debug {
242 int dbg_ctxt;
243 struct o2net_sock_container *dbg_sock;
244};
245
231static struct o2net_sock_container 246static struct o2net_sock_container
232 *next_sc(struct o2net_sock_container *sc_start) 247 *next_sc(struct o2net_sock_container *sc_start)
233{ 248{
@@ -253,7 +268,8 @@ static struct o2net_sock_container
253 268
254static void *sc_seq_start(struct seq_file *seq, loff_t *pos) 269static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
255{ 270{
256 struct o2net_sock_container *sc, *dummy_sc = seq->private; 271 struct o2net_sock_debug *sd = seq->private;
272 struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
257 273
258 spin_lock(&o2net_debug_lock); 274 spin_lock(&o2net_debug_lock);
259 sc = next_sc(dummy_sc); 275 sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
264 280
265static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) 281static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266{ 282{
267 struct o2net_sock_container *sc, *dummy_sc = seq->private; 283 struct o2net_sock_debug *sd = seq->private;
284 struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
268 285
269 spin_lock(&o2net_debug_lock); 286 spin_lock(&o2net_debug_lock);
270 sc = next_sc(dummy_sc); 287 sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
276 return sc; /* unused, just needs to be null when done */ 293 return sc; /* unused, just needs to be null when done */
277} 294}
278 295
279#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec 296#ifdef CONFIG_OCFS2_FS_STATS
297# define sc_send_count(_s) ((_s)->sc_send_count)
298# define sc_recv_count(_s) ((_s)->sc_recv_count)
299# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total))
300# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total))
301# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total))
302# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total))
303#else
304# define sc_send_count(_s) (0U)
305# define sc_recv_count(_s) (0U)
306# define sc_tv_acquiry_total_ns(_s) (0LL)
307# define sc_tv_send_total_ns(_s) (0LL)
308# define sc_tv_status_total_ns(_s) (0LL)
309# define sc_tv_process_total_ns(_s) (0LL)
310#endif
311
312/* So that debugfs.ocfs2 can determine which format is being used */
313#define O2NET_STATS_STR_VERSION 1
314static void sc_show_sock_stats(struct seq_file *seq,
315 struct o2net_sock_container *sc)
316{
317 if (!sc)
318 return;
319
320 seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
321 sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
322 (long long)sc_tv_acquiry_total_ns(sc),
323 (long long)sc_tv_send_total_ns(sc),
324 (long long)sc_tv_status_total_ns(sc),
325 (unsigned long)sc_recv_count(sc),
326 (long long)sc_tv_process_total_ns(sc));
327}
328
329static void sc_show_sock_container(struct seq_file *seq,
330 struct o2net_sock_container *sc)
331{
332 struct inet_sock *inet = NULL;
333 __be32 saddr = 0, daddr = 0;
334 __be16 sport = 0, dport = 0;
335
336 if (!sc)
337 return;
338
339 if (sc->sc_sock) {
340 inet = inet_sk(sc->sc_sock->sk);
341 /* the stack's structs aren't sparse endian clean */
342 saddr = (__force __be32)inet->inet_saddr;
343 daddr = (__force __be32)inet->inet_daddr;
344 sport = (__force __be16)inet->inet_sport;
345 dport = (__force __be16)inet->inet_dport;
346 }
347
348 /* XXX sigh, inet-> doesn't have sparse annotation so any
349 * use of it here generates a warning with -Wbitwise */
350 seq_printf(seq, "%p:\n"
351 " krefs: %d\n"
352 " sock: %pI4:%u -> "
353 "%pI4:%u\n"
354 " remote node: %s\n"
355 " page off: %zu\n"
356 " handshake ok: %u\n"
357 " timer: %lld usecs\n"
358 " data ready: %lld usecs\n"
359 " advance start: %lld usecs\n"
360 " advance stop: %lld usecs\n"
361 " func start: %lld usecs\n"
362 " func stop: %lld usecs\n"
363 " func key: 0x%08x\n"
364 " func type: %u\n",
365 sc,
366 atomic_read(&sc->sc_kref.refcount),
367 &saddr, inet ? ntohs(sport) : 0,
368 &daddr, inet ? ntohs(dport) : 0,
369 sc->sc_node->nd_name,
370 sc->sc_page_off,
371 sc->sc_handshake_ok,
372 (long long)ktime_to_us(sc->sc_tv_timer),
373 (long long)ktime_to_us(sc->sc_tv_data_ready),
374 (long long)ktime_to_us(sc->sc_tv_advance_start),
375 (long long)ktime_to_us(sc->sc_tv_advance_stop),
376 (long long)ktime_to_us(sc->sc_tv_func_start),
377 (long long)ktime_to_us(sc->sc_tv_func_stop),
378 sc->sc_msg_key,
379 sc->sc_msg_type);
380}
280 381
281static int sc_seq_show(struct seq_file *seq, void *v) 382static int sc_seq_show(struct seq_file *seq, void *v)
282{ 383{
283 struct o2net_sock_container *sc, *dummy_sc = seq->private; 384 struct o2net_sock_debug *sd = seq->private;
385 struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
284 386
285 spin_lock(&o2net_debug_lock); 387 spin_lock(&o2net_debug_lock);
286 sc = next_sc(dummy_sc); 388 sc = next_sc(dummy_sc);
287 389
288 if (sc != NULL) { 390 if (sc) {
289 struct inet_sock *inet = NULL; 391 if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
290 392 sc_show_sock_container(seq, sc);
291 __be32 saddr = 0, daddr = 0; 393 else
292 __be16 sport = 0, dport = 0; 394 sc_show_sock_stats(seq, sc);
293
294 if (sc->sc_sock) {
295 inet = inet_sk(sc->sc_sock->sk);
296 /* the stack's structs aren't sparse endian clean */
297 saddr = (__force __be32)inet->inet_saddr;
298 daddr = (__force __be32)inet->inet_daddr;
299 sport = (__force __be16)inet->inet_sport;
300 dport = (__force __be16)inet->inet_dport;
301 }
302
303 /* XXX sigh, inet-> doesn't have sparse annotation so any
304 * use of it here generates a warning with -Wbitwise */
305 seq_printf(seq, "%p:\n"
306 " krefs: %d\n"
307 " sock: %pI4:%u -> "
308 "%pI4:%u\n"
309 " remote node: %s\n"
310 " page off: %zu\n"
311 " handshake ok: %u\n"
312 " timer: %lu.%ld\n"
313 " data ready: %lu.%ld\n"
314 " advance start: %lu.%ld\n"
315 " advance stop: %lu.%ld\n"
316 " func start: %lu.%ld\n"
317 " func stop: %lu.%ld\n"
318 " func key: %u\n"
319 " func type: %u\n",
320 sc,
321 atomic_read(&sc->sc_kref.refcount),
322 &saddr, inet ? ntohs(sport) : 0,
323 &daddr, inet ? ntohs(dport) : 0,
324 sc->sc_node->nd_name,
325 sc->sc_page_off,
326 sc->sc_handshake_ok,
327 TV_SEC_USEC(sc->sc_tv_timer),
328 TV_SEC_USEC(sc->sc_tv_data_ready),
329 TV_SEC_USEC(sc->sc_tv_advance_start),
330 TV_SEC_USEC(sc->sc_tv_advance_stop),
331 TV_SEC_USEC(sc->sc_tv_func_start),
332 TV_SEC_USEC(sc->sc_tv_func_stop),
333 sc->sc_msg_key,
334 sc->sc_msg_type);
335 } 395 }
336 396
337
338 spin_unlock(&o2net_debug_lock); 397 spin_unlock(&o2net_debug_lock);
339 398
340 return 0; 399 return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
351 .show = sc_seq_show, 410 .show = sc_seq_show,
352}; 411};
353 412
354static int sc_fop_open(struct inode *inode, struct file *file) 413static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
355{ 414{
356 struct o2net_sock_container *dummy_sc; 415 struct o2net_sock_container *dummy_sc;
357 struct seq_file *seq; 416 struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
369 goto out; 428 goto out;
370 429
371 seq = file->private_data; 430 seq = file->private_data;
372 seq->private = dummy_sc; 431 seq->private = sd;
432 sd->dbg_sock = dummy_sc;
373 o2net_debug_add_sc(dummy_sc); 433 o2net_debug_add_sc(dummy_sc);
374 434
375 dummy_sc = NULL; 435 dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
382static int sc_fop_release(struct inode *inode, struct file *file) 442static int sc_fop_release(struct inode *inode, struct file *file)
383{ 443{
384 struct seq_file *seq = file->private_data; 444 struct seq_file *seq = file->private_data;
385 struct o2net_sock_container *dummy_sc = seq->private; 445 struct o2net_sock_debug *sd = seq->private;
446 struct o2net_sock_container *dummy_sc = sd->dbg_sock;
386 447
387 o2net_debug_del_sc(dummy_sc); 448 o2net_debug_del_sc(dummy_sc);
388 return seq_release_private(inode, file); 449 return seq_release_private(inode, file);
389} 450}
390 451
452static int stats_fop_open(struct inode *inode, struct file *file)
453{
454 struct o2net_sock_debug *sd;
455
456 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
457 if (sd == NULL)
458 return -ENOMEM;
459
460 sd->dbg_ctxt = SHOW_SOCK_STATS;
461 sd->dbg_sock = NULL;
462
463 return sc_common_open(file, sd);
464}
465
466static const struct file_operations stats_seq_fops = {
467 .open = stats_fop_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
470 .release = sc_fop_release,
471};
472
473static int sc_fop_open(struct inode *inode, struct file *file)
474{
475 struct o2net_sock_debug *sd;
476
477 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
478 if (sd == NULL)
479 return -ENOMEM;
480
481 sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
482 sd->dbg_sock = NULL;
483
484 return sc_common_open(file, sd);
485}
486
391static const struct file_operations sc_seq_fops = { 487static const struct file_operations sc_seq_fops = {
392 .open = sc_fop_open, 488 .open = sc_fop_open,
393 .read = seq_read, 489 .read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
419 goto bail; 515 goto bail;
420 } 516 }
421 517
518 stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
519 o2net_dentry, NULL,
520 &stats_seq_fops);
521 if (!stats_dentry) {
522 mlog_errno(-ENOMEM);
523 goto bail;
524 }
525
422 return 0; 526 return 0;
423bail: 527bail:
424 if (sc_dentry) 528 debugfs_remove(stats_dentry);
425 debugfs_remove(sc_dentry); 529 debugfs_remove(sc_dentry);
426 if (nst_dentry) 530 debugfs_remove(nst_dentry);
427 debugfs_remove(nst_dentry); 531 debugfs_remove(o2net_dentry);
428 if (o2net_dentry)
429 debugfs_remove(o2net_dentry);
430 return -ENOMEM; 532 return -ENOMEM;
431} 533}
432 534
433void o2net_debugfs_exit(void) 535void o2net_debugfs_exit(void)
434{ 536{
435 if (sc_dentry) 537 debugfs_remove(stats_dentry);
436 debugfs_remove(sc_dentry); 538 debugfs_remove(sc_dentry);
437 if (nst_dentry) 539 debugfs_remove(nst_dentry);
438 debugfs_remove(nst_dentry); 540 debugfs_remove(o2net_dentry);
439 if (o2net_dentry)
440 debugfs_remove(o2net_dentry);
441} 541}
442 542
443#endif /* CONFIG_DEBUG_FS */ 543#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index cf3e1669621..a87366750f2 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -325,5 +325,7 @@ void o2quo_init(void)
325 325
326void o2quo_exit(void) 326void o2quo_exit(void)
327{ 327{
328 flush_scheduled_work(); 328 struct o2quo_state *qs = &o2quo_state;
329
330 flush_work_sync(&qs->qs_work);
329} 331}
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9aa426e4212..3b11cb1e38f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
153 nst->st_node = node; 153 nst->st_node = node;
154} 154}
155 155
156static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) 156static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
157{ 157{
158 do_gettimeofday(&nst->st_sock_time); 158 nst->st_sock_time = ktime_get();
159} 159}
160 160
161static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) 161static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
162{ 162{
163 do_gettimeofday(&nst->st_send_time); 163 nst->st_send_time = ktime_get();
164} 164}
165 165
166static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) 166static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
167{ 167{
168 do_gettimeofday(&nst->st_status_time); 168 nst->st_status_time = ktime_get();
169} 169}
170 170
171static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, 171static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
172 struct o2net_sock_container *sc) 172 struct o2net_sock_container *sc)
173{ 173{
174 nst->st_sc = sc; 174 nst->st_sc = sc;
175} 175}
176 176
177static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) 177static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
178 u32 msg_id)
178{ 179{
179 nst->st_id = msg_id; 180 nst->st_id = msg_id;
180} 181}
181 182
182#else /* CONFIG_DEBUG_FS */ 183static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
183
184static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
185 u32 msgkey, struct task_struct *task, u8 node)
186{ 184{
185 sc->sc_tv_timer = ktime_get();
187} 186}
188 187
189static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) 188static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
190{ 189{
190 sc->sc_tv_data_ready = ktime_get();
191} 191}
192 192
193static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) 193static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
194{ 194{
195 sc->sc_tv_advance_start = ktime_get();
195} 196}
196 197
197static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) 198static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
198{ 199{
200 sc->sc_tv_advance_stop = ktime_get();
199} 201}
200 202
201static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, 203static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
202 struct o2net_sock_container *sc)
203{ 204{
205 sc->sc_tv_func_start = ktime_get();
204} 206}
205 207
206static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, 208static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
207 u32 msg_id)
208{ 209{
210 sc->sc_tv_func_stop = ktime_get();
209} 211}
210 212
213static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
214{
215 return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
216}
217#else /* CONFIG_DEBUG_FS */
218# define o2net_init_nst(a, b, c, d, e)
219# define o2net_set_nst_sock_time(a)
220# define o2net_set_nst_send_time(a)
221# define o2net_set_nst_status_time(a)
222# define o2net_set_nst_sock_container(a, b)
223# define o2net_set_nst_msg_id(a, b)
224# define o2net_set_sock_timer(a)
225# define o2net_set_data_ready_time(a)
226# define o2net_set_advance_start_time(a)
227# define o2net_set_advance_stop_time(a)
228# define o2net_set_func_start_time(a)
229# define o2net_set_func_stop_time(a)
230# define o2net_get_func_run_time(a) (ktime_t)0
211#endif /* CONFIG_DEBUG_FS */ 231#endif /* CONFIG_DEBUG_FS */
212 232
233#ifdef CONFIG_OCFS2_FS_STATS
234static void o2net_update_send_stats(struct o2net_send_tracking *nst,
235 struct o2net_sock_container *sc)
236{
237 sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
238 ktime_sub(ktime_get(),
239 nst->st_status_time));
240 sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
241 ktime_sub(nst->st_status_time,
242 nst->st_send_time));
243 sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
244 ktime_sub(nst->st_send_time,
245 nst->st_sock_time));
246 sc->sc_send_count++;
247}
248
249static void o2net_update_recv_stats(struct o2net_sock_container *sc)
250{
251 sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
252 o2net_get_func_run_time(sc));
253 sc->sc_recv_count++;
254}
255
256#else
257
258# define o2net_update_send_stats(a, b)
259
260# define o2net_update_recv_stats(sc)
261
262#endif /* CONFIG_OCFS2_FS_STATS */
263
213static inline int o2net_reconnect_delay(void) 264static inline int o2net_reconnect_delay(void)
214{ 265{
215 return o2nm_single_cluster->cl_reconnect_delay_ms; 266 return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
355 sc->sc_sock = NULL; 406 sc->sc_sock = NULL;
356 } 407 }
357 408
409 o2nm_undepend_item(&sc->sc_node->nd_item);
358 o2nm_node_put(sc->sc_node); 410 o2nm_node_put(sc->sc_node);
359 sc->sc_node = NULL; 411 sc->sc_node = NULL;
360 412
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
376{ 428{
377 struct o2net_sock_container *sc, *ret = NULL; 429 struct o2net_sock_container *sc, *ret = NULL;
378 struct page *page = NULL; 430 struct page *page = NULL;
431 int status = 0;
379 432
380 page = alloc_page(GFP_NOFS); 433 page = alloc_page(GFP_NOFS);
381 sc = kzalloc(sizeof(*sc), GFP_NOFS); 434 sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
386 o2nm_node_get(node); 439 o2nm_node_get(node);
387 sc->sc_node = node; 440 sc->sc_node = node;
388 441
442 /* pin the node item of the remote node */
443 status = o2nm_depend_item(&node->nd_item);
444 if (status) {
445 mlog_errno(status);
446 o2nm_node_put(node);
447 goto out;
448 }
389 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed); 449 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
390 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty); 450 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
391 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); 451 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
546 if (sk->sk_user_data) { 606 if (sk->sk_user_data) {
547 struct o2net_sock_container *sc = sk->sk_user_data; 607 struct o2net_sock_container *sc = sk->sk_user_data;
548 sclog(sc, "data_ready hit\n"); 608 sclog(sc, "data_ready hit\n");
549 do_gettimeofday(&sc->sc_tv_data_ready); 609 o2net_set_data_ready_time(sc);
550 o2net_sc_queue_work(sc, &sc->sc_rx_work); 610 o2net_sc_queue_work(sc, &sc->sc_rx_work);
551 ready = sc->sc_data_ready; 611 ready = sc->sc_data_ready;
552 } else { 612 } else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
1070 o2net_set_nst_status_time(&nst); 1130 o2net_set_nst_status_time(&nst);
1071 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); 1131 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
1072 1132
1133 o2net_update_send_stats(&nst, sc);
1134
1073 /* Note that we avoid overwriting the callers status return 1135 /* Note that we avoid overwriting the callers status return
1074 * variable if a system error was reported on the other 1136 * variable if a system error was reported on the other
1075 * side. Callers beware. */ 1137 * side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
1183 if (syserr != O2NET_ERR_NONE) 1245 if (syserr != O2NET_ERR_NONE)
1184 goto out_respond; 1246 goto out_respond;
1185 1247
1186 do_gettimeofday(&sc->sc_tv_func_start); 1248 o2net_set_func_start_time(sc);
1187 sc->sc_msg_key = be32_to_cpu(hdr->key); 1249 sc->sc_msg_key = be32_to_cpu(hdr->key);
1188 sc->sc_msg_type = be16_to_cpu(hdr->msg_type); 1250 sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
1189 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + 1251 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
1190 be16_to_cpu(hdr->data_len), 1252 be16_to_cpu(hdr->data_len),
1191 nmh->nh_func_data, &ret_data); 1253 nmh->nh_func_data, &ret_data);
1192 do_gettimeofday(&sc->sc_tv_func_stop); 1254 o2net_set_func_stop_time(sc);
1255
1256 o2net_update_recv_stats(sc);
1193 1257
1194out_respond: 1258out_respond:
1195 /* this destroys the hdr, so don't use it after this */ 1259 /* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
1300 size_t datalen; 1364 size_t datalen;
1301 1365
1302 sclog(sc, "receiving\n"); 1366 sclog(sc, "receiving\n");
1303 do_gettimeofday(&sc->sc_tv_advance_start); 1367 o2net_set_advance_start_time(sc);
1304 1368
1305 if (unlikely(sc->sc_handshake_ok == 0)) { 1369 if (unlikely(sc->sc_handshake_ok == 0)) {
1306 if(sc->sc_page_off < sizeof(struct o2net_handshake)) { 1370 if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
1375 1439
1376out: 1440out:
1377 sclog(sc, "ret = %d\n", ret); 1441 sclog(sc, "ret = %d\n", ret);
1378 do_gettimeofday(&sc->sc_tv_advance_stop); 1442 o2net_set_advance_stop_time(sc);
1379 return ret; 1443 return ret;
1380} 1444}
1381 1445
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
1475{ 1539{
1476 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1540 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1477 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1541 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1478 struct timeval now;
1479 1542
1480 do_gettimeofday(&now); 1543#ifdef CONFIG_DEBUG_FS
1544 ktime_t now = ktime_get();
1545#endif
1481 1546
1482 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1547 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
1483 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1548 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
1484 o2net_idle_timeout() / 1000, 1549 o2net_idle_timeout() / 1000,
1485 o2net_idle_timeout() % 1000); 1550 o2net_idle_timeout() % 1000);
1486 mlog(ML_NOTICE, "here are some times that might help debug the " 1551
1487 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1552#ifdef CONFIG_DEBUG_FS
1488 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1553 mlog(ML_NOTICE, "Here are some times that might help debug the "
1489 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, 1554 "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
1490 now.tv_sec, (long) now.tv_usec, 1555 "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
1491 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, 1556 (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
1492 sc->sc_tv_advance_start.tv_sec, 1557 (long long)ktime_to_us(sc->sc_tv_data_ready),
1493 (long) sc->sc_tv_advance_start.tv_usec, 1558 (long long)ktime_to_us(sc->sc_tv_advance_start),
1494 sc->sc_tv_advance_stop.tv_sec, 1559 (long long)ktime_to_us(sc->sc_tv_advance_stop),
1495 (long) sc->sc_tv_advance_stop.tv_usec,
1496 sc->sc_msg_key, sc->sc_msg_type, 1560 sc->sc_msg_key, sc->sc_msg_type,
1497 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, 1561 (long long)ktime_to_us(sc->sc_tv_func_start),
1498 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); 1562 (long long)ktime_to_us(sc->sc_tv_func_stop));
1563#endif
1499 1564
1500 /* 1565 /*
1501 * Initialize the nn_timeout so that the next connection attempt 1566 * Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
1511 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); 1576 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
1512 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, 1577 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
1513 msecs_to_jiffies(o2net_keepalive_delay())); 1578 msecs_to_jiffies(o2net_keepalive_delay()));
1514 do_gettimeofday(&sc->sc_tv_timer); 1579 o2net_set_sock_timer(sc);
1515 mod_timer(&sc->sc_idle_timeout, 1580 mod_timer(&sc->sc_idle_timeout,
1516 jiffies + msecs_to_jiffies(o2net_idle_timeout())); 1581 jiffies + msecs_to_jiffies(o2net_idle_timeout()));
1517} 1582}
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 15fdbdf9eb4..4cbcb65784a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -166,18 +166,27 @@ struct o2net_sock_container {
166 /* original handlers for the sockets */ 166 /* original handlers for the sockets */
167 void (*sc_state_change)(struct sock *sk); 167 void (*sc_state_change)(struct sock *sk);
168 void (*sc_data_ready)(struct sock *sk, int bytes); 168 void (*sc_data_ready)(struct sock *sk, int bytes);
169#ifdef CONFIG_DEBUG_FS 169
170 struct list_head sc_net_debug_item;
171#endif
172 struct timeval sc_tv_timer;
173 struct timeval sc_tv_data_ready;
174 struct timeval sc_tv_advance_start;
175 struct timeval sc_tv_advance_stop;
176 struct timeval sc_tv_func_start;
177 struct timeval sc_tv_func_stop;
178 u32 sc_msg_key; 170 u32 sc_msg_key;
179 u16 sc_msg_type; 171 u16 sc_msg_type;
180 172
173#ifdef CONFIG_DEBUG_FS
174 struct list_head sc_net_debug_item;
175 ktime_t sc_tv_timer;
176 ktime_t sc_tv_data_ready;
177 ktime_t sc_tv_advance_start;
178 ktime_t sc_tv_advance_stop;
179 ktime_t sc_tv_func_start;
180 ktime_t sc_tv_func_stop;
181#endif
182#ifdef CONFIG_OCFS2_FS_STATS
183 ktime_t sc_tv_acquiry_total;
184 ktime_t sc_tv_send_total;
185 ktime_t sc_tv_status_total;
186 u32 sc_send_count;
187 u32 sc_recv_count;
188 ktime_t sc_tv_process_total;
189#endif
181 struct mutex sc_send_lock; 190 struct mutex sc_send_lock;
182}; 191};
183 192
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
220 u32 st_msg_type; 229 u32 st_msg_type;
221 u32 st_msg_key; 230 u32 st_msg_key;
222 u8 st_node; 231 u8 st_node;
223 struct timeval st_sock_time; 232 ktime_t st_sock_time;
224 struct timeval st_send_time; 233 ktime_t st_send_time;
225 struct timeval st_status_time; 234 ktime_t st_status_time;
226}; 235};
227#else 236#else
228struct o2net_send_tracking { 237struct o2net_send_tracking {
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index edaded48e7e..6d80ecc7834 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -52,9 +52,15 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry)
52static int ocfs2_dentry_revalidate(struct dentry *dentry, 52static int ocfs2_dentry_revalidate(struct dentry *dentry,
53 struct nameidata *nd) 53 struct nameidata *nd)
54{ 54{
55 struct inode *inode = dentry->d_inode; 55 struct inode *inode;
56 int ret = 0; /* if all else fails, just return false */ 56 int ret = 0; /* if all else fails, just return false */
57 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 57 struct ocfs2_super *osb;
58
59 if (nd->flags & LOOKUP_RCU)
60 return -ECHILD;
61
62 inode = dentry->d_inode;
63 osb = OCFS2_SB(dentry->d_sb);
58 64
59 mlog_entry("(0x%p, '%.*s')\n", dentry, 65 mlog_entry("(0x%p, '%.*s')\n", dentry,
60 dentry->d_name.len, dentry->d_name.name); 66 dentry->d_name.len, dentry->d_name.name);
@@ -169,23 +175,25 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
169 struct list_head *p; 175 struct list_head *p;
170 struct dentry *dentry = NULL; 176 struct dentry *dentry = NULL;
171 177
172 spin_lock(&dcache_lock); 178 spin_lock(&inode->i_lock);
173
174 list_for_each(p, &inode->i_dentry) { 179 list_for_each(p, &inode->i_dentry) {
175 dentry = list_entry(p, struct dentry, d_alias); 180 dentry = list_entry(p, struct dentry, d_alias);
176 181
182 spin_lock(&dentry->d_lock);
177 if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { 183 if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
178 mlog(0, "dentry found: %.*s\n", 184 mlog(0, "dentry found: %.*s\n",
179 dentry->d_name.len, dentry->d_name.name); 185 dentry->d_name.len, dentry->d_name.name);
180 186
181 dget_locked(dentry); 187 dget_dlock(dentry);
188 spin_unlock(&dentry->d_lock);
182 break; 189 break;
183 } 190 }
191 spin_unlock(&dentry->d_lock);
184 192
185 dentry = NULL; 193 dentry = NULL;
186 } 194 }
187 195
188 spin_unlock(&dcache_lock); 196 spin_unlock(&inode->i_lock);
189 197
190 return dentry; 198 return dentry;
191} 199}
@@ -476,7 +484,6 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
476 484
477out: 485out:
478 iput(inode); 486 iput(inode);
479 ocfs2_dentry_attach_gen(dentry);
480} 487}
481 488
482/* 489/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c49f6de0e7a..d417b3f9b0c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2461 2461
2462 di->i_dx_root = cpu_to_le64(dr_blkno); 2462 di->i_dx_root = cpu_to_le64(dr_blkno);
2463 2463
2464 spin_lock(&OCFS2_I(dir)->ip_lock);
2464 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; 2465 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2465 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 2466 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2467 spin_unlock(&OCFS2_I(dir)->ip_lock);
2466 2468
2467 ocfs2_journal_dirty(handle, di_bh); 2469 ocfs2_journal_dirty(handle, di_bh);
2468 2470
@@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4466 goto out_commit; 4468 goto out_commit;
4467 } 4469 }
4468 4470
4471 spin_lock(&OCFS2_I(dir)->ip_lock);
4469 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; 4472 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4470 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 4473 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4474 spin_unlock(&OCFS2_I(dir)->ip_lock);
4471 di->i_dx_root = cpu_to_le64(0ULL); 4475 di->i_dx_root = cpu_to_le64(0ULL);
4472 4476
4473 ocfs2_journal_dirty(handle, di_bh); 4477 ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f4499915683..3a3ed4bb794 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
90 90
91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
92{ 92{
93 mlog_entry_void(); 93 struct dlm_lock_resource *res;
94 94
95 BUG_ON(!dlm); 95 BUG_ON(!dlm);
96 BUG_ON(!lock); 96 BUG_ON(!lock);
97 97
98 res = lock->lockres;
99
98 assert_spin_locked(&dlm->ast_lock); 100 assert_spin_locked(&dlm->ast_lock);
101
99 if (!list_empty(&lock->ast_list)) { 102 if (!list_empty(&lock->ast_list)) {
100 mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n", 103 mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
104 "AST list not empty, pending %d, newlevel %d\n",
105 dlm->name, res->lockname.len, res->lockname.name,
106 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
107 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
101 lock->ast_pending, lock->ml.type); 108 lock->ast_pending, lock->ml.type);
102 BUG(); 109 BUG();
103 } 110 }
104 if (lock->ast_pending) 111 if (lock->ast_pending)
105 mlog(0, "lock has an ast getting flushed right now\n"); 112 mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
113 dlm->name, res->lockname.len, res->lockname.name,
114 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
115 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
106 116
107 /* putting lock on list, add a ref */ 117 /* putting lock on list, add a ref */
108 dlm_lock_get(lock); 118 dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
110 120
111 /* check to see if this ast obsoletes the bast */ 121 /* check to see if this ast obsoletes the bast */
112 if (dlm_should_cancel_bast(dlm, lock)) { 122 if (dlm_should_cancel_bast(dlm, lock)) {
113 struct dlm_lock_resource *res = lock->lockres; 123 mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
114 mlog(0, "%s: cancelling bast for %.*s\n", 124 dlm->name, res->lockname.len, res->lockname.name,
115 dlm->name, res->lockname.len, res->lockname.name); 125 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
126 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
116 lock->bast_pending = 0; 127 lock->bast_pending = 0;
117 list_del_init(&lock->bast_list); 128 list_del_init(&lock->bast_list);
118 lock->ml.highest_blocked = LKM_IVMODE; 129 lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
134 145
135void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 146void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
136{ 147{
137 mlog_entry_void();
138
139 BUG_ON(!dlm); 148 BUG_ON(!dlm);
140 BUG_ON(!lock); 149 BUG_ON(!lock);
141 150
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
147 156
148void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 157void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
149{ 158{
150 mlog_entry_void(); 159 struct dlm_lock_resource *res;
151 160
152 BUG_ON(!dlm); 161 BUG_ON(!dlm);
153 BUG_ON(!lock); 162 BUG_ON(!lock);
163
154 assert_spin_locked(&dlm->ast_lock); 164 assert_spin_locked(&dlm->ast_lock);
155 165
166 res = lock->lockres;
167
156 BUG_ON(!list_empty(&lock->bast_list)); 168 BUG_ON(!list_empty(&lock->bast_list));
157 if (lock->bast_pending) 169 if (lock->bast_pending)
158 mlog(0, "lock has a bast getting flushed right now\n"); 170 mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
171 dlm->name, res->lockname.len, res->lockname.name,
172 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
173 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
159 174
160 /* putting lock on list, add a ref */ 175 /* putting lock on list, add a ref */
161 dlm_lock_get(lock); 176 dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
167 182
168void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 183void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
169{ 184{
170 mlog_entry_void();
171
172 BUG_ON(!dlm); 185 BUG_ON(!dlm);
173 BUG_ON(!lock); 186 BUG_ON(!lock);
174 187
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
213 dlm_astlockfunc_t *fn; 226 dlm_astlockfunc_t *fn;
214 struct dlm_lockstatus *lksb; 227 struct dlm_lockstatus *lksb;
215 228
216 mlog_entry_void(); 229 mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
230 res->lockname.len, res->lockname.name,
231 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
232 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
217 233
218 lksb = lock->lksb; 234 lksb = lock->lksb;
219 fn = lock->ast; 235 fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
231 struct dlm_lockstatus *lksb; 247 struct dlm_lockstatus *lksb;
232 int lksbflags; 248 int lksbflags;
233 249
234 mlog_entry_void(); 250 mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
251 res->lockname.len, res->lockname.name,
252 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
253 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
235 254
236 lksb = lock->lksb; 255 lksb = lock->lksb;
237 BUG_ON(lock->ml.node == dlm->node_num); 256 BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
250{ 269{
251 dlm_bastlockfunc_t *fn = lock->bast; 270 dlm_bastlockfunc_t *fn = lock->bast;
252 271
253 mlog_entry_void();
254 BUG_ON(lock->ml.node != dlm->node_num); 272 BUG_ON(lock->ml.node != dlm->node_num);
255 273
274 mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
275 dlm->name, res->lockname.len, res->lockname.name,
276 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
277 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
278 blocked_type);
279
256 (*fn)(lock->astdata, blocked_type); 280 (*fn)(lock->astdata, blocked_type);
257} 281}
258 282
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
332 /* cannot get a proxy ast message if this node owns it */ 356 /* cannot get a proxy ast message if this node owns it */
333 BUG_ON(res->owner == dlm->node_num); 357 BUG_ON(res->owner == dlm->node_num);
334 358
335 mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name); 359 mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
360 res->lockname.name);
336 361
337 spin_lock(&res->spinlock); 362 spin_lock(&res->spinlock);
338 if (res->state & DLM_LOCK_RES_RECOVERING) { 363 if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
382 if (past->type == DLM_AST) { 407 if (past->type == DLM_AST) {
383 /* do not alter lock refcount. switching lists. */ 408 /* do not alter lock refcount. switching lists. */
384 list_move_tail(&lock->list, &res->granted); 409 list_move_tail(&lock->list, &res->granted);
385 mlog(0, "ast: Adding to granted list... type=%d, " 410 mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
386 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); 411 dlm->name, res->lockname.len, res->lockname.name,
412 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
413 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
414 lock->ml.type, lock->ml.convert_type);
415
387 if (lock->ml.convert_type != LKM_IVMODE) { 416 if (lock->ml.convert_type != LKM_IVMODE) {
388 lock->ml.type = lock->ml.convert_type; 417 lock->ml.type = lock->ml.convert_type;
389 lock->ml.convert_type = LKM_IVMODE; 418 lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
426 size_t veclen = 1; 455 size_t veclen = 1;
427 int status; 456 int status;
428 457
429 mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n", 458 mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
430 res->lockname.len, res->lockname.name, lock->ml.node, 459 res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
431 msg_type, blocked_type); 460 blocked_type);
432 461
433 memset(&past, 0, sizeof(struct dlm_proxy_ast)); 462 memset(&past, 0, sizeof(struct dlm_proxy_ast));
434 past.node_idx = dlm->node_num; 463 past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
441 vec[0].iov_len = sizeof(struct dlm_proxy_ast); 470 vec[0].iov_len = sizeof(struct dlm_proxy_ast);
442 vec[0].iov_base = &past; 471 vec[0].iov_base = &past;
443 if (flags & DLM_LKSB_GET_LVB) { 472 if (flags & DLM_LKSB_GET_LVB) {
444 mlog(0, "returning requested LVB data\n");
445 be32_add_cpu(&past.flags, LKM_GET_LVB); 473 be32_add_cpu(&past.flags, LKM_GET_LVB);
446 vec[1].iov_len = DLM_LVB_LEN; 474 vec[1].iov_len = DLM_LVB_LEN;
447 vec[1].iov_base = lock->lksb->lvb; 475 vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
451 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, 479 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
452 lock->ml.node, &status); 480 lock->ml.node, &status);
453 if (ret < 0) 481 if (ret < 0)
454 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 482 mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
455 "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key, 483 dlm->name, res->lockname.len, res->lockname.name, ret,
456 lock->ml.node); 484 lock->ml.node);
457 else { 485 else {
458 if (status == DLM_RECOVERING) { 486 if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index b36d0bf77a5..4bdf7baee34 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) 50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
51 51
52enum dlm_mle_type { 52enum dlm_mle_type {
53 DLM_MLE_BLOCK, 53 DLM_MLE_BLOCK = 0,
54 DLM_MLE_MASTER, 54 DLM_MLE_MASTER = 1,
55 DLM_MLE_MIGRATION, 55 DLM_MLE_MIGRATION = 2,
56 DLM_MLE_NUM_TYPES 56 DLM_MLE_NUM_TYPES = 3,
57}; 57};
58 58
59struct dlm_master_list_entry { 59struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
82 82
83enum dlm_ast_type { 83enum dlm_ast_type {
84 DLM_AST = 0, 84 DLM_AST = 0,
85 DLM_BAST, 85 DLM_BAST = 1,
86 DLM_ASTUNLOCK 86 DLM_ASTUNLOCK = 2,
87}; 87};
88 88
89 89
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
119 119
120enum dlm_ctxt_state { 120enum dlm_ctxt_state {
121 DLM_CTXT_NEW = 0, 121 DLM_CTXT_NEW = 0,
122 DLM_CTXT_JOINED, 122 DLM_CTXT_JOINED = 1,
123 DLM_CTXT_IN_SHUTDOWN, 123 DLM_CTXT_IN_SHUTDOWN = 2,
124 DLM_CTXT_LEAVING, 124 DLM_CTXT_LEAVING = 3,
125}; 125};
126 126
127struct dlm_ctxt 127struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
388 388
389enum dlm_lockres_list { 389enum dlm_lockres_list {
390 DLM_GRANTED_LIST = 0, 390 DLM_GRANTED_LIST = 0,
391 DLM_CONVERTING_LIST, 391 DLM_CONVERTING_LIST = 1,
392 DLM_BLOCKED_LIST 392 DLM_BLOCKED_LIST = 2,
393}; 393};
394 394
395static inline int dlm_lvb_is_empty(char *lvb) 395static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,27 +427,27 @@ struct dlm_node_iter
427 427
428 428
429enum { 429enum {
430 DLM_MASTER_REQUEST_MSG = 500, 430 DLM_MASTER_REQUEST_MSG = 500,
431 DLM_UNUSED_MSG1, /* 501 */ 431 DLM_UNUSED_MSG1 = 501,
432 DLM_ASSERT_MASTER_MSG, /* 502 */ 432 DLM_ASSERT_MASTER_MSG = 502,
433 DLM_CREATE_LOCK_MSG, /* 503 */ 433 DLM_CREATE_LOCK_MSG = 503,
434 DLM_CONVERT_LOCK_MSG, /* 504 */ 434 DLM_CONVERT_LOCK_MSG = 504,
435 DLM_PROXY_AST_MSG, /* 505 */ 435 DLM_PROXY_AST_MSG = 505,
436 DLM_UNLOCK_LOCK_MSG, /* 506 */ 436 DLM_UNLOCK_LOCK_MSG = 506,
437 DLM_DEREF_LOCKRES_MSG, /* 507 */ 437 DLM_DEREF_LOCKRES_MSG = 507,
438 DLM_MIGRATE_REQUEST_MSG, /* 508 */ 438 DLM_MIGRATE_REQUEST_MSG = 508,
439 DLM_MIG_LOCKRES_MSG, /* 509 */ 439 DLM_MIG_LOCKRES_MSG = 509,
440 DLM_QUERY_JOIN_MSG, /* 510 */ 440 DLM_QUERY_JOIN_MSG = 510,
441 DLM_ASSERT_JOINED_MSG, /* 511 */ 441 DLM_ASSERT_JOINED_MSG = 511,
442 DLM_CANCEL_JOIN_MSG, /* 512 */ 442 DLM_CANCEL_JOIN_MSG = 512,
443 DLM_EXIT_DOMAIN_MSG, /* 513 */ 443 DLM_EXIT_DOMAIN_MSG = 513,
444 DLM_MASTER_REQUERY_MSG, /* 514 */ 444 DLM_MASTER_REQUERY_MSG = 514,
445 DLM_LOCK_REQUEST_MSG, /* 515 */ 445 DLM_LOCK_REQUEST_MSG = 515,
446 DLM_RECO_DATA_DONE_MSG, /* 516 */ 446 DLM_RECO_DATA_DONE_MSG = 516,
447 DLM_BEGIN_RECO_MSG, /* 517 */ 447 DLM_BEGIN_RECO_MSG = 517,
448 DLM_FINALIZE_RECO_MSG, /* 518 */ 448 DLM_FINALIZE_RECO_MSG = 518,
449 DLM_QUERY_REGION, /* 519 */ 449 DLM_QUERY_REGION = 519,
450 DLM_QUERY_NODEINFO, /* 520 */ 450 DLM_QUERY_NODEINFO = 520,
451}; 451};
452 452
453struct dlm_reco_node_data 453struct dlm_reco_node_data
@@ -460,19 +460,19 @@ struct dlm_reco_node_data
460enum { 460enum {
461 DLM_RECO_NODE_DATA_DEAD = -1, 461 DLM_RECO_NODE_DATA_DEAD = -1,
462 DLM_RECO_NODE_DATA_INIT = 0, 462 DLM_RECO_NODE_DATA_INIT = 0,
463 DLM_RECO_NODE_DATA_REQUESTING, 463 DLM_RECO_NODE_DATA_REQUESTING = 1,
464 DLM_RECO_NODE_DATA_REQUESTED, 464 DLM_RECO_NODE_DATA_REQUESTED = 2,
465 DLM_RECO_NODE_DATA_RECEIVING, 465 DLM_RECO_NODE_DATA_RECEIVING = 3,
466 DLM_RECO_NODE_DATA_DONE, 466 DLM_RECO_NODE_DATA_DONE = 4,
467 DLM_RECO_NODE_DATA_FINALIZE_SENT, 467 DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
468}; 468};
469 469
470 470
471enum { 471enum {
472 DLM_MASTER_RESP_NO = 0, 472 DLM_MASTER_RESP_NO = 0,
473 DLM_MASTER_RESP_YES, 473 DLM_MASTER_RESP_YES = 1,
474 DLM_MASTER_RESP_MAYBE, 474 DLM_MASTER_RESP_MAYBE = 2,
475 DLM_MASTER_RESP_ERROR 475 DLM_MASTER_RESP_ERROR = 3,
476}; 476};
477 477
478 478
@@ -649,9 +649,9 @@ struct dlm_proxy_ast
649#define DLM_MOD_KEY (0x666c6172) 649#define DLM_MOD_KEY (0x666c6172)
650enum dlm_query_join_response_code { 650enum dlm_query_join_response_code {
651 JOIN_DISALLOW = 0, 651 JOIN_DISALLOW = 0,
652 JOIN_OK, 652 JOIN_OK = 1,
653 JOIN_OK_NO_MAP, 653 JOIN_OK_NO_MAP = 2,
654 JOIN_PROTOCOL_MISMATCH, 654 JOIN_PROTOCOL_MISMATCH = 3,
655}; 655};
656 656
657struct dlm_query_join_packet { 657struct dlm_query_join_packet {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 272ec8631a5..04a32be0aeb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
370 kref_get(&dc->debug_refcnt); 370 kref_get(&dc->debug_refcnt);
371} 371}
372 372
373static struct debug_buffer *debug_buffer_allocate(void) 373static int debug_release(struct inode *inode, struct file *file)
374{ 374{
375 struct debug_buffer *db = NULL; 375 free_page((unsigned long)file->private_data);
376 376 return 0;
377 db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
378 if (!db)
379 goto bail;
380
381 db->len = PAGE_SIZE;
382 db->buf = kmalloc(db->len, GFP_KERNEL);
383 if (!db->buf)
384 goto bail;
385
386 return db;
387bail:
388 kfree(db);
389 return NULL;
390}
391
392static ssize_t debug_buffer_read(struct file *file, char __user *buf,
393 size_t nbytes, loff_t *ppos)
394{
395 struct debug_buffer *db = file->private_data;
396
397 return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
398}
399
400static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
401{
402 struct debug_buffer *db = file->private_data;
403 loff_t new = -1;
404
405 switch (whence) {
406 case 0:
407 new = off;
408 break;
409 case 1:
410 new = file->f_pos + off;
411 break;
412 }
413
414 if (new < 0 || new > db->len)
415 return -EINVAL;
416
417 return (file->f_pos = new);
418} 377}
419 378
420static int debug_buffer_release(struct inode *inode, struct file *file) 379static ssize_t debug_read(struct file *file, char __user *buf,
380 size_t nbytes, loff_t *ppos)
421{ 381{
422 struct debug_buffer *db = file->private_data; 382 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
423 383 i_size_read(file->f_mapping->host));
424 if (db)
425 kfree(db->buf);
426 kfree(db);
427
428 return 0;
429} 384}
430/* end - util funcs */ 385/* end - util funcs */
431 386
432/* begin - purge list funcs */ 387/* begin - purge list funcs */
433static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 388static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
434{ 389{
435 struct dlm_lock_resource *res; 390 struct dlm_lock_resource *res;
436 int out = 0; 391 int out = 0;
437 unsigned long total = 0; 392 unsigned long total = 0;
438 393
439 out += snprintf(db->buf + out, db->len - out, 394 out += snprintf(buf + out, len - out,
440 "Dumping Purgelist for Domain: %s\n", dlm->name); 395 "Dumping Purgelist for Domain: %s\n", dlm->name);
441 396
442 spin_lock(&dlm->spinlock); 397 spin_lock(&dlm->spinlock);
443 list_for_each_entry(res, &dlm->purge_list, purge) { 398 list_for_each_entry(res, &dlm->purge_list, purge) {
444 ++total; 399 ++total;
445 if (db->len - out < 100) 400 if (len - out < 100)
446 continue; 401 continue;
447 spin_lock(&res->spinlock); 402 spin_lock(&res->spinlock);
448 out += stringify_lockname(res->lockname.name, 403 out += stringify_lockname(res->lockname.name,
449 res->lockname.len, 404 res->lockname.len,
450 db->buf + out, db->len - out); 405 buf + out, len - out);
451 out += snprintf(db->buf + out, db->len - out, "\t%ld\n", 406 out += snprintf(buf + out, len - out, "\t%ld\n",
452 (jiffies - res->last_used)/HZ); 407 (jiffies - res->last_used)/HZ);
453 spin_unlock(&res->spinlock); 408 spin_unlock(&res->spinlock);
454 } 409 }
455 spin_unlock(&dlm->spinlock); 410 spin_unlock(&dlm->spinlock);
456 411
457 out += snprintf(db->buf + out, db->len - out, 412 out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
458 "Total on list: %ld\n", total);
459 413
460 return out; 414 return out;
461} 415}
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
463static int debug_purgelist_open(struct inode *inode, struct file *file) 417static int debug_purgelist_open(struct inode *inode, struct file *file)
464{ 418{
465 struct dlm_ctxt *dlm = inode->i_private; 419 struct dlm_ctxt *dlm = inode->i_private;
466 struct debug_buffer *db; 420 char *buf = NULL;
467 421
468 db = debug_buffer_allocate(); 422 buf = (char *) get_zeroed_page(GFP_NOFS);
469 if (!db) 423 if (!buf)
470 goto bail; 424 goto bail;
471 425
472 db->len = debug_purgelist_print(dlm, db); 426 i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
473 427
474 file->private_data = db; 428 file->private_data = buf;
475 429
476 return 0; 430 return 0;
477bail: 431bail:
@@ -480,14 +434,14 @@ bail:
480 434
481static const struct file_operations debug_purgelist_fops = { 435static const struct file_operations debug_purgelist_fops = {
482 .open = debug_purgelist_open, 436 .open = debug_purgelist_open,
483 .release = debug_buffer_release, 437 .release = debug_release,
484 .read = debug_buffer_read, 438 .read = debug_read,
485 .llseek = debug_buffer_llseek, 439 .llseek = generic_file_llseek,
486}; 440};
487/* end - purge list funcs */ 441/* end - purge list funcs */
488 442
489/* begin - debug mle funcs */ 443/* begin - debug mle funcs */
490static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 444static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
491{ 445{
492 struct dlm_master_list_entry *mle; 446 struct dlm_master_list_entry *mle;
493 struct hlist_head *bucket; 447 struct hlist_head *bucket;
@@ -495,7 +449,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
495 int i, out = 0; 449 int i, out = 0;
496 unsigned long total = 0, longest = 0, bucket_count = 0; 450 unsigned long total = 0, longest = 0, bucket_count = 0;
497 451
498 out += snprintf(db->buf + out, db->len - out, 452 out += snprintf(buf + out, len - out,
499 "Dumping MLEs for Domain: %s\n", dlm->name); 453 "Dumping MLEs for Domain: %s\n", dlm->name);
500 454
501 spin_lock(&dlm->master_lock); 455 spin_lock(&dlm->master_lock);
@@ -506,16 +460,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
506 master_hash_node); 460 master_hash_node);
507 ++total; 461 ++total;
508 ++bucket_count; 462 ++bucket_count;
509 if (db->len - out < 200) 463 if (len - out < 200)
510 continue; 464 continue;
511 out += dump_mle(mle, db->buf + out, db->len - out); 465 out += dump_mle(mle, buf + out, len - out);
512 } 466 }
513 longest = max(longest, bucket_count); 467 longest = max(longest, bucket_count);
514 bucket_count = 0; 468 bucket_count = 0;
515 } 469 }
516 spin_unlock(&dlm->master_lock); 470 spin_unlock(&dlm->master_lock);
517 471
518 out += snprintf(db->buf + out, db->len - out, 472 out += snprintf(buf + out, len - out,
519 "Total: %ld, Longest: %ld\n", total, longest); 473 "Total: %ld, Longest: %ld\n", total, longest);
520 return out; 474 return out;
521} 475}
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
523static int debug_mle_open(struct inode *inode, struct file *file) 477static int debug_mle_open(struct inode *inode, struct file *file)
524{ 478{
525 struct dlm_ctxt *dlm = inode->i_private; 479 struct dlm_ctxt *dlm = inode->i_private;
526 struct debug_buffer *db; 480 char *buf = NULL;
527 481
528 db = debug_buffer_allocate(); 482 buf = (char *) get_zeroed_page(GFP_NOFS);
529 if (!db) 483 if (!buf)
530 goto bail; 484 goto bail;
531 485
532 db->len = debug_mle_print(dlm, db); 486 i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
533 487
534 file->private_data = db; 488 file->private_data = buf;
535 489
536 return 0; 490 return 0;
537bail: 491bail:
@@ -540,9 +494,9 @@ bail:
540 494
541static const struct file_operations debug_mle_fops = { 495static const struct file_operations debug_mle_fops = {
542 .open = debug_mle_open, 496 .open = debug_mle_open,
543 .release = debug_buffer_release, 497 .release = debug_release,
544 .read = debug_buffer_read, 498 .read = debug_read,
545 .llseek = debug_buffer_llseek, 499 .llseek = generic_file_llseek,
546}; 500};
547 501
548/* end - debug mle funcs */ 502/* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
757/* end - debug lockres funcs */ 711/* end - debug lockres funcs */
758 712
759/* begin - debug state funcs */ 713/* begin - debug state funcs */
760static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 714static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
761{ 715{
762 int out = 0; 716 int out = 0;
763 struct dlm_reco_node_data *node; 717 struct dlm_reco_node_data *node;
@@ -781,35 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
781 } 735 }
782 736
783 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ 737 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
784 out += snprintf(db->buf + out, db->len - out, 738 out += snprintf(buf + out, len - out,
785 "Domain: %s Key: 0x%08x Protocol: %d.%d\n", 739 "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
786 dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, 740 dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
787 dlm->dlm_locking_proto.pv_minor); 741 dlm->dlm_locking_proto.pv_minor);
788 742
789 /* Thread Pid: xxx Node: xxx State: xxxxx */ 743 /* Thread Pid: xxx Node: xxx State: xxxxx */
790 out += snprintf(db->buf + out, db->len - out, 744 out += snprintf(buf + out, len - out,
791 "Thread Pid: %d Node: %d State: %s\n", 745 "Thread Pid: %d Node: %d State: %s\n",
792 dlm->dlm_thread_task->pid, dlm->node_num, state); 746 task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
793 747
794 /* Number of Joins: xxx Joining Node: xxx */ 748 /* Number of Joins: xxx Joining Node: xxx */
795 out += snprintf(db->buf + out, db->len - out, 749 out += snprintf(buf + out, len - out,
796 "Number of Joins: %d Joining Node: %d\n", 750 "Number of Joins: %d Joining Node: %d\n",
797 dlm->num_joins, dlm->joining_node); 751 dlm->num_joins, dlm->joining_node);
798 752
799 /* Domain Map: xx xx xx */ 753 /* Domain Map: xx xx xx */
800 out += snprintf(db->buf + out, db->len - out, "Domain Map: "); 754 out += snprintf(buf + out, len - out, "Domain Map: ");
801 out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, 755 out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
802 db->buf + out, db->len - out); 756 buf + out, len - out);
803 out += snprintf(db->buf + out, db->len - out, "\n"); 757 out += snprintf(buf + out, len - out, "\n");
804 758
805 /* Live Map: xx xx xx */ 759 /* Live Map: xx xx xx */
806 out += snprintf(db->buf + out, db->len - out, "Live Map: "); 760 out += snprintf(buf + out, len - out, "Live Map: ");
807 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, 761 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
808 db->buf + out, db->len - out); 762 buf + out, len - out);
809 out += snprintf(db->buf + out, db->len - out, "\n"); 763 out += snprintf(buf + out, len - out, "\n");
810 764
811 /* Lock Resources: xxx (xxx) */ 765 /* Lock Resources: xxx (xxx) */
812 out += snprintf(db->buf + out, db->len - out, 766 out += snprintf(buf + out, len - out,
813 "Lock Resources: %d (%d)\n", 767 "Lock Resources: %d (%d)\n",
814 atomic_read(&dlm->res_cur_count), 768 atomic_read(&dlm->res_cur_count),
815 atomic_read(&dlm->res_tot_count)); 769 atomic_read(&dlm->res_tot_count));
@@ -821,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
821 cur_mles += atomic_read(&dlm->mle_cur_count[i]); 775 cur_mles += atomic_read(&dlm->mle_cur_count[i]);
822 776
823 /* MLEs: xxx (xxx) */ 777 /* MLEs: xxx (xxx) */
824 out += snprintf(db->buf + out, db->len - out, 778 out += snprintf(buf + out, len - out,
825 "MLEs: %d (%d)\n", cur_mles, tot_mles); 779 "MLEs: %d (%d)\n", cur_mles, tot_mles);
826 780
827 /* Blocking: xxx (xxx) */ 781 /* Blocking: xxx (xxx) */
828 out += snprintf(db->buf + out, db->len - out, 782 out += snprintf(buf + out, len - out,
829 " Blocking: %d (%d)\n", 783 " Blocking: %d (%d)\n",
830 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), 784 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
831 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); 785 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
832 786
833 /* Mastery: xxx (xxx) */ 787 /* Mastery: xxx (xxx) */
834 out += snprintf(db->buf + out, db->len - out, 788 out += snprintf(buf + out, len - out,
835 " Mastery: %d (%d)\n", 789 " Mastery: %d (%d)\n",
836 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), 790 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
837 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); 791 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
838 792
839 /* Migration: xxx (xxx) */ 793 /* Migration: xxx (xxx) */
840 out += snprintf(db->buf + out, db->len - out, 794 out += snprintf(buf + out, len - out,
841 " Migration: %d (%d)\n", 795 " Migration: %d (%d)\n",
842 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), 796 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
843 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); 797 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
844 798
845 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ 799 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
846 out += snprintf(db->buf + out, db->len - out, 800 out += snprintf(buf + out, len - out,
847 "Lists: Dirty=%s Purge=%s PendingASTs=%s " 801 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
848 "PendingBASTs=%s\n", 802 "PendingBASTs=%s\n",
849 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), 803 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -852,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
852 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); 806 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
853 807
854 /* Purge Count: xxx Refs: xxx */ 808 /* Purge Count: xxx Refs: xxx */
855 out += snprintf(db->buf + out, db->len - out, 809 out += snprintf(buf + out, len - out,
856 "Purge Count: %d Refs: %d\n", dlm->purge_count, 810 "Purge Count: %d Refs: %d\n", dlm->purge_count,
857 atomic_read(&dlm->dlm_refs.refcount)); 811 atomic_read(&dlm->dlm_refs.refcount));
858 812
859 /* Dead Node: xxx */ 813 /* Dead Node: xxx */
860 out += snprintf(db->buf + out, db->len - out, 814 out += snprintf(buf + out, len - out,
861 "Dead Node: %d\n", dlm->reco.dead_node); 815 "Dead Node: %d\n", dlm->reco.dead_node);
862 816
863 /* What about DLM_RECO_STATE_FINALIZE? */ 817 /* What about DLM_RECO_STATE_FINALIZE? */
@@ -867,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
867 state = "INACTIVE"; 821 state = "INACTIVE";
868 822
869 /* Recovery Pid: xxxx Master: xxx State: xxxx */ 823 /* Recovery Pid: xxxx Master: xxx State: xxxx */
870 out += snprintf(db->buf + out, db->len - out, 824 out += snprintf(buf + out, len - out,
871 "Recovery Pid: %d Master: %d State: %s\n", 825 "Recovery Pid: %d Master: %d State: %s\n",
872 dlm->dlm_reco_thread_task->pid, 826 task_pid_nr(dlm->dlm_reco_thread_task),
873 dlm->reco.new_master, state); 827 dlm->reco.new_master, state);
874 828
875 /* Recovery Map: xx xx */ 829 /* Recovery Map: xx xx */
876 out += snprintf(db->buf + out, db->len - out, "Recovery Map: "); 830 out += snprintf(buf + out, len - out, "Recovery Map: ");
877 out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, 831 out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
878 db->buf + out, db->len - out); 832 buf + out, len - out);
879 out += snprintf(db->buf + out, db->len - out, "\n"); 833 out += snprintf(buf + out, len - out, "\n");
880 834
881 /* Recovery Node State: */ 835 /* Recovery Node State: */
882 out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n"); 836 out += snprintf(buf + out, len - out, "Recovery Node State:\n");
883 list_for_each_entry(node, &dlm->reco.node_data, list) { 837 list_for_each_entry(node, &dlm->reco.node_data, list) {
884 switch (node->state) { 838 switch (node->state) {
885 case DLM_RECO_NODE_DATA_INIT: 839 case DLM_RECO_NODE_DATA_INIT:
@@ -907,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
907 state = "BAD"; 861 state = "BAD";
908 break; 862 break;
909 } 863 }
910 out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n", 864 out += snprintf(buf + out, len - out, "\t%u - %s\n",
911 node->node_num, state); 865 node->node_num, state);
912 } 866 }
913 867
@@ -919,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
919static int debug_state_open(struct inode *inode, struct file *file) 873static int debug_state_open(struct inode *inode, struct file *file)
920{ 874{
921 struct dlm_ctxt *dlm = inode->i_private; 875 struct dlm_ctxt *dlm = inode->i_private;
922 struct debug_buffer *db = NULL; 876 char *buf = NULL;
923 877
924 db = debug_buffer_allocate(); 878 buf = (char *) get_zeroed_page(GFP_NOFS);
925 if (!db) 879 if (!buf)
926 goto bail; 880 goto bail;
927 881
928 db->len = debug_state_print(dlm, db); 882 i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
929 883
930 file->private_data = db; 884 file->private_data = buf;
931 885
932 return 0; 886 return 0;
933bail: 887bail:
@@ -936,9 +890,9 @@ bail:
936 890
937static const struct file_operations debug_state_fops = { 891static const struct file_operations debug_state_fops = {
938 .open = debug_state_open, 892 .open = debug_state_open,
939 .release = debug_buffer_release, 893 .release = debug_release,
940 .read = debug_buffer_read, 894 .read = debug_read,
941 .llseek = debug_buffer_llseek, 895 .llseek = generic_file_llseek,
942}; 896};
943/* end - debug state funcs */ 897/* end - debug state funcs */
944 898
@@ -1002,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
1002 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; 956 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
1003 957
1004 if (dc) { 958 if (dc) {
1005 if (dc->debug_purgelist_dentry) 959 debugfs_remove(dc->debug_purgelist_dentry);
1006 debugfs_remove(dc->debug_purgelist_dentry); 960 debugfs_remove(dc->debug_mle_dentry);
1007 if (dc->debug_mle_dentry) 961 debugfs_remove(dc->debug_lockres_dentry);
1008 debugfs_remove(dc->debug_mle_dentry); 962 debugfs_remove(dc->debug_state_dentry);
1009 if (dc->debug_lockres_dentry)
1010 debugfs_remove(dc->debug_lockres_dentry);
1011 if (dc->debug_state_dentry)
1012 debugfs_remove(dc->debug_state_dentry);
1013 dlm_debug_put(dc); 963 dlm_debug_put(dc);
1014 } 964 }
1015} 965}
@@ -1040,8 +990,7 @@ bail:
1040 990
1041void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) 991void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
1042{ 992{
1043 if (dlm->dlm_debugfs_subroot) 993 debugfs_remove(dlm->dlm_debugfs_subroot);
1044 debugfs_remove(dlm->dlm_debugfs_subroot);
1045} 994}
1046 995
1047/* debugfs root */ 996/* debugfs root */
@@ -1057,7 +1006,6 @@ int dlm_create_debugfs_root(void)
1057 1006
1058void dlm_destroy_debugfs_root(void) 1007void dlm_destroy_debugfs_root(void)
1059{ 1008{
1060 if (dlm_debugfs_root) 1009 debugfs_remove(dlm_debugfs_root);
1061 debugfs_remove(dlm_debugfs_root);
1062} 1010}
1063#endif /* CONFIG_DEBUG_FS */ 1011#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c..1f27c4812d1 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
37 struct dentry *debug_purgelist_dentry; 37 struct dentry *debug_purgelist_dentry;
38}; 38};
39 39
40struct debug_buffer {
41 int len;
42 char *buf;
43};
44
45struct debug_lockres { 40struct debug_lockres {
46 int dl_len; 41 int dl_len;
47 char *dl_buf; 42 char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 58a93b95373..7e38a072d72 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -460,8 +460,6 @@ redo_bucket:
460 } 460 }
461 cond_resched_lock(&dlm->spinlock); 461 cond_resched_lock(&dlm->spinlock);
462 num += n; 462 num += n;
463 mlog(0, "%s: touched %d lockreses in bucket %d "
464 "(tot=%d)\n", dlm->name, n, i, num);
465 } 463 }
466 spin_unlock(&dlm->spinlock); 464 spin_unlock(&dlm->spinlock);
467 wake_up(&dlm->dlm_thread_wq); 465 wake_up(&dlm->dlm_thread_wq);
@@ -959,7 +957,7 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
959 r += O2HB_MAX_REGION_NAME_LEN; 957 r += O2HB_MAX_REGION_NAME_LEN;
960 } 958 }
961 959
962 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); 960 local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
963 if (!local) { 961 if (!local) {
964 status = -ENOMEM; 962 status = -ENOMEM;
965 goto bail; 963 goto bail;
@@ -1661,8 +1659,8 @@ bail:
1661 1659
1662static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) 1660static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
1663{ 1661{
1664 o2hb_unregister_callback(NULL, &dlm->dlm_hb_up); 1662 o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
1665 o2hb_unregister_callback(NULL, &dlm->dlm_hb_down); 1663 o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
1666 o2net_unregister_handler_list(&dlm->dlm_domain_handlers); 1664 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
1667} 1665}
1668 1666
@@ -1674,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1674 1672
1675 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, 1673 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
1676 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); 1674 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
1677 status = o2hb_register_callback(NULL, &dlm->dlm_hb_down); 1675 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
1678 if (status) 1676 if (status)
1679 goto bail; 1677 goto bail;
1680 1678
1681 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, 1679 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
1682 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); 1680 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
1683 status = o2hb_register_callback(NULL, &dlm->dlm_hb_up); 1681 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
1684 if (status) 1682 if (status)
1685 goto bail; 1683 goto bail;
1686 1684
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369961c..7009292aac5 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
106 106
107 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) 107 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
108 return 0; 108 return 0;
109 if (!dlm_lock_compatible(tmplock->ml.convert_type,
110 lock->ml.type))
111 return 0;
109 } 112 }
110 113
111 return 1; 114 return 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f564b0e5f80..59f0f6bdfc6 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2346 */ 2346 */
2347static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, 2347static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2348 struct dlm_lock_resource *res, 2348 struct dlm_lock_resource *res,
2349 int *numlocks) 2349 int *numlocks,
2350 int *hasrefs)
2350{ 2351{
2351 int ret; 2352 int ret;
2352 int i; 2353 int i;
@@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2356 2357
2357 assert_spin_locked(&res->spinlock); 2358 assert_spin_locked(&res->spinlock);
2358 2359
2360 *numlocks = 0;
2361 *hasrefs = 0;
2362
2359 ret = -EINVAL; 2363 ret = -EINVAL;
2360 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 2364 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
2361 mlog(0, "cannot migrate lockres with unknown owner!\n"); 2365 mlog(0, "cannot migrate lockres with unknown owner!\n");
@@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2386 } 2390 }
2387 2391
2388 *numlocks = count; 2392 *numlocks = count;
2389 mlog(0, "migrateable lockres having %d locks\n", *numlocks); 2393
2394 count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2395 if (count < O2NM_MAX_NODES)
2396 *hasrefs = 1;
2397
2398 mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
2399 res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
2390 2400
2391leave: 2401leave:
2392 return ret; 2402 return ret;
@@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2408 const char *name; 2418 const char *name;
2409 unsigned int namelen; 2419 unsigned int namelen;
2410 int mle_added = 0; 2420 int mle_added = 0;
2411 int numlocks; 2421 int numlocks, hasrefs;
2412 int wake = 0; 2422 int wake = 0;
2413 2423
2414 if (!dlm_grab(dlm)) 2424 if (!dlm_grab(dlm))
@@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2417 name = res->lockname.name; 2427 name = res->lockname.name;
2418 namelen = res->lockname.len; 2428 namelen = res->lockname.len;
2419 2429
2420 mlog(0, "migrating %.*s to %u\n", namelen, name, target); 2430 mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
2421 2431
2422 /* 2432 /*
2423 * ensure this lockres is a proper candidate for migration 2433 * ensure this lockres is a proper candidate for migration
2424 */ 2434 */
2425 spin_lock(&res->spinlock); 2435 spin_lock(&res->spinlock);
2426 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); 2436 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2427 if (ret < 0) { 2437 if (ret < 0) {
2428 spin_unlock(&res->spinlock); 2438 spin_unlock(&res->spinlock);
2429 goto leave; 2439 goto leave;
@@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2431 spin_unlock(&res->spinlock); 2441 spin_unlock(&res->spinlock);
2432 2442
2433 /* no work to do */ 2443 /* no work to do */
2434 if (numlocks == 0) { 2444 if (numlocks == 0 && !hasrefs)
2435 mlog(0, "no locks were found on this lockres! done!\n");
2436 goto leave; 2445 goto leave;
2437 }
2438 2446
2439 /* 2447 /*
2440 * preallocate up front 2448 * preallocate up front
@@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2459 * find a node to migrate the lockres to 2467 * find a node to migrate the lockres to
2460 */ 2468 */
2461 2469
2462 mlog(0, "picking a migration node\n");
2463 spin_lock(&dlm->spinlock); 2470 spin_lock(&dlm->spinlock);
2464 /* pick a new node */ 2471 /* pick a new node */
2465 if (!test_bit(target, dlm->domain_map) || 2472 if (!test_bit(target, dlm->domain_map) ||
2466 target >= O2NM_MAX_NODES) { 2473 target >= O2NM_MAX_NODES) {
2467 target = dlm_pick_migration_target(dlm, res); 2474 target = dlm_pick_migration_target(dlm, res);
2468 } 2475 }
2469 mlog(0, "node %u chosen for migration\n", target); 2476 mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
2477 namelen, name, target);
2470 2478
2471 if (target >= O2NM_MAX_NODES || 2479 if (target >= O2NM_MAX_NODES ||
2472 !test_bit(target, dlm->domain_map)) { 2480 !test_bit(target, dlm->domain_map)) {
@@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2667{ 2675{
2668 int ret; 2676 int ret;
2669 int lock_dropped = 0; 2677 int lock_dropped = 0;
2670 int numlocks; 2678 int numlocks, hasrefs;
2671 2679
2672 spin_lock(&res->spinlock); 2680 spin_lock(&res->spinlock);
2673 if (res->owner != dlm->node_num) { 2681 if (res->owner != dlm->node_num) {
@@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2681 } 2689 }
2682 2690
2683 /* No need to migrate a lockres having no locks */ 2691 /* No need to migrate a lockres having no locks */
2684 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); 2692 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2685 if (ret >= 0 && numlocks == 0) { 2693 if (ret >= 0 && numlocks == 0 && !hasrefs) {
2686 spin_unlock(&res->spinlock); 2694 spin_unlock(&res->spinlock);
2687 goto leave; 2695 goto leave;
2688 } 2696 }
@@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2915 } 2923 }
2916 queue++; 2924 queue++;
2917 } 2925 }
2926
2927 nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2928 if (nodenum < O2NM_MAX_NODES) {
2929 spin_unlock(&res->spinlock);
2930 return nodenum;
2931 }
2918 spin_unlock(&res->spinlock); 2932 spin_unlock(&res->spinlock);
2919 mlog(0, "have not found a suitable target yet! checking domain map\n"); 2933 mlog(0, "have not found a suitable target yet! checking domain map\n");
2920 2934
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf33d9..1d6d1d22c47 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
122void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, 122void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
123 struct dlm_lock_resource *res) 123 struct dlm_lock_resource *res)
124{ 124{
125 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
126
127 assert_spin_locked(&dlm->spinlock); 125 assert_spin_locked(&dlm->spinlock);
128 assert_spin_locked(&res->spinlock); 126 assert_spin_locked(&res->spinlock);
129 127
130 if (__dlm_lockres_unused(res)){ 128 if (__dlm_lockres_unused(res)){
131 if (list_empty(&res->purge)) { 129 if (list_empty(&res->purge)) {
132 mlog(0, "putting lockres %.*s:%p onto purge list\n", 130 mlog(0, "%s: Adding res %.*s to purge list\n",
133 res->lockname.len, res->lockname.name, res); 131 dlm->name, res->lockname.len, res->lockname.name);
134 132
135 res->last_used = jiffies; 133 res->last_used = jiffies;
136 dlm_lockres_get(res); 134 dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
138 dlm->purge_count++; 136 dlm->purge_count++;
139 } 137 }
140 } else if (!list_empty(&res->purge)) { 138 } else if (!list_empty(&res->purge)) {
141 mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n", 139 mlog(0, "%s: Removing res %.*s from purge list\n",
142 res->lockname.len, res->lockname.name, res, res->owner); 140 dlm->name, res->lockname.len, res->lockname.name);
143 141
144 list_del_init(&res->purge); 142 list_del_init(&res->purge);
145 dlm_lockres_put(res); 143 dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
150void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, 148void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
151 struct dlm_lock_resource *res) 149 struct dlm_lock_resource *res)
152{ 150{
153 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
154 spin_lock(&dlm->spinlock); 151 spin_lock(&dlm->spinlock);
155 spin_lock(&res->spinlock); 152 spin_lock(&res->spinlock);
156 153
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
171 168
172 master = (res->owner == dlm->node_num); 169 master = (res->owner == dlm->node_num);
173 170
174 171 mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
175 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, 172 res->lockname.len, res->lockname.name, master);
176 res->lockname.name, master);
177 173
178 if (!master) { 174 if (!master) {
179 res->state |= DLM_LOCK_RES_DROPPING_REF; 175 res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
189 /* clear our bit from the master's refmap, ignore errors */ 185 /* clear our bit from the master's refmap, ignore errors */
190 ret = dlm_drop_lockres_ref(dlm, res); 186 ret = dlm_drop_lockres_ref(dlm, res);
191 if (ret < 0) { 187 if (ret < 0) {
192 mlog_errno(ret); 188 mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
189 res->lockname.len, res->lockname.name, ret);
193 if (!dlm_is_host_down(ret)) 190 if (!dlm_is_host_down(ret))
194 BUG(); 191 BUG();
195 } 192 }
196 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
197 dlm->name, res->lockname.len, res->lockname.name, ret);
198 spin_lock(&dlm->spinlock); 193 spin_lock(&dlm->spinlock);
199 spin_lock(&res->spinlock); 194 spin_lock(&res->spinlock);
200 } 195 }
201 196
202 if (!list_empty(&res->purge)) { 197 if (!list_empty(&res->purge)) {
203 mlog(0, "removing lockres %.*s:%p from purgelist, " 198 mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
204 "master = %d\n", res->lockname.len, res->lockname.name, 199 dlm->name, res->lockname.len, res->lockname.name, master);
205 res, master);
206 list_del_init(&res->purge); 200 list_del_init(&res->purge);
207 dlm_lockres_put(res); 201 dlm_lockres_put(res);
208 dlm->purge_count--; 202 dlm->purge_count--;
209 } 203 }
210 204
211 if (!__dlm_lockres_unused(res)) { 205 if (!__dlm_lockres_unused(res)) {
212 mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n", 206 mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
213 dlm->name, res->lockname.len, res->lockname.name); 207 dlm->name, res->lockname.len, res->lockname.name);
214 __dlm_print_one_lock_resource(res); 208 __dlm_print_one_lock_resource(res);
215 BUG(); 209 BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
266 unused = __dlm_lockres_unused(lockres); 260 unused = __dlm_lockres_unused(lockres);
267 if (!unused || 261 if (!unused ||
268 (lockres->state & DLM_LOCK_RES_MIGRATING)) { 262 (lockres->state & DLM_LOCK_RES_MIGRATING)) {
269 mlog(0, "lockres %s:%.*s: is in use or " 263 mlog(0, "%s: res %.*s is in use or being remastered, "
270 "being remastered, used %d, state %d\n", 264 "used %d, state %d\n", dlm->name,
271 dlm->name, lockres->lockname.len, 265 lockres->lockname.len, lockres->lockname.name,
272 lockres->lockname.name, !unused, lockres->state); 266 !unused, lockres->state);
273 list_move_tail(&dlm->purge_list, &lockres->purge); 267 list_move_tail(&dlm->purge_list, &lockres->purge);
274 spin_unlock(&lockres->spinlock); 268 spin_unlock(&lockres->spinlock);
275 continue; 269 continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
296 struct list_head *head; 290 struct list_head *head;
297 int can_grant = 1; 291 int can_grant = 1;
298 292
299 //mlog(0, "res->lockname.len=%d\n", res->lockname.len); 293 /*
300 //mlog(0, "res->lockname.name=%p\n", res->lockname.name); 294 * Because this function is called with the lockres
301 //mlog(0, "shuffle res %.*s\n", res->lockname.len,
302 // res->lockname.name);
303
304 /* because this function is called with the lockres
305 * spinlock, and because we know that it is not migrating/ 295 * spinlock, and because we know that it is not migrating/
306 * recovering/in-progress, it is fine to reserve asts and 296 * recovering/in-progress, it is fine to reserve asts and
307 * basts right before queueing them all throughout */ 297 * basts right before queueing them all throughout
298 */
308 assert_spin_locked(&dlm->ast_lock); 299 assert_spin_locked(&dlm->ast_lock);
309 assert_spin_locked(&res->spinlock); 300 assert_spin_locked(&res->spinlock);
310 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| 301 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
314converting: 305converting:
315 if (list_empty(&res->converting)) 306 if (list_empty(&res->converting))
316 goto blocked; 307 goto blocked;
317 mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len, 308 mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
318 res->lockname.name); 309 res->lockname.len, res->lockname.name);
319 310
320 target = list_entry(res->converting.next, struct dlm_lock, list); 311 target = list_entry(res->converting.next, struct dlm_lock, list);
321 if (target->ml.convert_type == LKM_IVMODE) { 312 if (target->ml.convert_type == LKM_IVMODE) {
322 mlog(ML_ERROR, "%.*s: converting a lock with no " 313 mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
323 "convert_type!\n", res->lockname.len, res->lockname.name); 314 dlm->name, res->lockname.len, res->lockname.name);
324 BUG(); 315 BUG();
325 } 316 }
326 head = &res->granted; 317 head = &res->granted;
@@ -365,9 +356,12 @@ converting:
365 spin_lock(&target->spinlock); 356 spin_lock(&target->spinlock);
366 BUG_ON(target->ml.highest_blocked != LKM_IVMODE); 357 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
367 358
368 mlog(0, "calling ast for converting lock: %.*s, have: %d, " 359 mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
369 "granting: %d, node: %u\n", res->lockname.len, 360 "%d => %d, node %u\n", dlm->name, res->lockname.len,
370 res->lockname.name, target->ml.type, 361 res->lockname.name,
362 dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
363 dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
364 target->ml.type,
371 target->ml.convert_type, target->ml.node); 365 target->ml.convert_type, target->ml.node);
372 366
373 target->ml.type = target->ml.convert_type; 367 target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
428 spin_lock(&target->spinlock); 422 spin_lock(&target->spinlock);
429 BUG_ON(target->ml.highest_blocked != LKM_IVMODE); 423 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
430 424
431 mlog(0, "calling ast for blocked lock: %.*s, granting: %d, " 425 mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
432 "node: %u\n", res->lockname.len, res->lockname.name, 426 "node %u\n", dlm->name, res->lockname.len,
427 res->lockname.name,
428 dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
429 dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
433 target->ml.type, target->ml.node); 430 target->ml.type, target->ml.node);
434 431
435 // target->ml.type is already correct 432 /* target->ml.type is already correct */
436 list_move_tail(&target->list, &res->granted); 433 list_move_tail(&target->list, &res->granted);
437 434
438 BUG_ON(!target->lksb); 435 BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
453/* must have NO locks when calling this with res !=NULL * */ 450/* must have NO locks when calling this with res !=NULL * */
454void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 451void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
455{ 452{
456 mlog_entry("dlm=%p, res=%p\n", dlm, res);
457 if (res) { 453 if (res) {
458 spin_lock(&dlm->spinlock); 454 spin_lock(&dlm->spinlock);
459 spin_lock(&res->spinlock); 455 spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
466 462
467void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 463void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
468{ 464{
469 mlog_entry("dlm=%p, res=%p\n", dlm, res);
470
471 assert_spin_locked(&dlm->spinlock); 465 assert_spin_locked(&dlm->spinlock);
472 assert_spin_locked(&res->spinlock); 466 assert_spin_locked(&res->spinlock);
473 467
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
484 res->state |= DLM_LOCK_RES_DIRTY; 478 res->state |= DLM_LOCK_RES_DIRTY;
485 } 479 }
486 } 480 }
481
482 mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
483 res->lockname.name);
487} 484}
488 485
489 486
490/* Launch the NM thread for the mounted volume */ 487/* Launch the NM thread for the mounted volume */
491int dlm_launch_thread(struct dlm_ctxt *dlm) 488int dlm_launch_thread(struct dlm_ctxt *dlm)
492{ 489{
493 mlog(0, "starting dlm thread...\n"); 490 mlog(0, "Starting dlm_thread...\n");
494 491
495 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); 492 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
496 if (IS_ERR(dlm->dlm_thread_task)) { 493 if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
505void dlm_complete_thread(struct dlm_ctxt *dlm) 502void dlm_complete_thread(struct dlm_ctxt *dlm)
506{ 503{
507 if (dlm->dlm_thread_task) { 504 if (dlm->dlm_thread_task) {
508 mlog(ML_KTHREAD, "waiting for dlm thread to exit\n"); 505 mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
509 kthread_stop(dlm->dlm_thread_task); 506 kthread_stop(dlm->dlm_thread_task);
510 dlm->dlm_thread_task = NULL; 507 dlm->dlm_thread_task = NULL;
511 } 508 }
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
536 /* get an extra ref on lock */ 533 /* get an extra ref on lock */
537 dlm_lock_get(lock); 534 dlm_lock_get(lock);
538 res = lock->lockres; 535 res = lock->lockres;
539 mlog(0, "delivering an ast for this lockres\n"); 536 mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
537 "node %u\n", dlm->name, res->lockname.len,
538 res->lockname.name,
539 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
540 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
541 lock->ml.type, lock->ml.node);
540 542
541 BUG_ON(!lock->ast_pending); 543 BUG_ON(!lock->ast_pending);
542 544
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
557 /* possible that another ast was queued while 559 /* possible that another ast was queued while
558 * we were delivering the last one */ 560 * we were delivering the last one */
559 if (!list_empty(&lock->ast_list)) { 561 if (!list_empty(&lock->ast_list)) {
560 mlog(0, "aha another ast got queued while " 562 mlog(0, "%s: res %.*s, AST queued while flushing last "
561 "we were finishing the last one. will " 563 "one\n", dlm->name, res->lockname.len,
562 "keep the ast_pending flag set.\n"); 564 res->lockname.name);
563 } else 565 } else
564 lock->ast_pending = 0; 566 lock->ast_pending = 0;
565 567
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
590 dlm_lock_put(lock); 592 dlm_lock_put(lock);
591 spin_unlock(&dlm->ast_lock); 593 spin_unlock(&dlm->ast_lock);
592 594
593 mlog(0, "delivering a bast for this lockres " 595 mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
594 "(blocked = %d\n", hi); 596 "blocked %d, node %u\n",
597 dlm->name, res->lockname.len, res->lockname.name,
598 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
599 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
600 hi, lock->ml.node);
595 601
596 if (lock->ml.node != dlm->node_num) { 602 if (lock->ml.node != dlm->node_num) {
597 ret = dlm_send_proxy_bast(dlm, res, lock, hi); 603 ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
605 /* possible that another bast was queued while 611 /* possible that another bast was queued while
606 * we were delivering the last one */ 612 * we were delivering the last one */
607 if (!list_empty(&lock->bast_list)) { 613 if (!list_empty(&lock->bast_list)) {
608 mlog(0, "aha another bast got queued while " 614 mlog(0, "%s: res %.*s, BAST queued while flushing last "
609 "we were finishing the last one. will " 615 "one\n", dlm->name, res->lockname.len,
610 "keep the bast_pending flag set.\n"); 616 res->lockname.name);
611 } else 617 } else
612 lock->bast_pending = 0; 618 lock->bast_pending = 0;
613 619
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
675 spin_lock(&res->spinlock); 681 spin_lock(&res->spinlock);
676 if (res->owner != dlm->node_num) { 682 if (res->owner != dlm->node_num) {
677 __dlm_print_one_lock_resource(res); 683 __dlm_print_one_lock_resource(res);
678 mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n", 684 mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
679 res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no", 685 " dirty %d\n", dlm->name,
680 res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no", 686 !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
681 res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no", 687 !!(res->state & DLM_LOCK_RES_MIGRATING),
682 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); 688 !!(res->state & DLM_LOCK_RES_RECOVERING),
689 !!(res->state & DLM_LOCK_RES_DIRTY));
683 } 690 }
684 BUG_ON(res->owner != dlm->node_num); 691 BUG_ON(res->owner != dlm->node_num);
685 692
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
693 res->state &= ~DLM_LOCK_RES_DIRTY; 700 res->state &= ~DLM_LOCK_RES_DIRTY;
694 spin_unlock(&res->spinlock); 701 spin_unlock(&res->spinlock);
695 spin_unlock(&dlm->ast_lock); 702 spin_unlock(&dlm->ast_lock);
696 mlog(0, "delaying list shuffling for in-" 703 mlog(0, "%s: res %.*s, inprogress, delay list "
697 "progress lockres %.*s, state=%d\n", 704 "shuffle, state %d\n", dlm->name,
698 res->lockname.len, res->lockname.name, 705 res->lockname.len, res->lockname.name,
699 res->state); 706 res->state);
700 delay = 1; 707 delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
706 * spinlock and do NOT have the dlm lock. 713 * spinlock and do NOT have the dlm lock.
707 * safe to reserve/queue asts and run the lists. */ 714 * safe to reserve/queue asts and run the lists. */
708 715
709 mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
710 "res=%.*s\n", dlm->name,
711 res->lockname.len, res->lockname.name);
712
713 /* called while holding lockres lock */ 716 /* called while holding lockres lock */
714 dlm_shuffle_lists(dlm, res); 717 dlm_shuffle_lists(dlm, res);
715 res->state &= ~DLM_LOCK_RES_DIRTY; 718 res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
733 /* unlikely, but we may need to give time to 736 /* unlikely, but we may need to give time to
734 * other tasks */ 737 * other tasks */
735 if (!--n) { 738 if (!--n) {
736 mlog(0, "throttling dlm_thread\n"); 739 mlog(0, "%s: Throttling dlm thread\n",
740 dlm->name);
737 break; 741 break;
738 } 742 }
739 } 743 }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b2df490a19e..8c5c0eddc36 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
351 return &ip->ip_vfs_inode; 351 return &ip->ip_vfs_inode;
352} 352}
353 353
354static void dlmfs_destroy_inode(struct inode *inode) 354static void dlmfs_i_callback(struct rcu_head *head)
355{ 355{
356 struct inode *inode = container_of(head, struct inode, i_rcu);
357 INIT_LIST_HEAD(&inode->i_dentry);
356 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); 358 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
357} 359}
358 360
361static void dlmfs_destroy_inode(struct inode *inode)
362{
363 call_rcu(&inode->i_rcu, dlmfs_i_callback);
364}
365
359static void dlmfs_evict_inode(struct inode *inode) 366static void dlmfs_evict_inode(struct inode *inode)
360{ 367{
361 int status; 368 int status;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 19ad145d2af..5dbc3062b4f 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -137,9 +137,7 @@ check_gen:
137 } 137 }
138 138
139 result = d_obtain_alias(inode); 139 result = d_obtain_alias(inode);
140 if (!IS_ERR(result)) 140 if (IS_ERR(result))
141 result->d_op = &ocfs2_dentry_ops;
142 else
143 mlog_errno(PTR_ERR(result)); 141 mlog_errno(PTR_ERR(result));
144 142
145bail: 143bail:
@@ -175,8 +173,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
175 } 173 }
176 174
177 parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); 175 parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
178 if (!IS_ERR(parent))
179 parent->d_op = &ocfs2_dentry_ops;
180 176
181bail_unlock: 177bail_unlock:
182 ocfs2_inode_unlock(dir, 0); 178 ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 77b4c04a280..a6651956482 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1307,10 +1307,13 @@ bail:
1307 return err; 1307 return err;
1308} 1308}
1309 1309
1310int ocfs2_permission(struct inode *inode, int mask) 1310int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
1311{ 1311{
1312 int ret; 1312 int ret;
1313 1313
1314 if (flags & IPERM_FLAG_RCU)
1315 return -ECHILD;
1316
1314 mlog_entry_void(); 1317 mlog_entry_void();
1315 1318
1316 ret = ocfs2_inode_lock(inode, NULL, 0); 1319 ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1320,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask)
1320 goto out; 1323 goto out;
1321 } 1324 }
1322 1325
1323 ret = generic_permission(inode, mask, ocfs2_check_acl); 1326 ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
1324 1327
1325 ocfs2_inode_unlock(inode, 0); 1328 ocfs2_inode_unlock(inode, 0);
1326out: 1329out:
@@ -1986,28 +1989,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1986 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1989 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1987} 1990}
1988 1991
1989static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, 1992static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
1990 loff_t len) 1993 loff_t len)
1991{ 1994{
1995 struct inode *inode = file->f_path.dentry->d_inode;
1992 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1996 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1993 struct ocfs2_space_resv sr; 1997 struct ocfs2_space_resv sr;
1994 int change_size = 1; 1998 int change_size = 1;
1999 int cmd = OCFS2_IOC_RESVSP64;
1995 2000
2001 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2002 return -EOPNOTSUPP;
1996 if (!ocfs2_writes_unwritten_extents(osb)) 2003 if (!ocfs2_writes_unwritten_extents(osb))
1997 return -EOPNOTSUPP; 2004 return -EOPNOTSUPP;
1998 2005
1999 if (S_ISDIR(inode->i_mode))
2000 return -ENODEV;
2001
2002 if (mode & FALLOC_FL_KEEP_SIZE) 2006 if (mode & FALLOC_FL_KEEP_SIZE)
2003 change_size = 0; 2007 change_size = 0;
2004 2008
2009 if (mode & FALLOC_FL_PUNCH_HOLE)
2010 cmd = OCFS2_IOC_UNRESVSP64;
2011
2005 sr.l_whence = 0; 2012 sr.l_whence = 0;
2006 sr.l_start = (s64)offset; 2013 sr.l_start = (s64)offset;
2007 sr.l_len = (s64)len; 2014 sr.l_len = (s64)len;
2008 2015
2009 return __ocfs2_change_file_space(NULL, inode, offset, 2016 return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2010 OCFS2_IOC_RESVSP64, &sr, change_size); 2017 change_size);
2011} 2018}
2012 2019
2013int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, 2020int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -2241,11 +2248,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2241 2248
2242 mutex_lock(&inode->i_mutex); 2249 mutex_lock(&inode->i_mutex);
2243 2250
2251 ocfs2_iocb_clear_sem_locked(iocb);
2252
2244relock: 2253relock:
2245 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 2254 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
2246 if (direct_io) { 2255 if (direct_io) {
2247 down_read(&inode->i_alloc_sem); 2256 down_read(&inode->i_alloc_sem);
2248 have_alloc_sem = 1; 2257 have_alloc_sem = 1;
2258 /* communicate with ocfs2_dio_end_io */
2259 ocfs2_iocb_set_sem_locked(iocb);
2249 } 2260 }
2250 2261
2251 /* 2262 /*
@@ -2382,8 +2393,10 @@ out:
2382 ocfs2_rw_unlock(inode, rw_level); 2393 ocfs2_rw_unlock(inode, rw_level);
2383 2394
2384out_sems: 2395out_sems:
2385 if (have_alloc_sem) 2396 if (have_alloc_sem) {
2386 up_read(&inode->i_alloc_sem); 2397 up_read(&inode->i_alloc_sem);
2398 ocfs2_iocb_clear_sem_locked(iocb);
2399 }
2387 2400
2388 mutex_unlock(&inode->i_mutex); 2401 mutex_unlock(&inode->i_mutex);
2389 2402
@@ -2527,6 +2540,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2527 goto bail; 2540 goto bail;
2528 } 2541 }
2529 2542
2543 ocfs2_iocb_clear_sem_locked(iocb);
2544
2530 /* 2545 /*
2531 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2546 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2532 * need locks to protect pending reads from racing with truncate. 2547 * need locks to protect pending reads from racing with truncate.
@@ -2534,6 +2549,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2534 if (filp->f_flags & O_DIRECT) { 2549 if (filp->f_flags & O_DIRECT) {
2535 down_read(&inode->i_alloc_sem); 2550 down_read(&inode->i_alloc_sem);
2536 have_alloc_sem = 1; 2551 have_alloc_sem = 1;
2552 ocfs2_iocb_set_sem_locked(iocb);
2537 2553
2538 ret = ocfs2_rw_lock(inode, 0); 2554 ret = ocfs2_rw_lock(inode, 0);
2539 if (ret < 0) { 2555 if (ret < 0) {
@@ -2575,8 +2591,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2575 } 2591 }
2576 2592
2577bail: 2593bail:
2578 if (have_alloc_sem) 2594 if (have_alloc_sem) {
2579 up_read(&inode->i_alloc_sem); 2595 up_read(&inode->i_alloc_sem);
2596 ocfs2_iocb_clear_sem_locked(iocb);
2597 }
2580 if (rw_level != -1) 2598 if (rw_level != -1)
2581 ocfs2_rw_unlock(inode, rw_level); 2599 ocfs2_rw_unlock(inode, rw_level);
2582 mlog_exit(ret); 2600 mlog_exit(ret);
@@ -2592,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
2592 .getxattr = generic_getxattr, 2610 .getxattr = generic_getxattr,
2593 .listxattr = ocfs2_listxattr, 2611 .listxattr = ocfs2_listxattr,
2594 .removexattr = generic_removexattr, 2612 .removexattr = generic_removexattr,
2595 .fallocate = ocfs2_fallocate,
2596 .fiemap = ocfs2_fiemap, 2613 .fiemap = ocfs2_fiemap,
2597}; 2614};
2598 2615
@@ -2624,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
2624 .flock = ocfs2_flock, 2641 .flock = ocfs2_flock,
2625 .splice_read = ocfs2_file_splice_read, 2642 .splice_read = ocfs2_file_splice_read,
2626 .splice_write = ocfs2_file_splice_write, 2643 .splice_write = ocfs2_file_splice_write,
2644 .fallocate = ocfs2_fallocate,
2627}; 2645};
2628 2646
2629const struct file_operations ocfs2_dops = { 2647const struct file_operations ocfs2_dops = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7..f5afbbef670 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
63 struct kstat *stat); 63 struct kstat *stat);
64int ocfs2_permission(struct inode *inode, int mask); 64int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
65 65
66int ocfs2_should_update_atime(struct inode *inode, 66int ocfs2_should_update_atime(struct inode *inode,
67 struct vfsmount *vfsmnt); 67 struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f935fd6600d..4068c6c4c6f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -434,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
434 * #1 and #2 can be simply solved by never taking the lock 434 * #1 and #2 can be simply solved by never taking the lock
435 * here for system files (which are the only type we read 435 * here for system files (which are the only type we read
436 * during mount). It's a heavier approach, but our main 436 * during mount). It's a heavier approach, but our main
437 * concern is user-accesible files anyway. 437 * concern is user-accessible files anyway.
438 * 438 *
439 * #3 works itself out because we'll eventually take the 439 * #3 works itself out because we'll eventually take the
440 * cluster lock before trusting anything anyway. 440 * cluster lock before trusting anything anyway.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ff5744e1e36..849fb4a2e81 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
147 spin_unlock(&oi->ip_lock); 147 spin_unlock(&oi->ip_lock);
148 148
149bail_add: 149bail_add:
150 dentry->d_op = &ocfs2_dentry_ops;
151 ret = d_splice_alias(inode, dentry); 150 ret = d_splice_alias(inode, dentry);
152 151
153 if (inode) { 152 if (inode) {
@@ -415,7 +414,6 @@ static int ocfs2_mknod(struct inode *dir,
415 mlog_errno(status); 414 mlog_errno(status);
416 goto leave; 415 goto leave;
417 } 416 }
418 dentry->d_op = &ocfs2_dentry_ops;
419 417
420 status = ocfs2_add_entry(handle, dentry, inode, 418 status = ocfs2_add_entry(handle, dentry, inode,
421 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 419 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -743,7 +741,6 @@ static int ocfs2_link(struct dentry *old_dentry,
743 } 741 }
744 742
745 ihold(inode); 743 ihold(inode);
746 dentry->d_op = &ocfs2_dentry_ops;
747 d_instantiate(dentry, inode); 744 d_instantiate(dentry, inode);
748 745
749out_commit: 746out_commit:
@@ -1017,8 +1014,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1017 * An error return must mean that no cluster locks 1014 * An error return must mean that no cluster locks
1018 * were held on function exit. 1015 * were held on function exit.
1019 */ 1016 */
1020 if (oi1->ip_blkno != oi2->ip_blkno) 1017 if (oi1->ip_blkno != oi2->ip_blkno) {
1021 ocfs2_inode_unlock(inode2, 1); 1018 ocfs2_inode_unlock(inode2, 1);
1019 brelse(*bh2);
1020 *bh2 = NULL;
1021 }
1022 1022
1023 if (status != -ENOENT) 1023 if (status != -ENOENT)
1024 mlog_errno(status); 1024 mlog_errno(status);
@@ -1794,7 +1794,6 @@ static int ocfs2_symlink(struct inode *dir,
1794 mlog_errno(status); 1794 mlog_errno(status);
1795 goto bail; 1795 goto bail;
1796 } 1796 }
1797 dentry->d_op = &ocfs2_dentry_ops;
1798 1797
1799 status = ocfs2_add_entry(handle, dentry, inode, 1798 status = ocfs2_add_entry(handle, dentry, inode,
1800 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1799 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2459,7 +2458,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2459 goto out_commit; 2458 goto out_commit;
2460 } 2459 }
2461 2460
2462 dentry->d_op = &ocfs2_dentry_ops;
2463 d_instantiate(dentry, inode); 2461 d_instantiate(dentry, inode);
2464 status = 0; 2462 status = 0;
2465out_commit: 2463out_commit:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d8408217e3b..51cd6898e7f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,9 @@ struct ocfs2_lock_res {
159 char l_name[OCFS2_LOCK_ID_MAX_LEN]; 159 char l_name[OCFS2_LOCK_ID_MAX_LEN];
160 unsigned int l_ro_holders; 160 unsigned int l_ro_holders;
161 unsigned int l_ex_holders; 161 unsigned int l_ex_holders;
162 unsigned char l_level; 162 signed char l_level;
163 signed char l_requested;
164 signed char l_blocking;
163 165
164 /* Data packed - type enum ocfs2_lock_type */ 166 /* Data packed - type enum ocfs2_lock_type */
165 unsigned char l_type; 167 unsigned char l_type;
@@ -169,8 +171,6 @@ struct ocfs2_lock_res {
169 unsigned char l_action; 171 unsigned char l_action;
170 /* Data packed - enum type ocfs2_unlock_action */ 172 /* Data packed - enum type ocfs2_unlock_action */
171 unsigned char l_unlock_action; 173 unsigned char l_unlock_action;
172 unsigned char l_requested;
173 unsigned char l_blocking;
174 unsigned int l_pending_gen; 174 unsigned int l_pending_gen;
175 175
176 spinlock_t l_lock; 176 spinlock_t l_lock;
@@ -420,6 +420,11 @@ struct ocfs2_super
420 struct inode *osb_tl_inode; 420 struct inode *osb_tl_inode;
421 struct buffer_head *osb_tl_bh; 421 struct buffer_head *osb_tl_bh;
422 struct delayed_work osb_truncate_log_wq; 422 struct delayed_work osb_truncate_log_wq;
423 /*
424 * How many clusters in our truncate log.
425 * It must be protected by osb_tl_inode->i_mutex.
426 */
427 unsigned int truncated_clusters;
423 428
424 struct ocfs2_node_map osb_recovering_orphan_dirs; 429 struct ocfs2_node_map osb_recovering_orphan_dirs;
425 unsigned int *osb_orphan_wipes; 430 unsigned int *osb_orphan_wipes;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c2e4f8222e2..bf2e7764920 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -350,7 +350,7 @@ enum {
350#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE 350#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
351 NUM_SYSTEM_INODES 351 NUM_SYSTEM_INODES
352}; 352};
353#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE 353#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
354#define NUM_LOCAL_SYSTEM_INODES \ 354#define NUM_LOCAL_SYSTEM_INODES \
355 (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) 355 (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
356 356
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 252e7c82f92..a5ebe421195 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -190,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
190 return c; 190 return c;
191 } 191 }
192 192
193 return c; 193 return NULL;
194} 194}
195 195
196/* 196/*
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5fed60de763..71998d4d61d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1916,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1916 if (res->sr_bg_blkno) { 1916 if (res->sr_bg_blkno) {
1917 /* Attempt to short-circuit the usual search mechanism 1917 /* Attempt to short-circuit the usual search mechanism
1918 * by jumping straight to the most recently used 1918 * by jumping straight to the most recently used
1919 * allocation group. This helps us mantain some 1919 * allocation group. This helps us maintain some
1920 * contiguousness across allocations. */ 1920 * contiguousness across allocations. */
1921 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1921 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1922 min_bits, res, &bits_left); 1922 min_bits, res, &bits_left);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f02c0ef3157..38f986d2447 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,7 +41,6 @@
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/smp_lock.h>
45 44
46#define MLOG_MASK_PREFIX ML_SUPER 45#define MLOG_MASK_PREFIX ML_SUPER
47#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -570,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
570 return &oi->vfs_inode; 569 return &oi->vfs_inode;
571} 570}
572 571
573static void ocfs2_destroy_inode(struct inode *inode) 572static void ocfs2_i_callback(struct rcu_head *head)
574{ 573{
574 struct inode *inode = container_of(head, struct inode, i_rcu);
575 INIT_LIST_HEAD(&inode->i_dentry);
575 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); 576 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
576} 577}
577 578
579static void ocfs2_destroy_inode(struct inode *inode)
580{
581 call_rcu(&inode->i_rcu, ocfs2_i_callback);
582}
583
578static unsigned long long ocfs2_max_file_offset(unsigned int bbits, 584static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
579 unsigned int cbits) 585 unsigned int cbits)
580{ 586{
@@ -987,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
987} 993}
988 994
989/* Handle quota on quotactl */ 995/* Handle quota on quotactl */
990static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, 996static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
991 char *path)
992{ 997{
993 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 998 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
994 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 999 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -1007,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
1007} 1012}
1008 1013
1009static const struct quotactl_ops ocfs2_quotactl_ops = { 1014static const struct quotactl_ops ocfs2_quotactl_ops = {
1010 .quota_on = ocfs2_quota_on, 1015 .quota_on_meta = ocfs2_quota_on,
1011 .quota_off = ocfs2_quota_off, 1016 .quota_off = ocfs2_quota_off,
1012 .quota_sync = dquot_quota_sync, 1017 .quota_sync = dquot_quota_sync,
1013 .get_info = dquot_get_dqinfo, 1018 .get_info = dquot_get_dqinfo,
@@ -2091,6 +2096,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2091 2096
2092 sb->s_fs_info = osb; 2097 sb->s_fs_info = osb;
2093 sb->s_op = &ocfs2_sops; 2098 sb->s_op = &ocfs2_sops;
2099 sb->s_d_op = &ocfs2_dentry_ops;
2094 sb->s_export_op = &ocfs2_export_ops; 2100 sb->s_export_op = &ocfs2_export_ops;
2095 sb->s_qcop = &ocfs2_quotactl_ops; 2101 sb->s_qcop = &ocfs2_quotactl_ops;
2096 sb->dq_op = &ocfs2_quota_operations; 2102 sb->dq_op = &ocfs2_quota_operations;
diff --git a/fs/open.c b/fs/open.c
index 4197b9ed023..e52389e1f05 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
223 return -EINVAL; 223 return -EINVAL;
224 224
225 /* Return error if mode is not supported */ 225 /* Return error if mode is not supported */
226 if (mode && !(mode & FALLOC_FL_KEEP_SIZE)) 226 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
227 return -EOPNOTSUPP;
228
229 /* Punch hole must have keep size set */
230 if ((mode & FALLOC_FL_PUNCH_HOLE) &&
231 !(mode & FALLOC_FL_KEEP_SIZE))
227 return -EOPNOTSUPP; 232 return -EOPNOTSUPP;
228 233
229 if (!(file->f_mode & FMODE_WRITE)) 234 if (!(file->f_mode & FMODE_WRITE))
@@ -250,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
250 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 255 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
251 return -EFBIG; 256 return -EFBIG;
252 257
253 if (!inode->i_op->fallocate) 258 if (!file->f_op->fallocate)
254 return -EOPNOTSUPP; 259 return -EOPNOTSUPP;
255 260
256 return inode->i_op->fallocate(inode, mode, offset, len); 261 return file->f_op->fallocate(file, mode, offset, len);
257} 262}
258 263
259SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) 264SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ddb1f41376e..a2a5bff774e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
343 return &oi->vfs_inode; 343 return &oi->vfs_inode;
344} 344}
345 345
346static void openprom_destroy_inode(struct inode *inode) 346static void openprom_i_callback(struct rcu_head *head)
347{ 347{
348 struct inode *inode = container_of(head, struct inode, i_rcu);
349 INIT_LIST_HEAD(&inode->i_dentry);
348 kmem_cache_free(op_inode_cachep, OP_I(inode)); 350 kmem_cache_free(op_inode_cachep, OP_I(inode));
349} 351}
350 352
353static void openprom_destroy_inode(struct inode *inode)
354{
355 call_rcu(&inode->i_rcu, openprom_i_callback);
356}
357
351static struct inode *openprom_iget(struct super_block *sb, ino_t ino) 358static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
352{ 359{
353 struct inode *inode; 360 struct inode *inode;
@@ -418,7 +425,7 @@ out_no_root:
418static struct dentry *openprom_mount(struct file_system_type *fs_type, 425static struct dentry *openprom_mount(struct file_system_type *fs_type,
419 int flags, const char *dev_name, void *data) 426 int flags, const char *dev_name, void *data)
420{ 427{
421 return mount_single(fs_type, flags, data, openprom_fill_super) 428 return mount_single(fs_type, flags, data, openprom_fill_super);
422} 429}
423 430
424static struct file_system_type openprom_fs_type = { 431static struct file_system_type openprom_fs_type = {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad0c7e..9c21119512b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev,
237 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 237 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
238} 238}
239 239
240ssize_t part_ro_show(struct device *dev,
241 struct device_attribute *attr, char *buf)
242{
243 struct hd_struct *p = dev_to_part(dev);
244 return sprintf(buf, "%d\n", p->policy ? 1 : 0);
245}
246
240ssize_t part_alignment_offset_show(struct device *dev, 247ssize_t part_alignment_offset_show(struct device *dev,
241 struct device_attribute *attr, char *buf) 248 struct device_attribute *attr, char *buf)
242{ 249{
@@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev,
312static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); 319static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
313static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 320static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
314static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 321static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
322static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
315static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 323static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
316static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, 324static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
317 NULL); 325 NULL);
@@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = {
326 &dev_attr_partition.attr, 334 &dev_attr_partition.attr,
327 &dev_attr_start.attr, 335 &dev_attr_start.attr,
328 &dev_attr_size.attr, 336 &dev_attr_size.attr,
337 &dev_attr_ro.attr,
329 &dev_attr_alignment_offset.attr, 338 &dev_attr_alignment_offset.attr,
330 &dev_attr_discard_alignment.attr, 339 &dev_attr_discard_alignment.attr,
331 &dev_attr_stat.attr, 340 &dev_attr_stat.attr,
@@ -372,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
372 put_device(part_to_dev(part)); 381 put_device(part_to_dev(part));
373} 382}
374 383
384void __delete_partition(struct hd_struct *part)
385{
386 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
387}
388
375void delete_partition(struct gendisk *disk, int partno) 389void delete_partition(struct gendisk *disk, int partno)
376{ 390{
377 struct disk_part_tbl *ptbl = disk->part_tbl; 391 struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -390,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
390 kobject_put(part->holder_dir); 404 kobject_put(part->holder_dir);
391 device_del(part_to_dev(part)); 405 device_del(part_to_dev(part));
392 406
393 call_rcu(&part->rcu_head, delete_partition_rcu_cb); 407 hd_struct_put(part);
394} 408}
395 409
396static ssize_t whole_disk_show(struct device *dev, 410static ssize_t whole_disk_show(struct device *dev,
@@ -489,6 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
489 if (!dev_get_uevent_suppress(ddev)) 503 if (!dev_get_uevent_suppress(ddev))
490 kobject_uevent(&pdev->kobj, KOBJ_ADD); 504 kobject_uevent(&pdev->kobj, KOBJ_ADD);
491 505
506 hd_ref_init(p);
492 return p; 507 return p;
493 508
494out_free_info: 509out_free_info:
@@ -507,65 +522,6 @@ out_put:
507 return ERR_PTR(err); 522 return ERR_PTR(err);
508} 523}
509 524
510/* Not exported, helper to add_disk(). */
511void register_disk(struct gendisk *disk)
512{
513 struct device *ddev = disk_to_dev(disk);
514 struct block_device *bdev;
515 struct disk_part_iter piter;
516 struct hd_struct *part;
517 int err;
518
519 ddev->parent = disk->driverfs_dev;
520
521 dev_set_name(ddev, disk->disk_name);
522
523 /* delay uevents, until we scanned partition table */
524 dev_set_uevent_suppress(ddev, 1);
525
526 if (device_add(ddev))
527 return;
528 if (!sysfs_deprecated) {
529 err = sysfs_create_link(block_depr, &ddev->kobj,
530 kobject_name(&ddev->kobj));
531 if (err) {
532 device_del(ddev);
533 return;
534 }
535 }
536 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
537 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
538
539 /* No minors to use for partitions */
540 if (!disk_partitionable(disk))
541 goto exit;
542
543 /* No such device (e.g., media were just removed) */
544 if (!get_capacity(disk))
545 goto exit;
546
547 bdev = bdget_disk(disk, 0);
548 if (!bdev)
549 goto exit;
550
551 bdev->bd_invalidated = 1;
552 err = blkdev_get(bdev, FMODE_READ);
553 if (err < 0)
554 goto exit;
555 blkdev_put(bdev, FMODE_READ);
556
557exit:
558 /* announce disk after possible partitions are created */
559 dev_set_uevent_suppress(ddev, 0);
560 kobject_uevent(&ddev->kobj, KOBJ_ADD);
561
562 /* announce possible partitions */
563 disk_part_iter_init(&piter, disk, 0);
564 while ((part = disk_part_iter_next(&piter)))
565 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
566 disk_part_iter_exit(&piter);
567}
568
569static bool disk_unlock_native_capacity(struct gendisk *disk) 525static bool disk_unlock_native_capacity(struct gendisk *disk)
570{ 526{
571 const struct block_device_operations *bdops = disk->fops; 527 const struct block_device_operations *bdops = disk->fops;
@@ -728,33 +684,3 @@ fail:
728} 684}
729 685
730EXPORT_SYMBOL(read_dev_sector); 686EXPORT_SYMBOL(read_dev_sector);
731
732void del_gendisk(struct gendisk *disk)
733{
734 struct disk_part_iter piter;
735 struct hd_struct *part;
736
737 /* invalidate stuff */
738 disk_part_iter_init(&piter, disk,
739 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
740 while ((part = disk_part_iter_next(&piter))) {
741 invalidate_partition(disk, part->partno);
742 delete_partition(disk, part->partno);
743 }
744 disk_part_iter_exit(&piter);
745
746 invalidate_partition(disk, 0);
747 blk_free_devt(disk_to_dev(disk)->devt);
748 set_capacity(disk, 0);
749 disk->flags &= ~GENHD_FL_UP;
750 unlink_gendisk(disk);
751 part_stat_set_all(&disk->part0, 0);
752 disk->part0.stamp = 0;
753
754 kobject_put(disk->part0.holder_dir);
755 kobject_put(disk->slave_dir);
756 disk->driverfs_dev = NULL;
757 if (!sysfs_deprecated)
758 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
759 device_del(disk_to_dev(disk));
760}
diff --git a/fs/pipe.c b/fs/pipe.c
index a8012a95572..da42f7db50d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -441,7 +441,7 @@ redo:
441 break; 441 break;
442 } 442 }
443 if (do_wakeup) { 443 if (do_wakeup) {
444 wake_up_interruptible_sync(&pipe->wait); 444 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
446 } 446 }
447 pipe_wait(pipe); 447 pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
450 450
451 /* Signal writers asynchronously that there is more room. */ 451 /* Signal writers asynchronously that there is more room. */
452 if (do_wakeup) { 452 if (do_wakeup) {
453 wake_up_interruptible_sync(&pipe->wait); 453 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
455 } 455 }
456 if (ret > 0) 456 if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
612 break; 612 break;
613 } 613 }
614 if (do_wakeup) { 614 if (do_wakeup) {
615 wake_up_interruptible_sync(&pipe->wait); 615 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
616 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 616 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
617 do_wakeup = 0; 617 do_wakeup = 0;
618 } 618 }
@@ -623,7 +623,7 @@ redo2:
623out: 623out:
624 mutex_unlock(&inode->i_mutex); 624 mutex_unlock(&inode->i_mutex);
625 if (do_wakeup) { 625 if (do_wakeup) {
626 wake_up_interruptible_sync(&pipe->wait); 626 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
627 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 627 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
628 } 628 }
629 if (ret > 0) 629 if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
715 if (!pipe->readers && !pipe->writers) { 715 if (!pipe->readers && !pipe->writers) {
716 free_pipe_info(inode); 716 free_pipe_info(inode);
717 } else { 717 } else {
718 wake_up_interruptible_sync(&pipe->wait); 718 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
719 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 719 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
720 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 720 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
721 } 721 }
@@ -999,12 +999,11 @@ struct file *create_write_pipe(int flags)
999 goto err; 999 goto err;
1000 1000
1001 err = -ENOMEM; 1001 err = -ENOMEM;
1002 path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); 1002 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
1003 if (!path.dentry) 1003 if (!path.dentry)
1004 goto err_inode; 1004 goto err_inode;
1005 path.mnt = mntget(pipe_mnt); 1005 path.mnt = mntget(pipe_mnt);
1006 1006
1007 path.dentry->d_op = &pipefs_dentry_operations;
1008 d_instantiate(path.dentry, inode); 1007 d_instantiate(path.dentry, inode);
1009 1008
1010 err = -ENFILE; 1009 err = -ENFILE;
@@ -1199,12 +1198,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1199 return ret; 1198 return ret;
1200} 1199}
1201 1200
1201/*
1202 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1203 * location, so checking ->i_pipe is not enough to verify that this is a
1204 * pipe.
1205 */
1206struct pipe_inode_info *get_pipe_info(struct file *file)
1207{
1208 struct inode *i = file->f_path.dentry->d_inode;
1209
1210 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
1211}
1212
1202long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1213long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1203{ 1214{
1204 struct pipe_inode_info *pipe; 1215 struct pipe_inode_info *pipe;
1205 long ret; 1216 long ret;
1206 1217
1207 pipe = file->f_path.dentry->d_inode->i_pipe; 1218 pipe = get_pipe_info(file);
1208 if (!pipe) 1219 if (!pipe)
1209 return -EBADF; 1220 return -EBADF;
1210 1221
@@ -1241,6 +1252,10 @@ out:
1241 return ret; 1252 return ret;
1242} 1253}
1243 1254
1255static const struct super_operations pipefs_ops = {
1256 .destroy_inode = free_inode_nonrcu,
1257};
1258
1244/* 1259/*
1245 * pipefs should _never_ be mounted by userland - too much of security hassle, 1260 * pipefs should _never_ be mounted by userland - too much of security hassle,
1246 * no real gain from having the whole whorehouse mounted. So we don't need 1261 * no real gain from having the whole whorehouse mounted. So we don't need
@@ -1250,7 +1265,8 @@ out:
1250static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1265static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1251 int flags, const char *dev_name, void *data) 1266 int flags, const char *dev_name, void *data)
1252{ 1267{
1253 return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); 1268 return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
1269 &pipefs_dentry_operations, PIPEFS_MAGIC);
1254} 1270}
1255 1271
1256static struct file_system_type pipe_fs_type = { 1272static struct file_system_type pipe_fs_type = {
diff --git a/fs/pnode.c b/fs/pnode.c
index 8066b8dd748..d42514e3238 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -288,7 +288,7 @@ out:
288 */ 288 */
289static inline int do_refcount_check(struct vfsmount *mnt, int count) 289static inline int do_refcount_check(struct vfsmount *mnt, int count)
290{ 290{
291 int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts; 291 int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
292 return (mycount > count); 292 return (mycount > count);
293} 293}
294 294
@@ -300,7 +300,7 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
300 * Check if any of these mounts that **do not have submounts** 300 * Check if any of these mounts that **do not have submounts**
301 * have more references than 'refcnt'. If so return busy. 301 * have more references than 'refcnt'. If so return busy.
302 * 302 *
303 * vfsmount lock must be held for read or write 303 * vfsmount lock must be held for write
304 */ 304 */
305int propagate_mount_busy(struct vfsmount *mnt, int refcnt) 305int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
306{ 306{
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a0ec2..b1cf6bf4b41 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/errno.h> 23#include <linux/errno.h>
24 24
25EXPORT_SYMBOL(posix_acl_init);
25EXPORT_SYMBOL(posix_acl_alloc); 26EXPORT_SYMBOL(posix_acl_alloc);
26EXPORT_SYMBOL(posix_acl_clone); 27EXPORT_SYMBOL(posix_acl_clone);
27EXPORT_SYMBOL(posix_acl_valid); 28EXPORT_SYMBOL(posix_acl_valid);
@@ -32,6 +33,16 @@ EXPORT_SYMBOL(posix_acl_chmod_masq);
32EXPORT_SYMBOL(posix_acl_permission); 33EXPORT_SYMBOL(posix_acl_permission);
33 34
34/* 35/*
36 * Init a fresh posix_acl
37 */
38void
39posix_acl_init(struct posix_acl *acl, int count)
40{
41 atomic_set(&acl->a_refcount, 1);
42 acl->a_count = count;
43}
44
45/*
35 * Allocate a new ACL with the specified number of entries. 46 * Allocate a new ACL with the specified number of entries.
36 */ 47 */
37struct posix_acl * 48struct posix_acl *
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
40 const size_t size = sizeof(struct posix_acl) + 51 const size_t size = sizeof(struct posix_acl) +
41 count * sizeof(struct posix_acl_entry); 52 count * sizeof(struct posix_acl_entry);
42 struct posix_acl *acl = kmalloc(size, flags); 53 struct posix_acl *acl = kmalloc(size, flags);
43 if (acl) { 54 if (acl)
44 atomic_set(&acl->a_refcount, 1); 55 posix_acl_init(acl, count);
45 acl->a_count = count;
46 }
47 return acl; 56 return acl;
48} 57}
49 58
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 6a0068841d9..15af6222f8a 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
1config PROC_FS 1config PROC_FS
2 bool "/proc file system support" if EMBEDDED 2 bool "/proc file system support" if EXPERT
3 default y 3 default y
4 help 4 help
5 This is a virtual file system providing information about the status 5 This is a virtual file system providing information about the status
@@ -40,7 +40,7 @@ config PROC_VMCORE
40 Exports the dump image of crashed kernel in ELF format. 40 Exports the dump image of crashed kernel in ELF format.
41 41
42config PROC_SYSCTL 42config PROC_SYSCTL
43 bool "Sysctl support (/proc/sys)" if EMBEDDED 43 bool "Sysctl support (/proc/sys)" if EXPERT
44 depends on PROC_FS 44 depends on PROC_FS
45 select SYSCTL 45 select SYSCTL
46 default y 46 default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
61config PROC_PAGE_MONITOR 61config PROC_PAGE_MONITOR
62 default y 62 default y
63 depends on PROC_FS && MMU 63 depends on PROC_FS && MMU
64 bool "Enable /proc page monitoring" if EMBEDDED 64 bool "Enable /proc page monitoring" if EXPERT
65 help 65 help
66 Various /proc files exist to monitor process memory utilization: 66 Various /proc files exist to monitor process memory utilization:
67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, 67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 2758e2afc51..df434c5f28f 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU) := mmu.o task_mmu.o
10proc-y += inode.o root.o base.o generic.o array.o \ 10proc-y += inode.o root.o base.o generic.o array.o \
11 proc_tty.o 11 proc_tty.o
12proc-y += cmdline.o 12proc-y += cmdline.o
13proc-y += consoles.o
13proc-y += cpuinfo.o 14proc-y += cpuinfo.o
14proc-y += devices.o 15proc-y += devices.o
15proc-y += interrupts.o 16proc-y += interrupts.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fff6572676a..df2b703b9d0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
95 95
96 get_task_comm(tcomm, p); 96 get_task_comm(tcomm, p);
97 97
98 seq_printf(m, "Name:\t"); 98 seq_puts(m, "Name:\t");
99 end = m->buf + m->size; 99 end = m->buf + m->size;
100 buf = m->buf + m->count; 100 buf = m->buf + m->count;
101 name = tcomm; 101 name = tcomm;
@@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
122 buf++; 122 buf++;
123 } 123 }
124 m->count = buf - m->buf; 124 m->count = buf - m->buf;
125 seq_printf(m, "\n"); 125 seq_putc(m, '\n');
126} 126}
127 127
128/* 128/*
@@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
208 seq_printf(m, "%d ", GROUP_AT(group_info, g)); 208 seq_printf(m, "%d ", GROUP_AT(group_info, g));
209 put_cred(cred); 209 put_cred(cred);
210 210
211 seq_printf(m, "\n"); 211 seq_putc(m, '\n');
212} 212}
213 213
214static void render_sigset_t(struct seq_file *m, const char *header, 214static void render_sigset_t(struct seq_file *m, const char *header,
@@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
216{ 216{
217 int i; 217 int i;
218 218
219 seq_printf(m, "%s", header); 219 seq_puts(m, header);
220 220
221 i = _NSIG; 221 i = _NSIG;
222 do { 222 do {
@@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
230 seq_printf(m, "%x", x); 230 seq_printf(m, "%x", x);
231 } while (i >= 4); 231 } while (i >= 4);
232 232
233 seq_printf(m, "\n"); 233 seq_putc(m, '\n');
234} 234}
235 235
236static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, 236static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
291{ 291{
292 unsigned __capi; 292 unsigned __capi;
293 293
294 seq_printf(m, "%s", header); 294 seq_puts(m, header);
295 CAP_FOR_EACH_U32(__capi) { 295 CAP_FOR_EACH_U32(__capi) {
296 seq_printf(m, "%08x", 296 seq_printf(m, "%08x",
297 a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); 297 a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
298 } 298 }
299 seq_printf(m, "\n"); 299 seq_putc(m, '\n');
300} 300}
301 301
302static inline void task_cap(struct seq_file *m, struct task_struct *p) 302static inline void task_cap(struct seq_file *m, struct task_struct *p)
@@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
329 329
330static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 330static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
331{ 331{
332 seq_printf(m, "Cpus_allowed:\t"); 332 seq_puts(m, "Cpus_allowed:\t");
333 seq_cpumask(m, &task->cpus_allowed); 333 seq_cpumask(m, &task->cpus_allowed);
334 seq_printf(m, "\n"); 334 seq_putc(m, '\n');
335 seq_printf(m, "Cpus_allowed_list:\t"); 335 seq_puts(m, "Cpus_allowed_list:\t");
336 seq_cpumask_list(m, &task->cpus_allowed); 336 seq_cpumask_list(m, &task->cpus_allowed);
337 seq_printf(m, "\n"); 337 seq_putc(m, '\n');
338} 338}
339 339
340int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 340int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
535int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, 535int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
536 struct pid *pid, struct task_struct *task) 536 struct pid *pid, struct task_struct *task)
537{ 537{
538 int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; 538 unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
539 struct mm_struct *mm = get_task_mm(task); 539 struct mm_struct *mm = get_task_mm(task);
540 540
541 if (mm) { 541 if (mm) {
542 size = task_statm(mm, &shared, &text, &data, &resident); 542 size = task_statm(mm, &shared, &text, &data, &resident);
543 mmput(mm); 543 mmput(mm);
544 } 544 }
545 seq_printf(m, "%d %d %d %d %d %d %d\n", 545 seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
546 size, resident, shared, text, lib, data, 0); 546 size, resident, shared, text, data);
547 547
548 return 0; 548 return 0;
549} 549}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3d02ca461e..9d096e82b20 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
373 return -ESRCH; 373 return -ESRCH;
374 seq_puts(m, "Latency Top version : v0.1\n"); 374 seq_puts(m, "Latency Top version : v0.1\n");
375 for (i = 0; i < 32; i++) { 375 for (i = 0; i < 32; i++) {
376 if (task->latency_record[i].backtrace[0]) { 376 struct latency_record *lr = &task->latency_record[i];
377 if (lr->backtrace[0]) {
377 int q; 378 int q;
378 seq_printf(m, "%i %li %li ", 379 seq_printf(m, "%i %li %li",
379 task->latency_record[i].count, 380 lr->count, lr->time, lr->max);
380 task->latency_record[i].time,
381 task->latency_record[i].max);
382 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 381 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
383 char sym[KSYM_SYMBOL_LEN]; 382 unsigned long bt = lr->backtrace[q];
384 char *c; 383 if (!bt)
385 if (!task->latency_record[i].backtrace[q])
386 break; 384 break;
387 if (task->latency_record[i].backtrace[q] == ULONG_MAX) 385 if (bt == ULONG_MAX)
388 break; 386 break;
389 sprint_symbol(sym, task->latency_record[i].backtrace[q]); 387 seq_printf(m, " %ps", (void *)bt);
390 c = strchr(sym, '+');
391 if (c)
392 *c = 0;
393 seq_printf(m, "%s ", sym);
394 } 388 }
395 seq_printf(m, "\n"); 389 seq_putc(m, '\n');
396 } 390 }
397 391
398 } 392 }
@@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v)
751 745
752static int proc_single_open(struct inode *inode, struct file *filp) 746static int proc_single_open(struct inode *inode, struct file *filp)
753{ 747{
754 int ret; 748 return single_open(filp, proc_single_show, inode);
755 ret = single_open(filp, proc_single_show, NULL);
756 if (!ret) {
757 struct seq_file *m = filp->private_data;
758
759 m->private = inode;
760 }
761 return ret;
762} 749}
763 750
764static const struct file_operations proc_single_file_operations = { 751static const struct file_operations proc_single_file_operations = {
@@ -1164,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1164 goto err_task_lock; 1151 goto err_task_lock;
1165 } 1152 }
1166 1153
1167 if (oom_score_adj < task->signal->oom_score_adj && 1154 if (oom_score_adj < task->signal->oom_score_adj_min &&
1168 !capable(CAP_SYS_RESOURCE)) { 1155 !capable(CAP_SYS_RESOURCE)) {
1169 err = -EACCES; 1156 err = -EACCES;
1170 goto err_sighand; 1157 goto err_sighand;
@@ -1177,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1177 atomic_dec(&task->mm->oom_disable_count); 1164 atomic_dec(&task->mm->oom_disable_count);
1178 } 1165 }
1179 task->signal->oom_score_adj = oom_score_adj; 1166 task->signal->oom_score_adj = oom_score_adj;
1167 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1168 task->signal->oom_score_adj_min = oom_score_adj;
1180 /* 1169 /*
1181 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is 1170 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1182 * always attainable. 1171 * always attainable.
@@ -1386,9 +1375,77 @@ sched_write(struct file *file, const char __user *buf,
1386 1375
1387static int sched_open(struct inode *inode, struct file *filp) 1376static int sched_open(struct inode *inode, struct file *filp)
1388{ 1377{
1378 return single_open(filp, sched_show, inode);
1379}
1380
1381static const struct file_operations proc_pid_sched_operations = {
1382 .open = sched_open,
1383 .read = seq_read,
1384 .write = sched_write,
1385 .llseek = seq_lseek,
1386 .release = single_release,
1387};
1388
1389#endif
1390
1391#ifdef CONFIG_SCHED_AUTOGROUP
1392/*
1393 * Print out autogroup related information:
1394 */
1395static int sched_autogroup_show(struct seq_file *m, void *v)
1396{
1397 struct inode *inode = m->private;
1398 struct task_struct *p;
1399
1400 p = get_proc_task(inode);
1401 if (!p)
1402 return -ESRCH;
1403 proc_sched_autogroup_show_task(p, m);
1404
1405 put_task_struct(p);
1406
1407 return 0;
1408}
1409
1410static ssize_t
1411sched_autogroup_write(struct file *file, const char __user *buf,
1412 size_t count, loff_t *offset)
1413{
1414 struct inode *inode = file->f_path.dentry->d_inode;
1415 struct task_struct *p;
1416 char buffer[PROC_NUMBUF];
1417 long nice;
1418 int err;
1419
1420 memset(buffer, 0, sizeof(buffer));
1421 if (count > sizeof(buffer) - 1)
1422 count = sizeof(buffer) - 1;
1423 if (copy_from_user(buffer, buf, count))
1424 return -EFAULT;
1425
1426 err = strict_strtol(strstrip(buffer), 0, &nice);
1427 if (err)
1428 return -EINVAL;
1429
1430 p = get_proc_task(inode);
1431 if (!p)
1432 return -ESRCH;
1433
1434 err = nice;
1435 err = proc_sched_autogroup_set_nice(p, &err);
1436 if (err)
1437 count = err;
1438
1439 put_task_struct(p);
1440
1441 return count;
1442}
1443
1444static int sched_autogroup_open(struct inode *inode, struct file *filp)
1445{
1389 int ret; 1446 int ret;
1390 1447
1391 ret = single_open(filp, sched_show, NULL); 1448 ret = single_open(filp, sched_autogroup_show, NULL);
1392 if (!ret) { 1449 if (!ret) {
1393 struct seq_file *m = filp->private_data; 1450 struct seq_file *m = filp->private_data;
1394 1451
@@ -1397,15 +1454,15 @@ static int sched_open(struct inode *inode, struct file *filp)
1397 return ret; 1454 return ret;
1398} 1455}
1399 1456
1400static const struct file_operations proc_pid_sched_operations = { 1457static const struct file_operations proc_pid_sched_autogroup_operations = {
1401 .open = sched_open, 1458 .open = sched_autogroup_open,
1402 .read = seq_read, 1459 .read = seq_read,
1403 .write = sched_write, 1460 .write = sched_autogroup_write,
1404 .llseek = seq_lseek, 1461 .llseek = seq_lseek,
1405 .release = single_release, 1462 .release = single_release,
1406}; 1463};
1407 1464
1408#endif 1465#endif /* CONFIG_SCHED_AUTOGROUP */
1409 1466
1410static ssize_t comm_write(struct file *file, const char __user *buf, 1467static ssize_t comm_write(struct file *file, const char __user *buf,
1411 size_t count, loff_t *offset) 1468 size_t count, loff_t *offset)
@@ -1454,15 +1511,7 @@ static int comm_show(struct seq_file *m, void *v)
1454 1511
1455static int comm_open(struct inode *inode, struct file *filp) 1512static int comm_open(struct inode *inode, struct file *filp)
1456{ 1513{
1457 int ret; 1514 return single_open(filp, comm_show, inode);
1458
1459 ret = single_open(filp, comm_show, NULL);
1460 if (!ret) {
1461 struct seq_file *m = filp->private_data;
1462
1463 m->private = inode;
1464 }
1465 return ret;
1466} 1515}
1467 1516
1468static const struct file_operations proc_pid_set_comm_operations = { 1517static const struct file_operations proc_pid_set_comm_operations = {
@@ -1574,7 +1623,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1574 if (!tmp) 1623 if (!tmp)
1575 return -ENOMEM; 1624 return -ENOMEM;
1576 1625
1577 pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE); 1626 pathname = d_path(path, tmp, PAGE_SIZE);
1578 len = PTR_ERR(pathname); 1627 len = PTR_ERR(pathname);
1579 if (IS_ERR(pathname)) 1628 if (IS_ERR(pathname))
1580 goto out; 1629 goto out;
@@ -1719,10 +1768,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
1719 */ 1768 */
1720static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) 1769static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1721{ 1770{
1722 struct inode *inode = dentry->d_inode; 1771 struct inode *inode;
1723 struct task_struct *task = get_proc_task(inode); 1772 struct task_struct *task;
1724 const struct cred *cred; 1773 const struct cred *cred;
1725 1774
1775 if (nd && nd->flags & LOOKUP_RCU)
1776 return -ECHILD;
1777
1778 inode = dentry->d_inode;
1779 task = get_proc_task(inode);
1780
1726 if (task) { 1781 if (task) {
1727 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1782 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1728 task_dumpable(task)) { 1783 task_dumpable(task)) {
@@ -1744,7 +1799,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1744 return 0; 1799 return 0;
1745} 1800}
1746 1801
1747static int pid_delete_dentry(struct dentry * dentry) 1802static int pid_delete_dentry(const struct dentry * dentry)
1748{ 1803{
1749 /* Is the task we represent dead? 1804 /* Is the task we represent dead?
1750 * If so, then don't put the dentry on the lru list, 1805 * If so, then don't put the dentry on the lru list,
@@ -1888,12 +1943,19 @@ static int proc_fd_link(struct inode *inode, struct path *path)
1888 1943
1889static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) 1944static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1890{ 1945{
1891 struct inode *inode = dentry->d_inode; 1946 struct inode *inode;
1892 struct task_struct *task = get_proc_task(inode); 1947 struct task_struct *task;
1893 int fd = proc_fd(inode); 1948 int fd;
1894 struct files_struct *files; 1949 struct files_struct *files;
1895 const struct cred *cred; 1950 const struct cred *cred;
1896 1951
1952 if (nd && nd->flags & LOOKUP_RCU)
1953 return -ECHILD;
1954
1955 inode = dentry->d_inode;
1956 task = get_proc_task(inode);
1957 fd = proc_fd(inode);
1958
1897 if (task) { 1959 if (task) {
1898 files = get_files_struct(task); 1960 files = get_files_struct(task);
1899 if (files) { 1961 if (files) {
@@ -1969,7 +2031,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
1969 inode->i_op = &proc_pid_link_inode_operations; 2031 inode->i_op = &proc_pid_link_inode_operations;
1970 inode->i_size = 64; 2032 inode->i_size = 64;
1971 ei->op.proc_get_link = proc_fd_link; 2033 ei->op.proc_get_link = proc_fd_link;
1972 dentry->d_op = &tid_fd_dentry_operations; 2034 d_set_d_op(dentry, &tid_fd_dentry_operations);
1973 d_add(dentry, inode); 2035 d_add(dentry, inode);
1974 /* Close the race of the process dying before we return the dentry */ 2036 /* Close the race of the process dying before we return the dentry */
1975 if (tid_fd_revalidate(dentry, NULL)) 2037 if (tid_fd_revalidate(dentry, NULL))
@@ -2101,11 +2163,13 @@ static const struct file_operations proc_fd_operations = {
2101 * /proc/pid/fd needs a special permission handler so that a process can still 2163 * /proc/pid/fd needs a special permission handler so that a process can still
2102 * access /proc/self/fd after it has executed a setuid(). 2164 * access /proc/self/fd after it has executed a setuid().
2103 */ 2165 */
2104static int proc_fd_permission(struct inode *inode, int mask) 2166static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
2105{ 2167{
2106 int rv; 2168 int rv;
2107 2169
2108 rv = generic_permission(inode, mask, NULL); 2170 if (flags & IPERM_FLAG_RCU)
2171 return -ECHILD;
2172 rv = generic_permission(inode, mask, flags, NULL);
2109 if (rv == 0) 2173 if (rv == 0)
2110 return 0; 2174 return 0;
2111 if (task_pid(current) == proc_pid(inode)) 2175 if (task_pid(current) == proc_pid(inode))
@@ -2137,7 +2201,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
2137 ei->fd = fd; 2201 ei->fd = fd;
2138 inode->i_mode = S_IFREG | S_IRUSR; 2202 inode->i_mode = S_IFREG | S_IRUSR;
2139 inode->i_fop = &proc_fdinfo_file_operations; 2203 inode->i_fop = &proc_fdinfo_file_operations;
2140 dentry->d_op = &tid_fd_dentry_operations; 2204 d_set_d_op(dentry, &tid_fd_dentry_operations);
2141 d_add(dentry, inode); 2205 d_add(dentry, inode);
2142 /* Close the race of the process dying before we return the dentry */ 2206 /* Close the race of the process dying before we return the dentry */
2143 if (tid_fd_revalidate(dentry, NULL)) 2207 if (tid_fd_revalidate(dentry, NULL))
@@ -2196,7 +2260,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
2196 if (p->fop) 2260 if (p->fop)
2197 inode->i_fop = p->fop; 2261 inode->i_fop = p->fop;
2198 ei->op = p->op; 2262 ei->op = p->op;
2199 dentry->d_op = &pid_dentry_operations; 2263 d_set_d_op(dentry, &pid_dentry_operations);
2200 d_add(dentry, inode); 2264 d_add(dentry, inode);
2201 /* Close the race of the process dying before we return the dentry */ 2265 /* Close the race of the process dying before we return the dentry */
2202 if (pid_revalidate(dentry, NULL)) 2266 if (pid_revalidate(dentry, NULL))
@@ -2563,8 +2627,14 @@ static const struct pid_entry proc_base_stuff[] = {
2563 */ 2627 */
2564static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) 2628static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
2565{ 2629{
2566 struct inode *inode = dentry->d_inode; 2630 struct inode *inode;
2567 struct task_struct *task = get_proc_task(inode); 2631 struct task_struct *task;
2632
2633 if (nd->flags & LOOKUP_RCU)
2634 return -ECHILD;
2635
2636 inode = dentry->d_inode;
2637 task = get_proc_task(inode);
2568 if (task) { 2638 if (task) {
2569 put_task_struct(task); 2639 put_task_struct(task);
2570 return 1; 2640 return 1;
@@ -2615,7 +2685,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2615 if (p->fop) 2685 if (p->fop)
2616 inode->i_fop = p->fop; 2686 inode->i_fop = p->fop;
2617 ei->op = p->op; 2687 ei->op = p->op;
2618 dentry->d_op = &proc_base_dentry_operations; 2688 d_set_d_op(dentry, &proc_base_dentry_operations);
2619 d_add(dentry, inode); 2689 d_add(dentry, inode);
2620 error = NULL; 2690 error = NULL;
2621out: 2691out:
@@ -2733,6 +2803,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2733#ifdef CONFIG_SCHED_DEBUG 2803#ifdef CONFIG_SCHED_DEBUG
2734 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2804 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2735#endif 2805#endif
2806#ifdef CONFIG_SCHED_AUTOGROUP
2807 REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
2808#endif
2736 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2809 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2737#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2810#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2738 INF("syscall", S_IRUSR, proc_pid_syscall), 2811 INF("syscall", S_IRUSR, proc_pid_syscall),
@@ -2926,7 +2999,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
2926 inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, 2999 inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
2927 ARRAY_SIZE(tgid_base_stuff)); 3000 ARRAY_SIZE(tgid_base_stuff));
2928 3001
2929 dentry->d_op = &pid_dentry_operations; 3002 d_set_d_op(dentry, &pid_dentry_operations);
2930 3003
2931 d_add(dentry, inode); 3004 d_add(dentry, inode);
2932 /* Close the race of the process dying before we return the dentry */ 3005 /* Close the race of the process dying before we return the dentry */
@@ -3169,7 +3242,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
3169 inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, 3242 inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
3170 ARRAY_SIZE(tid_base_stuff)); 3243 ARRAY_SIZE(tid_base_stuff));
3171 3244
3172 dentry->d_op = &pid_dentry_operations; 3245 d_set_d_op(dentry, &pid_dentry_operations);
3173 3246
3174 d_add(dentry, inode); 3247 d_add(dentry, inode);
3175 /* Close the race of the process dying before we return the dentry */ 3248 /* Close the race of the process dying before we return the dentry */
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
new file mode 100644
index 00000000000..b701eaa482b
--- /dev/null
+++ b/fs/proc/consoles.c
@@ -0,0 +1,114 @@
1/*
2 * Copyright (c) 2010 Werner Fink, Jiri Slaby
3 *
4 * Licensed under GPLv2
5 */
6
7#include <linux/console.h>
8#include <linux/kernel.h>
9#include <linux/proc_fs.h>
10#include <linux/seq_file.h>
11#include <linux/tty_driver.h>
12
13/*
14 * This is handler for /proc/consoles
15 */
16static int show_console_dev(struct seq_file *m, void *v)
17{
18 static const struct {
19 short flag;
20 char name;
21 } con_flags[] = {
22 { CON_ENABLED, 'E' },
23 { CON_CONSDEV, 'C' },
24 { CON_BOOT, 'B' },
25 { CON_PRINTBUFFER, 'p' },
26 { CON_BRL, 'b' },
27 { CON_ANYTIME, 'a' },
28 };
29 char flags[ARRAY_SIZE(con_flags) + 1];
30 struct console *con = v;
31 unsigned int a;
32 int len;
33 dev_t dev = 0;
34
35 if (con->device) {
36 const struct tty_driver *driver;
37 int index;
38 driver = con->device(con, &index);
39 if (driver) {
40 dev = MKDEV(driver->major, driver->minor_start);
41 dev += index;
42 }
43 }
44
45 for (a = 0; a < ARRAY_SIZE(con_flags); a++)
46 flags[a] = (con->flags & con_flags[a].flag) ?
47 con_flags[a].name : ' ';
48 flags[a] = 0;
49
50 seq_printf(m, "%s%d%n", con->name, con->index, &len);
51 len = 21 - len;
52 if (len < 1)
53 len = 1;
54 seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
55 con->write ? 'W' : '-', con->unblank ? 'U' : '-',
56 flags);
57 if (dev)
58 seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
59
60 seq_printf(m, "\n");
61
62 return 0;
63}
64
65static void *c_start(struct seq_file *m, loff_t *pos)
66{
67 struct console *con;
68 loff_t off = 0;
69
70 console_lock();
71 for_each_console(con)
72 if (off++ == *pos)
73 break;
74
75 return con;
76}
77
78static void *c_next(struct seq_file *m, void *v, loff_t *pos)
79{
80 struct console *con = v;
81 ++*pos;
82 return con->next;
83}
84
85static void c_stop(struct seq_file *m, void *v)
86{
87 console_unlock();
88}
89
90static const struct seq_operations consoles_op = {
91 .start = c_start,
92 .next = c_next,
93 .stop = c_stop,
94 .show = show_console_dev
95};
96
97static int consoles_open(struct inode *inode, struct file *file)
98{
99 return seq_open(file, &consoles_op);
100}
101
102static const struct file_operations proc_consoles_operations = {
103 .open = consoles_open,
104 .read = seq_read,
105 .llseek = seq_lseek,
106 .release = seq_release,
107};
108
109static int __init proc_consoles_init(void)
110{
111 proc_create("consoles", 0, NULL, &proc_consoles_operations);
112 return 0;
113}
114module_init(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c..b14347167c3 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
9 9
10 if (i < CHRDEV_MAJOR_HASH_SIZE) { 10 if (i < CHRDEV_MAJOR_HASH_SIZE) {
11 if (i == 0) 11 if (i == 0)
12 seq_printf(f, "Character devices:\n"); 12 seq_puts(f, "Character devices:\n");
13 chrdev_show(f, i); 13 chrdev_show(f, i);
14 } 14 }
15#ifdef CONFIG_BLOCK 15#ifdef CONFIG_BLOCK
16 else { 16 else {
17 i -= CHRDEV_MAJOR_HASH_SIZE; 17 i -= CHRDEV_MAJOR_HASH_SIZE;
18 if (i == 0) 18 if (i == 0)
19 seq_printf(f, "\nBlock devices:\n"); 19 seq_puts(f, "\nBlock devices:\n");
20 blkdev_show(f, i); 20 blkdev_show(f, i);
21 } 21 }
22#endif 22#endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dd29f033766..01e07f2a188 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = {
400 * smarter: we could keep a "volatile" flag in the 400 * smarter: we could keep a "volatile" flag in the
401 * inode to indicate which ones to keep. 401 * inode to indicate which ones to keep.
402 */ 402 */
403static int proc_delete_dentry(struct dentry * dentry) 403static int proc_delete_dentry(const struct dentry * dentry)
404{ 404{
405 return 1; 405 return 1;
406} 406}
@@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
425 if (de->namelen != dentry->d_name.len) 425 if (de->namelen != dentry->d_name.len)
426 continue; 426 continue;
427 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { 427 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
428 unsigned int ino;
429
430 ino = de->low_ino;
431 pde_get(de); 428 pde_get(de);
432 spin_unlock(&proc_subdir_lock); 429 spin_unlock(&proc_subdir_lock);
433 error = -EINVAL; 430 error = -EINVAL;
434 inode = proc_get_inode(dir->i_sb, ino, de); 431 inode = proc_get_inode(dir->i_sb, de);
435 goto out_unlock; 432 goto out_unlock;
436 } 433 }
437 } 434 }
@@ -439,7 +436,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
439out_unlock: 436out_unlock:
440 437
441 if (inode) { 438 if (inode) {
442 dentry->d_op = &proc_dentry_operations; 439 d_set_d_op(dentry, &proc_dentry_operations);
443 d_add(dentry, inode); 440 d_add(dentry, inode);
444 return NULL; 441 return NULL;
445 } 442 }
@@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
768 765
769static void free_proc_entry(struct proc_dir_entry *de) 766static void free_proc_entry(struct proc_dir_entry *de)
770{ 767{
771 unsigned int ino = de->low_ino; 768 release_inode_number(de->low_ino);
772
773 if (ino < PROC_DYNAMIC_FIRST)
774 return;
775
776 release_inode_number(ino);
777 769
778 if (S_ISLNK(de->mode)) 770 if (S_ISLNK(de->mode))
779 kfree(de->data); 771 kfree(de->data);
@@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
834 826
835 wait_for_completion(de->pde_unload_completion); 827 wait_for_completion(de->pde_unload_completion);
836 828
837 goto continue_removing; 829 spin_lock(&de->pde_unload_lock);
838 } 830 }
839 spin_unlock(&de->pde_unload_lock);
840 831
841continue_removing:
842 spin_lock(&de->pde_unload_lock);
843 while (!list_empty(&de->pde_openers)) { 832 while (!list_empty(&de->pde_openers)) {
844 struct pde_opener *pdeo; 833 struct pde_opener *pdeo;
845 834
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9c2b5f48487..176ce4cda68 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -16,7 +16,6 @@
16#include <linux/limits.h> 16#include <linux/limits.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp_lock.h>
20#include <linux/sysctl.h> 19#include <linux/sysctl.h>
21#include <linux/slab.h> 20#include <linux/slab.h>
22 21
@@ -66,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
66 return inode; 65 return inode;
67} 66}
68 67
69static void proc_destroy_inode(struct inode *inode) 68static void proc_i_callback(struct rcu_head *head)
70{ 69{
70 struct inode *inode = container_of(head, struct inode, i_rcu);
71 INIT_LIST_HEAD(&inode->i_dentry);
71 kmem_cache_free(proc_inode_cachep, PROC_I(inode)); 72 kmem_cache_free(proc_inode_cachep, PROC_I(inode));
72} 73}
73 74
75static void proc_destroy_inode(struct inode *inode)
76{
77 call_rcu(&inode->i_rcu, proc_i_callback);
78}
79
74static void init_once(void *foo) 80static void init_once(void *foo)
75{ 81{
76 struct proc_inode *ei = (struct proc_inode *) foo; 82 struct proc_inode *ei = (struct proc_inode *) foo;
@@ -410,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
410}; 416};
411#endif 417#endif
412 418
413struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, 419struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
414 struct proc_dir_entry *de)
415{ 420{
416 struct inode * inode; 421 struct inode * inode;
417 422
418 inode = iget_locked(sb, ino); 423 inode = iget_locked(sb, de->low_ino);
419 if (!inode) 424 if (!inode)
420 return NULL; 425 return NULL;
421 if (inode->i_state & I_NEW) { 426 if (inode->i_state & I_NEW) {
@@ -465,7 +470,7 @@ int proc_fill_super(struct super_block *s)
465 s->s_time_gran = 1; 470 s->s_time_gran = 1;
466 471
467 pde_get(&proc_root); 472 pde_get(&proc_root);
468 root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); 473 root_inode = proc_get_inode(s, &proc_root);
469 if (!root_inode) 474 if (!root_inode)
470 goto out_no_root; 475 goto out_no_root;
471 root_inode->i_uid = 0; 476 root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd1..9ad561ded40 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock;
96struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); 96struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
97int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); 97int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
98unsigned long task_vsize(struct mm_struct *); 98unsigned long task_vsize(struct mm_struct *);
99int task_statm(struct mm_struct *, int *, int *, int *, int *); 99unsigned long task_statm(struct mm_struct *,
100 unsigned long *, unsigned long *, unsigned long *, unsigned long *);
100void task_mem(struct seq_file *, struct mm_struct *); 101void task_mem(struct seq_file *, struct mm_struct *);
101 102
102static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) 103static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
@@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde);
108 109
109extern struct vfsmount *proc_mnt; 110extern struct vfsmount *proc_mnt;
110int proc_fill_super(struct super_block *); 111int proc_fill_super(struct super_block *);
111struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); 112struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
112 113
113/* 114/*
114 * These are generic /proc routines that use the internal 115 * These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6f37c391468..d245cb23dd7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
558static const struct file_operations proc_kcore_operations = { 558static const struct file_operations proc_kcore_operations = {
559 .read = read_kcore, 559 .read = read_kcore,
560 .open = open_kcore, 560 .open = open_kcore,
561 .llseek = generic_file_llseek, 561 .llseek = default_llseek,
562}; 562};
563 563
564#ifdef CONFIG_MEMORY_HOTPLUG 564#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97..ed257d14156 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
101#ifdef CONFIG_MEMORY_FAILURE 101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %5lu kB\n" 102 "HardwareCorrupted: %5lu kB\n"
103#endif 103#endif
104#ifdef CONFIG_TRANSPARENT_HUGEPAGE
105 "AnonHugePages: %8lu kB\n"
106#endif
104 , 107 ,
105 K(i.totalram), 108 K(i.totalram),
106 K(i.freeram), 109 K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
128 K(i.freeswap), 131 K(i.freeswap),
129 K(global_page_state(NR_FILE_DIRTY)), 132 K(global_page_state(NR_FILE_DIRTY)),
130 K(global_page_state(NR_WRITEBACK)), 133 K(global_page_state(NR_WRITEBACK)),
131 K(global_page_state(NR_ANON_PAGES)), 134 K(global_page_state(NR_ANON_PAGES)
135#ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
137 HPAGE_PMD_NR
138#endif
139 ),
132 K(global_page_state(NR_FILE_MAPPED)), 140 K(global_page_state(NR_FILE_MAPPED)),
133 K(global_page_state(NR_SHMEM)), 141 K(global_page_state(NR_SHMEM)),
134 K(global_page_state(NR_SLAB_RECLAIMABLE) + 142 K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
151#ifdef CONFIG_MEMORY_FAILURE 159#ifdef CONFIG_MEMORY_FAILURE
152 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) 160 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
153#endif 161#endif
162#ifdef CONFIG_TRANSPARENT_HUGEPAGE
163 ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
164 HPAGE_PMD_NR)
165#endif
154 ); 166 );
155 167
156 hugetlb_report_meminfo(m); 168 hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3b8b4566033..6d8e6a9e93a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
40 ppage = pfn_to_page(pfn); 40 ppage = pfn_to_page(pfn);
41 else 41 else
42 ppage = NULL; 42 ppage = NULL;
43 if (!ppage) 43 if (!ppage || PageSlab(ppage))
44 pcount = 0; 44 pcount = 0;
45 else 45 else
46 pcount = page_mapcount(ppage); 46 pcount = page_mapcount(ppage);
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
116 if (PageHuge(page)) 116 if (PageHuge(page))
117 u |= 1 << KPF_HUGE; 117 u |= 1 << KPF_HUGE;
118 118
119 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
120
121 /* 119 /*
122 * Caveats on high order pages: 120 * Caveats on high order pages: page->_count will only be set
123 * PG_buddy will only be set on the head page; SLUB/SLQB do the same 121 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
124 * for PG_slab; SLOB won't set PG_slab at all on compound pages. 122 * SLOB won't set PG_slab at all on compound pages.
125 */ 123 */
124 if (PageBuddy(page))
125 u |= 1 << KPF_BUDDY;
126
127 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
128
126 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 129 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
127 u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy);
128 130
129 u |= kpf_copy_bit(k, KPF_ERROR, PG_error); 131 u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
130 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); 132 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b652cb00906..09a1f92a34e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
5#include <linux/sysctl.h> 5#include <linux/sysctl.h>
6#include <linux/proc_fs.h> 6#include <linux/proc_fs.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/namei.h>
8#include "internal.h" 9#include "internal.h"
9 10
10static const struct dentry_operations proc_sys_dentry_operations; 11static const struct dentry_operations proc_sys_dentry_operations;
@@ -120,7 +121,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
120 goto out; 121 goto out;
121 122
122 err = NULL; 123 err = NULL;
123 dentry->d_op = &proc_sys_dentry_operations; 124 d_set_d_op(dentry, &proc_sys_dentry_operations);
124 d_add(dentry, inode); 125 d_add(dentry, inode);
125 126
126out: 127out:
@@ -201,7 +202,7 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
201 dput(child); 202 dput(child);
202 return -ENOMEM; 203 return -ENOMEM;
203 } else { 204 } else {
204 child->d_op = &proc_sys_dentry_operations; 205 d_set_d_op(child, &proc_sys_dentry_operations);
205 d_add(child, inode); 206 d_add(child, inode);
206 } 207 }
207 } else { 208 } else {
@@ -294,7 +295,7 @@ out:
294 return ret; 295 return ret;
295} 296}
296 297
297static int proc_sys_permission(struct inode *inode, int mask) 298static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
298{ 299{
299 /* 300 /*
300 * sysctl entries that are not writeable, 301 * sysctl entries that are not writeable,
@@ -304,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask)
304 struct ctl_table *table; 305 struct ctl_table *table;
305 int error; 306 int error;
306 307
308 if (flags & IPERM_FLAG_RCU)
309 return -ECHILD;
310
307 /* Executable files are not allowed under /proc/sys/ */ 311 /* Executable files are not allowed under /proc/sys/ */
308 if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) 312 if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
309 return -EACCES; 313 return -EACCES;
@@ -389,23 +393,30 @@ static const struct inode_operations proc_sys_dir_operations = {
389 393
390static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) 394static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
391{ 395{
396 if (nd->flags & LOOKUP_RCU)
397 return -ECHILD;
392 return !PROC_I(dentry->d_inode)->sysctl->unregistering; 398 return !PROC_I(dentry->d_inode)->sysctl->unregistering;
393} 399}
394 400
395static int proc_sys_delete(struct dentry *dentry) 401static int proc_sys_delete(const struct dentry *dentry)
396{ 402{
397 return !!PROC_I(dentry->d_inode)->sysctl->unregistering; 403 return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
398} 404}
399 405
400static int proc_sys_compare(struct dentry *dir, struct qstr *qstr, 406static int proc_sys_compare(const struct dentry *parent,
401 struct qstr *name) 407 const struct inode *pinode,
408 const struct dentry *dentry, const struct inode *inode,
409 unsigned int len, const char *str, const struct qstr *name)
402{ 410{
403 struct dentry *dentry = container_of(qstr, struct dentry, d_name); 411 /* Although proc doesn't have negative dentries, rcu-walk means
404 if (qstr->len != name->len) 412 * that inode here can be NULL */
413 if (!inode)
414 return 0;
415 if (name->len != len)
405 return 1; 416 return 1;
406 if (memcmp(qstr->name, name->name, name->len)) 417 if (memcmp(name->name, str, len))
407 return 1; 418 return 1;
408 return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl); 419 return !sysctl_is_seen(PROC_I(inode)->sysctl);
409} 420}
410 421
411static const struct dentry_operations proc_sys_dentry_operations = { 422static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc86943..cb761f01030 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
36 } 36 }
37 switch (p->type) { 37 switch (p->type) {
38 case TTY_DRIVER_TYPE_SYSTEM: 38 case TTY_DRIVER_TYPE_SYSTEM:
39 seq_printf(m, "system"); 39 seq_puts(m, "system");
40 if (p->subtype == SYSTEM_TYPE_TTY) 40 if (p->subtype == SYSTEM_TYPE_TTY)
41 seq_printf(m, ":/dev/tty"); 41 seq_puts(m, ":/dev/tty");
42 else if (p->subtype == SYSTEM_TYPE_SYSCONS) 42 else if (p->subtype == SYSTEM_TYPE_SYSCONS)
43 seq_printf(m, ":console"); 43 seq_puts(m, ":console");
44 else if (p->subtype == SYSTEM_TYPE_CONSOLE) 44 else if (p->subtype == SYSTEM_TYPE_CONSOLE)
45 seq_printf(m, ":vtmaster"); 45 seq_puts(m, ":vtmaster");
46 break; 46 break;
47 case TTY_DRIVER_TYPE_CONSOLE: 47 case TTY_DRIVER_TYPE_CONSOLE:
48 seq_printf(m, "console"); 48 seq_puts(m, "console");
49 break; 49 break;
50 case TTY_DRIVER_TYPE_SERIAL: 50 case TTY_DRIVER_TYPE_SERIAL:
51 seq_printf(m, "serial"); 51 seq_puts(m, "serial");
52 break; 52 break;
53 case TTY_DRIVER_TYPE_PTY: 53 case TTY_DRIVER_TYPE_PTY:
54 if (p->subtype == PTY_TYPE_MASTER) 54 if (p->subtype == PTY_TYPE_MASTER)
55 seq_printf(m, "pty:master"); 55 seq_puts(m, "pty:master");
56 else if (p->subtype == PTY_TYPE_SLAVE) 56 else if (p->subtype == PTY_TYPE_SLAVE)
57 seq_printf(m, "pty:slave"); 57 seq_puts(m, "pty:slave");
58 else 58 else
59 seq_printf(m, "pty"); 59 seq_puts(m, "pty");
60 break; 60 break;
61 default: 61 default:
62 seq_printf(m, "type:%d.%d", p->type, p->subtype); 62 seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
74 /* pseudo-drivers first */ 74 /* pseudo-drivers first */
75 seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); 75 seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
76 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0); 76 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
77 seq_printf(m, "system:/dev/tty\n"); 77 seq_puts(m, "system:/dev/tty\n");
78 seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console"); 78 seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
79 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1); 79 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
80 seq_printf(m, "system:console\n"); 80 seq_puts(m, "system:console\n");
81#ifdef CONFIG_UNIX98_PTYS 81#ifdef CONFIG_UNIX98_PTYS
82 seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx"); 82 seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
83 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2); 83 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
84 seq_printf(m, "system\n"); 84 seq_puts(m, "system\n");
85#endif 85#endif
86#ifdef CONFIG_VT 86#ifdef CONFIG_VT
87 seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0"); 87 seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
88 seq_printf(m, "%3d %7d ", TTY_MAJOR, 0); 88 seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
89 seq_printf(m, "system:vtmaster\n"); 89 seq_puts(m, "system:vtmaster\n");
90#endif 90#endif
91 } 91 }
92 92
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 37994737c98..62604be9f58 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
10{ 10{
11 int i, j; 11 int i, j;
12 12
13 seq_printf(p, " "); 13 seq_puts(p, " ");
14 for_each_possible_cpu(i) 14 for_each_possible_cpu(i)
15 seq_printf(p, "CPU%-8d", i); 15 seq_printf(p, "CPU%-8d", i);
16 seq_printf(p, "\n"); 16 seq_putc(p, '\n');
17 17
18 for (i = 0; i < NR_SOFTIRQS; i++) { 18 for (i = 0; i < NR_SOFTIRQS; i++) {
19 seq_printf(p, "%12s:", softirq_to_name[i]); 19 seq_printf(p, "%12s:", softirq_to_name[i]);
20 for_each_possible_cpu(j) 20 for_each_possible_cpu(j)
21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); 21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
22 seq_printf(p, "\n"); 22 seq_putc(p, '\n');
23 } 23 }
24 return 0; 24 return 0;
25} 25}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e15a19c93ba..1cffa2b8a2f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -126,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v)
126 126
127 for (i = 0; i < NR_SOFTIRQS; i++) 127 for (i = 0; i < NR_SOFTIRQS; i++)
128 seq_printf(p, " %u", per_softirq_sums[i]); 128 seq_printf(p, " %u", per_softirq_sums[i]);
129 seq_printf(p, "\n"); 129 seq_putc(p, '\n');
130 130
131 return 0; 131 return 0;
132} 132}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index da6b01d70f0..60b914860f8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm)
66 return PAGE_SIZE * mm->total_vm; 66 return PAGE_SIZE * mm->total_vm;
67} 67}
68 68
69int task_statm(struct mm_struct *mm, int *shared, int *text, 69unsigned long task_statm(struct mm_struct *mm,
70 int *data, int *resident) 70 unsigned long *shared, unsigned long *text,
71 unsigned long *data, unsigned long *resident)
71{ 72{
72 *shared = get_mm_counter(mm, MM_FILEPAGES); 73 *shared = get_mm_counter(mm, MM_FILEPAGES);
73 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 74 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
@@ -417,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
417 "Anonymous: %8lu kB\n" 418 "Anonymous: %8lu kB\n"
418 "Swap: %8lu kB\n" 419 "Swap: %8lu kB\n"
419 "KernelPageSize: %8lu kB\n" 420 "KernelPageSize: %8lu kB\n"
420 "MMUPageSize: %8lu kB\n", 421 "MMUPageSize: %8lu kB\n"
422 "Locked: %8lu kB\n",
421 (vma->vm_end - vma->vm_start) >> 10, 423 (vma->vm_end - vma->vm_start) >> 10,
422 mss.resident >> 10, 424 mss.resident >> 10,
423 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 425 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -429,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
429 mss.anonymous >> 10, 431 mss.anonymous >> 10,
430 mss.swap >> 10, 432 mss.swap >> 10,
431 vma_kernel_pagesize(vma) >> 10, 433 vma_kernel_pagesize(vma) >> 10,
432 vma_mmu_pagesize(vma) >> 10); 434 vma_mmu_pagesize(vma) >> 10,
435 (vma->vm_flags & VM_LOCKED) ?
436 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
433 437
434 if (m->count < m->size) /* vma is copied successfully */ 438 if (m->count < m->size) /* vma is copied successfully */
435 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 439 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
@@ -706,6 +710,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
706 * skip over unmapped regions. 710 * skip over unmapped regions.
707 */ 711 */
708#define PAGEMAP_WALK_SIZE (PMD_SIZE) 712#define PAGEMAP_WALK_SIZE (PMD_SIZE)
713#define PAGEMAP_WALK_MASK (PMD_MASK)
709static ssize_t pagemap_read(struct file *file, char __user *buf, 714static ssize_t pagemap_read(struct file *file, char __user *buf,
710 size_t count, loff_t *ppos) 715 size_t count, loff_t *ppos)
711{ 716{
@@ -776,7 +781,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
776 unsigned long end; 781 unsigned long end;
777 782
778 pm.pos = 0; 783 pm.pos = 0;
779 end = start_vaddr + PAGEMAP_WALK_SIZE; 784 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
780 /* overflow ? */ 785 /* overflow ? */
781 if (end < start_vaddr || end > end_vaddr) 786 if (end < start_vaddr || end > end_vaddr)
782 end = end_vaddr; 787 end = end_vaddr;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index cb6306e6384..b535d3e5d5f 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
92 return vsize; 92 return vsize;
93} 93}
94 94
95int task_statm(struct mm_struct *mm, int *shared, int *text, 95unsigned long task_statm(struct mm_struct *mm,
96 int *data, int *resident) 96 unsigned long *shared, unsigned long *text,
97 unsigned long *data, unsigned long *resident)
97{ 98{
98 struct vm_area_struct *vma; 99 struct vm_area_struct *vma;
99 struct vm_region *region; 100 struct vm_region *region;
100 struct rb_node *p; 101 struct rb_node *p;
101 int size = kobjsize(mm); 102 unsigned long size = kobjsize(mm);
102 103
103 down_read(&mm->mmap_sem); 104 down_read(&mm->mmap_sem);
104 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { 105 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 2367fb3f70b..74802bc5ded 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -499,7 +499,7 @@ static int __init parse_crash_elf64_headers(void)
499 /* Do some basic Verification. */ 499 /* Do some basic Verification. */
500 if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || 500 if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
501 (ehdr.e_type != ET_CORE) || 501 (ehdr.e_type != ET_CORE) ||
502 !vmcore_elf_check_arch(&ehdr) || 502 !vmcore_elf64_check_arch(&ehdr) ||
503 ehdr.e_ident[EI_CLASS] != ELFCLASS64 || 503 ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
504 ehdr.e_ident[EI_VERSION] != EV_CURRENT || 504 ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
505 ehdr.e_version != EV_CURRENT || 505 ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fcada42f1aa..e63b4171d58 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -425,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb)
425 return &ei->vfs_inode; 425 return &ei->vfs_inode;
426} 426}
427 427
428static void qnx4_destroy_inode(struct inode *inode) 428static void qnx4_i_callback(struct rcu_head *head)
429{ 429{
430 struct inode *inode = container_of(head, struct inode, i_rcu);
431 INIT_LIST_HEAD(&inode->i_dentry);
430 kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); 432 kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
431} 433}
432 434
435static void qnx4_destroy_inode(struct inode *inode)
436{
437 call_rcu(&inode->i_rcu, qnx4_i_callback);
438}
439
433static void init_once(void *foo) 440static void init_once(void *foo)
434{ 441{
435 struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; 442 struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0fed41e6efc..a2a622e079f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
133EXPORT_SYMBOL(dq_data_lock); 133EXPORT_SYMBOL(dq_data_lock);
134 134
135void __quota_error(struct super_block *sb, const char *func, 135void __quota_error(struct super_block *sb, const char *func,
136 const char *fmt, ...) 136 const char *fmt, ...)
137{ 137{
138 va_list args;
139
140 if (printk_ratelimit()) { 138 if (printk_ratelimit()) {
139 va_list args;
140 struct va_format vaf;
141
141 va_start(args, fmt); 142 va_start(args, fmt);
142 printk(KERN_ERR "Quota error (device %s): %s: ", 143
143 sb->s_id, func); 144 vaf.fmt = fmt;
144 vprintk(fmt, args); 145 vaf.va = &args;
145 printk("\n"); 146
147 printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
148 sb->s_id, func, &vaf);
149
146 va_end(args); 150 va_end(args);
147 } 151 }
148} 152}
@@ -2185,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
2185} 2189}
2186EXPORT_SYMBOL(dquot_resume); 2190EXPORT_SYMBOL(dquot_resume);
2187 2191
2188int dquot_quota_on_path(struct super_block *sb, int type, int format_id, 2192int dquot_quota_on(struct super_block *sb, int type, int format_id,
2189 struct path *path) 2193 struct path *path)
2190{ 2194{
2191 int error = security_quota_on(path->dentry); 2195 int error = security_quota_on(path->dentry);
2192 if (error) 2196 if (error)
@@ -2200,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
2200 DQUOT_LIMITS_ENABLED); 2204 DQUOT_LIMITS_ENABLED);
2201 return error; 2205 return error;
2202} 2206}
2203EXPORT_SYMBOL(dquot_quota_on_path);
2204
2205int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
2206{
2207 struct path path;
2208 int error;
2209
2210 error = kern_path(name, LOOKUP_FOLLOW, &path);
2211 if (!error) {
2212 error = dquot_quota_on_path(sb, type, format_id, &path);
2213 path_put(&path);
2214 }
2215 return error;
2216}
2217EXPORT_SYMBOL(dquot_quota_on); 2207EXPORT_SYMBOL(dquot_quota_on);
2218 2208
2219/* 2209/*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1ed..b34bdb25490 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
64} 64}
65 65
66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
67 void __user *addr) 67 struct path *path)
68{ 68{
69 char *pathname; 69 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
70 int ret = -ENOSYS; 70 return -ENOSYS;
71 71 if (sb->s_qcop->quota_on_meta)
72 pathname = getname(addr); 72 return sb->s_qcop->quota_on_meta(sb, type, id);
73 if (IS_ERR(pathname)) 73 if (IS_ERR(path))
74 return PTR_ERR(pathname); 74 return PTR_ERR(path);
75 if (sb->s_qcop->quota_on) 75 return sb->s_qcop->quota_on(sb, type, id, path);
76 ret = sb->s_qcop->quota_on(sb, type, id, pathname);
77 putname(pathname);
78 return ret;
79} 76}
80 77
81static int quota_getfmt(struct super_block *sb, int type, void __user *addr) 78static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
241 238
242/* Copy parameters and call proper function */ 239/* Copy parameters and call proper function */
243static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, 240static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
244 void __user *addr) 241 void __user *addr, struct path *path)
245{ 242{
246 int ret; 243 int ret;
247 244
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
256 253
257 switch (cmd) { 254 switch (cmd) {
258 case Q_QUOTAON: 255 case Q_QUOTAON:
259 return quota_quotaon(sb, type, cmd, id, addr); 256 return quota_quotaon(sb, type, cmd, id, path);
260 case Q_QUOTAOFF: 257 case Q_QUOTAOFF:
261 if (!sb->s_qcop->quota_off) 258 if (!sb->s_qcop->quota_off)
262 return -ENOSYS; 259 return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
335{ 332{
336 uint cmds, type; 333 uint cmds, type;
337 struct super_block *sb = NULL; 334 struct super_block *sb = NULL;
335 struct path path, *pathp = NULL;
338 int ret; 336 int ret;
339 337
340 cmds = cmd >> SUBCMDSHIFT; 338 cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
351 return -ENODEV; 349 return -ENODEV;
352 } 350 }
353 351
352 /*
353 * Path for quotaon has to be resolved before grabbing superblock
354 * because that gets s_umount sem which is also possibly needed by path
355 * resolution (think about autofs) and thus deadlocks could arise.
356 */
357 if (cmds == Q_QUOTAON) {
358 ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
359 if (ret)
360 pathp = ERR_PTR(ret);
361 else
362 pathp = &path;
363 }
364
354 sb = quotactl_block(special); 365 sb = quotactl_block(special);
355 if (IS_ERR(sb)) 366 if (IS_ERR(sb))
356 return PTR_ERR(sb); 367 return PTR_ERR(sb);
357 368
358 ret = do_quotactl(sb, type, cmds, id, addr); 369 ret = do_quotactl(sb, type, cmds, id, addr, pathp);
359 370
360 drop_super(sb); 371 drop_super(sb);
372 if (pathp && !IS_ERR(pathp))
373 path_put(pathp);
361 return ret; 374 return ret;
362} 375}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabc..e41c1becf09 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
468 return -ENOMEM; 468 return -ENOMEM;
469 ret = read_blk(info, *blk, buf); 469 ret = read_blk(info, *blk, buf);
470 if (ret < 0) { 470 if (ret < 0) {
471 quota_error(dquot->dq_sb, "Can't read quota data " 471 quota_error(dquot->dq_sb, "Can't read quota data block %u",
472 "block %u", blk); 472 *blk);
473 goto out_buf; 473 goto out_buf;
474 } 474 }
475 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); 475 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
493 } else { 493 } else {
494 ret = write_blk(info, *blk, buf); 494 ret = write_blk(info, *blk, buf);
495 if (ret < 0) 495 if (ret < 0)
496 quota_error(dquot->dq_sb, "Can't write quota " 496 quota_error(dquot->dq_sb,
497 "tree block %u", blk); 497 "Can't write quota tree block %u",
498 *blk);
498 } 499 }
499 } 500 }
500out_buf: 501out_buf:
diff --git a/fs/read_write.c b/fs/read_write.c
index 431a0ed610c..5520f8ad550 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
9#include <linux/fcntl.h> 9#include <linux/fcntl.h>
10#include <linux/file.h> 10#include <linux/file.h>
11#include <linux/uio.h> 11#include <linux/uio.h>
12#include <linux/smp_lock.h>
13#include <linux/fsnotify.h> 12#include <linux/fsnotify.h>
14#include <linux/security.h> 13#include <linux/security.h>
15#include <linux/module.h> 14#include <linux/module.h>
@@ -31,18 +30,9 @@ const struct file_operations generic_ro_fops = {
31 30
32EXPORT_SYMBOL(generic_ro_fops); 31EXPORT_SYMBOL(generic_ro_fops);
33 32
34static int 33static inline int unsigned_offsets(struct file *file)
35__negative_fpos_check(struct file *file, loff_t pos, size_t count)
36{ 34{
37 /* 35 return file->f_mode & FMODE_UNSIGNED_OFFSET;
38 * pos or pos+count is negative here, check overflow.
39 * too big "count" will be caught in rw_verify_area().
40 */
41 if ((pos < 0) && (pos + count < pos))
42 return -EOVERFLOW;
43 if (file->f_mode & FMODE_UNSIGNED_OFFSET)
44 return 0;
45 return -EINVAL;
46} 36}
47 37
48/** 38/**
@@ -76,7 +66,7 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
76 break; 66 break;
77 } 67 }
78 68
79 if (offset < 0 && __negative_fpos_check(file, offset, 0)) 69 if (offset < 0 && !unsigned_offsets(file))
80 return -EINVAL; 70 return -EINVAL;
81 if (offset > inode->i_sb->s_maxbytes) 71 if (offset > inode->i_sb->s_maxbytes)
82 return -EINVAL; 72 return -EINVAL;
@@ -153,7 +143,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
153 offset += file->f_pos; 143 offset += file->f_pos;
154 } 144 }
155 retval = -EINVAL; 145 retval = -EINVAL;
156 if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) { 146 if (offset >= 0 || unsigned_offsets(file)) {
157 if (offset != file->f_pos) { 147 if (offset != file->f_pos) {
158 file->f_pos = offset; 148 file->f_pos = offset;
159 file->f_version = 0; 149 file->f_version = 0;
@@ -253,9 +243,13 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
253 if (unlikely((ssize_t) count < 0)) 243 if (unlikely((ssize_t) count < 0))
254 return retval; 244 return retval;
255 pos = *ppos; 245 pos = *ppos;
256 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) { 246 if (unlikely(pos < 0)) {
257 retval = __negative_fpos_check(file, pos, count); 247 if (!unsigned_offsets(file))
258 if (retval) 248 return retval;
249 if (count >= -pos) /* both values are in 0..LLONG_MAX */
250 return -EOVERFLOW;
251 } else if (unlikely((loff_t) (pos + count) < 0)) {
252 if (!unsigned_offsets(file))
259 return retval; 253 return retval;
260 } 254 }
261 255
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 41656d40dc5..0bae036831e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -8,7 +8,6 @@
8#include <linux/reiserfs_acl.h> 8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h> 9#include <linux/reiserfs_xattr.h>
10#include <linux/exportfs.h> 10#include <linux/exportfs.h>
11#include <linux/smp_lock.h>
12#include <linux/pagemap.h> 11#include <linux/pagemap.h>
13#include <linux/highmem.h> 12#include <linux/highmem.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index adf22b485ce..79265fdc317 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,7 +9,6 @@
9#include <linux/time.h> 9#include <linux/time.h>
10#include <asm/uaccess.h> 10#include <asm/uaccess.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/smp_lock.h>
13#include <linux/compat.h> 12#include <linux/compat.h>
14 13
15/* 14/*
@@ -184,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
184 return 0; 183 return 0;
185 } 184 }
186 185
187 /* we need to make sure nobody is changing the file size beneath
188 ** us
189 */
190 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
191 depth = reiserfs_write_lock_once(inode->i_sb); 186 depth = reiserfs_write_lock_once(inode->i_sb);
192 187
188 /* we need to make sure nobody is changing the file size beneath us */
189 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
190
193 write_from = inode->i_size & (blocksize - 1); 191 write_from = inode->i_size & (blocksize - 1);
194 /* if we are on a block boundary, we are already unpacked. */ 192 /* if we are on a block boundary, we are already unpacked. */
195 if (write_from == 0) { 193 if (write_from == 0) {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 076c8b19468..3eea859e699 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -43,7 +43,6 @@
43#include <linux/fcntl.h> 43#include <linux/fcntl.h>
44#include <linux/stat.h> 44#include <linux/stat.h>
45#include <linux/string.h> 45#include <linux/string.h>
46#include <linux/smp_lock.h>
47#include <linux/buffer_head.h> 46#include <linux/buffer_head.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/writeback.h> 48#include <linux/writeback.h>
@@ -2552,8 +2551,6 @@ static int release_journal_dev(struct super_block *super,
2552 result = 0; 2551 result = 0;
2553 2552
2554 if (journal->j_dev_bd != NULL) { 2553 if (journal->j_dev_bd != NULL) {
2555 if (journal->j_dev_bd->bd_dev != super->s_dev)
2556 bd_release(journal->j_dev_bd);
2557 result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); 2554 result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
2558 journal->j_dev_bd = NULL; 2555 journal->j_dev_bd = NULL;
2559 } 2556 }
@@ -2571,7 +2568,7 @@ static int journal_init_dev(struct super_block *super,
2571{ 2568{
2572 int result; 2569 int result;
2573 dev_t jdev; 2570 dev_t jdev;
2574 fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE; 2571 fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
2575 char b[BDEVNAME_SIZE]; 2572 char b[BDEVNAME_SIZE];
2576 2573
2577 result = 0; 2574 result = 0;
@@ -2585,7 +2582,10 @@ static int journal_init_dev(struct super_block *super,
2585 2582
2586 /* there is no "jdev" option and journal is on separate device */ 2583 /* there is no "jdev" option and journal is on separate device */
2587 if ((!jdev_name || !jdev_name[0])) { 2584 if ((!jdev_name || !jdev_name[0])) {
2588 journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); 2585 if (jdev == super->s_dev)
2586 blkdev_mode &= ~FMODE_EXCL;
2587 journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
2588 journal);
2589 journal->j_dev_mode = blkdev_mode; 2589 journal->j_dev_mode = blkdev_mode;
2590 if (IS_ERR(journal->j_dev_bd)) { 2590 if (IS_ERR(journal->j_dev_bd)) {
2591 result = PTR_ERR(journal->j_dev_bd); 2591 result = PTR_ERR(journal->j_dev_bd);
@@ -2594,22 +2594,14 @@ static int journal_init_dev(struct super_block *super,
2594 "cannot init journal device '%s': %i", 2594 "cannot init journal device '%s': %i",
2595 __bdevname(jdev, b), result); 2595 __bdevname(jdev, b), result);
2596 return result; 2596 return result;
2597 } else if (jdev != super->s_dev) { 2597 } else if (jdev != super->s_dev)
2598 result = bd_claim(journal->j_dev_bd, journal);
2599 if (result) {
2600 blkdev_put(journal->j_dev_bd, blkdev_mode);
2601 return result;
2602 }
2603
2604 set_blocksize(journal->j_dev_bd, super->s_blocksize); 2598 set_blocksize(journal->j_dev_bd, super->s_blocksize);
2605 }
2606 2599
2607 return 0; 2600 return 0;
2608 } 2601 }
2609 2602
2610 journal->j_dev_mode = blkdev_mode; 2603 journal->j_dev_mode = blkdev_mode;
2611 journal->j_dev_bd = open_bdev_exclusive(jdev_name, 2604 journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
2612 blkdev_mode, journal);
2613 if (IS_ERR(journal->j_dev_bd)) { 2605 if (IS_ERR(journal->j_dev_bd)) {
2614 result = PTR_ERR(journal->j_dev_bd); 2606 result = PTR_ERR(journal->j_dev_bd);
2615 journal->j_dev_bd = NULL; 2607 journal->j_dev_bd = NULL;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index adbc6f53851..45de98b5946 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int l
586 va_list args; 586 va_list args;
587 int mode, first, last; 587 int mode, first, last;
588 588
589 va_start(args, bh);
590
591 if (!bh) { 589 if (!bh) {
592 printk("print_block: buffer is NULL\n"); 590 printk("print_block: buffer is NULL\n");
593 return; 591 return;
594 } 592 }
595 593
594 va_start(args, bh);
595
596 mode = va_arg(args, int); 596 mode = va_arg(args, int);
597 first = va_arg(args, int); 597 first = va_arg(args, int);
598 last = va_arg(args, int); 598 last = va_arg(args, int);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3bf7a6457f4..0aab04f4682 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,7 +28,6 @@
28#include <linux/mount.h> 28#include <linux/mount.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/crc32.h> 30#include <linux/crc32.h>
31#include <linux/smp_lock.h>
32 31
33struct file_system_type reiserfs_fs_type; 32struct file_system_type reiserfs_fs_type;
34 33
@@ -530,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
530 return &ei->vfs_inode; 529 return &ei->vfs_inode;
531} 530}
532 531
533static void reiserfs_destroy_inode(struct inode *inode) 532static void reiserfs_i_callback(struct rcu_head *head)
534{ 533{
534 struct inode *inode = container_of(head, struct inode, i_rcu);
535 INIT_LIST_HEAD(&inode->i_dentry);
535 kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); 536 kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
536} 537}
537 538
539static void reiserfs_destroy_inode(struct inode *inode)
540{
541 call_rcu(&inode->i_rcu, reiserfs_i_callback);
542}
543
538static void init_once(void *foo) 544static void init_once(void *foo)
539{ 545{
540 struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; 546 struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
@@ -626,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
626static int reiserfs_release_dquot(struct dquot *); 632static int reiserfs_release_dquot(struct dquot *);
627static int reiserfs_mark_dquot_dirty(struct dquot *); 633static int reiserfs_mark_dquot_dirty(struct dquot *);
628static int reiserfs_write_info(struct super_block *, int); 634static int reiserfs_write_info(struct super_block *, int);
629static int reiserfs_quota_on(struct super_block *, int, int, char *); 635static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
630 636
631static const struct dquot_operations reiserfs_quota_operations = { 637static const struct dquot_operations reiserfs_quota_operations = {
632 .write_dquot = reiserfs_write_dquot, 638 .write_dquot = reiserfs_write_dquot,
@@ -2042,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
2042 * Standard function to be called on quota_on 2048 * Standard function to be called on quota_on
2043 */ 2049 */
2044static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, 2050static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2045 char *name) 2051 struct path *path)
2046{ 2052{
2047 int err; 2053 int err;
2048 struct path path;
2049 struct inode *inode; 2054 struct inode *inode;
2050 struct reiserfs_transaction_handle th; 2055 struct reiserfs_transaction_handle th;
2051 2056
2052 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) 2057 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
2053 return -EINVAL; 2058 return -EINVAL;
2054 2059
2055 err = kern_path(name, LOOKUP_FOLLOW, &path);
2056 if (err)
2057 return err;
2058 /* Quotafile not on the same filesystem? */ 2060 /* Quotafile not on the same filesystem? */
2059 if (path.mnt->mnt_sb != sb) { 2061 if (path->mnt->mnt_sb != sb) {
2060 err = -EXDEV; 2062 err = -EXDEV;
2061 goto out; 2063 goto out;
2062 } 2064 }
2063 inode = path.dentry->d_inode; 2065 inode = path->dentry->d_inode;
2064 /* We must not pack tails for quota files on reiserfs for quota IO to work */ 2066 /* We must not pack tails for quota files on reiserfs for quota IO to work */
2065 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { 2067 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
2066 err = reiserfs_unpack(inode, NULL); 2068 err = reiserfs_unpack(inode, NULL);
@@ -2076,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2076 /* Journaling quota? */ 2078 /* Journaling quota? */
2077 if (REISERFS_SB(sb)->s_qf_names[type]) { 2079 if (REISERFS_SB(sb)->s_qf_names[type]) {
2078 /* Quotafile not of fs root? */ 2080 /* Quotafile not of fs root? */
2079 if (path.dentry->d_parent != sb->s_root) 2081 if (path->dentry->d_parent != sb->s_root)
2080 reiserfs_warning(sb, "super-6521", 2082 reiserfs_warning(sb, "super-6521",
2081 "Quota file not on filesystem root. " 2083 "Quota file not on filesystem root. "
2082 "Journalled quota will not work."); 2084 "Journalled quota will not work.");
@@ -2095,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2095 if (err) 2097 if (err)
2096 goto out; 2098 goto out;
2097 } 2099 }
2098 err = dquot_quota_on_path(sb, type, format_id, &path); 2100 err = dquot_quota_on(sb, type, format_id, path);
2099out: 2101out:
2100 path_put(&path);
2101 return err; 2102 return err;
2102} 2103}
2103 2104
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 5d04a7828e7..3cfb2e93364 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -870,11 +870,14 @@ out:
870 return err; 870 return err;
871} 871}
872 872
873static int reiserfs_check_acl(struct inode *inode, int mask) 873static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
874{ 874{
875 struct posix_acl *acl; 875 struct posix_acl *acl;
876 int error = -EAGAIN; /* do regular unix permission checks by default */ 876 int error = -EAGAIN; /* do regular unix permission checks by default */
877 877
878 if (flags & IPERM_FLAG_RCU)
879 return -ECHILD;
880
878 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); 881 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
879 882
880 if (acl) { 883 if (acl) {
@@ -951,8 +954,10 @@ static int xattr_mount_check(struct super_block *s)
951 return 0; 954 return 0;
952} 955}
953 956
954int reiserfs_permission(struct inode *inode, int mask) 957int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
955{ 958{
959 if (flags & IPERM_FLAG_RCU)
960 return -ECHILD;
956 /* 961 /*
957 * We don't do permission checks on the internal objects. 962 * We don't do permission checks on the internal objects.
958 * Permissions are determined by the "owning" object. 963 * Permissions are determined by the "owning" object.
@@ -965,13 +970,16 @@ int reiserfs_permission(struct inode *inode, int mask)
965 * Stat data v1 doesn't support ACLs. 970 * Stat data v1 doesn't support ACLs.
966 */ 971 */
967 if (get_inode_sd_version(inode) != STAT_DATA_V1) 972 if (get_inode_sd_version(inode) != STAT_DATA_V1)
968 return generic_permission(inode, mask, reiserfs_check_acl); 973 return generic_permission(inode, mask, flags,
974 reiserfs_check_acl);
969#endif 975#endif
970 return generic_permission(inode, mask, NULL); 976 return generic_permission(inode, mask, flags, NULL);
971} 977}
972 978
973static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) 979static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
974{ 980{
981 if (nd->flags & LOOKUP_RCU)
982 return -ECHILD;
975 return -EPERM; 983 return -EPERM;
976} 984}
977 985
@@ -990,7 +998,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
990 strlen(PRIVROOT_NAME)); 998 strlen(PRIVROOT_NAME));
991 if (!IS_ERR(dentry)) { 999 if (!IS_ERR(dentry)) {
992 REISERFS_SB(s)->priv_root = dentry; 1000 REISERFS_SB(s)->priv_root = dentry;
993 dentry->d_op = &xattr_lookup_poison_ops; 1001 d_set_d_op(dentry, &xattr_lookup_poison_ops);
994 if (dentry->d_inode) 1002 if (dentry->d_inode)
995 dentry->d_inode->i_flags |= S_PRIVATE; 1003 dentry->d_inode->i_flags |= S_PRIVATE;
996 } else 1004 } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 536d697a8a2..90d2fcb67a3 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode)
472 struct reiserfs_transaction_handle th; 472 struct reiserfs_transaction_handle th;
473 size_t size = reiserfs_xattr_nblocks(inode, 473 size_t size = reiserfs_xattr_nblocks(inode,
474 reiserfs_acl_size(clone->a_count)); 474 reiserfs_acl_size(clone->a_count));
475 reiserfs_write_lock(inode->i_sb); 475 int depth;
476
477 depth = reiserfs_write_lock_once(inode->i_sb);
476 error = journal_begin(&th, inode->i_sb, size * 2); 478 error = journal_begin(&th, inode->i_sb, size * 2);
477 if (!error) { 479 if (!error) {
478 int error2; 480 int error2;
@@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode)
482 if (error2) 484 if (error2)
483 error = error2; 485 error = error2;
484 } 486 }
485 reiserfs_write_unlock(inode->i_sb); 487 reiserfs_write_unlock_once(inode->i_sb, depth);
486 } 488 }
487 posix_acl_release(clone); 489 posix_acl_release(clone);
488 return error; 490 return error;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 6647f90e55c..2305e3121cb 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -400,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
400/* 400/*
401 * return a spent inode to the slab cache 401 * return a spent inode to the slab cache
402 */ 402 */
403static void romfs_destroy_inode(struct inode *inode) 403static void romfs_i_callback(struct rcu_head *head)
404{ 404{
405 struct inode *inode = container_of(head, struct inode, i_rcu);
406 INIT_LIST_HEAD(&inode->i_dentry);
405 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); 407 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
406} 408}
407 409
410static void romfs_destroy_inode(struct inode *inode)
411{
412 call_rcu(&inode->i_rcu, romfs_i_callback);
413}
414
408/* 415/*
409 * get filesystem statistics 416 * get filesystem statistics
410 */ 417 */
diff --git a/fs/select.c b/fs/select.c
index b7b10aa3086..e56560d2b08 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
306 rts.tv_sec = rts.tv_nsec = 0; 306 rts.tv_sec = rts.tv_nsec = 0;
307 307
308 if (timeval) { 308 if (timeval) {
309 if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
310 memset(&rtv, 0, sizeof(rtv));
309 rtv.tv_sec = rts.tv_sec; 311 rtv.tv_sec = rts.tv_sec;
310 rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; 312 rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
311 313
diff --git a/fs/splice.c b/fs/splice.c
index 8f1dfaecc8f..50a5d978da1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
682{ 682{
683 struct file *file = sd->u.file; 683 struct file *file = sd->u.file;
684 loff_t pos = sd->pos; 684 loff_t pos = sd->pos;
685 int ret, more; 685 int more;
686
687 ret = buf->ops->confirm(pipe, buf);
688 if (!ret) {
689 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
690 if (file->f_op && file->f_op->sendpage)
691 ret = file->f_op->sendpage(file, buf->page, buf->offset,
692 sd->len, &pos, more);
693 else
694 ret = -EINVAL;
695 }
696 686
697 return ret; 687 if (!likely(file->f_op && file->f_op->sendpage))
688 return -EINVAL;
689
690 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
691 return file->f_op->sendpage(file, buf->page, buf->offset,
692 sd->len, &pos, more);
698} 693}
699 694
700/* 695/*
@@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
727 void *fsdata; 722 void *fsdata;
728 int ret; 723 int ret;
729 724
730 /*
731 * make sure the data in this buffer is uptodate
732 */
733 ret = buf->ops->confirm(pipe, buf);
734 if (unlikely(ret))
735 return ret;
736
737 offset = sd->pos & ~PAGE_CACHE_MASK; 725 offset = sd->pos & ~PAGE_CACHE_MASK;
738 726
739 this_len = sd->len; 727 this_len = sd->len;
@@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
805 if (sd->len > sd->total_len) 793 if (sd->len > sd->total_len)
806 sd->len = sd->total_len; 794 sd->len = sd->total_len;
807 795
808 ret = actor(pipe, buf, sd); 796 ret = buf->ops->confirm(pipe, buf);
809 if (ret <= 0) { 797 if (unlikely(ret)) {
810 if (ret == -ENODATA) 798 if (ret == -ENODATA)
811 ret = 0; 799 ret = 0;
812 return ret; 800 return ret;
813 } 801 }
802
803 ret = actor(pipe, buf, sd);
804 if (ret <= 0)
805 return ret;
806
814 buf->offset += ret; 807 buf->offset += ret;
815 buf->len -= ret; 808 buf->len -= ret;
816 809
@@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1044 int ret; 1037 int ret;
1045 void *data; 1038 void *data;
1046 1039
1047 ret = buf->ops->confirm(pipe, buf);
1048 if (ret)
1049 return ret;
1050
1051 data = buf->ops->map(pipe, buf, 0); 1040 data = buf->ops->map(pipe, buf, 0);
1052 ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); 1041 ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1053 buf->ops->unmap(pipe, buf, data); 1042 buf->ops->unmap(pipe, buf, data);
@@ -1311,18 +1300,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1311static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1300static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1312 struct pipe_inode_info *opipe, 1301 struct pipe_inode_info *opipe,
1313 size_t len, unsigned int flags); 1302 size_t len, unsigned int flags);
1314/*
1315 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1316 * location, so checking ->i_pipe is not enough to verify that this is a
1317 * pipe.
1318 */
1319static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1320{
1321 if (S_ISFIFO(inode->i_mode))
1322 return inode->i_pipe;
1323
1324 return NULL;
1325}
1326 1303
1327/* 1304/*
1328 * Determine where to splice to/from. 1305 * Determine where to splice to/from.
@@ -1336,8 +1313,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1336 loff_t offset, *off; 1313 loff_t offset, *off;
1337 long ret; 1314 long ret;
1338 1315
1339 ipipe = pipe_info(in->f_path.dentry->d_inode); 1316 ipipe = get_pipe_info(in);
1340 opipe = pipe_info(out->f_path.dentry->d_inode); 1317 opipe = get_pipe_info(out);
1341 1318
1342 if (ipipe && opipe) { 1319 if (ipipe && opipe) {
1343 if (off_in || off_out) 1320 if (off_in || off_out)
@@ -1507,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1507 char *src; 1484 char *src;
1508 int ret; 1485 int ret;
1509 1486
1510 ret = buf->ops->confirm(pipe, buf);
1511 if (unlikely(ret))
1512 return ret;
1513
1514 /* 1487 /*
1515 * See if we can use the atomic maps, by prefaulting in the 1488 * See if we can use the atomic maps, by prefaulting in the
1516 * pages and doing an atomic copy 1489 * pages and doing an atomic copy
@@ -1555,7 +1528,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1555 int error; 1528 int error;
1556 long ret; 1529 long ret;
1557 1530
1558 pipe = pipe_info(file->f_path.dentry->d_inode); 1531 pipe = get_pipe_info(file);
1559 if (!pipe) 1532 if (!pipe)
1560 return -EBADF; 1533 return -EBADF;
1561 1534
@@ -1642,7 +1615,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1642 }; 1615 };
1643 long ret; 1616 long ret;
1644 1617
1645 pipe = pipe_info(file->f_path.dentry->d_inode); 1618 pipe = get_pipe_info(file);
1646 if (!pipe) 1619 if (!pipe)
1647 return -EBADF; 1620 return -EBADF;
1648 1621
@@ -2022,8 +1995,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
2022static long do_tee(struct file *in, struct file *out, size_t len, 1995static long do_tee(struct file *in, struct file *out, size_t len,
2023 unsigned int flags) 1996 unsigned int flags)
2024{ 1997{
2025 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1998 struct pipe_inode_info *ipipe = get_pipe_info(in);
2026 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1999 struct pipe_inode_info *opipe = get_pipe_info(out);
2027 int ret = -EINVAL; 2000 int ret = -EINVAL;
2028 2001
2029 /* 2002 /*
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d0..aa68a8a3151 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
29config SQUASHFS_XATTR 29config SQUASHFS_XATTR
30 bool "Squashfs XATTR support" 30 bool "Squashfs XATTR support"
31 depends on SQUASHFS 31 depends on SQUASHFS
32 default n
33 help 32 help
34 Saying Y here includes support for extended attributes (xattrs). 33 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by 34 Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
40config SQUASHFS_LZO 39config SQUASHFS_LZO
41 bool "Include support for LZO compressed file systems" 40 bool "Include support for LZO compressed file systems"
42 depends on SQUASHFS 41 depends on SQUASHFS
43 default n
44 select LZO_DECOMPRESS 42 select LZO_DECOMPRESS
45 help 43 help
46 Saying Y here includes support for reading Squashfs file systems 44 Saying Y here includes support for reading Squashfs file systems
@@ -53,10 +51,24 @@ config SQUASHFS_LZO
53 51
54 If unsure, say N. 52 If unsure, say N.
55 53
54config SQUASHFS_XZ
55 bool "Include support for XZ compressed file systems"
56 depends on SQUASHFS
57 select XZ_DEC
58 help
59 Saying Y here includes support for reading Squashfs file systems
60 compressed with XZ compresssion. XZ gives better compression than
61 the default zlib compression, at the expense of greater CPU and
62 memory overhead.
63
64 XZ is not the standard compression used in Squashfs and so most
65 file systems will be readable without selecting this option.
66
67 If unsure, say N.
68
56config SQUASHFS_EMBEDDED 69config SQUASHFS_EMBEDDED
57 bool "Additional option for memory-constrained systems" 70 bool "Additional option for memory-constrained systems"
58 depends on SQUASHFS 71 depends on SQUASHFS
59 default n
60 help 72 help
61 Saying Y here allows you to specify cache size. 73 Saying Y here allows you to specify cache size.
62 74
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d32..cecf2bea07a 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o 8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o 9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
10squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb84..8ab48bc2fa7 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
34 34
35#include "squashfs_fs.h" 35#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
37#include "squashfs_fs_i.h"
38#include "squashfs.h" 37#include "squashfs.h"
39#include "decompressor.h" 38#include "decompressor.h"
40 39
@@ -64,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb,
64 *length = (unsigned char) bh->b_data[*offset] | 63 *length = (unsigned char) bh->b_data[*offset] |
65 (unsigned char) bh->b_data[*offset + 1] << 8; 64 (unsigned char) bh->b_data[*offset + 1] << 8;
66 *offset += 2; 65 *offset += 2;
66
67 if (*offset == msblk->devblksize) {
68 put_bh(bh);
69 bh = sb_bread(sb, ++(*cur_index));
70 if (bh == NULL)
71 return NULL;
72 *offset = 0;
73 }
67 } 74 }
68 75
69 return bh; 76 return bh;
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee905..26b15ae34d6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
55 55
56#include "squashfs_fs.h" 56#include "squashfs_fs.h"
57#include "squashfs_fs_sb.h" 57#include "squashfs_fs_sb.h"
58#include "squashfs_fs_i.h"
59#include "squashfs.h" 58#include "squashfs.h"
60 59
61/* 60/*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722..a5940e54c4d 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
27 27
28#include "squashfs_fs.h" 28#include "squashfs_fs.h"
29#include "squashfs_fs_sb.h" 29#include "squashfs_fs_sb.h"
30#include "squashfs_fs_i.h"
31#include "decompressor.h" 30#include "decompressor.h"
32#include "squashfs.h" 31#include "squashfs.h"
33 32
@@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
41}; 40};
42 41
43#ifndef CONFIG_SQUASHFS_LZO 42#ifndef CONFIG_SQUASHFS_LZO
44static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { 43static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
45 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
46}; 45};
47#endif 46#endif
48 47
48#ifndef CONFIG_SQUASHFS_XZ
49static const struct squashfs_decompressor squashfs_xz_comp_ops = {
50 NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
51};
52#endif
53
49static const struct squashfs_decompressor squashfs_unknown_comp_ops = { 54static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
50 NULL, NULL, NULL, 0, "unknown", 0 55 NULL, NULL, NULL, 0, "unknown", 0
51}; 56};
52 57
53static const struct squashfs_decompressor *decompressor[] = { 58static const struct squashfs_decompressor *decompressor[] = {
54 &squashfs_zlib_comp_ops, 59 &squashfs_zlib_comp_ops,
55 &squashfs_lzma_unsupported_comp_ops,
56#ifdef CONFIG_SQUASHFS_LZO
57 &squashfs_lzo_comp_ops, 60 &squashfs_lzo_comp_ops,
58#else 61 &squashfs_xz_comp_ops,
59 &squashfs_lzo_unsupported_comp_ops, 62 &squashfs_lzma_unsupported_comp_ops,
60#endif
61 &squashfs_unknown_comp_ops 63 &squashfs_unknown_comp_ops
62}; 64};
63 65
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f..3b305a70f7a 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, 52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
53 length, srclength, pages); 53 length, srclength, pages);
54} 54}
55
56#ifdef CONFIG_SQUASHFS_XZ
57extern const struct squashfs_decompressor squashfs_xz_comp_ops;
58#endif
59
60#ifdef CONFIG_SQUASHFS_LZO
61extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
62#endif
63
55#endif 64#endif
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879..7eef571443c 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
39 39
40#include "squashfs_fs.h" 40#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h"
43#include "squashfs.h" 42#include "squashfs.h"
44 43
45/* 44/*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b7..d8f32452638 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
37 37
38#include "squashfs_fs.h" 38#include "squashfs_fs.h"
39#include "squashfs_fs_sb.h" 39#include "squashfs_fs_sb.h"
40#include "squashfs_fs_i.h"
41#include "squashfs.h" 40#include "squashfs.h"
42 41
43/* 42/*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c..7da759e34c5 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
29 29
30#include "squashfs_fs.h" 30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 32#include "squashfs.h"
34#include "decompressor.h" 33#include "decompressor.h"
35 34
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f7..ba729d80887 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
27 27
28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) 28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
29 29
30static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
31{
32 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
33}
34
35/* block.c */ 30/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, 31extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int, int); 32 int, int);
@@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
104 99
105/* zlib_wrapper.c */ 100/* zlib_wrapper.c */
106extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 101extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
107
108/* lzo_wrapper.c */
109extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab1..39533feffd6 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
238#define ZLIB_COMPRESSION 1 238#define ZLIB_COMPRESSION 1
239#define LZMA_COMPRESSION 2 239#define LZMA_COMPRESSION 2
240#define LZO_COMPRESSION 3 240#define LZO_COMPRESSION 3
241#define XZ_COMPRESSION 4
241 242
242struct squashfs_super_block { 243struct squashfs_super_block {
243 __le32 s_magic; 244 __le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a..359baefc01f 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
45 }; 45 };
46 struct inode vfs_inode; 46 struct inode vfs_inode;
47}; 47};
48
49
50static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
51{
52 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
53}
48#endif 54#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 24de30ba34c..20700b9f2b4 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -440,11 +440,18 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb)
440} 440}
441 441
442 442
443static void squashfs_destroy_inode(struct inode *inode) 443static void squashfs_i_callback(struct rcu_head *head)
444{ 444{
445 struct inode *inode = container_of(head, struct inode, i_rcu);
446 INIT_LIST_HEAD(&inode->i_dentry);
445 kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); 447 kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
446} 448}
447 449
450static void squashfs_destroy_inode(struct inode *inode)
451{
452 call_rcu(&inode->i_rcu, squashfs_i_callback);
453}
454
448 455
449static struct file_system_type squashfs_fs_type = { 456static struct file_system_type squashfs_fs_type = {
450 .owner = THIS_MODULE, 457 .owner = THIS_MODULE,
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d33be5dd6c3..05385dbe146 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,7 +32,6 @@
32 32
33#include "squashfs_fs.h" 33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h" 34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h" 35#include "squashfs.h"
37#include "xattr.h" 36#include "xattr.h"
38 37
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 00000000000..c4eb4001825
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,147 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xz_wrapper.c
22 */
23
24
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27#include <linux/slab.h>
28#include <linux/xz.h>
29
30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h"
34#include "decompressor.h"
35
36struct squashfs_xz {
37 struct xz_dec *state;
38 struct xz_buf buf;
39};
40
41static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
42{
43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
44
45 struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
46 if (stream == NULL)
47 goto failed;
48
49 stream->state = xz_dec_init(XZ_PREALLOC, block_size);
50 if (stream->state == NULL)
51 goto failed;
52
53 return stream;
54
55failed:
56 ERROR("Failed to allocate xz workspace\n");
57 kfree(stream);
58 return NULL;
59}
60
61
62static void squashfs_xz_free(void *strm)
63{
64 struct squashfs_xz *stream = strm;
65
66 if (stream) {
67 xz_dec_end(stream->state);
68 kfree(stream);
69 }
70}
71
72
73static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
74 struct buffer_head **bh, int b, int offset, int length, int srclength,
75 int pages)
76{
77 enum xz_ret xz_err;
78 int avail, total = 0, k = 0, page = 0;
79 struct squashfs_xz *stream = msblk->stream;
80
81 mutex_lock(&msblk->read_data_mutex);
82
83 xz_dec_reset(stream->state);
84 stream->buf.in_pos = 0;
85 stream->buf.in_size = 0;
86 stream->buf.out_pos = 0;
87 stream->buf.out_size = PAGE_CACHE_SIZE;
88 stream->buf.out = buffer[page++];
89
90 do {
91 if (stream->buf.in_pos == stream->buf.in_size && k < b) {
92 avail = min(length, msblk->devblksize - offset);
93 length -= avail;
94 wait_on_buffer(bh[k]);
95 if (!buffer_uptodate(bh[k]))
96 goto release_mutex;
97
98 stream->buf.in = bh[k]->b_data + offset;
99 stream->buf.in_size = avail;
100 stream->buf.in_pos = 0;
101 offset = 0;
102 }
103
104 if (stream->buf.out_pos == stream->buf.out_size
105 && page < pages) {
106 stream->buf.out = buffer[page++];
107 stream->buf.out_pos = 0;
108 total += PAGE_CACHE_SIZE;
109 }
110
111 xz_err = xz_dec_run(stream->state, &stream->buf);
112
113 if (stream->buf.in_pos == stream->buf.in_size && k < b)
114 put_bh(bh[k++]);
115 } while (xz_err == XZ_OK);
116
117 if (xz_err != XZ_STREAM_END) {
118 ERROR("xz_dec_run error, data probably corrupt\n");
119 goto release_mutex;
120 }
121
122 if (k < b) {
123 ERROR("xz_uncompress error, input remaining\n");
124 goto release_mutex;
125 }
126
127 total += stream->buf.out_pos;
128 mutex_unlock(&msblk->read_data_mutex);
129 return total;
130
131release_mutex:
132 mutex_unlock(&msblk->read_data_mutex);
133
134 for (; k < b; k++)
135 put_bh(bh[k]);
136
137 return -EIO;
138}
139
140const struct squashfs_decompressor squashfs_xz_comp_ops = {
141 .init = squashfs_xz_init,
142 .free = squashfs_xz_free,
143 .decompress = squashfs_xz_uncompress,
144 .id = XZ_COMPRESSION,
145 .name = "xz",
146 .supported = 1
147};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e48..4661ae2b1ce 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
29 29
30#include "squashfs_fs.h" 30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 32#include "squashfs.h"
34#include "decompressor.h" 33#include "decompressor.h"
35 34
@@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
66 struct buffer_head **bh, int b, int offset, int length, int srclength, 65 struct buffer_head **bh, int b, int offset, int length, int srclength,
67 int pages) 66 int pages)
68{ 67{
69 int zlib_err = 0, zlib_init = 0; 68 int zlib_err, zlib_init = 0;
70 int avail, bytes, k = 0, page = 0; 69 int k = 0, page = 0;
71 z_stream *stream = msblk->stream; 70 z_stream *stream = msblk->stream;
72 71
73 mutex_lock(&msblk->read_data_mutex); 72 mutex_lock(&msblk->read_data_mutex);
@@ -75,21 +74,14 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
75 stream->avail_out = 0; 74 stream->avail_out = 0;
76 stream->avail_in = 0; 75 stream->avail_in = 0;
77 76
78 bytes = length;
79 do { 77 do {
80 if (stream->avail_in == 0 && k < b) { 78 if (stream->avail_in == 0 && k < b) {
81 avail = min(bytes, msblk->devblksize - offset); 79 int avail = min(length, msblk->devblksize - offset);
82 bytes -= avail; 80 length -= avail;
83 wait_on_buffer(bh[k]); 81 wait_on_buffer(bh[k]);
84 if (!buffer_uptodate(bh[k])) 82 if (!buffer_uptodate(bh[k]))
85 goto release_mutex; 83 goto release_mutex;
86 84
87 if (avail == 0) {
88 offset = 0;
89 put_bh(bh[k++]);
90 continue;
91 }
92
93 stream->next_in = bh[k]->b_data + offset; 85 stream->next_in = bh[k]->b_data + offset;
94 stream->avail_in = avail; 86 stream->avail_in = avail;
95 offset = 0; 87 offset = 0;
@@ -128,6 +120,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
128 goto release_mutex; 120 goto release_mutex;
129 } 121 }
130 122
123 if (k < b) {
124 ERROR("zlib_uncompress error, data remaining\n");
125 goto release_mutex;
126 }
127
131 length = stream->total_out; 128 length = stream->total_out;
132 mutex_unlock(&msblk->read_data_mutex); 129 mutex_unlock(&msblk->read_data_mutex);
133 return length; 130 return length;
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e21390..d5c61cf2b70 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
75 int error = -EINVAL; 75 int error = -EINVAL;
76 int lookup_flags = 0; 76 int lookup_flags = 0;
77 77
78 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) 78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
79 goto out; 79 goto out;
80 80
81 if (!(flag & AT_SYMLINK_NOFOLLOW)) 81 if (!(flag & AT_SYMLINK_NOFOLLOW))
82 lookup_flags |= LOOKUP_FOLLOW; 82 lookup_flags |= LOOKUP_FOLLOW;
83 if (flag & AT_NO_AUTOMOUNT)
84 lookup_flags |= LOOKUP_NO_AUTOMOUNT;
83 85
84 error = user_path_at(dfd, filename, lookup_flags, &path); 86 error = user_path_at(dfd, filename, lookup_flags, &path);
85 if (error) 87 if (error)
diff --git a/fs/super.c b/fs/super.c
index ca696155cd9..74e149efed8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -30,6 +30,7 @@
30#include <linux/idr.h> 30#include <linux/idr.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
33#include <linux/rculist_bl.h>
33#include "internal.h" 34#include "internal.h"
34 35
35 36
@@ -71,7 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
71 INIT_LIST_HEAD(&s->s_files); 72 INIT_LIST_HEAD(&s->s_files);
72#endif 73#endif
73 INIT_LIST_HEAD(&s->s_instances); 74 INIT_LIST_HEAD(&s->s_instances);
74 INIT_HLIST_HEAD(&s->s_anon); 75 INIT_HLIST_BL_HEAD(&s->s_anon);
75 INIT_LIST_HEAD(&s->s_inodes); 76 INIT_LIST_HEAD(&s->s_inodes);
76 INIT_LIST_HEAD(&s->s_dentry_lru); 77 INIT_LIST_HEAD(&s->s_dentry_lru);
77 init_rwsem(&s->s_umount); 78 init_rwsem(&s->s_umount);
@@ -766,13 +767,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
766{ 767{
767 struct block_device *bdev; 768 struct block_device *bdev;
768 struct super_block *s; 769 struct super_block *s;
769 fmode_t mode = FMODE_READ; 770 fmode_t mode = FMODE_READ | FMODE_EXCL;
770 int error = 0; 771 int error = 0;
771 772
772 if (!(flags & MS_RDONLY)) 773 if (!(flags & MS_RDONLY))
773 mode |= FMODE_WRITE; 774 mode |= FMODE_WRITE;
774 775
775 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 776 bdev = blkdev_get_by_path(dev_name, mode, fs_type);
776 if (IS_ERR(bdev)) 777 if (IS_ERR(bdev))
777 return ERR_CAST(bdev); 778 return ERR_CAST(bdev);
778 779
@@ -801,13 +802,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
801 802
802 /* 803 /*
803 * s_umount nests inside bd_mutex during 804 * s_umount nests inside bd_mutex during
804 * __invalidate_device(). close_bdev_exclusive() 805 * __invalidate_device(). blkdev_put() acquires
805 * acquires bd_mutex and can't be called under 806 * bd_mutex and can't be called under s_umount. Drop
806 * s_umount. Drop s_umount temporarily. This is safe 807 * s_umount temporarily. This is safe as we're
807 * as we're holding an active reference. 808 * holding an active reference.
808 */ 809 */
809 up_write(&s->s_umount); 810 up_write(&s->s_umount);
810 close_bdev_exclusive(bdev, mode); 811 blkdev_put(bdev, mode);
811 down_write(&s->s_umount); 812 down_write(&s->s_umount);
812 } else { 813 } else {
813 char b[BDEVNAME_SIZE]; 814 char b[BDEVNAME_SIZE];
@@ -831,7 +832,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
831error_s: 832error_s:
832 error = PTR_ERR(s); 833 error = PTR_ERR(s);
833error_bdev: 834error_bdev:
834 close_bdev_exclusive(bdev, mode); 835 blkdev_put(bdev, mode);
835error: 836error:
836 return ERR_PTR(error); 837 return ERR_PTR(error);
837} 838}
@@ -862,7 +863,8 @@ void kill_block_super(struct super_block *sb)
862 bdev->bd_super = NULL; 863 bdev->bd_super = NULL;
863 generic_shutdown_super(sb); 864 generic_shutdown_super(sb);
864 sync_blockdev(bdev); 865 sync_blockdev(bdev);
865 close_bdev_exclusive(bdev, mode); 866 WARN_ON_ONCE(!(mode & FMODE_EXCL));
867 blkdev_put(bdev, mode | FMODE_EXCL);
866} 868}
867 869
868EXPORT_SYMBOL(kill_block_super); 870EXPORT_SYMBOL(kill_block_super);
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d..8c41feacbac 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
1config SYSFS 1config SYSFS
2 bool "sysfs file system support" if EMBEDDED 2 bool "sysfs file system support" if EXPERT
3 default y 3 default y
4 help 4 help
5 The sysfs filesystem is a virtual filesystem that the kernel uses to 5 The sysfs filesystem is a virtual filesystem that the kernel uses to
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7e54bac8c4b..ea9120a830d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -231,7 +231,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
231 goto repeat; 231 goto repeat;
232} 232}
233 233
234static int sysfs_dentry_delete(struct dentry *dentry) 234static int sysfs_dentry_delete(const struct dentry *dentry)
235{ 235{
236 struct sysfs_dirent *sd = dentry->d_fsdata; 236 struct sysfs_dirent *sd = dentry->d_fsdata;
237 return !!(sd->s_flags & SYSFS_FLAG_REMOVED); 237 return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
@@ -239,9 +239,13 @@ static int sysfs_dentry_delete(struct dentry *dentry)
239 239
240static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) 240static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
241{ 241{
242 struct sysfs_dirent *sd = dentry->d_fsdata; 242 struct sysfs_dirent *sd;
243 int is_dir; 243 int is_dir;
244 244
245 if (nd->flags & LOOKUP_RCU)
246 return -ECHILD;
247
248 sd = dentry->d_fsdata;
245 mutex_lock(&sysfs_mutex); 249 mutex_lock(&sysfs_mutex);
246 250
247 /* The sysfs dirent has been deleted */ 251 /* The sysfs dirent has been deleted */
@@ -701,7 +705,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
701 /* instantiate and hash dentry */ 705 /* instantiate and hash dentry */
702 ret = d_find_alias(inode); 706 ret = d_find_alias(inode);
703 if (!ret) { 707 if (!ret) {
704 dentry->d_op = &sysfs_dentry_ops; 708 d_set_d_op(dentry, &sysfs_dentry_ops);
705 dentry->d_fsdata = sysfs_get(sd); 709 dentry->d_fsdata = sysfs_get(sd);
706 d_add(dentry, inode); 710 d_add(dentry, inode);
707 } else { 711 } else {
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 442f34ff1af..c8769dc222d 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -165,10 +165,7 @@ int sysfs_merge_group(struct kobject *kobj,
165 struct attribute *const *attr; 165 struct attribute *const *attr;
166 int i; 166 int i;
167 167
168 if (grp) 168 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
169 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
170 else
171 dir_sd = sysfs_get(kobj->sd);
172 if (!dir_sd) 169 if (!dir_sd)
173 return -ENOENT; 170 return -ENOENT;
174 171
@@ -195,10 +192,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
195 struct sysfs_dirent *dir_sd; 192 struct sysfs_dirent *dir_sd;
196 struct attribute *const *attr; 193 struct attribute *const *attr;
197 194
198 if (grp) 195 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
199 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
200 else
201 dir_sd = sysfs_get(kobj->sd);
202 if (dir_sd) { 196 if (dir_sd) {
203 for (attr = grp->attrs; *attr; ++attr) 197 for (attr = grp->attrs; *attr; ++attr)
204 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); 198 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index cffb1fd8ba3..0a12eb89cd3 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/sysfs.h>
22#include <linux/xattr.h> 23#include <linux/xattr.h>
23#include <linux/security.h> 24#include <linux/security.h>
24#include "sysfs.h" 25#include "sysfs.h"
@@ -348,13 +349,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
348 return -ENOENT; 349 return -ENOENT;
349} 350}
350 351
351int sysfs_permission(struct inode *inode, int mask) 352int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
352{ 353{
353 struct sysfs_dirent *sd = inode->i_private; 354 struct sysfs_dirent *sd;
355
356 if (flags & IPERM_FLAG_RCU)
357 return -ECHILD;
358
359 sd = inode->i_private;
354 360
355 mutex_lock(&sysfs_mutex); 361 mutex_lock(&sysfs_mutex);
356 sysfs_refresh_inode(sd, inode); 362 sysfs_refresh_inode(sd, inode);
357 mutex_unlock(&sysfs_mutex); 363 mutex_unlock(&sysfs_mutex);
358 364
359 return generic_permission(inode, mask, NULL); 365 return generic_permission(inode, mask, flags, NULL);
360} 366}
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d9be60a2e95..3d28af31d86 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/lockdep.h> 11#include <linux/lockdep.h>
12#include <linux/kobject_ns.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13 14
14struct sysfs_open_dirent; 15struct sysfs_open_dirent;
@@ -200,7 +201,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
200struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); 201struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
201void sysfs_evict_inode(struct inode *inode); 202void sysfs_evict_inode(struct inode *inode);
202int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); 203int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
203int sysfs_permission(struct inode *inode, int mask); 204int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
204int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 205int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
205int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); 206int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
206int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 207int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index de44d067b9e..0630eb969a2 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
333 return &si->vfs_inode; 333 return &si->vfs_inode;
334} 334}
335 335
336static void sysv_destroy_inode(struct inode *inode) 336static void sysv_i_callback(struct rcu_head *head)
337{ 337{
338 struct inode *inode = container_of(head, struct inode, i_rcu);
339 INIT_LIST_HEAD(&inode->i_dentry);
338 kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); 340 kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
339} 341}
340 342
343static void sysv_destroy_inode(struct inode *inode)
344{
345 call_rcu(&inode->i_rcu, sysv_i_callback);
346}
347
341static void init_once(void *p) 348static void init_once(void *p)
342{ 349{
343 struct sysv_inode_info *si = (struct sysv_inode_info *)p; 350 struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 11e7f7d11cd..b427b1208c2 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
27 return err; 27 return err;
28} 28}
29 29
30static int sysv_hash(struct dentry *dentry, struct qstr *qstr) 30static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
31 struct qstr *qstr)
31{ 32{
32 /* Truncate the name in place, avoids having to define a compare 33 /* Truncate the name in place, avoids having to define a compare
33 function. */ 34 function. */
@@ -47,7 +48,6 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
47 struct inode * inode = NULL; 48 struct inode * inode = NULL;
48 ino_t ino; 49 ino_t ino;
49 50
50 dentry->d_op = dir->i_sb->s_root->d_op;
51 if (dentry->d_name.len > SYSV_NAMELEN) 51 if (dentry->d_name.len > SYSV_NAMELEN)
52 return ERR_PTR(-ENAMETOOLONG); 52 return ERR_PTR(-ENAMETOOLONG);
53 ino = sysv_inode_by_name(dentry); 53 ino = sysv_inode_by_name(dentry);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 3d9c62be0c1..f60c196913e 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -332,6 +332,10 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
332 sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; 332 sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
333 /* set up enough so that it can read an inode */ 333 /* set up enough so that it can read an inode */
334 sb->s_op = &sysv_sops; 334 sb->s_op = &sysv_sops;
335 if (sbi->s_forced_ro)
336 sb->s_flags |= MS_RDONLY;
337 if (sbi->s_truncate)
338 sb->s_d_op = &sysv_dentry_operations;
335 root_inode = sysv_iget(sb, SYSV_ROOT_INO); 339 root_inode = sysv_iget(sb, SYSV_ROOT_INO);
336 if (IS_ERR(root_inode)) { 340 if (IS_ERR(root_inode)) {
337 printk("SysV FS: get root inode failed\n"); 341 printk("SysV FS: get root inode failed\n");
@@ -343,10 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
343 printk("SysV FS: get root dentry failed\n"); 347 printk("SysV FS: get root dentry failed\n");
344 return 0; 348 return 0;
345 } 349 }
346 if (sbi->s_forced_ro)
347 sb->s_flags |= MS_RDONLY;
348 if (sbi->s_truncate)
349 sb->s_root->d_op = &sysv_dentry_operations;
350 return 1; 350 return 1;
351} 351}
352 352
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 91fac54c70e..6e11c2975dc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
272 return &ui->vfs_inode; 272 return &ui->vfs_inode;
273}; 273};
274 274
275static void ubifs_i_callback(struct rcu_head *head)
276{
277 struct inode *inode = container_of(head, struct inode, i_rcu);
278 struct ubifs_inode *ui = ubifs_inode(inode);
279 INIT_LIST_HEAD(&inode->i_dentry);
280 kmem_cache_free(ubifs_inode_slab, ui);
281}
282
275static void ubifs_destroy_inode(struct inode *inode) 283static void ubifs_destroy_inode(struct inode *inode)
276{ 284{
277 struct ubifs_inode *ui = ubifs_inode(inode); 285 struct ubifs_inode *ui = ubifs_inode(inode);
278 286
279 kfree(ui->data); 287 kfree(ui->data);
280 kmem_cache_free(ubifs_inode_slab, inode); 288 call_rcu(&inode->i_rcu, ubifs_i_callback);
281} 289}
282 290
283/* 291/*
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index f8def3c8ea4..0e0e99bd6bc 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,5 @@
1config UDF_FS 1config UDF_FS
2 tristate "UDF file system support" 2 tristate "UDF file system support"
3 depends on BKL # needs serious work to remove
4 select CRC_ITU_T 3 select CRC_ITU_T
5 help 4 help
6 This is the new file system used on some CD-ROMs and DVDs. Say Y if 5 This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b608efaa4ce..306ee39ef2c 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
157 udf_debug("bit %ld already set\n", bit + i); 157 udf_debug("bit %ld already set\n", bit + i);
158 udf_debug("byte=%2x\n", 158 udf_debug("byte=%2x\n",
159 ((char *)bh->b_data)[(bit + i) >> 3]); 159 ((char *)bh->b_data)[(bit + i) >> 3]);
160 } else {
161 udf_add_free_space(sb, sbi->s_partition, 1);
162 } 160 }
163 } 161 }
162 udf_add_free_space(sb, sbi->s_partition, count);
164 mark_buffer_dirty(bh); 163 mark_buffer_dirty(bh);
165 if (overflow) { 164 if (overflow) {
166 block += count; 165 block += count;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 51552bf5022..eb8bfe2b89a 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
30#include <linux/errno.h> 30#include <linux/errno.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/smp_lock.h>
34#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
35 34
36#include "udf_i.h" 35#include "udf_i.h"
@@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
190 struct inode *dir = filp->f_path.dentry->d_inode; 189 struct inode *dir = filp->f_path.dentry->d_inode;
191 int result; 190 int result;
192 191
193 lock_kernel();
194
195 if (filp->f_pos == 0) { 192 if (filp->f_pos == 0) {
196 if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) { 193 if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
197 unlock_kernel();
198 return 0; 194 return 0;
199 } 195 }
200 filp->f_pos++; 196 filp->f_pos++;
201 } 197 }
202 198
203 result = do_udf_readdir(dir, filp, filldir, dirent); 199 result = do_udf_readdir(dir, filp, filldir, dirent);
204 unlock_kernel();
205 return result; 200 return result;
206} 201}
207 202
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 66b9e7e7e4c..89c78486cbb 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -32,7 +32,6 @@
32#include <linux/string.h> /* memset */ 32#include <linux/string.h> /* memset */
33#include <linux/capability.h> 33#include <linux/capability.h>
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 35#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
38#include <linux/aio.h> 37#include <linux/aio.h>
@@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
114 size_t count = iocb->ki_left; 113 size_t count = iocb->ki_left;
115 struct udf_inode_info *iinfo = UDF_I(inode); 114 struct udf_inode_info *iinfo = UDF_I(inode);
116 115
116 down_write(&iinfo->i_data_sem);
117 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 117 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
118 if (file->f_flags & O_APPEND) 118 if (file->f_flags & O_APPEND)
119 pos = inode->i_size; 119 pos = inode->i_size;
@@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
126 udf_expand_file_adinicb(inode, pos + count, &err); 126 udf_expand_file_adinicb(inode, pos + count, &err);
127 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 127 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
128 udf_debug("udf_expand_adinicb: err=%d\n", err); 128 udf_debug("udf_expand_adinicb: err=%d\n", err);
129 up_write(&iinfo->i_data_sem);
129 return err; 130 return err;
130 } 131 }
131 } else { 132 } else {
@@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
135 iinfo->i_lenAlloc = inode->i_size; 136 iinfo->i_lenAlloc = inode->i_size;
136 } 137 }
137 } 138 }
139 up_write(&iinfo->i_data_sem);
138 140
139 retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); 141 retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
140 if (retval > 0) 142 if (retval > 0)
@@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
149 long old_block, new_block; 151 long old_block, new_block;
150 int result = -EINVAL; 152 int result = -EINVAL;
151 153
152 lock_kernel();
153
154 if (file_permission(filp, MAY_READ) != 0) { 154 if (file_permission(filp, MAY_READ) != 0) {
155 udf_debug("no permission to access inode %lu\n", inode->i_ino); 155 udf_debug("no permission to access inode %lu\n", inode->i_ino);
156 result = -EPERM; 156 result = -EPERM;
@@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
196 } 196 }
197 197
198out: 198out:
199 unlock_kernel();
200 return result; 199 return result;
201} 200}
202 201
@@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
204{ 203{
205 if (filp->f_mode & FMODE_WRITE) { 204 if (filp->f_mode & FMODE_WRITE) {
206 mutex_lock(&inode->i_mutex); 205 mutex_lock(&inode->i_mutex);
207 lock_kernel(); 206 down_write(&UDF_I(inode)->i_data_sem);
208 udf_discard_prealloc(inode); 207 udf_discard_prealloc(inode);
209 udf_truncate_tail_extent(inode); 208 udf_truncate_tail_extent(inode);
210 unlock_kernel(); 209 up_write(&UDF_I(inode)->i_data_sem);
211 mutex_unlock(&inode->i_mutex); 210 mutex_unlock(&inode->i_mutex);
212 } 211 }
213 return 0; 212 return 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 75d9304d0dc..6fb7e0adcda 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
92 return NULL; 92 return NULL;
93 } 93 }
94 94
95 mutex_lock(&sbi->s_alloc_mutex);
96 if (sbi->s_lvid_bh) { 95 if (sbi->s_lvid_bh) {
97 struct logicalVolIntegrityDesc *lvid = 96 struct logicalVolIntegrityDescImpUse *lvidiu;
98 (struct logicalVolIntegrityDesc *) 97
99 sbi->s_lvid_bh->b_data; 98 iinfo->i_unique = lvid_get_unique_id(sb);
100 struct logicalVolIntegrityDescImpUse *lvidiu = 99 mutex_lock(&sbi->s_alloc_mutex);
101 udf_sb_lvidiu(sbi); 100 lvidiu = udf_sb_lvidiu(sbi);
102 struct logicalVolHeaderDesc *lvhd;
103 uint64_t uniqueID;
104 lvhd = (struct logicalVolHeaderDesc *)
105 (lvid->logicalVolContentsUse);
106 if (S_ISDIR(mode)) 101 if (S_ISDIR(mode))
107 le32_add_cpu(&lvidiu->numDirs, 1); 102 le32_add_cpu(&lvidiu->numDirs, 1);
108 else 103 else
109 le32_add_cpu(&lvidiu->numFiles, 1); 104 le32_add_cpu(&lvidiu->numFiles, 1);
110 iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
111 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
112 uniqueID += 16;
113 lvhd->uniqueID = cpu_to_le64(uniqueID);
114 udf_updated_lvid(sb); 105 udf_updated_lvid(sb);
106 mutex_unlock(&sbi->s_alloc_mutex);
115 } 107 }
116 mutex_unlock(&sbi->s_alloc_mutex);
117 108
118 inode_init_owner(inode, dir, mode); 109 inode_init_owner(inode, dir, mode);
119 110
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fc48f37aa2d..c6a2e782b97 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -31,7 +31,6 @@
31 31
32#include "udfdecl.h" 32#include "udfdecl.h"
33#include <linux/mm.h> 33#include <linux/mm.h>
34#include <linux/smp_lock.h>
35#include <linux/module.h> 34#include <linux/module.h>
36#include <linux/pagemap.h> 35#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
@@ -51,6 +50,7 @@ MODULE_LICENSE("GPL");
51static mode_t udf_convert_permissions(struct fileEntry *); 50static mode_t udf_convert_permissions(struct fileEntry *);
52static int udf_update_inode(struct inode *, int); 51static int udf_update_inode(struct inode *, int);
53static void udf_fill_inode(struct inode *, struct buffer_head *); 52static void udf_fill_inode(struct inode *, struct buffer_head *);
53static int udf_sync_inode(struct inode *inode);
54static int udf_alloc_i_data(struct inode *inode, size_t size); 54static int udf_alloc_i_data(struct inode *inode, size_t size);
55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
56 sector_t *, int *); 56 sector_t *, int *);
@@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode)
79 want_delete = 1; 79 want_delete = 1;
80 inode->i_size = 0; 80 inode->i_size = 0;
81 udf_truncate(inode); 81 udf_truncate(inode);
82 lock_kernel();
83 udf_update_inode(inode, IS_SYNC(inode)); 82 udf_update_inode(inode, IS_SYNC(inode));
84 unlock_kernel();
85 } 83 }
86 invalidate_inode_buffers(inode); 84 invalidate_inode_buffers(inode);
87 end_writeback(inode); 85 end_writeback(inode);
@@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode)
97 kfree(iinfo->i_ext.i_data); 95 kfree(iinfo->i_ext.i_data);
98 iinfo->i_ext.i_data = NULL; 96 iinfo->i_ext.i_data = NULL;
99 if (want_delete) { 97 if (want_delete) {
100 lock_kernel();
101 udf_free_inode(inode); 98 udf_free_inode(inode);
102 unlock_kernel();
103 } 99 }
104} 100}
105 101
@@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block,
302 err = -EIO; 298 err = -EIO;
303 new = 0; 299 new = 0;
304 bh = NULL; 300 bh = NULL;
305
306 lock_kernel();
307
308 iinfo = UDF_I(inode); 301 iinfo = UDF_I(inode);
302
303 down_write(&iinfo->i_data_sem);
309 if (block == iinfo->i_next_alloc_block + 1) { 304 if (block == iinfo->i_next_alloc_block + 1) {
310 iinfo->i_next_alloc_block++; 305 iinfo->i_next_alloc_block++;
311 iinfo->i_next_alloc_goal++; 306 iinfo->i_next_alloc_goal++;
@@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
324 map_bh(bh_result, inode->i_sb, phys); 319 map_bh(bh_result, inode->i_sb, phys);
325 320
326abort: 321abort:
327 unlock_kernel(); 322 up_write(&iinfo->i_data_sem);
328 return err; 323 return err;
329} 324}
330 325
@@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode)
1022 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 1017 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1023 return; 1018 return;
1024 1019
1025 lock_kernel();
1026 iinfo = UDF_I(inode); 1020 iinfo = UDF_I(inode);
1027 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1021 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1022 down_write(&iinfo->i_data_sem);
1028 if (inode->i_sb->s_blocksize < 1023 if (inode->i_sb->s_blocksize <
1029 (udf_file_entry_alloc_offset(inode) + 1024 (udf_file_entry_alloc_offset(inode) +
1030 inode->i_size)) { 1025 inode->i_size)) {
1031 udf_expand_file_adinicb(inode, inode->i_size, &err); 1026 udf_expand_file_adinicb(inode, inode->i_size, &err);
1032 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1027 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1033 inode->i_size = iinfo->i_lenAlloc; 1028 inode->i_size = iinfo->i_lenAlloc;
1034 unlock_kernel(); 1029 up_write(&iinfo->i_data_sem);
1035 return; 1030 return;
1036 } else 1031 } else
1037 udf_truncate_extents(inode); 1032 udf_truncate_extents(inode);
@@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode)
1042 offset - udf_file_entry_alloc_offset(inode)); 1037 offset - udf_file_entry_alloc_offset(inode));
1043 iinfo->i_lenAlloc = inode->i_size; 1038 iinfo->i_lenAlloc = inode->i_size;
1044 } 1039 }
1040 up_write(&iinfo->i_data_sem);
1045 } else { 1041 } else {
1046 block_truncate_page(inode->i_mapping, inode->i_size, 1042 block_truncate_page(inode->i_mapping, inode->i_size,
1047 udf_get_block); 1043 udf_get_block);
1044 down_write(&iinfo->i_data_sem);
1048 udf_truncate_extents(inode); 1045 udf_truncate_extents(inode);
1046 up_write(&iinfo->i_data_sem);
1049 } 1047 }
1050 1048
1051 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); 1049 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
@@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode)
1053 udf_sync_inode(inode); 1051 udf_sync_inode(inode);
1054 else 1052 else
1055 mark_inode_dirty(inode); 1053 mark_inode_dirty(inode);
1056 unlock_kernel();
1057} 1054}
1058 1055
1059static void __udf_read_inode(struct inode *inode) 1056static void __udf_read_inode(struct inode *inode)
@@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1202 return; 1199 return;
1203 } 1200 }
1204 1201
1202 read_lock(&sbi->s_cred_lock);
1205 inode->i_uid = le32_to_cpu(fe->uid); 1203 inode->i_uid = le32_to_cpu(fe->uid);
1206 if (inode->i_uid == -1 || 1204 if (inode->i_uid == -1 ||
1207 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || 1205 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
@@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1214 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) 1212 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
1215 inode->i_gid = UDF_SB(inode->i_sb)->s_gid; 1213 inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
1216 1214
1217 inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
1218 if (!inode->i_nlink)
1219 inode->i_nlink = 1;
1220
1221 inode->i_size = le64_to_cpu(fe->informationLength);
1222 iinfo->i_lenExtents = inode->i_size;
1223
1224 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && 1215 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
1225 sbi->s_fmode != UDF_INVALID_MODE) 1216 sbi->s_fmode != UDF_INVALID_MODE)
1226 inode->i_mode = sbi->s_fmode; 1217 inode->i_mode = sbi->s_fmode;
@@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1230 else 1221 else
1231 inode->i_mode = udf_convert_permissions(fe); 1222 inode->i_mode = udf_convert_permissions(fe);
1232 inode->i_mode &= ~sbi->s_umask; 1223 inode->i_mode &= ~sbi->s_umask;
1224 read_unlock(&sbi->s_cred_lock);
1225
1226 inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
1227 if (!inode->i_nlink)
1228 inode->i_nlink = 1;
1229
1230 inode->i_size = le64_to_cpu(fe->informationLength);
1231 iinfo->i_lenExtents = inode->i_size;
1233 1232
1234 if (iinfo->i_efe == 0) { 1233 if (iinfo->i_efe == 0) {
1235 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << 1234 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
1373 1372
1374int udf_write_inode(struct inode *inode, struct writeback_control *wbc) 1373int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
1375{ 1374{
1376 int ret; 1375 return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1377
1378 lock_kernel();
1379 ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1380 unlock_kernel();
1381
1382 return ret;
1383} 1376}
1384 1377
1385int udf_sync_inode(struct inode *inode) 1378static int udf_sync_inode(struct inode *inode)
1386{ 1379{
1387 return udf_update_inode(inode, 1); 1380 return udf_update_inode(inode, 1);
1388} 1381}
@@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2048 struct extent_position epos = {}; 2041 struct extent_position epos = {};
2049 int ret; 2042 int ret;
2050 2043
2051 lock_kernel(); 2044 down_read(&UDF_I(inode)->i_data_sem);
2052 2045
2053 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == 2046 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
2054 (EXT_RECORDED_ALLOCATED >> 30)) 2047 (EXT_RECORDED_ALLOCATED >> 30))
@@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2056 else 2049 else
2057 ret = 0; 2050 ret = 0;
2058 2051
2059 unlock_kernel(); 2052 up_read(&UDF_I(inode)->i_data_sem);
2060 brelse(epos.bh); 2053 brelse(epos.bh);
2061 2054
2062 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) 2055 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6d8dc02baeb..2be0f9eb86d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/sched.h> 31#include <linux/sched.h>
33#include <linux/crc-itu-t.h> 32#include <linux/crc-itu-t.h>
@@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
228 } 227 }
229 228
230 if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) && 229 if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
231 isdotdot) { 230 isdotdot)
232 brelse(epos.bh); 231 goto out_ok;
233 return fi;
234 }
235 232
236 if (!lfi) 233 if (!lfi)
237 continue; 234 continue;
@@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
263 if (dentry->d_name.len > UDF_NAME_LEN - 2) 260 if (dentry->d_name.len > UDF_NAME_LEN - 2)
264 return ERR_PTR(-ENAMETOOLONG); 261 return ERR_PTR(-ENAMETOOLONG);
265 262
266 lock_kernel();
267#ifdef UDF_RECOVERY 263#ifdef UDF_RECOVERY
268 /* temporary shorthand for specifying files by inode number */ 264 /* temporary shorthand for specifying files by inode number */
269 if (!strncmp(dentry->d_name.name, ".B=", 3)) { 265 if (!strncmp(dentry->d_name.name, ".B=", 3)) {
@@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
275 }; 271 };
276 inode = udf_iget(dir->i_sb, lb); 272 inode = udf_iget(dir->i_sb, lb);
277 if (!inode) { 273 if (!inode) {
278 unlock_kernel();
279 return ERR_PTR(-EACCES); 274 return ERR_PTR(-EACCES);
280 } 275 }
281 } else 276 } else
@@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
291 loc = lelb_to_cpu(cfi.icb.extLocation); 286 loc = lelb_to_cpu(cfi.icb.extLocation);
292 inode = udf_iget(dir->i_sb, &loc); 287 inode = udf_iget(dir->i_sb, &loc);
293 if (!inode) { 288 if (!inode) {
294 unlock_kernel();
295 return ERR_PTR(-EACCES); 289 return ERR_PTR(-EACCES);
296 } 290 }
297 } 291 }
298 unlock_kernel();
299 292
300 return d_splice_alias(inode, dentry); 293 return d_splice_alias(inode, dentry);
301} 294}
@@ -476,15 +469,19 @@ add:
476 f_pos >> dir->i_sb->s_blocksize_bits, 1, err); 469 f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
477 if (!fibh->ebh) 470 if (!fibh->ebh)
478 goto out_err; 471 goto out_err;
472 /* Extents could have been merged, invalidate our position */
473 brelse(epos.bh);
474 epos.bh = NULL;
475 epos.block = dinfo->i_location;
476 epos.offset = udf_file_entry_alloc_offset(dir);
479 477
480 if (!fibh->soffset) { 478 if (!fibh->soffset) {
481 if (udf_next_aext(dir, &epos, &eloc, &elen, 1) == 479 /* Find the freshly allocated block */
482 (EXT_RECORDED_ALLOCATED >> 30)) { 480 while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
483 block = eloc.logicalBlockNum + ((elen - 1) >> 481 (EXT_RECORDED_ALLOCATED >> 30))
482 ;
483 block = eloc.logicalBlockNum + ((elen - 1) >>
484 dir->i_sb->s_blocksize_bits); 484 dir->i_sb->s_blocksize_bits);
485 } else
486 block++;
487
488 brelse(fibh->sbh); 485 brelse(fibh->sbh);
489 fibh->sbh = fibh->ebh; 486 fibh->sbh = fibh->ebh;
490 fi = (struct fileIdentDesc *)(fibh->sbh->b_data); 487 fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
@@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
562 int err; 559 int err;
563 struct udf_inode_info *iinfo; 560 struct udf_inode_info *iinfo;
564 561
565 lock_kernel();
566 inode = udf_new_inode(dir, mode, &err); 562 inode = udf_new_inode(dir, mode, &err);
567 if (!inode) { 563 if (!inode) {
568 unlock_kernel();
569 return err; 564 return err;
570 } 565 }
571 566
@@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
583 inode->i_nlink--; 578 inode->i_nlink--;
584 mark_inode_dirty(inode); 579 mark_inode_dirty(inode);
585 iput(inode); 580 iput(inode);
586 unlock_kernel();
587 return err; 581 return err;
588 } 582 }
589 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 583 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
596 if (fibh.sbh != fibh.ebh) 590 if (fibh.sbh != fibh.ebh)
597 brelse(fibh.ebh); 591 brelse(fibh.ebh);
598 brelse(fibh.sbh); 592 brelse(fibh.sbh);
599 unlock_kernel();
600 d_instantiate(dentry, inode); 593 d_instantiate(dentry, inode);
601 594
602 return 0; 595 return 0;
@@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
614 if (!old_valid_dev(rdev)) 607 if (!old_valid_dev(rdev))
615 return -EINVAL; 608 return -EINVAL;
616 609
617 lock_kernel();
618 err = -EIO; 610 err = -EIO;
619 inode = udf_new_inode(dir, mode, &err); 611 inode = udf_new_inode(dir, mode, &err);
620 if (!inode) 612 if (!inode)
@@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
627 inode->i_nlink--; 619 inode->i_nlink--;
628 mark_inode_dirty(inode); 620 mark_inode_dirty(inode);
629 iput(inode); 621 iput(inode);
630 unlock_kernel();
631 return err; 622 return err;
632 } 623 }
633 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 624 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
646 err = 0; 637 err = 0;
647 638
648out: 639out:
649 unlock_kernel();
650 return err; 640 return err;
651} 641}
652 642
@@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
659 struct udf_inode_info *dinfo = UDF_I(dir); 649 struct udf_inode_info *dinfo = UDF_I(dir);
660 struct udf_inode_info *iinfo; 650 struct udf_inode_info *iinfo;
661 651
662 lock_kernel();
663 err = -EMLINK; 652 err = -EMLINK;
664 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 653 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
665 goto out; 654 goto out;
@@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
712 err = 0; 701 err = 0;
713 702
714out: 703out:
715 unlock_kernel();
716 return err; 704 return err;
717} 705}
718 706
@@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
794 struct kernel_lb_addr tloc; 782 struct kernel_lb_addr tloc;
795 783
796 retval = -ENOENT; 784 retval = -ENOENT;
797 lock_kernel();
798 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 785 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
799 if (!fi) 786 if (!fi)
800 goto out; 787 goto out;
@@ -826,7 +813,6 @@ end_rmdir:
826 brelse(fibh.sbh); 813 brelse(fibh.sbh);
827 814
828out: 815out:
829 unlock_kernel();
830 return retval; 816 return retval;
831} 817}
832 818
@@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
840 struct kernel_lb_addr tloc; 826 struct kernel_lb_addr tloc;
841 827
842 retval = -ENOENT; 828 retval = -ENOENT;
843 lock_kernel();
844 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 829 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
845 if (!fi) 830 if (!fi)
846 goto out; 831 goto out;
@@ -870,7 +855,6 @@ end_unlink:
870 brelse(fibh.sbh); 855 brelse(fibh.sbh);
871 856
872out: 857out:
873 unlock_kernel();
874 return retval; 858 return retval;
875} 859}
876 860
@@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
890 int block; 874 int block;
891 unsigned char *name = NULL; 875 unsigned char *name = NULL;
892 int namelen; 876 int namelen;
893 struct buffer_head *bh;
894 struct udf_inode_info *iinfo; 877 struct udf_inode_info *iinfo;
878 struct super_block *sb = dir->i_sb;
895 879
896 lock_kernel();
897 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); 880 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
898 if (!inode) 881 if (!inode)
899 goto out; 882 goto out;
900 883
884 iinfo = UDF_I(inode);
885 down_write(&iinfo->i_data_sem);
901 name = kmalloc(UDF_NAME_LEN, GFP_NOFS); 886 name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
902 if (!name) { 887 if (!name) {
903 err = -ENOMEM; 888 err = -ENOMEM;
904 goto out_no_entry; 889 goto out_no_entry;
905 } 890 }
906 891
907 iinfo = UDF_I(inode);
908 inode->i_data.a_ops = &udf_symlink_aops; 892 inode->i_data.a_ops = &udf_symlink_aops;
909 inode->i_op = &udf_symlink_inode_operations; 893 inode->i_op = &udf_symlink_inode_operations;
910 894
@@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
912 struct kernel_lb_addr eloc; 896 struct kernel_lb_addr eloc;
913 uint32_t bsize; 897 uint32_t bsize;
914 898
915 block = udf_new_block(inode->i_sb, inode, 899 block = udf_new_block(sb, inode,
916 iinfo->i_location.partitionReferenceNum, 900 iinfo->i_location.partitionReferenceNum,
917 iinfo->i_location.logicalBlockNum, &err); 901 iinfo->i_location.logicalBlockNum, &err);
918 if (!block) 902 if (!block)
@@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
923 eloc.logicalBlockNum = block; 907 eloc.logicalBlockNum = block;
924 eloc.partitionReferenceNum = 908 eloc.partitionReferenceNum =
925 iinfo->i_location.partitionReferenceNum; 909 iinfo->i_location.partitionReferenceNum;
926 bsize = inode->i_sb->s_blocksize; 910 bsize = sb->s_blocksize;
927 iinfo->i_lenExtents = bsize; 911 iinfo->i_lenExtents = bsize;
928 udf_add_aext(inode, &epos, &eloc, bsize, 0); 912 udf_add_aext(inode, &epos, &eloc, bsize, 0);
929 brelse(epos.bh); 913 brelse(epos.bh);
930 914
931 block = udf_get_pblock(inode->i_sb, block, 915 block = udf_get_pblock(sb, block,
932 iinfo->i_location.partitionReferenceNum, 916 iinfo->i_location.partitionReferenceNum,
933 0); 917 0);
934 epos.bh = udf_tgetblk(inode->i_sb, block); 918 epos.bh = udf_tgetblk(sb, block);
935 lock_buffer(epos.bh); 919 lock_buffer(epos.bh);
936 memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize); 920 memset(epos.bh->b_data, 0x00, bsize);
937 set_buffer_uptodate(epos.bh); 921 set_buffer_uptodate(epos.bh);
938 unlock_buffer(epos.bh); 922 unlock_buffer(epos.bh);
939 mark_buffer_dirty_inode(epos.bh, inode); 923 mark_buffer_dirty_inode(epos.bh, inode);
@@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
941 } else 925 } else
942 ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr; 926 ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
943 927
944 eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode); 928 eoffset = sb->s_blocksize - udf_ext0_offset(inode);
945 pc = (struct pathComponent *)ea; 929 pc = (struct pathComponent *)ea;
946 930
947 if (*symname == '/') { 931 if (*symname == '/') {
@@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
981 } 965 }
982 966
983 if (pc->componentType == 5) { 967 if (pc->componentType == 5) {
984 namelen = udf_put_filename(inode->i_sb, compstart, name, 968 namelen = udf_put_filename(sb, compstart, name,
985 symname - compstart); 969 symname - compstart);
986 if (!namelen) 970 if (!namelen)
987 goto out_no_entry; 971 goto out_no_entry;
@@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1015 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 999 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1016 if (!fi) 1000 if (!fi)
1017 goto out_no_entry; 1001 goto out_no_entry;
1018 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 1002 cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
1019 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); 1003 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
1020 bh = UDF_SB(inode->i_sb)->s_lvid_bh; 1004 if (UDF_SB(inode->i_sb)->s_lvid_bh) {
1021 if (bh) {
1022 struct logicalVolIntegrityDesc *lvid =
1023 (struct logicalVolIntegrityDesc *)bh->b_data;
1024 struct logicalVolHeaderDesc *lvhd;
1025 uint64_t uniqueID;
1026 lvhd = (struct logicalVolHeaderDesc *)
1027 lvid->logicalVolContentsUse;
1028 uniqueID = le64_to_cpu(lvhd->uniqueID);
1029 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = 1005 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
1030 cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL); 1006 cpu_to_le32(lvid_get_unique_id(sb));
1031 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
1032 uniqueID += 16;
1033 lvhd->uniqueID = cpu_to_le64(uniqueID);
1034 mark_buffer_dirty(bh);
1035 } 1007 }
1036 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 1008 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1037 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1009 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1038 mark_inode_dirty(dir); 1010 mark_inode_dirty(dir);
1011 up_write(&iinfo->i_data_sem);
1039 if (fibh.sbh != fibh.ebh) 1012 if (fibh.sbh != fibh.ebh)
1040 brelse(fibh.ebh); 1013 brelse(fibh.ebh);
1041 brelse(fibh.sbh); 1014 brelse(fibh.sbh);
@@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1044 1017
1045out: 1018out:
1046 kfree(name); 1019 kfree(name);
1047 unlock_kernel();
1048 return err; 1020 return err;
1049 1021
1050out_no_entry: 1022out_no_entry:
1023 up_write(&iinfo->i_data_sem);
1051 inode_dec_link_count(inode); 1024 inode_dec_link_count(inode);
1052 iput(inode); 1025 iput(inode);
1053 goto out; 1026 goto out;
@@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1060 struct udf_fileident_bh fibh; 1033 struct udf_fileident_bh fibh;
1061 struct fileIdentDesc cfi, *fi; 1034 struct fileIdentDesc cfi, *fi;
1062 int err; 1035 int err;
1063 struct buffer_head *bh;
1064 1036
1065 lock_kernel();
1066 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1037 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1067 unlock_kernel();
1068 return -EMLINK; 1038 return -EMLINK;
1069 } 1039 }
1070 1040
1071 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 1041 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1072 if (!fi) { 1042 if (!fi) {
1073 unlock_kernel();
1074 return err; 1043 return err;
1075 } 1044 }
1076 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 1045 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
1077 cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); 1046 cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
1078 bh = UDF_SB(inode->i_sb)->s_lvid_bh; 1047 if (UDF_SB(inode->i_sb)->s_lvid_bh) {
1079 if (bh) {
1080 struct logicalVolIntegrityDesc *lvid =
1081 (struct logicalVolIntegrityDesc *)bh->b_data;
1082 struct logicalVolHeaderDesc *lvhd;
1083 uint64_t uniqueID;
1084 lvhd = (struct logicalVolHeaderDesc *)
1085 (lvid->logicalVolContentsUse);
1086 uniqueID = le64_to_cpu(lvhd->uniqueID);
1087 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = 1048 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
1088 cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL); 1049 cpu_to_le32(lvid_get_unique_id(inode->i_sb));
1089 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
1090 uniqueID += 16;
1091 lvhd->uniqueID = cpu_to_le64(uniqueID);
1092 mark_buffer_dirty(bh);
1093 } 1050 }
1094 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 1051 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1095 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1052 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
@@ -1103,7 +1060,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1103 mark_inode_dirty(inode); 1060 mark_inode_dirty(inode);
1104 ihold(inode); 1061 ihold(inode);
1105 d_instantiate(dentry, inode); 1062 d_instantiate(dentry, inode);
1106 unlock_kernel();
1107 1063
1108 return 0; 1064 return 0;
1109} 1065}
@@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1124 struct kernel_lb_addr tloc; 1080 struct kernel_lb_addr tloc;
1125 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1081 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1126 1082
1127 lock_kernel();
1128 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1083 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1129 if (ofi) { 1084 if (ofi) {
1130 if (ofibh.sbh != ofibh.ebh) 1085 if (ofibh.sbh != ofibh.ebh)
@@ -1248,7 +1203,6 @@ end_rename:
1248 brelse(nfibh.ebh); 1203 brelse(nfibh.ebh);
1249 brelse(nfibh.sbh); 1204 brelse(nfibh.sbh);
1250 } 1205 }
1251 unlock_kernel();
1252 1206
1253 return retval; 1207 return retval;
1254} 1208}
@@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child)
1261 struct fileIdentDesc cfi; 1215 struct fileIdentDesc cfi;
1262 struct udf_fileident_bh fibh; 1216 struct udf_fileident_bh fibh;
1263 1217
1264 lock_kernel();
1265 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) 1218 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
1266 goto out_unlock; 1219 goto out_unlock;
1267 1220
@@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child)
1273 inode = udf_iget(child->d_inode->i_sb, &tloc); 1226 inode = udf_iget(child->d_inode->i_sb, &tloc);
1274 if (!inode) 1227 if (!inode)
1275 goto out_unlock; 1228 goto out_unlock;
1276 unlock_kernel();
1277 1229
1278 return d_obtain_alias(inode); 1230 return d_obtain_alias(inode);
1279out_unlock: 1231out_unlock:
1280 unlock_kernel();
1281 return ERR_PTR(-EACCES); 1232 return ERR_PTR(-EACCES);
1282} 1233}
1283 1234
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 745eb209be0..a71090ea0e0 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -25,6 +25,7 @@
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/mutex.h>
28 29
29uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, 30uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
30 uint16_t partition, uint32_t offset) 31 uint16_t partition, uint32_t offset)
@@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
159 struct udf_sb_info *sbi = UDF_SB(sb); 160 struct udf_sb_info *sbi = UDF_SB(sb);
160 u16 reallocationTableLen; 161 u16 reallocationTableLen;
161 struct buffer_head *bh; 162 struct buffer_head *bh;
163 int ret = 0;
162 164
165 mutex_lock(&sbi->s_alloc_mutex);
163 for (i = 0; i < sbi->s_partitions; i++) { 166 for (i = 0; i < sbi->s_partitions; i++) {
164 struct udf_part_map *map = &sbi->s_partmaps[i]; 167 struct udf_part_map *map = &sbi->s_partmaps[i];
165 if (old_block > map->s_partition_root && 168 if (old_block > map->s_partition_root &&
@@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
175 break; 178 break;
176 } 179 }
177 180
178 if (!st) 181 if (!st) {
179 return 1; 182 ret = 1;
183 goto out;
184 }
180 185
181 reallocationTableLen = 186 reallocationTableLen =
182 le16_to_cpu(st->reallocationTableLen); 187 le16_to_cpu(st->reallocationTableLen);
@@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
207 ((old_block - 212 ((old_block -
208 map->s_partition_root) & 213 map->s_partition_root) &
209 (sdata->s_packet_len - 1)); 214 (sdata->s_packet_len - 1));
210 return 0; 215 ret = 0;
216 goto out;
211 } else if (origLoc == packet) { 217 } else if (origLoc == packet) {
212 *new_block = le32_to_cpu( 218 *new_block = le32_to_cpu(
213 entry->mappedLocation) + 219 entry->mappedLocation) +
214 ((old_block - 220 ((old_block -
215 map->s_partition_root) & 221 map->s_partition_root) &
216 (sdata->s_packet_len - 1)); 222 (sdata->s_packet_len - 1));
217 return 0; 223 ret = 0;
224 goto out;
218 } else if (origLoc > packet) 225 } else if (origLoc > packet)
219 break; 226 break;
220 } 227 }
@@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
251 st->mapEntry[k].mappedLocation) + 258 st->mapEntry[k].mappedLocation) +
252 ((old_block - map->s_partition_root) & 259 ((old_block - map->s_partition_root) &
253 (sdata->s_packet_len - 1)); 260 (sdata->s_packet_len - 1));
254 return 0; 261 ret = 0;
262 goto out;
255 } 263 }
256 264
257 return 1; 265 ret = 1;
266 goto out;
258 } /* if old_block */ 267 } /* if old_block */
259 } 268 }
260 269
261 if (i == sbi->s_partitions) { 270 if (i == sbi->s_partitions) {
262 /* outside of partitions */ 271 /* outside of partitions */
263 /* for now, fail =) */ 272 /* for now, fail =) */
264 return 1; 273 ret = 1;
265 } 274 }
266 275
267 return 0; 276out:
277 mutex_unlock(&sbi->s_alloc_mutex);
278 return ret;
268} 279}
269 280
270static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, 281static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4a5c7c61836..7b27b063ff6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
48#include <linux/stat.h> 48#include <linux/stat.h>
49#include <linux/cdrom.h> 49#include <linux/cdrom.h>
50#include <linux/nls.h> 50#include <linux/nls.h>
51#include <linux/smp_lock.h>
52#include <linux/buffer_head.h> 51#include <linux/buffer_head.h>
53#include <linux/vfs.h> 52#include <linux/vfs.h>
54#include <linux/vmalloc.h> 53#include <linux/vmalloc.h>
@@ -135,15 +134,23 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
135 ei->i_next_alloc_block = 0; 134 ei->i_next_alloc_block = 0;
136 ei->i_next_alloc_goal = 0; 135 ei->i_next_alloc_goal = 0;
137 ei->i_strat4096 = 0; 136 ei->i_strat4096 = 0;
137 init_rwsem(&ei->i_data_sem);
138 138
139 return &ei->vfs_inode; 139 return &ei->vfs_inode;
140} 140}
141 141
142static void udf_destroy_inode(struct inode *inode) 142static void udf_i_callback(struct rcu_head *head)
143{ 143{
144 struct inode *inode = container_of(head, struct inode, i_rcu);
145 INIT_LIST_HEAD(&inode->i_dentry);
144 kmem_cache_free(udf_inode_cachep, UDF_I(inode)); 146 kmem_cache_free(udf_inode_cachep, UDF_I(inode));
145} 147}
146 148
149static void udf_destroy_inode(struct inode *inode)
150{
151 call_rcu(&inode->i_rcu, udf_i_callback);
152}
153
147static void init_once(void *foo) 154static void init_once(void *foo)
148{ 155{
149 struct udf_inode_info *ei = (struct udf_inode_info *)foo; 156 struct udf_inode_info *ei = (struct udf_inode_info *)foo;
@@ -567,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
567 if (!udf_parse_options(options, &uopt, true)) 574 if (!udf_parse_options(options, &uopt, true))
568 return -EINVAL; 575 return -EINVAL;
569 576
570 lock_kernel(); 577 write_lock(&sbi->s_cred_lock);
571 sbi->s_flags = uopt.flags; 578 sbi->s_flags = uopt.flags;
572 sbi->s_uid = uopt.uid; 579 sbi->s_uid = uopt.uid;
573 sbi->s_gid = uopt.gid; 580 sbi->s_gid = uopt.gid;
574 sbi->s_umask = uopt.umask; 581 sbi->s_umask = uopt.umask;
575 sbi->s_fmode = uopt.fmode; 582 sbi->s_fmode = uopt.fmode;
576 sbi->s_dmode = uopt.dmode; 583 sbi->s_dmode = uopt.dmode;
584 write_unlock(&sbi->s_cred_lock);
577 585
578 if (sbi->s_lvid_bh) { 586 if (sbi->s_lvid_bh) {
579 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); 587 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -590,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
590 udf_open_lvid(sb); 598 udf_open_lvid(sb);
591 599
592out_unlock: 600out_unlock:
593 unlock_kernel();
594 return error; 601 return error;
595} 602}
596 603
@@ -959,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
959 (sizeof(struct buffer_head *) * nr_groups); 966 (sizeof(struct buffer_head *) * nr_groups);
960 967
961 if (size <= PAGE_SIZE) 968 if (size <= PAGE_SIZE)
962 bitmap = kmalloc(size, GFP_KERNEL); 969 bitmap = kzalloc(size, GFP_KERNEL);
963 else 970 else
964 bitmap = vmalloc(size); /* TODO: get rid of vmalloc */ 971 bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
965 972
966 if (bitmap == NULL) { 973 if (bitmap == NULL) {
967 udf_error(sb, __func__, 974 udf_error(sb, __func__,
@@ -970,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
970 return NULL; 977 return NULL;
971 } 978 }
972 979
973 memset(bitmap, 0x00, size);
974 bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); 980 bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
975 bitmap->s_nr_groups = nr_groups; 981 bitmap->s_nr_groups = nr_groups;
976 return bitmap; 982 return bitmap;
@@ -1774,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb)
1774 1780
1775 if (!bh) 1781 if (!bh)
1776 return; 1782 return;
1783
1784 mutex_lock(&sbi->s_alloc_mutex);
1777 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1785 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1778 lvidiu = udf_sb_lvidiu(sbi); 1786 lvidiu = udf_sb_lvidiu(sbi);
1779 1787
@@ -1790,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb)
1790 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1798 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1791 mark_buffer_dirty(bh); 1799 mark_buffer_dirty(bh);
1792 sbi->s_lvid_dirty = 0; 1800 sbi->s_lvid_dirty = 0;
1801 mutex_unlock(&sbi->s_alloc_mutex);
1793} 1802}
1794 1803
1795static void udf_close_lvid(struct super_block *sb) 1804static void udf_close_lvid(struct super_block *sb)
@@ -1802,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb)
1802 if (!bh) 1811 if (!bh)
1803 return; 1812 return;
1804 1813
1814 mutex_lock(&sbi->s_alloc_mutex);
1805 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1815 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1806 lvidiu = udf_sb_lvidiu(sbi); 1816 lvidiu = udf_sb_lvidiu(sbi);
1807 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1817 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1822,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb)
1822 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1832 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1823 mark_buffer_dirty(bh); 1833 mark_buffer_dirty(bh);
1824 sbi->s_lvid_dirty = 0; 1834 sbi->s_lvid_dirty = 0;
1835 mutex_unlock(&sbi->s_alloc_mutex);
1836}
1837
1838u64 lvid_get_unique_id(struct super_block *sb)
1839{
1840 struct buffer_head *bh;
1841 struct udf_sb_info *sbi = UDF_SB(sb);
1842 struct logicalVolIntegrityDesc *lvid;
1843 struct logicalVolHeaderDesc *lvhd;
1844 u64 uniqueID;
1845 u64 ret;
1846
1847 bh = sbi->s_lvid_bh;
1848 if (!bh)
1849 return 0;
1850
1851 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1852 lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
1853
1854 mutex_lock(&sbi->s_alloc_mutex);
1855 ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
1856 if (!(++uniqueID & 0xFFFFFFFF))
1857 uniqueID += 16;
1858 lvhd->uniqueID = cpu_to_le64(uniqueID);
1859 mutex_unlock(&sbi->s_alloc_mutex);
1860 mark_buffer_dirty(bh);
1861
1862 return ret;
1825} 1863}
1826 1864
1827static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) 1865static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1879,8 +1917,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1879 struct kernel_lb_addr rootdir, fileset; 1917 struct kernel_lb_addr rootdir, fileset;
1880 struct udf_sb_info *sbi; 1918 struct udf_sb_info *sbi;
1881 1919
1882 lock_kernel();
1883
1884 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1920 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1885 uopt.uid = -1; 1921 uopt.uid = -1;
1886 uopt.gid = -1; 1922 uopt.gid = -1;
@@ -1889,10 +1925,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1889 uopt.dmode = UDF_INVALID_MODE; 1925 uopt.dmode = UDF_INVALID_MODE;
1890 1926
1891 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); 1927 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
1892 if (!sbi) { 1928 if (!sbi)
1893 unlock_kernel();
1894 return -ENOMEM; 1929 return -ENOMEM;
1895 }
1896 1930
1897 sb->s_fs_info = sbi; 1931 sb->s_fs_info = sbi;
1898 1932
@@ -1929,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1929 sbi->s_fmode = uopt.fmode; 1963 sbi->s_fmode = uopt.fmode;
1930 sbi->s_dmode = uopt.dmode; 1964 sbi->s_dmode = uopt.dmode;
1931 sbi->s_nls_map = uopt.nls_map; 1965 sbi->s_nls_map = uopt.nls_map;
1966 rwlock_init(&sbi->s_cred_lock);
1932 1967
1933 if (uopt.session == 0xFFFFFFFF) 1968 if (uopt.session == 0xFFFFFFFF)
1934 sbi->s_session = udf_get_last_session(sb); 1969 sbi->s_session = udf_get_last_session(sb);
@@ -2038,7 +2073,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2038 goto error_out; 2073 goto error_out;
2039 } 2074 }
2040 sb->s_maxbytes = MAX_LFS_FILESIZE; 2075 sb->s_maxbytes = MAX_LFS_FILESIZE;
2041 unlock_kernel();
2042 return 0; 2076 return 0;
2043 2077
2044error_out: 2078error_out:
@@ -2059,7 +2093,6 @@ error_out:
2059 kfree(sbi); 2093 kfree(sbi);
2060 sb->s_fs_info = NULL; 2094 sb->s_fs_info = NULL;
2061 2095
2062 unlock_kernel();
2063 return -EINVAL; 2096 return -EINVAL;
2064} 2097}
2065 2098
@@ -2098,8 +2131,6 @@ static void udf_put_super(struct super_block *sb)
2098 2131
2099 sbi = UDF_SB(sb); 2132 sbi = UDF_SB(sb);
2100 2133
2101 lock_kernel();
2102
2103 if (sbi->s_vat_inode) 2134 if (sbi->s_vat_inode)
2104 iput(sbi->s_vat_inode); 2135 iput(sbi->s_vat_inode);
2105 if (sbi->s_partitions) 2136 if (sbi->s_partitions)
@@ -2115,8 +2146,6 @@ static void udf_put_super(struct super_block *sb)
2115 kfree(sbi->s_partmaps); 2146 kfree(sbi->s_partmaps);
2116 kfree(sb->s_fs_info); 2147 kfree(sb->s_fs_info);
2117 sb->s_fs_info = NULL; 2148 sb->s_fs_info = NULL;
2118
2119 unlock_kernel();
2120} 2149}
2121 2150
2122static int udf_sync_fs(struct super_block *sb, int wait) 2151static int udf_sync_fs(struct super_block *sb, int wait)
@@ -2179,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2179 uint16_t ident; 2208 uint16_t ident;
2180 struct spaceBitmapDesc *bm; 2209 struct spaceBitmapDesc *bm;
2181 2210
2182 lock_kernel();
2183
2184 loc.logicalBlockNum = bitmap->s_extPosition; 2211 loc.logicalBlockNum = bitmap->s_extPosition;
2185 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 2212 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
2186 bh = udf_read_ptagged(sb, &loc, 0, &ident); 2213 bh = udf_read_ptagged(sb, &loc, 0, &ident);
@@ -2217,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2217 } 2244 }
2218 } 2245 }
2219 brelse(bh); 2246 brelse(bh);
2220
2221out: 2247out:
2222 unlock_kernel();
2223
2224 return accum; 2248 return accum;
2225} 2249}
2226 2250
@@ -2233,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2233 int8_t etype; 2257 int8_t etype;
2234 struct extent_position epos; 2258 struct extent_position epos;
2235 2259
2236 lock_kernel(); 2260 mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
2237
2238 epos.block = UDF_I(table)->i_location; 2261 epos.block = UDF_I(table)->i_location;
2239 epos.offset = sizeof(struct unallocSpaceEntry); 2262 epos.offset = sizeof(struct unallocSpaceEntry);
2240 epos.bh = NULL; 2263 epos.bh = NULL;
@@ -2243,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2243 accum += (elen >> table->i_sb->s_blocksize_bits); 2266 accum += (elen >> table->i_sb->s_blocksize_bits);
2244 2267
2245 brelse(epos.bh); 2268 brelse(epos.bh);
2246 2269 mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
2247 unlock_kernel();
2248 2270
2249 return accum; 2271 return accum;
2250} 2272}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 16064787d2b..b1d4488b0f1 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/pagemap.h> 29#include <linux/pagemap.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include "udf_i.h" 31#include "udf_i.h"
33 32
@@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page)
78 int err = -EIO; 77 int err = -EIO;
79 unsigned char *p = kmap(page); 78 unsigned char *p = kmap(page);
80 struct udf_inode_info *iinfo; 79 struct udf_inode_info *iinfo;
80 uint32_t pos;
81 81
82 lock_kernel();
83 iinfo = UDF_I(inode); 82 iinfo = UDF_I(inode);
83 pos = udf_block_map(inode, 0);
84
85 down_read(&iinfo->i_data_sem);
84 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 86 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
85 symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr; 87 symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
86 } else { 88 } else {
87 bh = sb_bread(inode->i_sb, udf_block_map(inode, 0)); 89 bh = sb_bread(inode->i_sb, pos);
88 90
89 if (!bh) 91 if (!bh)
90 goto out; 92 goto out;
@@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page)
95 udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p); 97 udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
96 brelse(bh); 98 brelse(bh);
97 99
98 unlock_kernel(); 100 up_read(&iinfo->i_data_sem);
99 SetPageUptodate(page); 101 SetPageUptodate(page);
100 kunmap(page); 102 kunmap(page);
101 unlock_page(page); 103 unlock_page(page);
102 return 0; 104 return 0;
103 105
104out: 106out:
105 unlock_kernel(); 107 up_read(&iinfo->i_data_sem);
106 SetPageError(page); 108 SetPageError(page);
107 kunmap(page); 109 kunmap(page);
108 unlock_page(page); 110 unlock_page(page);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index e58d1de4107..d1bd31ea724 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,18 @@
1#ifndef _UDF_I_H 1#ifndef _UDF_I_H
2#define _UDF_I_H 2#define _UDF_I_H
3 3
4/*
5 * The i_data_sem and i_mutex serve for protection of allocation information
6 * of a regular files and symlinks. This includes all extents belonging to
7 * the file/symlink, a fact whether data are in-inode or in external data
8 * blocks, preallocation, goal block information... When extents are read,
9 * i_mutex or i_data_sem must be held (for reading is enough in case of
10 * i_data_sem). When extents are changed, i_data_sem must be held for writing
11 * and also i_mutex must be held.
12 *
13 * For directories i_mutex is used for all the necessary protection.
14 */
15
4struct udf_inode_info { 16struct udf_inode_info {
5 struct timespec i_crtime; 17 struct timespec i_crtime;
6 /* Physical address of inode */ 18 /* Physical address of inode */
@@ -21,6 +33,7 @@ struct udf_inode_info {
21 struct long_ad *i_lad; 33 struct long_ad *i_lad;
22 __u8 *i_data; 34 __u8 *i_data;
23 } i_ext; 35 } i_ext;
36 struct rw_semaphore i_data_sem;
24 struct inode vfs_inode; 37 struct inode vfs_inode;
25}; 38};
26 39
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d113b72c276..4858c191242 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -2,6 +2,7 @@
2#define __LINUX_UDF_SB_H 2#define __LINUX_UDF_SB_H
3 3
4#include <linux/mutex.h> 4#include <linux/mutex.h>
5#include <linux/bitops.h>
5 6
6/* Since UDF 2.01 is ISO 13346 based... */ 7/* Since UDF 2.01 is ISO 13346 based... */
7#define UDF_SUPER_MAGIC 0x15013346 8#define UDF_SUPER_MAGIC 0x15013346
@@ -128,6 +129,8 @@ struct udf_sb_info {
128 uid_t s_uid; 129 uid_t s_uid;
129 mode_t s_fmode; 130 mode_t s_fmode;
130 mode_t s_dmode; 131 mode_t s_dmode;
132 /* Lock protecting consistency of above permission settings */
133 rwlock_t s_cred_lock;
131 134
132 /* Root Info */ 135 /* Root Info */
133 struct timespec s_record_time; 136 struct timespec s_record_time;
@@ -139,7 +142,7 @@ struct udf_sb_info {
139 __u16 s_udfrev; 142 __u16 s_udfrev;
140 143
141 /* Miscellaneous flags */ 144 /* Miscellaneous flags */
142 __u32 s_flags; 145 unsigned long s_flags;
143 146
144 /* Encoding info */ 147 /* Encoding info */
145 struct nls_table *s_nls_map; 148 struct nls_table *s_nls_map;
@@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
161 164
162int udf_compute_nr_groups(struct super_block *sb, u32 partition); 165int udf_compute_nr_groups(struct super_block *sb, u32 partition);
163 166
164#define UDF_QUERY_FLAG(X,Y) ( UDF_SB(X)->s_flags & ( 1 << (Y) ) ) 167static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
165#define UDF_SET_FLAG(X,Y) ( UDF_SB(X)->s_flags |= ( 1 << (Y) ) ) 168{
166#define UDF_CLEAR_FLAG(X,Y) ( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) ) 169 return test_bit(flag, &UDF_SB(sb)->s_flags);
170}
171
172static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
173{
174 set_bit(flag, &UDF_SB(sb)->s_flags);
175}
176
177static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
178{
179 clear_bit(flag, &UDF_SB(sb)->s_flags);
180}
167 181
168#endif /* __LINUX_UDF_SB_H */ 182#endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 6995ab1f430..eba48209f9f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -111,6 +111,8 @@ struct extent_position {
111}; 111};
112 112
113/* super.c */ 113/* super.c */
114
115__attribute__((format(printf, 3, 4)))
114extern void udf_warning(struct super_block *, const char *, const char *, ...); 116extern void udf_warning(struct super_block *, const char *, const char *, ...);
115static inline void udf_updated_lvid(struct super_block *sb) 117static inline void udf_updated_lvid(struct super_block *sb)
116{ 118{
@@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb)
123 sb->s_dirt = 1; 125 sb->s_dirt = 1;
124 UDF_SB(sb)->s_lvid_dirty = 1; 126 UDF_SB(sb)->s_lvid_dirty = 1;
125} 127}
128extern u64 lvid_get_unique_id(struct super_block *sb);
126 129
127/* namei.c */ 130/* namei.c */
128extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, 131extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
133extern long udf_ioctl(struct file *, unsigned int, unsigned long); 136extern long udf_ioctl(struct file *, unsigned int, unsigned long);
134/* inode.c */ 137/* inode.c */
135extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 138extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
136extern int udf_sync_inode(struct inode *);
137extern void udf_expand_file_adinicb(struct inode *, int, int *); 139extern void udf_expand_file_adinicb(struct inode *, int, int *);
138extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 140extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
139extern struct buffer_head *udf_bread(struct inode *, int, int, int *); 141extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c47daed56d..2c61ac5d4e4 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1412,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
1412 return &ei->vfs_inode; 1412 return &ei->vfs_inode;
1413} 1413}
1414 1414
1415static void ufs_destroy_inode(struct inode *inode) 1415static void ufs_i_callback(struct rcu_head *head)
1416{ 1416{
1417 struct inode *inode = container_of(head, struct inode, i_rcu);
1418 INIT_LIST_HEAD(&inode->i_dentry);
1417 kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); 1419 kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
1418} 1420}
1419 1421
1422static void ufs_destroy_inode(struct inode *inode)
1423{
1424 call_rcu(&inode->i_rcu, ufs_i_callback);
1425}
1426
1420static void init_once(void *foo) 1427static void init_once(void *foo)
1421{ 1428{
1422 struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; 1429 struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6ca..faca4499709 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
98 kmem.o \ 98 kmem.o \
99 xfs_aops.o \ 99 xfs_aops.o \
100 xfs_buf.o \ 100 xfs_buf.o \
101 xfs_discard.o \
101 xfs_export.o \ 102 xfs_export.o \
102 xfs_file.o \ 103 xfs_file.o \
103 xfs_fs_subr.o \ 104 xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c37081..00000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_SV_H__
19#define __XFS_SUPPORT_SV_H__
20
21#include <linux/wait.h>
22#include <linux/sched.h>
23#include <linux/spinlock.h>
24
25/*
26 * Synchronisation variables.
27 *
28 * (Parameters "pri", "svf" and "rts" are not implemented)
29 */
30
31typedef struct sv_s {
32 wait_queue_head_t waiters;
33} sv_t;
34
35static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
36{
37 DECLARE_WAITQUEUE(wait, current);
38
39 add_wait_queue_exclusive(&sv->waiters, &wait);
40 __set_current_state(TASK_UNINTERRUPTIBLE);
41 spin_unlock(lock);
42
43 schedule();
44
45 remove_wait_queue(&sv->waiters, &wait);
46}
47
48#define sv_init(sv,flag,name) \
49 init_waitqueue_head(&(sv)->waiters)
50#define sv_destroy(sv) \
51 /*NOTHING*/
52#define sv_wait(sv, pri, lock, s) \
53 _sv_wait(sv, lock)
54#define sv_signal(sv) \
55 wake_up(&(sv)->waiters)
56#define sv_broadcast(sv) \
57 wake_up_all(&(sv)->waiters)
58
59#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3..39f4f809bb6 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
219} 219}
220 220
221int 221int
222xfs_check_acl(struct inode *inode, int mask) 222xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
223{ 223{
224 struct xfs_inode *ip = XFS_I(inode); 224 struct xfs_inode *ip;
225 struct posix_acl *acl; 225 struct posix_acl *acl;
226 int error = -EAGAIN; 226 int error = -EAGAIN;
227 227
228 ip = XFS_I(inode);
228 trace_xfs_check_acl(ip); 229 trace_xfs_check_acl(ip);
229 230
230 /* 231 /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
234 if (!XFS_IFORK_Q(ip)) 235 if (!XFS_IFORK_Q(ip))
235 return -EAGAIN; 236 return -EAGAIN;
236 237
238 if (flags & IPERM_FLAG_RCU) {
239 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
240 return -ECHILD;
241 return -EAGAIN;
242 }
243
237 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); 244 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
238 if (IS_ERR(acl)) 245 if (IS_ERR(acl))
239 return PTR_ERR(acl); 246 return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c9af48fffcd..ec7bbb5645b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
38#include <linux/pagevec.h> 38#include <linux/pagevec.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40 40
41/*
42 * Types of I/O for bmap clustering and I/O completion tracking.
43 */
44enum {
45 IO_READ, /* mapping for a read */
46 IO_DELAY, /* mapping covers delalloc region */
47 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
48 IO_NEW /* just allocated */
49};
50 41
51/* 42/*
52 * Prime number of hash buckets since address is used as the key. 43 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
182 xfs_inode_t *ip = XFS_I(ioend->io_inode); 173 xfs_inode_t *ip = XFS_I(ioend->io_inode);
183 xfs_fsize_t isize; 174 xfs_fsize_t isize;
184 175
185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
186 ASSERT(ioend->io_type != IO_READ);
187
188 if (unlikely(ioend->io_error)) 176 if (unlikely(ioend->io_error))
189 return 0; 177 return 0;
190 178
@@ -244,10 +232,8 @@ xfs_end_io(
244 * We might have to update the on-disk file size after extending 232 * We might have to update the on-disk file size after extending
245 * writes. 233 * writes.
246 */ 234 */
247 if (ioend->io_type != IO_READ) { 235 error = xfs_setfilesize(ioend);
248 error = xfs_setfilesize(ioend); 236 ASSERT(!error || error == EAGAIN);
249 ASSERT(!error || error == EAGAIN);
250 }
251 237
252 /* 238 /*
253 * If we didn't complete processing of the ioend, requeue it to the 239 * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
318xfs_map_blocks( 304xfs_map_blocks(
319 struct inode *inode, 305 struct inode *inode,
320 loff_t offset, 306 loff_t offset,
321 ssize_t count,
322 struct xfs_bmbt_irec *imap, 307 struct xfs_bmbt_irec *imap,
323 int flags) 308 int type,
309 int nonblocking)
324{ 310{
325 int nmaps = 1; 311 struct xfs_inode *ip = XFS_I(inode);
326 int new = 0; 312 struct xfs_mount *mp = ip->i_mount;
313 ssize_t count = 1 << inode->i_blkbits;
314 xfs_fileoff_t offset_fsb, end_fsb;
315 int error = 0;
316 int bmapi_flags = XFS_BMAPI_ENTIRE;
317 int nimaps = 1;
318
319 if (XFS_FORCED_SHUTDOWN(mp))
320 return -XFS_ERROR(EIO);
321
322 if (type == IO_UNWRITTEN)
323 bmapi_flags |= XFS_BMAPI_IGSTATE;
324
325 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
326 if (nonblocking)
327 return -XFS_ERROR(EAGAIN);
328 xfs_ilock(ip, XFS_ILOCK_SHARED);
329 }
330
331 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
332 (ip->i_df.if_flags & XFS_IFEXTENTS));
333 ASSERT(offset <= mp->m_maxioffset);
334
335 if (offset + count > mp->m_maxioffset)
336 count = mp->m_maxioffset - offset;
337 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
338 offset_fsb = XFS_B_TO_FSBT(mp, offset);
339 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
340 bmapi_flags, NULL, 0, imap, &nimaps, NULL);
341 xfs_iunlock(ip, XFS_ILOCK_SHARED);
327 342
328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); 343 if (error)
344 return -XFS_ERROR(error);
345
346 if (type == IO_DELALLOC &&
347 (!nimaps || isnullstartblock(imap->br_startblock))) {
348 error = xfs_iomap_write_allocate(ip, offset, count, imap);
349 if (!error)
350 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
351 return -XFS_ERROR(error);
352 }
353
354#ifdef DEBUG
355 if (type == IO_UNWRITTEN) {
356 ASSERT(nimaps);
357 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
358 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
359 }
360#endif
361 if (nimaps)
362 trace_xfs_map_blocks_found(ip, offset, count, type, imap);
363 return 0;
329} 364}
330 365
331STATIC int 366STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
380 415
381 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
382 WRITE_SYNC_PLUG : WRITE, bio); 417 WRITE_SYNC_PLUG : WRITE, bio);
383 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
384 bio_put(bio);
385} 418}
386 419
387STATIC struct bio * 420STATIC struct bio *
388xfs_alloc_ioend_bio( 421xfs_alloc_ioend_bio(
389 struct buffer_head *bh) 422 struct buffer_head *bh)
390{ 423{
391 struct bio *bio;
392 int nvecs = bio_get_nr_vecs(bh->b_bdev); 424 int nvecs = bio_get_nr_vecs(bh->b_bdev);
393 425 struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
394 do {
395 bio = bio_alloc(GFP_NOIO, nvecs);
396 nvecs >>= 1;
397 } while (!bio);
398 426
399 ASSERT(bio->bi_private == NULL); 427 ASSERT(bio->bi_private == NULL);
400 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 428 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
401 bio->bi_bdev = bh->b_bdev; 429 bio->bi_bdev = bh->b_bdev;
402 bio_get(bio);
403 return bio; 430 return bio;
404} 431}
405 432
@@ -470,9 +497,8 @@ xfs_submit_ioend(
470 /* Pass 1 - start writeback */ 497 /* Pass 1 - start writeback */
471 do { 498 do {
472 next = ioend->io_list; 499 next = ioend->io_list;
473 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 500 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
474 xfs_start_buffer_writeback(bh); 501 xfs_start_buffer_writeback(bh);
475 }
476 } while ((ioend = next) != NULL); 502 } while ((ioend = next) != NULL);
477 503
478 /* Pass 2 - submit I/O */ 504 /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
600 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 626 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 627 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
602 628
603 lock_buffer(bh);
604 xfs_map_buffer(inode, bh, imap, offset); 629 xfs_map_buffer(inode, bh, imap, offset);
605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
606 set_buffer_mapped(bh); 630 set_buffer_mapped(bh);
607 clear_buffer_delay(bh); 631 clear_buffer_delay(bh);
608 clear_buffer_unwritten(bh); 632 clear_buffer_unwritten(bh);
609} 633}
610 634
611/* 635/*
612 * Look for a page at index that is suitable for clustering.
613 */
614STATIC unsigned int
615xfs_probe_page(
616 struct page *page,
617 unsigned int pg_offset)
618{
619 struct buffer_head *bh, *head;
620 int ret = 0;
621
622 if (PageWriteback(page))
623 return 0;
624 if (!PageDirty(page))
625 return 0;
626 if (!page->mapping)
627 return 0;
628 if (!page_has_buffers(page))
629 return 0;
630
631 bh = head = page_buffers(page);
632 do {
633 if (!buffer_uptodate(bh))
634 break;
635 if (!buffer_mapped(bh))
636 break;
637 ret += bh->b_size;
638 if (ret >= pg_offset)
639 break;
640 } while ((bh = bh->b_this_page) != head);
641
642 return ret;
643}
644
645STATIC size_t
646xfs_probe_cluster(
647 struct inode *inode,
648 struct page *startpage,
649 struct buffer_head *bh,
650 struct buffer_head *head)
651{
652 struct pagevec pvec;
653 pgoff_t tindex, tlast, tloff;
654 size_t total = 0;
655 int done = 0, i;
656
657 /* First sum forwards in this page */
658 do {
659 if (!buffer_uptodate(bh) || !buffer_mapped(bh))
660 return total;
661 total += bh->b_size;
662 } while ((bh = bh->b_this_page) != head);
663
664 /* if we reached the end of the page, sum forwards in following pages */
665 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
666 tindex = startpage->index + 1;
667
668 /* Prune this back to avoid pathological behavior */
669 tloff = min(tlast, startpage->index + 64);
670
671 pagevec_init(&pvec, 0);
672 while (!done && tindex <= tloff) {
673 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
674
675 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
676 break;
677
678 for (i = 0; i < pagevec_count(&pvec); i++) {
679 struct page *page = pvec.pages[i];
680 size_t pg_offset, pg_len = 0;
681
682 if (tindex == tlast) {
683 pg_offset =
684 i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
685 if (!pg_offset) {
686 done = 1;
687 break;
688 }
689 } else
690 pg_offset = PAGE_CACHE_SIZE;
691
692 if (page->index == tindex && trylock_page(page)) {
693 pg_len = xfs_probe_page(page, pg_offset);
694 unlock_page(page);
695 }
696
697 if (!pg_len) {
698 done = 1;
699 break;
700 }
701
702 total += pg_len;
703 tindex++;
704 }
705
706 pagevec_release(&pvec);
707 cond_resched();
708 }
709
710 return total;
711}
712
713/*
714 * Test if a given page is suitable for writing as part of an unwritten 636 * Test if a given page is suitable for writing as part of an unwritten
715 * or delayed allocate extent. 637 * or delayed allocate extent.
716 */ 638 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
731 if (buffer_unwritten(bh)) 653 if (buffer_unwritten(bh))
732 acceptable = (type == IO_UNWRITTEN); 654 acceptable = (type == IO_UNWRITTEN);
733 else if (buffer_delay(bh)) 655 else if (buffer_delay(bh))
734 acceptable = (type == IO_DELAY); 656 acceptable = (type == IO_DELALLOC);
735 else if (buffer_dirty(bh) && buffer_mapped(bh)) 657 else if (buffer_dirty(bh) && buffer_mapped(bh))
736 acceptable = (type == IO_NEW); 658 acceptable = (type == IO_OVERWRITE);
737 else 659 else
738 break; 660 break;
739 } while ((bh = bh->b_this_page) != head); 661 } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
758 loff_t tindex, 680 loff_t tindex,
759 struct xfs_bmbt_irec *imap, 681 struct xfs_bmbt_irec *imap,
760 xfs_ioend_t **ioendp, 682 xfs_ioend_t **ioendp,
761 struct writeback_control *wbc, 683 struct writeback_control *wbc)
762 int all_bh)
763{ 684{
764 struct buffer_head *bh, *head; 685 struct buffer_head *bh, *head;
765 xfs_off_t end_offset; 686 xfs_off_t end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
814 continue; 735 continue;
815 } 736 }
816 737
817 if (buffer_unwritten(bh) || buffer_delay(bh)) { 738 if (buffer_unwritten(bh) || buffer_delay(bh) ||
739 buffer_mapped(bh)) {
818 if (buffer_unwritten(bh)) 740 if (buffer_unwritten(bh))
819 type = IO_UNWRITTEN; 741 type = IO_UNWRITTEN;
742 else if (buffer_delay(bh))
743 type = IO_DELALLOC;
820 else 744 else
821 type = IO_DELAY; 745 type = IO_OVERWRITE;
822 746
823 if (!xfs_imap_valid(inode, imap, offset)) { 747 if (!xfs_imap_valid(inode, imap, offset)) {
824 done = 1; 748 done = 1;
825 continue; 749 continue;
826 } 750 }
827 751
828 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 752 lock_buffer(bh);
829 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 753 if (type != IO_OVERWRITE)
830 754 xfs_map_at_offset(inode, bh, imap, offset);
831 xfs_map_at_offset(inode, bh, imap, offset);
832 xfs_add_to_ioend(inode, bh, offset, type, 755 xfs_add_to_ioend(inode, bh, offset, type,
833 ioendp, done); 756 ioendp, done);
834 757
835 page_dirty--; 758 page_dirty--;
836 count++; 759 count++;
837 } else { 760 } else {
838 type = IO_NEW; 761 done = 1;
839 if (buffer_mapped(bh) && all_bh) {
840 lock_buffer(bh);
841 xfs_add_to_ioend(inode, bh, offset,
842 type, ioendp, done);
843 count++;
844 page_dirty--;
845 } else {
846 done = 1;
847 }
848 } 762 }
849 } while (offset += len, (bh = bh->b_this_page) != head); 763 } while (offset += len, (bh = bh->b_this_page) != head);
850 764
@@ -876,7 +790,6 @@ xfs_cluster_write(
876 struct xfs_bmbt_irec *imap, 790 struct xfs_bmbt_irec *imap,
877 xfs_ioend_t **ioendp, 791 xfs_ioend_t **ioendp,
878 struct writeback_control *wbc, 792 struct writeback_control *wbc,
879 int all_bh,
880 pgoff_t tlast) 793 pgoff_t tlast)
881{ 794{
882 struct pagevec pvec; 795 struct pagevec pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
891 804
892 for (i = 0; i < pagevec_count(&pvec); i++) { 805 for (i = 0; i < pagevec_count(&pvec); i++) {
893 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 806 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
894 imap, ioendp, wbc, all_bh); 807 imap, ioendp, wbc);
895 if (done) 808 if (done)
896 break; 809 break;
897 } 810 }
@@ -934,9 +847,8 @@ xfs_aops_discard_page(
934 struct xfs_inode *ip = XFS_I(inode); 847 struct xfs_inode *ip = XFS_I(inode);
935 struct buffer_head *bh, *head; 848 struct buffer_head *bh, *head;
936 loff_t offset = page_offset(page); 849 loff_t offset = page_offset(page);
937 ssize_t len = 1 << inode->i_blkbits;
938 850
939 if (!xfs_is_delayed_page(page, IO_DELAY)) 851 if (!xfs_is_delayed_page(page, IO_DELALLOC))
940 goto out_invalidate; 852 goto out_invalidate;
941 853
942 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 854 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -949,58 +861,14 @@ xfs_aops_discard_page(
949 xfs_ilock(ip, XFS_ILOCK_EXCL); 861 xfs_ilock(ip, XFS_ILOCK_EXCL);
950 bh = head = page_buffers(page); 862 bh = head = page_buffers(page);
951 do { 863 do {
952 int done;
953 xfs_fileoff_t offset_fsb;
954 xfs_bmbt_irec_t imap;
955 int nimaps = 1;
956 int error; 864 int error;
957 xfs_fsblock_t firstblock; 865 xfs_fileoff_t start_fsb;
958 xfs_bmap_free_t flist;
959 866
960 if (!buffer_delay(bh)) 867 if (!buffer_delay(bh))
961 goto next_buffer; 868 goto next_buffer;
962 869
963 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 870 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
964 871 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
965 /*
966 * Map the range first and check that it is a delalloc extent
967 * before trying to unmap the range. Otherwise we will be
968 * trying to remove a real extent (which requires a
969 * transaction) or a hole, which is probably a bad idea...
970 */
971 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
972 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
973 &nimaps, NULL);
974
975 if (error) {
976 /* something screwed, just bail */
977 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
978 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
979 "page discard failed delalloc mapping lookup.");
980 }
981 break;
982 }
983 if (!nimaps) {
984 /* nothing there */
985 goto next_buffer;
986 }
987 if (imap.br_startblock != DELAYSTARTBLOCK) {
988 /* been converted, ignore */
989 goto next_buffer;
990 }
991 WARN_ON(imap.br_blockcount == 0);
992
993 /*
994 * Note: while we initialise the firstblock/flist pair, they
995 * should never be used because blocks should never be
996 * allocated or freed for a delalloc extent and hence we need
997 * don't cancel or finish them after the xfs_bunmapi() call.
998 */
999 xfs_bmap_init(&flist, &firstblock);
1000 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
1001 &flist, &done);
1002
1003 ASSERT(!flist.xbf_count && !flist.xbf_first);
1004 if (error) { 872 if (error) {
1005 /* something screwed, just bail */ 873 /* something screwed, just bail */
1006 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 874 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +878,7 @@ xfs_aops_discard_page(
1010 break; 878 break;
1011 } 879 }
1012next_buffer: 880next_buffer:
1013 offset += len; 881 offset += 1 << inode->i_blkbits;
1014 882
1015 } while ((bh = bh->b_this_page) != head); 883 } while ((bh = bh->b_this_page) != head);
1016 884
@@ -1047,10 +915,10 @@ xfs_vm_writepage(
1047 unsigned int type; 915 unsigned int type;
1048 __uint64_t end_offset; 916 __uint64_t end_offset;
1049 pgoff_t end_index, last_index; 917 pgoff_t end_index, last_index;
1050 ssize_t size, len; 918 ssize_t len;
1051 int flags, err, imap_valid = 0, uptodate = 1; 919 int err, imap_valid = 0, uptodate = 1;
1052 int count = 0; 920 int count = 0;
1053 int all_bh = 0; 921 int nonblocking = 0;
1054 922
1055 trace_xfs_writepage(inode, page, 0); 923 trace_xfs_writepage(inode, page, 0);
1056 924
@@ -1101,109 +969,78 @@ xfs_vm_writepage(
1101 969
1102 bh = head = page_buffers(page); 970 bh = head = page_buffers(page);
1103 offset = page_offset(page); 971 offset = page_offset(page);
1104 flags = BMAPI_READ; 972 type = IO_OVERWRITE;
1105 type = IO_NEW; 973
974 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
975 nonblocking = 1;
1106 976
1107 do { 977 do {
978 int new_ioend = 0;
979
1108 if (offset >= end_offset) 980 if (offset >= end_offset)
1109 break; 981 break;
1110 if (!buffer_uptodate(bh)) 982 if (!buffer_uptodate(bh))
1111 uptodate = 0; 983 uptodate = 0;
1112 984
1113 /* 985 /*
1114 * A hole may still be marked uptodate because discard_buffer 986 * set_page_dirty dirties all buffers in a page, independent
1115 * leaves the flag set. 987 * of their state. The dirty state however is entirely
988 * meaningless for holes (!mapped && uptodate), so skip
989 * buffers covering holes here.
1116 */ 990 */
1117 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 991 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1118 ASSERT(!buffer_dirty(bh));
1119 imap_valid = 0; 992 imap_valid = 0;
1120 continue; 993 continue;
1121 } 994 }
1122 995
1123 if (imap_valid) 996 if (buffer_unwritten(bh)) {
1124 imap_valid = xfs_imap_valid(inode, &imap, offset); 997 if (type != IO_UNWRITTEN) {
1125
1126 if (buffer_unwritten(bh) || buffer_delay(bh)) {
1127 int new_ioend = 0;
1128
1129 /*
1130 * Make sure we don't use a read-only iomap
1131 */
1132 if (flags == BMAPI_READ)
1133 imap_valid = 0;
1134
1135 if (buffer_unwritten(bh)) {
1136 type = IO_UNWRITTEN; 998 type = IO_UNWRITTEN;
1137 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 999 imap_valid = 0;
1138 } else if (buffer_delay(bh)) {
1139 type = IO_DELAY;
1140 flags = BMAPI_ALLOCATE;
1141
1142 if (wbc->sync_mode == WB_SYNC_NONE)
1143 flags |= BMAPI_TRYLOCK;
1144 }
1145
1146 if (!imap_valid) {
1147 /*
1148 * If we didn't have a valid mapping then we
1149 * need to ensure that we put the new mapping
1150 * in a new ioend structure. This needs to be
1151 * done to ensure that the ioends correctly
1152 * reflect the block mappings at io completion
1153 * for unwritten extent conversion.
1154 */
1155 new_ioend = 1;
1156 err = xfs_map_blocks(inode, offset, len,
1157 &imap, flags);
1158 if (err)
1159 goto error;
1160 imap_valid = xfs_imap_valid(inode, &imap,
1161 offset);
1162 } 1000 }
1163 if (imap_valid) { 1001 } else if (buffer_delay(bh)) {
1164 xfs_map_at_offset(inode, bh, &imap, offset); 1002 if (type != IO_DELALLOC) {
1165 xfs_add_to_ioend(inode, bh, offset, type, 1003 type = IO_DELALLOC;
1166 &ioend, new_ioend); 1004 imap_valid = 0;
1167 count++;
1168 } 1005 }
1169 } else if (buffer_uptodate(bh)) { 1006 } else if (buffer_uptodate(bh)) {
1170 /* 1007 if (type != IO_OVERWRITE) {
1171 * we got here because the buffer is already mapped. 1008 type = IO_OVERWRITE;
1172 * That means it must already have extents allocated 1009 imap_valid = 0;
1173 * underneath it. Map the extent by reading it. 1010 }
1174 */ 1011 } else {
1175 if (!imap_valid || flags != BMAPI_READ) { 1012 if (PageUptodate(page)) {
1176 flags = BMAPI_READ; 1013 ASSERT(buffer_mapped(bh));
1177 size = xfs_probe_cluster(inode, page, bh, head); 1014 imap_valid = 0;
1178 err = xfs_map_blocks(inode, offset, size,
1179 &imap, flags);
1180 if (err)
1181 goto error;
1182 imap_valid = xfs_imap_valid(inode, &imap,
1183 offset);
1184 } 1015 }
1016 continue;
1017 }
1185 1018
1019 if (imap_valid)
1020 imap_valid = xfs_imap_valid(inode, &imap, offset);
1021 if (!imap_valid) {
1186 /* 1022 /*
1187 * We set the type to IO_NEW in case we are doing a 1023 * If we didn't have a valid mapping then we need to
1188 * small write at EOF that is extending the file but 1024 * put the new mapping into a separate ioend structure.
1189 * without needing an allocation. We need to update the 1025 * This ensures non-contiguous extents always have
1190 * file size on I/O completion in this case so it is 1026 * separate ioends, which is particularly important
1191 * the same case as having just allocated a new extent 1027 * for unwritten extent conversion at I/O completion
1192 * that we are writing into for the first time. 1028 * time.
1193 */ 1029 */
1194 type = IO_NEW; 1030 new_ioend = 1;
1195 if (trylock_buffer(bh)) { 1031 err = xfs_map_blocks(inode, offset, &imap, type,
1196 if (imap_valid) 1032 nonblocking);
1197 all_bh = 1; 1033 if (err)
1198 xfs_add_to_ioend(inode, bh, offset, type, 1034 goto error;
1199 &ioend, !imap_valid); 1035 imap_valid = xfs_imap_valid(inode, &imap, offset);
1200 count++; 1036 }
1201 } else { 1037 if (imap_valid) {
1202 imap_valid = 0; 1038 lock_buffer(bh);
1203 } 1039 if (type != IO_OVERWRITE)
1204 } else if (PageUptodate(page)) { 1040 xfs_map_at_offset(inode, bh, &imap, offset);
1205 ASSERT(buffer_mapped(bh)); 1041 xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1206 imap_valid = 0; 1042 new_ioend);
1043 count++;
1207 } 1044 }
1208 1045
1209 if (!iohead) 1046 if (!iohead)
@@ -1232,7 +1069,7 @@ xfs_vm_writepage(
1232 end_index = last_index; 1069 end_index = last_index;
1233 1070
1234 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1071 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1235 wbc, all_bh, end_index); 1072 wbc, end_index);
1236 } 1073 }
1237 1074
1238 if (iohead) 1075 if (iohead)
@@ -1301,13 +1138,19 @@ __xfs_get_blocks(
1301 int create, 1138 int create,
1302 int direct) 1139 int direct)
1303{ 1140{
1304 int flags = create ? BMAPI_WRITE : BMAPI_READ; 1141 struct xfs_inode *ip = XFS_I(inode);
1142 struct xfs_mount *mp = ip->i_mount;
1143 xfs_fileoff_t offset_fsb, end_fsb;
1144 int error = 0;
1145 int lockmode = 0;
1305 struct xfs_bmbt_irec imap; 1146 struct xfs_bmbt_irec imap;
1147 int nimaps = 1;
1306 xfs_off_t offset; 1148 xfs_off_t offset;
1307 ssize_t size; 1149 ssize_t size;
1308 int nimap = 1;
1309 int new = 0; 1150 int new = 0;
1310 int error; 1151
1152 if (XFS_FORCED_SHUTDOWN(mp))
1153 return -XFS_ERROR(EIO);
1311 1154
1312 offset = (xfs_off_t)iblock << inode->i_blkbits; 1155 offset = (xfs_off_t)iblock << inode->i_blkbits;
1313 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1156 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1316,15 +1159,45 @@ __xfs_get_blocks(
1316 if (!create && direct && offset >= i_size_read(inode)) 1159 if (!create && direct && offset >= i_size_read(inode))
1317 return 0; 1160 return 0;
1318 1161
1319 if (direct && create) 1162 if (create) {
1320 flags |= BMAPI_DIRECT; 1163 lockmode = XFS_ILOCK_EXCL;
1164 xfs_ilock(ip, lockmode);
1165 } else {
1166 lockmode = xfs_ilock_map_shared(ip);
1167 }
1168
1169 ASSERT(offset <= mp->m_maxioffset);
1170 if (offset + size > mp->m_maxioffset)
1171 size = mp->m_maxioffset - offset;
1172 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1173 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1321 1174
1322 error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, 1175 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
1323 &new); 1176 XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
1324 if (error) 1177 if (error)
1325 return -error; 1178 goto out_unlock;
1326 if (nimap == 0) 1179
1327 return 0; 1180 if (create &&
1181 (!nimaps ||
1182 (imap.br_startblock == HOLESTARTBLOCK ||
1183 imap.br_startblock == DELAYSTARTBLOCK))) {
1184 if (direct) {
1185 error = xfs_iomap_write_direct(ip, offset, size,
1186 &imap, nimaps);
1187 } else {
1188 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1189 }
1190 if (error)
1191 goto out_unlock;
1192
1193 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1194 } else if (nimaps) {
1195 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1196 } else {
1197 trace_xfs_get_blocks_notfound(ip, offset, size);
1198 goto out_unlock;
1199 }
1200 xfs_iunlock(ip, lockmode);
1328 1201
1329 if (imap.br_startblock != HOLESTARTBLOCK && 1202 if (imap.br_startblock != HOLESTARTBLOCK &&
1330 imap.br_startblock != DELAYSTARTBLOCK) { 1203 imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1391,6 +1264,10 @@ __xfs_get_blocks(
1391 } 1264 }
1392 1265
1393 return 0; 1266 return 0;
1267
1268out_unlock:
1269 xfs_iunlock(ip, lockmode);
1270 return -error;
1394} 1271}
1395 1272
1396int 1273int
@@ -1478,7 +1355,7 @@ xfs_vm_direct_IO(
1478 ssize_t ret; 1355 ssize_t ret;
1479 1356
1480 if (rw & WRITE) { 1357 if (rw & WRITE) {
1481 iocb->private = xfs_alloc_ioend(inode, IO_NEW); 1358 iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
1482 1359
1483 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1360 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1484 offset, nr_segs, 1361 offset, nr_segs,
@@ -1504,11 +1381,42 @@ xfs_vm_write_failed(
1504 struct inode *inode = mapping->host; 1381 struct inode *inode = mapping->host;
1505 1382
1506 if (to > inode->i_size) { 1383 if (to > inode->i_size) {
1507 struct iattr ia = { 1384 /*
1508 .ia_valid = ATTR_SIZE | ATTR_FORCE, 1385 * punch out the delalloc blocks we have already allocated. We
1509 .ia_size = inode->i_size, 1386 * don't call xfs_setattr() to do this as we may be in the
1510 }; 1387 * middle of a multi-iovec write and so the vfs inode->i_size
1511 xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); 1388 * will not match the xfs ip->i_size and so it will zero too
1389 * much. Hence we jus truncate the page cache to zero what is
1390 * necessary and punch the delalloc blocks directly.
1391 */
1392 struct xfs_inode *ip = XFS_I(inode);
1393 xfs_fileoff_t start_fsb;
1394 xfs_fileoff_t end_fsb;
1395 int error;
1396
1397 truncate_pagecache(inode, to, inode->i_size);
1398
1399 /*
1400 * Check if there are any blocks that are outside of i_size
1401 * that need to be trimmed back.
1402 */
1403 start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
1404 end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
1405 if (end_fsb <= start_fsb)
1406 return;
1407
1408 xfs_ilock(ip, XFS_ILOCK_EXCL);
1409 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1410 end_fsb - start_fsb);
1411 if (error) {
1412 /* something screwed, just bail */
1413 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1414 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
1415 "xfs_vm_write_failed: unable to clean up ino %lld",
1416 ip->i_ino);
1417 }
1418 }
1419 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1512 } 1420 }
1513} 1421}
1514 1422
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237..71f721e1a71 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
23extern mempool_t *xfs_ioend_pool; 23extern mempool_t *xfs_ioend_pool;
24 24
25/* 25/*
26 * Types of I/O for bmap clustering and I/O completion tracking.
27 */
28enum {
29 IO_DIRECT = 0, /* special case for direct I/O ioends */
30 IO_DELALLOC, /* mapping covers delalloc region */
31 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
32 IO_OVERWRITE, /* mapping covers already allocated extent */
33};
34
35#define XFS_IO_TYPES \
36 { 0, "" }, \
37 { IO_DELALLOC, "delalloc" }, \
38 { IO_UNWRITTEN, "unwritten" }, \
39 { IO_OVERWRITE, "overwrite" }
40
41/*
26 * xfs_ioend struct manages large extent writes for XFS. 42 * xfs_ioend struct manages large extent writes for XFS.
27 * It can manage several multi-page bio's at once. 43 * It can manage several multi-page bio's at once.
28 */ 44 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 63fd2c07cb5..ac1c7e8378d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
44 44
45static kmem_zone_t *xfs_buf_zone; 45static kmem_zone_t *xfs_buf_zone;
46STATIC int xfsbufd(void *); 46STATIC int xfsbufd(void *);
47STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
48STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 47STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
49static struct shrinker xfs_buf_shake = {
50 .shrink = xfsbufd_wakeup,
51 .seeks = DEFAULT_SEEKS,
52};
53 48
54static struct workqueue_struct *xfslogd_workqueue; 49static struct workqueue_struct *xfslogd_workqueue;
55struct workqueue_struct *xfsdatad_workqueue; 50struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
168} 163}
169 164
170/* 165/*
171 * Internal xfs_buf_t object manipulation 166 * xfs_buf_lru_add - add a buffer to the LRU.
167 *
168 * The LRU takes a new reference to the buffer so that it will only be freed
169 * once the shrinker takes the buffer off the LRU.
172 */ 170 */
171STATIC void
172xfs_buf_lru_add(
173 struct xfs_buf *bp)
174{
175 struct xfs_buftarg *btp = bp->b_target;
176
177 spin_lock(&btp->bt_lru_lock);
178 if (list_empty(&bp->b_lru)) {
179 atomic_inc(&bp->b_hold);
180 list_add_tail(&bp->b_lru, &btp->bt_lru);
181 btp->bt_lru_nr++;
182 }
183 spin_unlock(&btp->bt_lru_lock);
184}
185
186/*
187 * xfs_buf_lru_del - remove a buffer from the LRU
188 *
189 * The unlocked check is safe here because it only occurs when there are not
190 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
191 * to optimise the shrinker removing the buffer from the LRU and calling
192 * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
193 * bt_lru_lock.
194 */
195STATIC void
196xfs_buf_lru_del(
197 struct xfs_buf *bp)
198{
199 struct xfs_buftarg *btp = bp->b_target;
200
201 if (list_empty(&bp->b_lru))
202 return;
203
204 spin_lock(&btp->bt_lru_lock);
205 if (!list_empty(&bp->b_lru)) {
206 list_del_init(&bp->b_lru);
207 btp->bt_lru_nr--;
208 }
209 spin_unlock(&btp->bt_lru_lock);
210}
211
212/*
213 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
214 * b_lru_ref count so that the buffer is freed immediately when the buffer
215 * reference count falls to zero. If the buffer is already on the LRU, we need
216 * to remove the reference that LRU holds on the buffer.
217 *
218 * This prevents build-up of stale buffers on the LRU.
219 */
220void
221xfs_buf_stale(
222 struct xfs_buf *bp)
223{
224 bp->b_flags |= XBF_STALE;
225 atomic_set(&(bp)->b_lru_ref, 0);
226 if (!list_empty(&bp->b_lru)) {
227 struct xfs_buftarg *btp = bp->b_target;
228
229 spin_lock(&btp->bt_lru_lock);
230 if (!list_empty(&bp->b_lru)) {
231 list_del_init(&bp->b_lru);
232 btp->bt_lru_nr--;
233 atomic_dec(&bp->b_hold);
234 }
235 spin_unlock(&btp->bt_lru_lock);
236 }
237 ASSERT(atomic_read(&bp->b_hold) >= 1);
238}
173 239
174STATIC void 240STATIC void
175_xfs_buf_initialize( 241_xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
186 252
187 memset(bp, 0, sizeof(xfs_buf_t)); 253 memset(bp, 0, sizeof(xfs_buf_t));
188 atomic_set(&bp->b_hold, 1); 254 atomic_set(&bp->b_hold, 1);
255 atomic_set(&bp->b_lru_ref, 1);
189 init_completion(&bp->b_iowait); 256 init_completion(&bp->b_iowait);
257 INIT_LIST_HEAD(&bp->b_lru);
190 INIT_LIST_HEAD(&bp->b_list); 258 INIT_LIST_HEAD(&bp->b_list);
191 RB_CLEAR_NODE(&bp->b_rbnode); 259 RB_CLEAR_NODE(&bp->b_rbnode);
192 sema_init(&bp->b_sema, 0); /* held, no waiters */ 260 sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
262{ 330{
263 trace_xfs_buf_free(bp, _RET_IP_); 331 trace_xfs_buf_free(bp, _RET_IP_);
264 332
333 ASSERT(list_empty(&bp->b_lru));
334
265 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 335 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
266 uint i; 336 uint i;
267 337
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
337 __func__, gfp_mask); 407 __func__, gfp_mask);
338 408
339 XFS_STATS_INC(xb_page_retries); 409 XFS_STATS_INC(xb_page_retries);
340 xfsbufd_wakeup(NULL, 0, gfp_mask);
341 congestion_wait(BLK_RW_ASYNC, HZ/50); 410 congestion_wait(BLK_RW_ASYNC, HZ/50);
342 goto retry; 411 goto retry;
343 } 412 }
@@ -488,29 +557,16 @@ found:
488 spin_unlock(&pag->pag_buf_lock); 557 spin_unlock(&pag->pag_buf_lock);
489 xfs_perag_put(pag); 558 xfs_perag_put(pag);
490 559
491 /* Attempt to get the semaphore without sleeping, 560 if (xfs_buf_cond_lock(bp)) {
492 * if this does not work then we need to drop the 561 /* failed, so wait for the lock if requested. */
493 * spinlock and do a hard attempt on the semaphore.
494 */
495 if (down_trylock(&bp->b_sema)) {
496 if (!(flags & XBF_TRYLOCK)) { 562 if (!(flags & XBF_TRYLOCK)) {
497 /* wait for buffer ownership */
498 xfs_buf_lock(bp); 563 xfs_buf_lock(bp);
499 XFS_STATS_INC(xb_get_locked_waited); 564 XFS_STATS_INC(xb_get_locked_waited);
500 } else { 565 } else {
501 /* We asked for a trylock and failed, no need
502 * to look at file offset and length here, we
503 * know that this buffer at least overlaps our
504 * buffer and is locked, therefore our buffer
505 * either does not exist, or is this buffer.
506 */
507 xfs_buf_rele(bp); 566 xfs_buf_rele(bp);
508 XFS_STATS_INC(xb_busy_locked); 567 XFS_STATS_INC(xb_busy_locked);
509 return NULL; 568 return NULL;
510 } 569 }
511 } else {
512 /* trylock worked */
513 XB_SET_OWNER(bp);
514 } 570 }
515 571
516 if (bp->b_flags & XBF_STALE) { 572 if (bp->b_flags & XBF_STALE) {
@@ -840,7 +896,7 @@ xfs_buf_rele(
840 trace_xfs_buf_rele(bp, _RET_IP_); 896 trace_xfs_buf_rele(bp, _RET_IP_);
841 897
842 if (!pag) { 898 if (!pag) {
843 ASSERT(!bp->b_relse); 899 ASSERT(list_empty(&bp->b_lru));
844 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 900 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
845 if (atomic_dec_and_test(&bp->b_hold)) 901 if (atomic_dec_and_test(&bp->b_hold))
846 xfs_buf_free(bp); 902 xfs_buf_free(bp);
@@ -848,13 +904,15 @@ xfs_buf_rele(
848 } 904 }
849 905
850 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 906 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
907
851 ASSERT(atomic_read(&bp->b_hold) > 0); 908 ASSERT(atomic_read(&bp->b_hold) > 0);
852 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 909 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
853 if (bp->b_relse) { 910 if (!(bp->b_flags & XBF_STALE) &&
854 atomic_inc(&bp->b_hold); 911 atomic_read(&bp->b_lru_ref)) {
912 xfs_buf_lru_add(bp);
855 spin_unlock(&pag->pag_buf_lock); 913 spin_unlock(&pag->pag_buf_lock);
856 bp->b_relse(bp);
857 } else { 914 } else {
915 xfs_buf_lru_del(bp);
858 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 916 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
859 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 917 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
860 spin_unlock(&pag->pag_buf_lock); 918 spin_unlock(&pag->pag_buf_lock);
@@ -876,10 +934,18 @@ xfs_buf_rele(
876 */ 934 */
877 935
878/* 936/*
879 * Locks a buffer object, if it is not already locked. 937 * Locks a buffer object, if it is not already locked. Note that this in
880 * Note that this in no way locks the underlying pages, so it is only 938 * no way locks the underlying pages, so it is only useful for
881 * useful for synchronizing concurrent use of buffer objects, not for 939 * synchronizing concurrent use of buffer objects, not for synchronizing
882 * synchronizing independent access to the underlying pages. 940 * independent access to the underlying pages.
941 *
942 * If we come across a stale, pinned, locked buffer, we know that we are
943 * being asked to lock a buffer that has been reallocated. Because it is
944 * pinned, we know that the log has not been pushed to disk and hence it
945 * will still be locked. Rather than continuing to have trylock attempts
946 * fail until someone else pushes the log, push it ourselves before
947 * returning. This means that the xfsaild will not get stuck trying
948 * to push on stale inode buffers.
883 */ 949 */
884int 950int
885xfs_buf_cond_lock( 951xfs_buf_cond_lock(
@@ -890,6 +956,8 @@ xfs_buf_cond_lock(
890 locked = down_trylock(&bp->b_sema) == 0; 956 locked = down_trylock(&bp->b_sema) == 0;
891 if (locked) 957 if (locked)
892 XB_SET_OWNER(bp); 958 XB_SET_OWNER(bp);
959 else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
960 xfs_log_force(bp->b_target->bt_mount, 0);
893 961
894 trace_xfs_buf_cond_lock(bp, _RET_IP_); 962 trace_xfs_buf_cond_lock(bp, _RET_IP_);
895 return locked ? 0 : -EBUSY; 963 return locked ? 0 : -EBUSY;
@@ -1441,51 +1509,84 @@ xfs_buf_iomove(
1441 */ 1509 */
1442 1510
1443/* 1511/*
1444 * Wait for any bufs with callbacks that have been submitted but 1512 * Wait for any bufs with callbacks that have been submitted but have not yet
1445 * have not yet returned... walk the hash list for the target. 1513 * returned. These buffers will have an elevated hold count, so wait on those
1514 * while freeing all the buffers only held by the LRU.
1446 */ 1515 */
1447void 1516void
1448xfs_wait_buftarg( 1517xfs_wait_buftarg(
1449 struct xfs_buftarg *btp) 1518 struct xfs_buftarg *btp)
1450{ 1519{
1451 struct xfs_perag *pag; 1520 struct xfs_buf *bp;
1452 uint i;
1453 1521
1454 for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { 1522restart:
1455 pag = xfs_perag_get(btp->bt_mount, i); 1523 spin_lock(&btp->bt_lru_lock);
1456 spin_lock(&pag->pag_buf_lock); 1524 while (!list_empty(&btp->bt_lru)) {
1457 while (rb_first(&pag->pag_buf_tree)) { 1525 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1458 spin_unlock(&pag->pag_buf_lock); 1526 if (atomic_read(&bp->b_hold) > 1) {
1527 spin_unlock(&btp->bt_lru_lock);
1459 delay(100); 1528 delay(100);
1460 spin_lock(&pag->pag_buf_lock); 1529 goto restart;
1461 } 1530 }
1462 spin_unlock(&pag->pag_buf_lock); 1531 /*
1463 xfs_perag_put(pag); 1532 * clear the LRU reference count so the bufer doesn't get
1533 * ignored in xfs_buf_rele().
1534 */
1535 atomic_set(&bp->b_lru_ref, 0);
1536 spin_unlock(&btp->bt_lru_lock);
1537 xfs_buf_rele(bp);
1538 spin_lock(&btp->bt_lru_lock);
1464 } 1539 }
1540 spin_unlock(&btp->bt_lru_lock);
1465} 1541}
1466 1542
1467/* 1543int
1468 * buftarg list for delwrite queue processing 1544xfs_buftarg_shrink(
1469 */ 1545 struct shrinker *shrink,
1470static LIST_HEAD(xfs_buftarg_list); 1546 int nr_to_scan,
1471static DEFINE_SPINLOCK(xfs_buftarg_lock); 1547 gfp_t mask)
1472
1473STATIC void
1474xfs_register_buftarg(
1475 xfs_buftarg_t *btp)
1476{ 1548{
1477 spin_lock(&xfs_buftarg_lock); 1549 struct xfs_buftarg *btp = container_of(shrink,
1478 list_add(&btp->bt_list, &xfs_buftarg_list); 1550 struct xfs_buftarg, bt_shrinker);
1479 spin_unlock(&xfs_buftarg_lock); 1551 struct xfs_buf *bp;
1480} 1552 LIST_HEAD(dispose);
1481 1553
1482STATIC void 1554 if (!nr_to_scan)
1483xfs_unregister_buftarg( 1555 return btp->bt_lru_nr;
1484 xfs_buftarg_t *btp) 1556
1485{ 1557 spin_lock(&btp->bt_lru_lock);
1486 spin_lock(&xfs_buftarg_lock); 1558 while (!list_empty(&btp->bt_lru)) {
1487 list_del(&btp->bt_list); 1559 if (nr_to_scan-- <= 0)
1488 spin_unlock(&xfs_buftarg_lock); 1560 break;
1561
1562 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1563
1564 /*
1565 * Decrement the b_lru_ref count unless the value is already
1566 * zero. If the value is already zero, we need to reclaim the
1567 * buffer, otherwise it gets another trip through the LRU.
1568 */
1569 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1570 list_move_tail(&bp->b_lru, &btp->bt_lru);
1571 continue;
1572 }
1573
1574 /*
1575 * remove the buffer from the LRU now to avoid needing another
1576 * lock round trip inside xfs_buf_rele().
1577 */
1578 list_move(&bp->b_lru, &dispose);
1579 btp->bt_lru_nr--;
1580 }
1581 spin_unlock(&btp->bt_lru_lock);
1582
1583 while (!list_empty(&dispose)) {
1584 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1585 list_del_init(&bp->b_lru);
1586 xfs_buf_rele(bp);
1587 }
1588
1589 return btp->bt_lru_nr;
1489} 1590}
1490 1591
1491void 1592void
@@ -1493,17 +1594,14 @@ xfs_free_buftarg(
1493 struct xfs_mount *mp, 1594 struct xfs_mount *mp,
1494 struct xfs_buftarg *btp) 1595 struct xfs_buftarg *btp)
1495{ 1596{
1597 unregister_shrinker(&btp->bt_shrinker);
1598
1496 xfs_flush_buftarg(btp, 1); 1599 xfs_flush_buftarg(btp, 1);
1497 if (mp->m_flags & XFS_MOUNT_BARRIER) 1600 if (mp->m_flags & XFS_MOUNT_BARRIER)
1498 xfs_blkdev_issue_flush(btp); 1601 xfs_blkdev_issue_flush(btp);
1499 iput(btp->bt_mapping->host); 1602 iput(btp->bt_mapping->host);
1500 1603
1501 /* Unregister the buftarg first so that we don't get a
1502 * wakeup finding a non-existent task
1503 */
1504 xfs_unregister_buftarg(btp);
1505 kthread_stop(btp->bt_task); 1604 kthread_stop(btp->bt_task);
1506
1507 kmem_free(btp); 1605 kmem_free(btp);
1508} 1606}
1509 1607
@@ -1600,20 +1698,13 @@ xfs_alloc_delwrite_queue(
1600 xfs_buftarg_t *btp, 1698 xfs_buftarg_t *btp,
1601 const char *fsname) 1699 const char *fsname)
1602{ 1700{
1603 int error = 0;
1604
1605 INIT_LIST_HEAD(&btp->bt_list);
1606 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1701 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1607 spin_lock_init(&btp->bt_delwrite_lock); 1702 spin_lock_init(&btp->bt_delwrite_lock);
1608 btp->bt_flags = 0; 1703 btp->bt_flags = 0;
1609 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1704 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1610 if (IS_ERR(btp->bt_task)) { 1705 if (IS_ERR(btp->bt_task))
1611 error = PTR_ERR(btp->bt_task); 1706 return PTR_ERR(btp->bt_task);
1612 goto out_error; 1707 return 0;
1613 }
1614 xfs_register_buftarg(btp);
1615out_error:
1616 return error;
1617} 1708}
1618 1709
1619xfs_buftarg_t * 1710xfs_buftarg_t *
@@ -1630,12 +1721,17 @@ xfs_alloc_buftarg(
1630 btp->bt_mount = mp; 1721 btp->bt_mount = mp;
1631 btp->bt_dev = bdev->bd_dev; 1722 btp->bt_dev = bdev->bd_dev;
1632 btp->bt_bdev = bdev; 1723 btp->bt_bdev = bdev;
1724 INIT_LIST_HEAD(&btp->bt_lru);
1725 spin_lock_init(&btp->bt_lru_lock);
1633 if (xfs_setsize_buftarg_early(btp, bdev)) 1726 if (xfs_setsize_buftarg_early(btp, bdev))
1634 goto error; 1727 goto error;
1635 if (xfs_mapping_buftarg(btp, bdev)) 1728 if (xfs_mapping_buftarg(btp, bdev))
1636 goto error; 1729 goto error;
1637 if (xfs_alloc_delwrite_queue(btp, fsname)) 1730 if (xfs_alloc_delwrite_queue(btp, fsname))
1638 goto error; 1731 goto error;
1732 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1733 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1734 register_shrinker(&btp->bt_shrinker);
1639 return btp; 1735 return btp;
1640 1736
1641error: 1737error:
@@ -1740,27 +1836,6 @@ xfs_buf_runall_queues(
1740 flush_workqueue(queue); 1836 flush_workqueue(queue);
1741} 1837}
1742 1838
1743STATIC int
1744xfsbufd_wakeup(
1745 struct shrinker *shrink,
1746 int priority,
1747 gfp_t mask)
1748{
1749 xfs_buftarg_t *btp;
1750
1751 spin_lock(&xfs_buftarg_lock);
1752 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1753 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1754 continue;
1755 if (list_empty(&btp->bt_delwrite_queue))
1756 continue;
1757 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1758 wake_up_process(btp->bt_task);
1759 }
1760 spin_unlock(&xfs_buftarg_lock);
1761 return 0;
1762}
1763
1764/* 1839/*
1765 * Move as many buffers as specified to the supplied list 1840 * Move as many buffers as specified to the supplied list
1766 * idicating if we skipped any buffers to prevent deadlocks. 1841 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1781,7 +1856,6 @@ xfs_buf_delwri_split(
1781 INIT_LIST_HEAD(list); 1856 INIT_LIST_HEAD(list);
1782 spin_lock(dwlk); 1857 spin_lock(dwlk);
1783 list_for_each_entry_safe(bp, n, dwq, b_list) { 1858 list_for_each_entry_safe(bp, n, dwq, b_list) {
1784 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1785 ASSERT(bp->b_flags & XBF_DELWRI); 1859 ASSERT(bp->b_flags & XBF_DELWRI);
1786 1860
1787 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { 1861 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1795,6 +1869,7 @@ xfs_buf_delwri_split(
1795 _XBF_RUN_QUEUES); 1869 _XBF_RUN_QUEUES);
1796 bp->b_flags |= XBF_WRITE; 1870 bp->b_flags |= XBF_WRITE;
1797 list_move_tail(&bp->b_list, list); 1871 list_move_tail(&bp->b_list, list);
1872 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1798 } else 1873 } else
1799 skipped++; 1874 skipped++;
1800 } 1875 }
@@ -1955,7 +2030,6 @@ xfs_buf_init(void)
1955 if (!xfsconvertd_workqueue) 2030 if (!xfsconvertd_workqueue)
1956 goto out_destroy_xfsdatad_workqueue; 2031 goto out_destroy_xfsdatad_workqueue;
1957 2032
1958 register_shrinker(&xfs_buf_shake);
1959 return 0; 2033 return 0;
1960 2034
1961 out_destroy_xfsdatad_workqueue: 2035 out_destroy_xfsdatad_workqueue:
@@ -1971,7 +2045,6 @@ xfs_buf_init(void)
1971void 2045void
1972xfs_buf_terminate(void) 2046xfs_buf_terminate(void)
1973{ 2047{
1974 unregister_shrinker(&xfs_buf_shake);
1975 destroy_workqueue(xfsconvertd_workqueue); 2048 destroy_workqueue(xfsconvertd_workqueue);
1976 destroy_workqueue(xfsdatad_workqueue); 2049 destroy_workqueue(xfsdatad_workqueue);
1977 destroy_workqueue(xfslogd_workqueue); 2050 destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf9..cbe65950e52 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
128 128
129 /* per device delwri queue */ 129 /* per device delwri queue */
130 struct task_struct *bt_task; 130 struct task_struct *bt_task;
131 struct list_head bt_list;
132 struct list_head bt_delwrite_queue; 131 struct list_head bt_delwrite_queue;
133 spinlock_t bt_delwrite_lock; 132 spinlock_t bt_delwrite_lock;
134 unsigned long bt_flags; 133 unsigned long bt_flags;
134
135 /* LRU control structures */
136 struct shrinker bt_shrinker;
137 struct list_head bt_lru;
138 spinlock_t bt_lru_lock;
139 unsigned int bt_lru_nr;
135} xfs_buftarg_t; 140} xfs_buftarg_t;
136 141
137/* 142/*
@@ -147,8 +152,6 @@ typedef struct xfs_buftarg {
147 152
148struct xfs_buf; 153struct xfs_buf;
149typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
150typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
151typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
152 155
153#define XB_PAGES 2 156#define XB_PAGES 2
154 157
@@ -164,9 +167,11 @@ typedef struct xfs_buf {
164 xfs_off_t b_file_offset; /* offset in file */ 167 xfs_off_t b_file_offset; /* offset in file */
165 size_t b_buffer_length;/* size of buffer in bytes */ 168 size_t b_buffer_length;/* size of buffer in bytes */
166 atomic_t b_hold; /* reference count */ 169 atomic_t b_hold; /* reference count */
170 atomic_t b_lru_ref; /* lru reclaim ref count */
167 xfs_buf_flags_t b_flags; /* status flags */ 171 xfs_buf_flags_t b_flags; /* status flags */
168 struct semaphore b_sema; /* semaphore for lockables */ 172 struct semaphore b_sema; /* semaphore for lockables */
169 173
174 struct list_head b_lru; /* lru list */
170 wait_queue_head_t b_waiters; /* unpin waiters */ 175 wait_queue_head_t b_waiters; /* unpin waiters */
171 struct list_head b_list; 176 struct list_head b_list;
172 struct xfs_perag *b_pag; /* contains rbtree root */ 177 struct xfs_perag *b_pag; /* contains rbtree root */
@@ -176,7 +181,6 @@ typedef struct xfs_buf {
176 void *b_addr; /* virtual address of buffer */ 181 void *b_addr; /* virtual address of buffer */
177 struct work_struct b_iodone_work; 182 struct work_struct b_iodone_work;
178 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 183 xfs_buf_iodone_t b_iodone; /* I/O completion function */
179 xfs_buf_relse_t b_relse; /* releasing function */
180 struct completion b_iowait; /* queue for I/O waiters */ 184 struct completion b_iowait; /* queue for I/O waiters */
181 void *b_fspriv; 185 void *b_fspriv;
182 void *b_fspriv2; 186 void *b_fspriv2;
@@ -264,7 +268,8 @@ extern void xfs_buf_terminate(void);
264#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 268#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
265 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 269 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
266 270
267#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) 271void xfs_buf_stale(struct xfs_buf *bp);
272#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
268#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 273#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
269#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 274#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
270#define XFS_BUF_SUPER_STALE(bp) do { \ 275#define XFS_BUF_SUPER_STALE(bp) do { \
@@ -315,7 +320,6 @@ extern void xfs_buf_terminate(void);
315#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 320#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
316#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 321#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
317#define XFS_BUF_SET_START(bp) do { } while (0) 322#define XFS_BUF_SET_START(bp) do { } while (0)
318#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
319 323
320#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 324#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
321#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 325#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -328,9 +332,15 @@ extern void xfs_buf_terminate(void);
328#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 332#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
329#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 333#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
330 334
331#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) 335static inline void
336xfs_buf_set_ref(
337 struct xfs_buf *bp,
338 int lru_ref)
339{
340 atomic_set(&bp->b_lru_ref, lru_ref);
341}
342#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
332#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 343#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
333#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
334 344
335#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) 345#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
336 346
@@ -346,8 +356,7 @@ extern void xfs_buf_terminate(void);
346 356
347static inline void xfs_buf_relse(xfs_buf_t *bp) 357static inline void xfs_buf_relse(xfs_buf_t *bp)
348{ 358{
349 if (!bp->b_relse) 359 xfs_buf_unlock(bp);
350 xfs_buf_unlock(bp);
351 xfs_buf_rele(bp); 360 xfs_buf_rele(bp);
352} 361}
353 362
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 00000000000..05201ae719e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_trans.h"
26#include "xfs_alloc_btree.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_inode.h"
31#include "xfs_alloc.h"
32#include "xfs_error.h"
33#include "xfs_discard.h"
34#include "xfs_trace.h"
35
36STATIC int
37xfs_trim_extents(
38 struct xfs_mount *mp,
39 xfs_agnumber_t agno,
40 xfs_fsblock_t start,
41 xfs_fsblock_t len,
42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed)
44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
46 struct xfs_btree_cur *cur;
47 struct xfs_buf *agbp;
48 struct xfs_perag *pag;
49 int error;
50 int i;
51
52 pag = xfs_perag_get(mp, agno);
53
54 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
55 if (error || !agbp)
56 goto out_put_perag;
57
58 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
59
60 /*
61 * Force out the log. This means any transactions that might have freed
62 * space before we took the AGF buffer lock are now on disk, and the
63 * volatile disk cache is flushed.
64 */
65 xfs_log_force(mp, XFS_LOG_SYNC);
66
67 /*
68 * Look up the longest btree in the AGF and start with it.
69 */
70 error = xfs_alloc_lookup_le(cur, 0,
71 XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
72 if (error)
73 goto out_del_cursor;
74
75 /*
76 * Loop until we are done with all extents that are large
77 * enough to be worth discarding.
78 */
79 while (i) {
80 xfs_agblock_t fbno;
81 xfs_extlen_t flen;
82
83 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
84 if (error)
85 goto out_del_cursor;
86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
87 ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
88
89 /*
90 * Too small? Give up.
91 */
92 if (flen < minlen) {
93 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
94 goto out_del_cursor;
95 }
96
97 /*
98 * If the extent is entirely outside of the range we are
99 * supposed to discard skip it. Do not bother to trim
100 * down partially overlapping ranges for now.
101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent;
106 }
107
108 /*
109 * If any blocks in the range are still busy, skip the
110 * discard and try again the next time.
111 */
112 if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
113 trace_xfs_discard_busy(mp, agno, fbno, flen);
114 goto next_extent;
115 }
116
117 trace_xfs_discard_extent(mp, agno, fbno, flen);
118 error = -blkdev_issue_discard(bdev,
119 XFS_AGB_TO_DADDR(mp, agno, fbno),
120 XFS_FSB_TO_BB(mp, flen),
121 GFP_NOFS, 0);
122 if (error)
123 goto out_del_cursor;
124 *blocks_trimmed += flen;
125
126next_extent:
127 error = xfs_btree_decrement(cur, 0, &i);
128 if (error)
129 goto out_del_cursor;
130 }
131
132out_del_cursor:
133 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
134 xfs_buf_relse(agbp);
135out_put_perag:
136 xfs_perag_put(pag);
137 return error;
138}
139
140int
141xfs_ioc_trim(
142 struct xfs_mount *mp,
143 struct fstrim_range __user *urange)
144{
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0;
152
153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM);
155 if (copy_from_user(&range, urange, sizeof(range)))
156 return -XFS_ERROR(EFAULT);
157
158 /*
159 * Truncating down the len isn't actually quite correct, but using
160 * XFS_B_TO_FSB would mean we trivially get overflows for values
161 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
162 * used by the fstrim application. In the end it really doesn't
163 * matter as trimming blocks is an advisory interface.
164 */
165 start = XFS_B_TO_FSBT(mp, range.start);
166 len = XFS_B_TO_FSBT(mp, range.len);
167 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
168
169 start_agno = XFS_FSB_TO_AGNO(mp, start);
170 if (start_agno >= mp->m_sb.sb_agcount)
171 return -XFS_ERROR(EINVAL);
172
173 end_agno = XFS_FSB_TO_AGNO(mp, start + len);
174 if (end_agno >= mp->m_sb.sb_agcount)
175 end_agno = mp->m_sb.sb_agcount - 1;
176
177 for (agno = start_agno; agno <= end_agno; agno++) {
178 error = -xfs_trim_extents(mp, agno, start, len, minlen,
179 &blocks_trimmed);
180 if (error)
181 last_error = error;
182 }
183
184 if (last_error)
185 return last_error;
186
187 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
188 if (copy_to_user(urange, &range, sizeof(range)))
189 return -XFS_ERROR(EFAULT);
190 return 0;
191}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 00000000000..e82b6dd3e12
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
1#ifndef XFS_DISCARD_H
2#define XFS_DISCARD_H 1
3
4struct fstrim_range;
5
6extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
7
8#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790e..fc0114da7fd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
70 else 70 else
71 fileid_type = FILEID_INO32_GEN_PARENT; 71 fileid_type = FILEID_INO32_GEN_PARENT;
72 72
73 /* filesystem may contain 64bit inode numbers */ 73 /*
74 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) 74 * If the the filesystem may contain 64bit inode numbers, we need
75 * to use larger file handles that can represent them.
76 *
77 * While we only allocate inodes that do not fit into 32 bits any
78 * large enough filesystem may contain them, thus the slightly
79 * confusing looking conditional below.
80 */
81 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
82 (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
75 fileid_type |= XFS_FILEID_TYPE_64FLAG; 83 fileid_type |= XFS_FILEID_TYPE_64FLAG;
76 84
77 /* 85 /*
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a16..a55c1b46b21 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38 38
39#include <linux/dcache.h> 39#include <linux/dcache.h>
40#include <linux/falloc.h>
40 41
41static const struct vm_operations_struct xfs_file_vm_ops; 42static const struct vm_operations_struct xfs_file_vm_ops;
42 43
43/* 44/*
45 * Locking primitives for read and write IO paths to ensure we consistently use
46 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
47 */
48static inline void
49xfs_rw_ilock(
50 struct xfs_inode *ip,
51 int type)
52{
53 if (type & XFS_IOLOCK_EXCL)
54 mutex_lock(&VFS_I(ip)->i_mutex);
55 xfs_ilock(ip, type);
56}
57
58static inline void
59xfs_rw_iunlock(
60 struct xfs_inode *ip,
61 int type)
62{
63 xfs_iunlock(ip, type);
64 if (type & XFS_IOLOCK_EXCL)
65 mutex_unlock(&VFS_I(ip)->i_mutex);
66}
67
68static inline void
69xfs_rw_ilock_demote(
70 struct xfs_inode *ip,
71 int type)
72{
73 xfs_ilock_demote(ip, type);
74 if (type & XFS_IOLOCK_EXCL)
75 mutex_unlock(&VFS_I(ip)->i_mutex);
76}
77
78/*
44 * xfs_iozero 79 * xfs_iozero
45 * 80 *
46 * xfs_iozero clears the specified range of buffer supplied, 81 * xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
262 if (XFS_FORCED_SHUTDOWN(mp)) 297 if (XFS_FORCED_SHUTDOWN(mp))
263 return -EIO; 298 return -EIO;
264 299
265 if (unlikely(ioflags & IO_ISDIRECT))
266 mutex_lock(&inode->i_mutex);
267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
268
269 if (unlikely(ioflags & IO_ISDIRECT)) { 300 if (unlikely(ioflags & IO_ISDIRECT)) {
301 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
302
270 if (inode->i_mapping->nrpages) { 303 if (inode->i_mapping->nrpages) {
271 ret = -xfs_flushinval_pages(ip, 304 ret = -xfs_flushinval_pages(ip,
272 (iocb->ki_pos & PAGE_CACHE_MASK), 305 (iocb->ki_pos & PAGE_CACHE_MASK),
273 -1, FI_REMAPF_LOCKED); 306 -1, FI_REMAPF_LOCKED);
307 if (ret) {
308 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310 }
274 } 311 }
275 mutex_unlock(&inode->i_mutex); 312 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
276 if (ret) { 313 } else
277 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 314 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
278 return ret;
279 }
280 }
281 315
282 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 316 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
283 317
@@ -285,7 +319,7 @@ xfs_file_aio_read(
285 if (ret > 0) 319 if (ret > 0)
286 XFS_STATS_ADD(xs_read_bytes, ret); 320 XFS_STATS_ADD(xs_read_bytes, ret);
287 321
288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
289 return ret; 323 return ret;
290} 324}
291 325
@@ -309,7 +343,7 @@ xfs_file_splice_read(
309 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 343 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
310 return -EIO; 344 return -EIO;
311 345
312 xfs_ilock(ip, XFS_IOLOCK_SHARED); 346 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
313 347
314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
315 349
@@ -317,10 +351,61 @@ xfs_file_splice_read(
317 if (ret > 0) 351 if (ret > 0)
318 XFS_STATS_ADD(xs_read_bytes, ret); 352 XFS_STATS_ADD(xs_read_bytes, ret);
319 353
320 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 354 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 return ret; 355 return ret;
322} 356}
323 357
358STATIC void
359xfs_aio_write_isize_update(
360 struct inode *inode,
361 loff_t *ppos,
362 ssize_t bytes_written)
363{
364 struct xfs_inode *ip = XFS_I(inode);
365 xfs_fsize_t isize = i_size_read(inode);
366
367 if (bytes_written > 0)
368 XFS_STATS_ADD(xs_write_bytes, bytes_written);
369
370 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
371 *ppos > isize))
372 *ppos = isize;
373
374 if (*ppos > ip->i_size) {
375 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
376 if (*ppos > ip->i_size)
377 ip->i_size = *ppos;
378 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
379 }
380}
381
382/*
383 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
384 * part of the I/O may have been written to disk before the error occured. In
385 * this case the on-disk file size may have been adjusted beyond the in-memory
386 * file size and now needs to be truncated back.
387 */
388STATIC void
389xfs_aio_write_newsize_update(
390 struct xfs_inode *ip)
391{
392 if (ip->i_new_size) {
393 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
394 ip->i_new_size = 0;
395 if (ip->i_d.di_size > ip->i_size)
396 ip->i_d.di_size = ip->i_size;
397 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
398 }
399}
400
401/*
402 * xfs_file_splice_write() does not use xfs_rw_ilock() because
403 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
404 * couuld cause lock inversions between the aio_write path and the splice path
405 * if someone is doing concurrent splice(2) based writes and write(2) based
406 * writes to the same inode. The only real way to fix this is to re-implement
407 * the generic code here with correct locking orders.
408 */
324STATIC ssize_t 409STATIC ssize_t
325xfs_file_splice_write( 410xfs_file_splice_write(
326 struct pipe_inode_info *pipe, 411 struct pipe_inode_info *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
331{ 416{
332 struct inode *inode = outfilp->f_mapping->host; 417 struct inode *inode = outfilp->f_mapping->host;
333 struct xfs_inode *ip = XFS_I(inode); 418 struct xfs_inode *ip = XFS_I(inode);
334 xfs_fsize_t isize, new_size; 419 xfs_fsize_t new_size;
335 int ioflags = 0; 420 int ioflags = 0;
336 ssize_t ret; 421 ssize_t ret;
337 422
@@ -355,27 +440,9 @@ xfs_file_splice_write(
355 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 440 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
356 441
357 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 442 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
358 if (ret > 0)
359 XFS_STATS_ADD(xs_write_bytes, ret);
360
361 isize = i_size_read(inode);
362 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
363 *ppos = isize;
364
365 if (*ppos > ip->i_size) {
366 xfs_ilock(ip, XFS_ILOCK_EXCL);
367 if (*ppos > ip->i_size)
368 ip->i_size = *ppos;
369 xfs_iunlock(ip, XFS_ILOCK_EXCL);
370 }
371 443
372 if (ip->i_new_size) { 444 xfs_aio_write_isize_update(inode, ppos, ret);
373 xfs_ilock(ip, XFS_ILOCK_EXCL); 445 xfs_aio_write_newsize_update(ip);
374 ip->i_new_size = 0;
375 if (ip->i_d.di_size > ip->i_size)
376 ip->i_d.di_size = ip->i_size;
377 xfs_iunlock(ip, XFS_ILOCK_EXCL);
378 }
379 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 446 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
380 return ret; 447 return ret;
381} 448}
@@ -562,247 +629,314 @@ out_lock:
562 return error; 629 return error;
563} 630}
564 631
632/*
633 * Common pre-write limit and setup checks.
634 *
635 * Returns with iolock held according to @iolock.
636 */
565STATIC ssize_t 637STATIC ssize_t
566xfs_file_aio_write( 638xfs_file_aio_write_checks(
567 struct kiocb *iocb, 639 struct file *file,
568 const struct iovec *iovp, 640 loff_t *pos,
569 unsigned long nr_segs, 641 size_t *count,
570 loff_t pos) 642 int *iolock)
571{ 643{
572 struct file *file = iocb->ki_filp; 644 struct inode *inode = file->f_mapping->host;
573 struct address_space *mapping = file->f_mapping;
574 struct inode *inode = mapping->host;
575 struct xfs_inode *ip = XFS_I(inode); 645 struct xfs_inode *ip = XFS_I(inode);
576 struct xfs_mount *mp = ip->i_mount; 646 xfs_fsize_t new_size;
577 ssize_t ret = 0, error = 0; 647 int error = 0;
578 int ioflags = 0;
579 xfs_fsize_t isize, new_size;
580 int iolock;
581 size_t ocount = 0, count;
582 int need_i_mutex;
583 648
584 XFS_STATS_INC(xs_write_calls); 649 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
650 if (error) {
651 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
652 *iolock = 0;
653 return error;
654 }
585 655
586 BUG_ON(iocb->ki_pos != pos); 656 new_size = *pos + *count;
657 if (new_size > ip->i_size)
658 ip->i_new_size = new_size;
587 659
588 if (unlikely(file->f_flags & O_DIRECT)) 660 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
589 ioflags |= IO_ISDIRECT; 661 file_update_time(file);
590 if (file->f_mode & FMODE_NOCMTIME) 662
591 ioflags |= IO_INVIS; 663 /*
664 * If the offset is beyond the size of the file, we need to zero any
665 * blocks that fall between the existing EOF and the start of this
666 * write.
667 */
668 if (*pos > ip->i_size)
669 error = -xfs_zero_eof(ip, *pos, ip->i_size);
592 670
593 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); 671 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
594 if (error) 672 if (error)
595 return error; 673 return error;
596 674
597 count = ocount; 675 /*
598 if (count == 0) 676 * If we're writing the file then make sure to clear the setuid and
599 return 0; 677 * setgid bits if the process is not being run by root. This keeps
600 678 * people from modifying setuid and setgid binaries.
601 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 679 */
680 return file_remove_suid(file);
602 681
603 if (XFS_FORCED_SHUTDOWN(mp)) 682}
604 return -EIO;
605 683
606relock: 684/*
607 if (ioflags & IO_ISDIRECT) { 685 * xfs_file_dio_aio_write - handle direct IO writes
608 iolock = XFS_IOLOCK_SHARED; 686 *
609 need_i_mutex = 0; 687 * Lock the inode appropriately to prepare for and issue a direct IO write.
610 } else { 688 * By separating it from the buffered write path we remove all the tricky to
611 iolock = XFS_IOLOCK_EXCL; 689 * follow locking changes and looping.
612 need_i_mutex = 1; 690 *
613 mutex_lock(&inode->i_mutex); 691 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
692 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
693 * pages are flushed out.
694 *
695 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
696 * allowing them to be done in parallel with reads and other direct IO writes.
697 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
698 * needs to do sub-block zeroing and that requires serialisation against other
699 * direct IOs to the same block. In this case we need to serialise the
700 * submission of the unaligned IOs so that we don't get racing block zeroing in
701 * the dio layer. To avoid the problem with aio, we also need to wait for
702 * outstanding IOs to complete so that unwritten extent conversion is completed
703 * before we try to map the overlapping block. This is currently implemented by
704 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
705 *
706 * Returns with locks held indicated by @iolock and errors indicated by
707 * negative return values.
708 */
709STATIC ssize_t
710xfs_file_dio_aio_write(
711 struct kiocb *iocb,
712 const struct iovec *iovp,
713 unsigned long nr_segs,
714 loff_t pos,
715 size_t ocount,
716 int *iolock)
717{
718 struct file *file = iocb->ki_filp;
719 struct address_space *mapping = file->f_mapping;
720 struct inode *inode = mapping->host;
721 struct xfs_inode *ip = XFS_I(inode);
722 struct xfs_mount *mp = ip->i_mount;
723 ssize_t ret = 0;
724 size_t count = ocount;
725 int unaligned_io = 0;
726 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
727 mp->m_rtdev_targp : mp->m_ddev_targp;
728
729 *iolock = 0;
730 if ((pos & target->bt_smask) || (count & target->bt_smask))
731 return -XFS_ERROR(EINVAL);
732
733 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
734 unaligned_io = 1;
735
736 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
737 *iolock = XFS_IOLOCK_EXCL;
738 else
739 *iolock = XFS_IOLOCK_SHARED;
740 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
741
742 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
743 if (ret)
744 return ret;
745
746 if (mapping->nrpages) {
747 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
748 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
749 FI_REMAPF_LOCKED);
750 if (ret)
751 return ret;
614 } 752 }
615 753
616 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 754 /*
617 755 * If we are doing unaligned IO, wait for all other IO to drain,
618start: 756 * otherwise demote the lock if we had to flush cached pages
619 error = -generic_write_checks(file, &pos, &count, 757 */
620 S_ISBLK(inode->i_mode)); 758 if (unaligned_io)
621 if (error) { 759 xfs_ioend_wait(ip);
622 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 760 else if (*iolock == XFS_IOLOCK_EXCL) {
623 goto out_unlock_mutex; 761 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
762 *iolock = XFS_IOLOCK_SHARED;
624 } 763 }
625 764
626 if (ioflags & IO_ISDIRECT) { 765 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
627 xfs_buftarg_t *target = 766 ret = generic_file_direct_write(iocb, iovp,
628 XFS_IS_REALTIME_INODE(ip) ? 767 &nr_segs, pos, &iocb->ki_pos, count, ocount);
629 mp->m_rtdev_targp : mp->m_ddev_targp;
630 768
631 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 769 /* No fallback to buffered IO on errors for XFS. */
632 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 770 ASSERT(ret < 0 || ret == count);
633 return XFS_ERROR(-EINVAL); 771 return ret;
634 } 772}
635 773
636 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { 774STATIC ssize_t
637 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 775xfs_file_buffered_aio_write(
638 iolock = XFS_IOLOCK_EXCL; 776 struct kiocb *iocb,
639 need_i_mutex = 1; 777 const struct iovec *iovp,
640 mutex_lock(&inode->i_mutex); 778 unsigned long nr_segs,
641 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 779 loff_t pos,
642 goto start; 780 size_t ocount,
643 } 781 int *iolock)
644 } 782{
783 struct file *file = iocb->ki_filp;
784 struct address_space *mapping = file->f_mapping;
785 struct inode *inode = mapping->host;
786 struct xfs_inode *ip = XFS_I(inode);
787 ssize_t ret;
788 int enospc = 0;
789 size_t count = ocount;
645 790
646 new_size = pos + count; 791 *iolock = XFS_IOLOCK_EXCL;
647 if (new_size > ip->i_size) 792 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
648 ip->i_new_size = new_size;
649 793
650 if (likely(!(ioflags & IO_INVIS))) 794 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
651 file_update_time(file); 795 if (ret)
796 return ret;
652 797
798 /* We can write back this queue in page reclaim */
799 current->backing_dev_info = mapping->backing_dev_info;
800
801write_retry:
802 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
803 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
804 pos, &iocb->ki_pos, count, ret);
653 /* 805 /*
654 * If the offset is beyond the size of the file, we have a couple 806 * if we just got an ENOSPC, flush the inode now we aren't holding any
655 * of things to do. First, if there is already space allocated 807 * page locks and retry *once*
656 * we need to either create holes or zero the disk or ...
657 *
658 * If there is a page where the previous size lands, we need
659 * to zero it out up to the new size.
660 */ 808 */
661 809 if (ret == -ENOSPC && !enospc) {
662 if (pos > ip->i_size) { 810 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
663 error = xfs_zero_eof(ip, pos, ip->i_size); 811 if (ret)
664 if (error) { 812 return ret;
665 xfs_iunlock(ip, XFS_ILOCK_EXCL); 813 enospc = 1;
666 goto out_unlock_internal; 814 goto write_retry;
667 }
668 } 815 }
669 xfs_iunlock(ip, XFS_ILOCK_EXCL); 816 current->backing_dev_info = NULL;
817 return ret;
818}
670 819
671 /* 820STATIC ssize_t
672 * If we're writing the file then make sure to clear the 821xfs_file_aio_write(
673 * setuid and setgid bits if the process is not being run 822 struct kiocb *iocb,
674 * by root. This keeps people from modifying setuid and 823 const struct iovec *iovp,
675 * setgid binaries. 824 unsigned long nr_segs,
676 */ 825 loff_t pos)
677 error = -file_remove_suid(file); 826{
678 if (unlikely(error)) 827 struct file *file = iocb->ki_filp;
679 goto out_unlock_internal; 828 struct address_space *mapping = file->f_mapping;
829 struct inode *inode = mapping->host;
830 struct xfs_inode *ip = XFS_I(inode);
831 ssize_t ret;
832 int iolock;
833 size_t ocount = 0;
680 834
681 /* We can write back this queue in page reclaim */ 835 XFS_STATS_INC(xs_write_calls);
682 current->backing_dev_info = mapping->backing_dev_info;
683 836
684 if ((ioflags & IO_ISDIRECT)) { 837 BUG_ON(iocb->ki_pos != pos);
685 if (mapping->nrpages) {
686 WARN_ON(need_i_mutex == 0);
687 error = xfs_flushinval_pages(ip,
688 (pos & PAGE_CACHE_MASK),
689 -1, FI_REMAPF_LOCKED);
690 if (error)
691 goto out_unlock_internal;
692 }
693 838
694 if (need_i_mutex) { 839 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
695 /* demote the lock now the cached pages are gone */ 840 if (ret)
696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 841 return ret;
697 mutex_unlock(&inode->i_mutex);
698 842
699 iolock = XFS_IOLOCK_SHARED; 843 if (ocount == 0)
700 need_i_mutex = 0; 844 return 0;
701 }
702 845
703 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); 846 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
704 ret = generic_file_direct_write(iocb, iovp,
705 &nr_segs, pos, &iocb->ki_pos, count, ocount);
706 847
707 /* 848 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
708 * direct-io write to a hole: fall through to buffered I/O 849 return -EIO;
709 * for completing the rest of the request.
710 */
711 if (ret >= 0 && ret != count) {
712 XFS_STATS_ADD(xs_write_bytes, ret);
713 850
714 pos += ret; 851 if (unlikely(file->f_flags & O_DIRECT))
715 count -= ret; 852 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
853 ocount, &iolock);
854 else
855 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
856 ocount, &iolock);
716 857
717 ioflags &= ~IO_ISDIRECT; 858 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
718 xfs_iunlock(ip, iolock);
719 goto relock;
720 }
721 } else {
722 int enospc = 0;
723 ssize_t ret2 = 0;
724 859
725write_retry: 860 if (ret <= 0)
726 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); 861 goto out_unlock;
727 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
728 pos, &iocb->ki_pos, count, ret);
729 /*
730 * if we just got an ENOSPC, flush the inode now we
731 * aren't holding any page locks and retry *once*
732 */
733 if (ret2 == -ENOSPC && !enospc) {
734 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
735 if (error)
736 goto out_unlock_internal;
737 enospc = 1;
738 goto write_retry;
739 }
740 ret = ret2;
741 }
742 862
743 current->backing_dev_info = NULL; 863 /* Handle various SYNC-type writes */
864 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
865 loff_t end = pos + ret - 1;
866 int error, error2;
744 867
745 isize = i_size_read(inode); 868 xfs_rw_iunlock(ip, iolock);
746 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) 869 error = filemap_write_and_wait_range(mapping, pos, end);
747 iocb->ki_pos = isize; 870 xfs_rw_ilock(ip, iolock);
748 871
749 if (iocb->ki_pos > ip->i_size) { 872 error2 = -xfs_file_fsync(file,
750 xfs_ilock(ip, XFS_ILOCK_EXCL); 873 (file->f_flags & __O_SYNC) ? 0 : 1);
751 if (iocb->ki_pos > ip->i_size) 874 if (error)
752 ip->i_size = iocb->ki_pos; 875 ret = error;
753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 876 else if (error2)
877 ret = error2;
754 } 878 }
755 879
756 error = -ret; 880out_unlock:
757 if (ret <= 0) 881 xfs_aio_write_newsize_update(ip);
758 goto out_unlock_internal; 882 xfs_rw_iunlock(ip, iolock);
883 return ret;
884}
759 885
760 XFS_STATS_ADD(xs_write_bytes, ret); 886STATIC long
887xfs_file_fallocate(
888 struct file *file,
889 int mode,
890 loff_t offset,
891 loff_t len)
892{
893 struct inode *inode = file->f_path.dentry->d_inode;
894 long error;
895 loff_t new_size = 0;
896 xfs_flock64_t bf;
897 xfs_inode_t *ip = XFS_I(inode);
898 int cmd = XFS_IOC_RESVSP;
761 899
762 /* Handle various SYNC-type writes */ 900 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
763 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 901 return -EOPNOTSUPP;
764 loff_t end = pos + ret - 1;
765 int error2;
766 902
767 xfs_iunlock(ip, iolock); 903 bf.l_whence = 0;
768 if (need_i_mutex) 904 bf.l_start = offset;
769 mutex_unlock(&inode->i_mutex); 905 bf.l_len = len;
770 906
771 error2 = filemap_write_and_wait_range(mapping, pos, end); 907 xfs_ilock(ip, XFS_IOLOCK_EXCL);
772 if (!error)
773 error = error2;
774 if (need_i_mutex)
775 mutex_lock(&inode->i_mutex);
776 xfs_ilock(ip, iolock);
777 908
778 error2 = -xfs_file_fsync(file, 909 if (mode & FALLOC_FL_PUNCH_HOLE)
779 (file->f_flags & __O_SYNC) ? 0 : 1); 910 cmd = XFS_IOC_UNRESVSP;
780 if (!error) 911
781 error = error2; 912 /* check the new inode size is valid before allocating */
913 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
914 offset + len > i_size_read(inode)) {
915 new_size = offset + len;
916 error = inode_newsize_ok(inode, new_size);
917 if (error)
918 goto out_unlock;
782 } 919 }
783 920
784 out_unlock_internal: 921 error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
785 if (ip->i_new_size) { 922 if (error)
786 xfs_ilock(ip, XFS_ILOCK_EXCL); 923 goto out_unlock;
787 ip->i_new_size = 0; 924
788 /* 925 /* Change file size if needed */
789 * If this was a direct or synchronous I/O that failed (such 926 if (new_size) {
790 * as ENOSPC) then part of the I/O may have been written to 927 struct iattr iattr;
791 * disk before the error occured. In this case the on-disk 928
792 * file size may have been adjusted beyond the in-memory file 929 iattr.ia_valid = ATTR_SIZE;
793 * size and now needs to be truncated back. 930 iattr.ia_size = new_size;
794 */ 931 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
795 if (ip->i_d.di_size > ip->i_size)
796 ip->i_d.di_size = ip->i_size;
797 xfs_iunlock(ip, XFS_ILOCK_EXCL);
798 } 932 }
799 xfs_iunlock(ip, iolock); 933
800 out_unlock_mutex: 934out_unlock:
801 if (need_i_mutex) 935 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
802 mutex_unlock(&inode->i_mutex); 936 return error;
803 return -error;
804} 937}
805 938
939
806STATIC int 940STATIC int
807xfs_file_open( 941xfs_file_open(
808 struct inode *inode, 942 struct inode *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
921 .open = xfs_file_open, 1055 .open = xfs_file_open,
922 .release = xfs_file_release, 1056 .release = xfs_file_release,
923 .fsync = xfs_file_fsync, 1057 .fsync = xfs_file_fsync,
1058 .fallocate = xfs_file_fallocate,
924}; 1059};
925 1060
926const struct file_operations xfs_dir_file_operations = { 1061const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2ea238f6d38..f5e2a19e0f8 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
39#include "xfs_dfrag.h" 39#include "xfs_dfrag.h"
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_discard.h"
42#include "xfs_quota.h" 43#include "xfs_quota.h"
43#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
44#include "xfs_export.h" 45#include "xfs_export.h"
@@ -416,7 +417,7 @@ xfs_attrlist_by_handle(
416 if (IS_ERR(dentry)) 417 if (IS_ERR(dentry))
417 return PTR_ERR(dentry); 418 return PTR_ERR(dentry);
418 419
419 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 420 kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
420 if (!kbuf) 421 if (!kbuf)
421 goto out_dput; 422 goto out_dput;
422 423
@@ -984,10 +985,22 @@ xfs_ioctl_setattr(
984 985
985 /* 986 /*
986 * Extent size must be a multiple of the appropriate block 987 * Extent size must be a multiple of the appropriate block
987 * size, if set at all. 988 * size, if set at all. It must also be smaller than the
989 * maximum extent size supported by the filesystem.
990 *
991 * Also, for non-realtime files, limit the extent size hint to
992 * half the size of the AGs in the filesystem so alignment
993 * doesn't result in extents larger than an AG.
988 */ 994 */
989 if (fa->fsx_extsize != 0) { 995 if (fa->fsx_extsize != 0) {
990 xfs_extlen_t size; 996 xfs_extlen_t size;
997 xfs_fsblock_t extsize_fsb;
998
999 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1000 if (extsize_fsb > MAXEXTLEN) {
1001 code = XFS_ERROR(EINVAL);
1002 goto error_return;
1003 }
991 1004
992 if (XFS_IS_REALTIME_INODE(ip) || 1005 if (XFS_IS_REALTIME_INODE(ip) ||
993 ((mask & FSX_XFLAGS) && 1006 ((mask & FSX_XFLAGS) &&
@@ -996,6 +1009,10 @@ xfs_ioctl_setattr(
996 mp->m_sb.sb_blocklog; 1009 mp->m_sb.sb_blocklog;
997 } else { 1010 } else {
998 size = mp->m_sb.sb_blocksize; 1011 size = mp->m_sb.sb_blocksize;
1012 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1013 code = XFS_ERROR(EINVAL);
1014 goto error_return;
1015 }
999 } 1016 }
1000 1017
1001 if (fa->fsx_extsize % size) { 1018 if (fa->fsx_extsize % size) {
@@ -1294,6 +1311,8 @@ xfs_file_ioctl(
1294 trace_xfs_file_ioctl(ip); 1311 trace_xfs_file_ioctl(ip);
1295 1312
1296 switch (cmd) { 1313 switch (cmd) {
1314 case FITRIM:
1315 return xfs_ioc_trim(mp, arg);
1297 case XFS_IOC_ALLOCSP: 1316 case XFS_IOC_ALLOCSP:
1298 case XFS_IOC_FREESP: 1317 case XFS_IOC_FREESP:
1299 case XFS_IOC_RESVSP: 1318 case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 96107efc0c6..bd5727852fd 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/posix_acl.h> 47#include <linux/posix_acl.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/falloc.h>
50#include <linux/fiemap.h> 49#include <linux/fiemap.h>
51#include <linux/slab.h> 50#include <linux/slab.h>
52 51
@@ -505,58 +504,6 @@ xfs_vn_setattr(
505 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 504 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
506} 505}
507 506
508STATIC long
509xfs_vn_fallocate(
510 struct inode *inode,
511 int mode,
512 loff_t offset,
513 loff_t len)
514{
515 long error;
516 loff_t new_size = 0;
517 xfs_flock64_t bf;
518 xfs_inode_t *ip = XFS_I(inode);
519
520 /* preallocation on directories not yet supported */
521 error = -ENODEV;
522 if (S_ISDIR(inode->i_mode))
523 goto out_error;
524
525 bf.l_whence = 0;
526 bf.l_start = offset;
527 bf.l_len = len;
528
529 xfs_ilock(ip, XFS_IOLOCK_EXCL);
530
531 /* check the new inode size is valid before allocating */
532 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
533 offset + len > i_size_read(inode)) {
534 new_size = offset + len;
535 error = inode_newsize_ok(inode, new_size);
536 if (error)
537 goto out_unlock;
538 }
539
540 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
541 0, XFS_ATTR_NOLOCK);
542 if (error)
543 goto out_unlock;
544
545 /* Change file size if needed */
546 if (new_size) {
547 struct iattr iattr;
548
549 iattr.ia_valid = ATTR_SIZE;
550 iattr.ia_size = new_size;
551 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
552 }
553
554out_unlock:
555 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
556out_error:
557 return error;
558}
559
560#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 507#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
561 508
562/* 509/*
@@ -650,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
650 .getxattr = generic_getxattr, 597 .getxattr = generic_getxattr,
651 .removexattr = generic_removexattr, 598 .removexattr = generic_removexattr,
652 .listxattr = xfs_vn_listxattr, 599 .listxattr = xfs_vn_listxattr,
653 .fallocate = xfs_vn_fallocate,
654 .fiemap = xfs_vn_fiemap, 600 .fiemap = xfs_vn_fiemap,
655}; 601};
656 602
@@ -762,7 +708,8 @@ xfs_setup_inode(
762 inode->i_state = I_NEW; 708 inode->i_state = I_NEW;
763 709
764 inode_sb_list_add(inode); 710 inode_sb_list_add(inode);
765 insert_inode_hash(inode); 711 /* make the inode look hashed for the writeback code */
712 hlist_add_fake(&inode->i_hash);
766 713
767 inode->i_mode = ip->i_d.di_mode; 714 inode->i_mode = ip->i_d.di_mode;
768 inode->i_nlink = ip->i_d.di_nlink; 715 inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff7..09649499774 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
37 37
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h>
41#include <time.h> 40#include <time.h>
42 41
43#include <support/debug.h> 42#include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae..9731898083a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -353,9 +353,6 @@ xfs_parseargs(
353 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 353 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
354 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 354 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
355 mp->m_flags |= XFS_MOUNT_DELAYLOG; 355 mp->m_flags |= XFS_MOUNT_DELAYLOG;
356 cmn_err(CE_WARN,
357 "Enabling EXPERIMENTAL delayed logging feature "
358 "- use at your own risk.\n");
359 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
360 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
361 } else if (!strcmp(this_char, "ihashsize")) { 358 } else if (!strcmp(this_char, "ihashsize")) {
@@ -609,7 +606,8 @@ xfs_blkdev_get(
609{ 606{
610 int error = 0; 607 int error = 0;
611 608
612 *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); 609 *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
610 mp);
613 if (IS_ERR(*bdevp)) { 611 if (IS_ERR(*bdevp)) {
614 error = PTR_ERR(*bdevp); 612 error = PTR_ERR(*bdevp);
615 printk("XFS: Invalid device [%s], error=%d\n", name, error); 613 printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -623,7 +621,7 @@ xfs_blkdev_put(
623 struct block_device *bdev) 621 struct block_device *bdev)
624{ 622{
625 if (bdev) 623 if (bdev)
626 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 624 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
627} 625}
628 626
629/* 627/*
@@ -837,8 +835,11 @@ xfsaild_wakeup(
837 struct xfs_ail *ailp, 835 struct xfs_ail *ailp,
838 xfs_lsn_t threshold_lsn) 836 xfs_lsn_t threshold_lsn)
839{ 837{
840 ailp->xa_target = threshold_lsn; 838 /* only ever move the target forwards */
841 wake_up_process(ailp->xa_task); 839 if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
840 ailp->xa_target = threshold_lsn;
841 wake_up_process(ailp->xa_task);
842 }
842} 843}
843 844
844STATIC int 845STATIC int
@@ -850,8 +851,17 @@ xfsaild(
850 long tout = 0; /* milliseconds */ 851 long tout = 0; /* milliseconds */
851 852
852 while (!kthread_should_stop()) { 853 while (!kthread_should_stop()) {
853 schedule_timeout_interruptible(tout ? 854 /*
854 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); 855 * for short sleeps indicating congestion, don't allow us to
856 * get woken early. Otherwise all we do is bang on the AIL lock
857 * without making progress.
858 */
859 if (tout && tout <= 20)
860 __set_current_state(TASK_KILLABLE);
861 else
862 __set_current_state(TASK_INTERRUPTIBLE);
863 schedule_timeout(tout ?
864 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
855 865
856 /* swsusp */ 866 /* swsusp */
857 try_to_freeze(); 867 try_to_freeze();
@@ -938,7 +948,7 @@ out_reclaim:
938 * Slab object creation initialisation for the XFS inode. 948 * Slab object creation initialisation for the XFS inode.
939 * This covers only the idempotent fields in the XFS inode; 949 * This covers only the idempotent fields in the XFS inode;
940 * all other fields need to be initialised on allocation 950 * all other fields need to be initialised on allocation
941 * from the slab. This avoids the need to repeatedly intialise 951 * from the slab. This avoids the need to repeatedly initialise
942 * fields in the xfs inode that left in the initialise state 952 * fields in the xfs inode that left in the initialise state
943 * when freeing the inode. 953 * when freeing the inode.
944 */ 954 */
@@ -1121,6 +1131,8 @@ xfs_fs_evict_inode(
1121 */ 1131 */
1122 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 1132 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
1123 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 1133 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
1134 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
1135 &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
1124 1136
1125 xfs_inactive(ip); 1137 xfs_inactive(ip);
1126} 1138}
@@ -1402,7 +1414,7 @@ xfs_fs_freeze(
1402 1414
1403 xfs_save_resvblks(mp); 1415 xfs_save_resvblks(mp);
1404 xfs_quiesce_attr(mp); 1416 xfs_quiesce_attr(mp);
1405 return -xfs_fs_log_dummy(mp, SYNC_WAIT); 1417 return -xfs_fs_log_dummy(mp);
1406} 1418}
1407 1419
1408STATIC int 1420STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 37d33254981..e22f0057d21 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
53{ 53{
54 struct inode *inode = VFS_I(ip); 54 struct inode *inode = VFS_I(ip);
55 55
56 ASSERT(rcu_read_lock_held());
57
58 /*
59 * check for stale RCU freed inode
60 *
61 * If the inode has been reallocated, it doesn't matter if it's not in
62 * the AG we are walking - we are walking for writeback, so if it
63 * passes all the "valid inode" checks and is dirty, then we'll write
64 * it back anyway. If it has been reallocated and still being
65 * initialised, the XFS_INEW check below will catch it.
66 */
67 spin_lock(&ip->i_flags_lock);
68 if (!ip->i_ino)
69 goto out_unlock_noent;
70
71 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
72 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
73 goto out_unlock_noent;
74 spin_unlock(&ip->i_flags_lock);
75
56 /* nothing to sync during shutdown */ 76 /* nothing to sync during shutdown */
57 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 77 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
58 return EFSCORRUPTED; 78 return EFSCORRUPTED;
59 79
60 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
61 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
62 return ENOENT;
63
64 /* If we can't grab the inode, it must on it's way to reclaim. */ 80 /* If we can't grab the inode, it must on it's way to reclaim. */
65 if (!igrab(inode)) 81 if (!igrab(inode))
66 return ENOENT; 82 return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
72 88
73 /* inode is valid */ 89 /* inode is valid */
74 return 0; 90 return 0;
91
92out_unlock_noent:
93 spin_unlock(&ip->i_flags_lock);
94 return ENOENT;
75} 95}
76 96
77STATIC int 97STATIC int
@@ -98,12 +118,12 @@ restart:
98 int error = 0; 118 int error = 0;
99 int i; 119 int i;
100 120
101 read_lock(&pag->pag_ici_lock); 121 rcu_read_lock();
102 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 122 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
103 (void **)batch, first_index, 123 (void **)batch, first_index,
104 XFS_LOOKUP_BATCH); 124 XFS_LOOKUP_BATCH);
105 if (!nr_found) { 125 if (!nr_found) {
106 read_unlock(&pag->pag_ici_lock); 126 rcu_read_unlock();
107 break; 127 break;
108 } 128 }
109 129
@@ -118,18 +138,26 @@ restart:
118 batch[i] = NULL; 138 batch[i] = NULL;
119 139
120 /* 140 /*
121 * Update the index for the next lookup. Catch overflows 141 * Update the index for the next lookup. Catch
122 * into the next AG range which can occur if we have inodes 142 * overflows into the next AG range which can occur if
123 * in the last block of the AG and we are currently 143 * we have inodes in the last block of the AG and we
124 * pointing to the last inode. 144 * are currently pointing to the last inode.
145 *
146 * Because we may see inodes that are from the wrong AG
147 * due to RCU freeing and reallocation, only update the
148 * index if it lies in this AG. It was a race that lead
149 * us to see this inode, so another lookup from the
150 * same index will not find it again.
125 */ 151 */
152 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
153 continue;
126 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 154 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 155 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128 done = 1; 156 done = 1;
129 } 157 }
130 158
131 /* unlock now we've grabbed the inodes. */ 159 /* unlock now we've grabbed the inodes. */
132 read_unlock(&pag->pag_ici_lock); 160 rcu_read_unlock();
133 161
134 for (i = 0; i < nr_found; i++) { 162 for (i = 0; i < nr_found; i++) {
135 if (!batch[i]) 163 if (!batch[i])
@@ -334,7 +362,7 @@ xfs_quiesce_data(
334 362
335 /* mark the log as covered if needed */ 363 /* mark the log as covered if needed */
336 if (xfs_log_need_covered(mp)) 364 if (xfs_log_need_covered(mp))
337 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); 365 error2 = xfs_fs_log_dummy(mp);
338 366
339 /* flush data-only devices */ 367 /* flush data-only devices */
340 if (mp->m_rtdev_targp) 368 if (mp->m_rtdev_targp)
@@ -475,13 +503,14 @@ xfs_sync_worker(
475 int error; 503 int error;
476 504
477 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 505 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
478 xfs_log_force(mp, 0);
479 xfs_reclaim_inodes(mp, 0);
480 /* dgc: errors ignored here */ 506 /* dgc: errors ignored here */
481 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
482 if (mp->m_super->s_frozen == SB_UNFROZEN && 507 if (mp->m_super->s_frozen == SB_UNFROZEN &&
483 xfs_log_need_covered(mp)) 508 xfs_log_need_covered(mp))
484 error = xfs_fs_log_dummy(mp, 0); 509 error = xfs_fs_log_dummy(mp);
510 else
511 xfs_log_force(mp, 0);
512 xfs_reclaim_inodes(mp, 0);
513 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
485 } 514 }
486 mp->m_sync_seq++; 515 mp->m_sync_seq++;
487 wake_up(&mp->m_wait_single_sync_task); 516 wake_up(&mp->m_wait_single_sync_task);
@@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag(
592 struct xfs_perag *pag; 621 struct xfs_perag *pag;
593 622
594 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 623 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
595 write_lock(&pag->pag_ici_lock); 624 spin_lock(&pag->pag_ici_lock);
596 spin_lock(&ip->i_flags_lock); 625 spin_lock(&ip->i_flags_lock);
597 __xfs_inode_set_reclaim_tag(pag, ip); 626 __xfs_inode_set_reclaim_tag(pag, ip);
598 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 627 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
599 spin_unlock(&ip->i_flags_lock); 628 spin_unlock(&ip->i_flags_lock);
600 write_unlock(&pag->pag_ici_lock); 629 spin_unlock(&pag->pag_ici_lock);
601 xfs_perag_put(pag); 630 xfs_perag_put(pag);
602} 631}
603 632
@@ -639,9 +668,14 @@ xfs_reclaim_inode_grab(
639 struct xfs_inode *ip, 668 struct xfs_inode *ip,
640 int flags) 669 int flags)
641{ 670{
671 ASSERT(rcu_read_lock_held());
672
673 /* quick check for stale RCU freed inode */
674 if (!ip->i_ino)
675 return 1;
642 676
643 /* 677 /*
644 * do some unlocked checks first to avoid unnecceary lock traffic. 678 * do some unlocked checks first to avoid unnecessary lock traffic.
645 * The first is a flush lock check, the second is a already in reclaim 679 * The first is a flush lock check, the second is a already in reclaim
646 * check. Only do these checks if we are not going to block on locks. 680 * check. Only do these checks if we are not going to block on locks.
647 */ 681 */
@@ -654,11 +688,16 @@ xfs_reclaim_inode_grab(
654 * The radix tree lock here protects a thread in xfs_iget from racing 688 * The radix tree lock here protects a thread in xfs_iget from racing
655 * with us starting reclaim on the inode. Once we have the 689 * with us starting reclaim on the inode. Once we have the
656 * XFS_IRECLAIM flag set it will not touch us. 690 * XFS_IRECLAIM flag set it will not touch us.
691 *
692 * Due to RCU lookup, we may find inodes that have been freed and only
693 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
694 * aren't candidates for reclaim at all, so we must check the
695 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
657 */ 696 */
658 spin_lock(&ip->i_flags_lock); 697 spin_lock(&ip->i_flags_lock);
659 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); 698 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
660 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { 699 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
661 /* ignore as it is already under reclaim */ 700 /* not a reclaim candidate. */
662 spin_unlock(&ip->i_flags_lock); 701 spin_unlock(&ip->i_flags_lock);
663 return 1; 702 return 1;
664 } 703 }
@@ -795,12 +834,12 @@ reclaim:
795 * added to the tree assert that it's been there before to catch 834 * added to the tree assert that it's been there before to catch
796 * problems with the inode life time early on. 835 * problems with the inode life time early on.
797 */ 836 */
798 write_lock(&pag->pag_ici_lock); 837 spin_lock(&pag->pag_ici_lock);
799 if (!radix_tree_delete(&pag->pag_ici_root, 838 if (!radix_tree_delete(&pag->pag_ici_root,
800 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 839 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
801 ASSERT(0); 840 ASSERT(0);
802 __xfs_inode_clear_reclaim(pag, ip); 841 __xfs_inode_clear_reclaim(pag, ip);
803 write_unlock(&pag->pag_ici_lock); 842 spin_unlock(&pag->pag_ici_lock);
804 843
805 /* 844 /*
806 * Here we do an (almost) spurious inode lock in order to coordinate 845 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -853,6 +892,7 @@ restart:
853 if (trylock) { 892 if (trylock) {
854 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 893 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
855 skipped++; 894 skipped++;
895 xfs_perag_put(pag);
856 continue; 896 continue;
857 } 897 }
858 first_index = pag->pag_ici_reclaim_cursor; 898 first_index = pag->pag_ici_reclaim_cursor;
@@ -863,14 +903,14 @@ restart:
863 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 903 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
864 int i; 904 int i;
865 905
866 write_lock(&pag->pag_ici_lock); 906 rcu_read_lock();
867 nr_found = radix_tree_gang_lookup_tag( 907 nr_found = radix_tree_gang_lookup_tag(
868 &pag->pag_ici_root, 908 &pag->pag_ici_root,
869 (void **)batch, first_index, 909 (void **)batch, first_index,
870 XFS_LOOKUP_BATCH, 910 XFS_LOOKUP_BATCH,
871 XFS_ICI_RECLAIM_TAG); 911 XFS_ICI_RECLAIM_TAG);
872 if (!nr_found) { 912 if (!nr_found) {
873 write_unlock(&pag->pag_ici_lock); 913 rcu_read_unlock();
874 break; 914 break;
875 } 915 }
876 916
@@ -890,14 +930,24 @@ restart:
890 * occur if we have inodes in the last block of 930 * occur if we have inodes in the last block of
891 * the AG and we are currently pointing to the 931 * the AG and we are currently pointing to the
892 * last inode. 932 * last inode.
933 *
934 * Because we may see inodes that are from the
935 * wrong AG due to RCU freeing and
936 * reallocation, only update the index if it
937 * lies in this AG. It was a race that lead us
938 * to see this inode, so another lookup from
939 * the same index will not find it again.
893 */ 940 */
941 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
942 pag->pag_agno)
943 continue;
894 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 944 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
895 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 945 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
896 done = 1; 946 done = 1;
897 } 947 }
898 948
899 /* unlock now we've grabbed the inodes. */ 949 /* unlock now we've grabbed the inodes. */
900 write_unlock(&pag->pag_ici_lock); 950 rcu_read_unlock();
901 951
902 for (i = 0; i < nr_found; i++) { 952 for (i = 0; i < nr_found; i++) {
903 if (!batch[i]) 953 if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae..ee3cee097e7 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include "xfs_error.h"
21 22
22static struct ctl_table_header *xfs_table_header; 23static struct ctl_table_header *xfs_table_header;
23 24
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
51 52
52 return ret; 53 return ret;
53} 54}
55
56STATIC int
57xfs_panic_mask_proc_handler(
58 ctl_table *ctl,
59 int write,
60 void __user *buffer,
61 size_t *lenp,
62 loff_t *ppos)
63{
64 int ret, *valp = ctl->data;
65
66 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
67 if (!ret && write) {
68 xfs_panic_mask = *valp;
69#ifdef DEBUG
70 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
71#endif
72 }
73 return ret;
74}
54#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
55 76
56static ctl_table xfs_table[] = { 77static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
77 .data = &xfs_params.panic_mask.val, 98 .data = &xfs_params.panic_mask.val,
78 .maxlen = sizeof(int), 99 .maxlen = sizeof(int),
79 .mode = 0644, 100 .mode = 0644,
80 .proc_handler = proc_dointvec_minmax, 101 .proc_handler = xfs_panic_mask_proc_handler,
81 .extra1 = &xfs_params.panic_mask.min, 102 .extra1 = &xfs_params.panic_mask.min,
82 .extra2 = &xfs_params.panic_mask.max 103 .extra2 = &xfs_params.panic_mask.max
83 }, 104 },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c59..2d0bcb47907 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
766 __field(int, curr_res) 766 __field(int, curr_res)
767 __field(int, unit_res) 767 __field(int, unit_res)
768 __field(unsigned int, flags) 768 __field(unsigned int, flags)
769 __field(void *, reserve_headq) 769 __field(int, reserveq)
770 __field(void *, write_headq) 770 __field(int, writeq)
771 __field(int, grant_reserve_cycle) 771 __field(int, grant_reserve_cycle)
772 __field(int, grant_reserve_bytes) 772 __field(int, grant_reserve_bytes)
773 __field(int, grant_write_cycle) 773 __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
784 __entry->curr_res = tic->t_curr_res; 784 __entry->curr_res = tic->t_curr_res;
785 __entry->unit_res = tic->t_unit_res; 785 __entry->unit_res = tic->t_unit_res;
786 __entry->flags = tic->t_flags; 786 __entry->flags = tic->t_flags;
787 __entry->reserve_headq = log->l_reserve_headq; 787 __entry->reserveq = list_empty(&log->l_reserveq);
788 __entry->write_headq = log->l_write_headq; 788 __entry->writeq = list_empty(&log->l_writeq);
789 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; 789 xlog_crack_grant_head(&log->l_grant_reserve_head,
790 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; 790 &__entry->grant_reserve_cycle,
791 __entry->grant_write_cycle = log->l_grant_write_cycle; 791 &__entry->grant_reserve_bytes);
792 __entry->grant_write_bytes = log->l_grant_write_bytes; 792 xlog_crack_grant_head(&log->l_grant_write_head,
793 &__entry->grant_write_cycle,
794 &__entry->grant_write_bytes);
793 __entry->curr_cycle = log->l_curr_cycle; 795 __entry->curr_cycle = log->l_curr_cycle;
794 __entry->curr_block = log->l_curr_block; 796 __entry->curr_block = log->l_curr_block;
795 __entry->tail_lsn = log->l_tail_lsn; 797 __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
796 ), 798 ),
797 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " 799 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
798 "t_unit_res %u t_flags %s reserve_headq 0x%p " 800 "t_unit_res %u t_flags %s reserveq %s "
799 "write_headq 0x%p grant_reserve_cycle %d " 801 "writeq %s grant_reserve_cycle %d "
800 "grant_reserve_bytes %d grant_write_cycle %d " 802 "grant_reserve_bytes %d grant_write_cycle %d "
801 "grant_write_bytes %d curr_cycle %d curr_block %d " 803 "grant_write_bytes %d curr_cycle %d curr_block %d "
802 "tail_cycle %d tail_block %d", 804 "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
807 __entry->curr_res, 809 __entry->curr_res,
808 __entry->unit_res, 810 __entry->unit_res,
809 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), 811 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
810 __entry->reserve_headq, 812 __entry->reserveq ? "empty" : "active",
811 __entry->write_headq, 813 __entry->writeq ? "empty" : "active",
812 __entry->grant_reserve_cycle, 814 __entry->grant_reserve_cycle,
813 __entry->grant_reserve_bytes, 815 __entry->grant_reserve_bytes,
814 __entry->grant_write_cycle, 816 __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
835DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); 837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); 838DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); 839DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
840DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
838DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
839DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 850DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); 851DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
935DEFINE_PAGE_EVENT(xfs_releasepage); 939DEFINE_PAGE_EVENT(xfs_releasepage);
936DEFINE_PAGE_EVENT(xfs_invalidatepage); 940DEFINE_PAGE_EVENT(xfs_invalidatepage);
937 941
938DECLARE_EVENT_CLASS(xfs_iomap_class, 942DECLARE_EVENT_CLASS(xfs_imap_class,
939 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 943 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
940 int flags, struct xfs_bmbt_irec *irec), 944 int type, struct xfs_bmbt_irec *irec),
941 TP_ARGS(ip, offset, count, flags, irec), 945 TP_ARGS(ip, offset, count, type, irec),
942 TP_STRUCT__entry( 946 TP_STRUCT__entry(
943 __field(dev_t, dev) 947 __field(dev_t, dev)
944 __field(xfs_ino_t, ino) 948 __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
946 __field(loff_t, new_size) 950 __field(loff_t, new_size)
947 __field(loff_t, offset) 951 __field(loff_t, offset)
948 __field(size_t, count) 952 __field(size_t, count)
949 __field(int, flags) 953 __field(int, type)
950 __field(xfs_fileoff_t, startoff) 954 __field(xfs_fileoff_t, startoff)
951 __field(xfs_fsblock_t, startblock) 955 __field(xfs_fsblock_t, startblock)
952 __field(xfs_filblks_t, blockcount) 956 __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
958 __entry->new_size = ip->i_new_size; 962 __entry->new_size = ip->i_new_size;
959 __entry->offset = offset; 963 __entry->offset = offset;
960 __entry->count = count; 964 __entry->count = count;
961 __entry->flags = flags; 965 __entry->type = type;
962 __entry->startoff = irec ? irec->br_startoff : 0; 966 __entry->startoff = irec ? irec->br_startoff : 0;
963 __entry->startblock = irec ? irec->br_startblock : 0; 967 __entry->startblock = irec ? irec->br_startblock : 0;
964 __entry->blockcount = irec ? irec->br_blockcount : 0; 968 __entry->blockcount = irec ? irec->br_blockcount : 0;
965 ), 969 ),
966 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 970 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
967 "offset 0x%llx count %zd flags %s " 971 "offset 0x%llx count %zd type %s "
968 "startoff 0x%llx startblock %lld blockcount 0x%llx", 972 "startoff 0x%llx startblock %lld blockcount 0x%llx",
969 MAJOR(__entry->dev), MINOR(__entry->dev), 973 MAJOR(__entry->dev), MINOR(__entry->dev),
970 __entry->ino, 974 __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
972 __entry->new_size, 976 __entry->new_size,
973 __entry->offset, 977 __entry->offset,
974 __entry->count, 978 __entry->count,
975 __print_flags(__entry->flags, "|", BMAPI_FLAGS), 979 __print_symbolic(__entry->type, XFS_IO_TYPES),
976 __entry->startoff, 980 __entry->startoff,
977 (__int64_t)__entry->startblock, 981 (__int64_t)__entry->startblock,
978 __entry->blockcount) 982 __entry->blockcount)
979) 983)
980 984
981#define DEFINE_IOMAP_EVENT(name) \ 985#define DEFINE_IOMAP_EVENT(name) \
982DEFINE_EVENT(xfs_iomap_class, name, \ 986DEFINE_EVENT(xfs_imap_class, name, \
983 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 987 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
984 int flags, struct xfs_bmbt_irec *irec), \ 988 int type, struct xfs_bmbt_irec *irec), \
985 TP_ARGS(ip, offset, count, flags, irec)) 989 TP_ARGS(ip, offset, count, type, irec))
986DEFINE_IOMAP_EVENT(xfs_iomap_enter); 990DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
987DEFINE_IOMAP_EVENT(xfs_iomap_found); 991DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
988DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 992DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
993DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
989 994
990DECLARE_EVENT_CLASS(xfs_simple_io_class, 995DECLARE_EVENT_CLASS(xfs_simple_io_class,
991 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 996 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
1022 TP_ARGS(ip, offset, count)) 1027 TP_ARGS(ip, offset, count))
1023DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 1028DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
1024DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 1029DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
1030DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
1025 1031
1026 1032
1027TRACE_EVENT(xfs_itruncate_start, 1033TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
1420 TP_PROTO(struct xfs_alloc_arg *args), \ 1426 TP_PROTO(struct xfs_alloc_arg *args), \
1421 TP_ARGS(args)) 1427 TP_ARGS(args))
1422DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); 1428DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
1429DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
1423DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); 1430DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
1424DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); 1431DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
1425DEFINE_ALLOC_EVENT(xfs_alloc_near_first); 1432DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1752DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); 1759DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1753DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); 1760DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1754 1761
1762DECLARE_EVENT_CLASS(xfs_discard_class,
1763 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1764 xfs_agblock_t agbno, xfs_extlen_t len),
1765 TP_ARGS(mp, agno, agbno, len),
1766 TP_STRUCT__entry(
1767 __field(dev_t, dev)
1768 __field(xfs_agnumber_t, agno)
1769 __field(xfs_agblock_t, agbno)
1770 __field(xfs_extlen_t, len)
1771 ),
1772 TP_fast_assign(
1773 __entry->dev = mp->m_super->s_dev;
1774 __entry->agno = agno;
1775 __entry->agbno = agbno;
1776 __entry->len = len;
1777 ),
1778 TP_printk("dev %d:%d agno %u agbno %u len %u\n",
1779 MAJOR(__entry->dev), MINOR(__entry->dev),
1780 __entry->agno,
1781 __entry->agbno,
1782 __entry->len)
1783)
1784
1785#define DEFINE_DISCARD_EVENT(name) \
1786DEFINE_EVENT(xfs_discard_class, name, \
1787 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1788 xfs_agblock_t agbno, xfs_extlen_t len), \
1789 TP_ARGS(mp, agno, agbno, len))
1790DEFINE_DISCARD_EVENT(xfs_discard_extent);
1791DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
1792DEFINE_DISCARD_EVENT(xfs_discard_exclude);
1793DEFINE_DISCARD_EVENT(xfs_discard_busy);
1794
1755#endif /* _TRACE_XFS_H */ 1795#endif /* _TRACE_XFS_H */
1756 1796
1757#undef TRACE_INCLUDE_PATH 1797#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a1..d22aa310310 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
149 ASSERT(list_empty(&dqp->q_freelist)); 149 ASSERT(list_empty(&dqp->q_freelist));
150 150
151 mutex_destroy(&dqp->q_qlock); 151 mutex_destroy(&dqp->q_qlock);
152 sv_destroy(&dqp->q_pinwait);
153 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); 152 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
154 153
155 atomic_dec(&xfs_Gqm->qm_totaldquots); 154 atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde..206a2815ced 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1863,12 +1863,14 @@ xfs_qm_dqreclaim_one(void)
1863 xfs_dquot_t *dqpout; 1863 xfs_dquot_t *dqpout;
1864 xfs_dquot_t *dqp; 1864 xfs_dquot_t *dqp;
1865 int restarts; 1865 int restarts;
1866 int startagain;
1866 1867
1867 restarts = 0; 1868 restarts = 0;
1868 dqpout = NULL; 1869 dqpout = NULL;
1869 1870
1870 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ 1871 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1871startagain: 1872again:
1873 startagain = 0;
1872 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); 1874 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1873 1875
1874 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { 1876 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1887,10 @@ startagain:
1885 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); 1887 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1886 1888
1887 trace_xfs_dqreclaim_want(dqp); 1889 trace_xfs_dqreclaim_want(dqp);
1888
1889 xfs_dqunlock(dqp);
1890 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1891 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1892 return NULL;
1893 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1890 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1894 goto startagain; 1891 restarts++;
1892 startagain = 1;
1893 goto dqunlock;
1895 } 1894 }
1896 1895
1897 /* 1896 /*
@@ -1906,23 +1905,20 @@ startagain:
1906 ASSERT(list_empty(&dqp->q_mplist)); 1905 ASSERT(list_empty(&dqp->q_mplist));
1907 list_del_init(&dqp->q_freelist); 1906 list_del_init(&dqp->q_freelist);
1908 xfs_Gqm->qm_dqfrlist_cnt--; 1907 xfs_Gqm->qm_dqfrlist_cnt--;
1909 xfs_dqunlock(dqp);
1910 dqpout = dqp; 1908 dqpout = dqp;
1911 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1909 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1912 break; 1910 goto dqunlock;
1913 } 1911 }
1914 1912
1915 ASSERT(dqp->q_hash); 1913 ASSERT(dqp->q_hash);
1916 ASSERT(!list_empty(&dqp->q_mplist)); 1914 ASSERT(!list_empty(&dqp->q_mplist));
1917 1915
1918 /* 1916 /*
1919 * Try to grab the flush lock. If this dquot is in the process of 1917 * Try to grab the flush lock. If this dquot is in the process
1920 * getting flushed to disk, we don't want to reclaim it. 1918 * of getting flushed to disk, we don't want to reclaim it.
1921 */ 1919 */
1922 if (!xfs_dqflock_nowait(dqp)) { 1920 if (!xfs_dqflock_nowait(dqp))
1923 xfs_dqunlock(dqp); 1921 goto dqunlock;
1924 continue;
1925 }
1926 1922
1927 /* 1923 /*
1928 * We have the flush lock so we know that this is not in the 1924 * We have the flush lock so we know that this is not in the
@@ -1944,8 +1940,7 @@ startagain:
1944 xfs_fs_cmn_err(CE_WARN, mp, 1940 xfs_fs_cmn_err(CE_WARN, mp,
1945 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 1941 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
1946 } 1942 }
1947 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 1943 goto dqunlock;
1948 continue;
1949 } 1944 }
1950 1945
1951 /* 1946 /*
@@ -1967,13 +1962,8 @@ startagain:
1967 */ 1962 */
1968 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { 1963 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
1969 restarts++; 1964 restarts++;
1970 mutex_unlock(&dqp->q_hash->qh_lock); 1965 startagain = 1;
1971 xfs_dqfunlock(dqp); 1966 goto qhunlock;
1972 xfs_dqunlock(dqp);
1973 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1974 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
1975 return NULL;
1976 goto startagain;
1977 } 1967 }
1978 1968
1979 ASSERT(dqp->q_nrefs == 0); 1969 ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1976,20 @@ startagain:
1986 xfs_Gqm->qm_dqfrlist_cnt--; 1976 xfs_Gqm->qm_dqfrlist_cnt--;
1987 dqpout = dqp; 1977 dqpout = dqp;
1988 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); 1978 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1979qhunlock:
1989 mutex_unlock(&dqp->q_hash->qh_lock); 1980 mutex_unlock(&dqp->q_hash->qh_lock);
1990dqfunlock: 1981dqfunlock:
1991 xfs_dqfunlock(dqp); 1982 xfs_dqfunlock(dqp);
1983dqunlock:
1992 xfs_dqunlock(dqp); 1984 xfs_dqunlock(dqp);
1993 if (dqpout) 1985 if (dqpout)
1994 break; 1986 break;
1995 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1987 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1996 return NULL; 1988 break;
1989 if (startagain) {
1990 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1991 goto again;
1992 }
1997 } 1993 }
1998 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); 1994 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1999 return dqpout; 1995 return dqpout;
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a4..0df88897ef8 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
25#include "xfs_mount.h" 25#include "xfs_mount.h"
26#include "xfs_error.h" 26#include "xfs_error.h"
27 27
28static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock);
30
31/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
32#define XFS_MAX_ERR_LEVEL 7
33#define XFS_ERR_MASK ((1 << 3) - 1)
34static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
35 {KERN_EMERG, KERN_ALERT, KERN_CRIT,
36 KERN_ERR, KERN_WARNING, KERN_NOTICE,
37 KERN_INFO, KERN_DEBUG};
38
39void 28void
40cmn_err(register int level, char *fmt, ...) 29cmn_err(
30 const char *lvl,
31 const char *fmt,
32 ...)
41{ 33{
42 char *fp = fmt; 34 struct va_format vaf;
43 int len; 35 va_list args;
44 ulong flags; 36
45 va_list ap; 37 va_start(args, fmt);
46 38 vaf.fmt = fmt;
47 level &= XFS_ERR_MASK; 39 vaf.va = &args;
48 if (level > XFS_MAX_ERR_LEVEL) 40
49 level = XFS_MAX_ERR_LEVEL; 41 printk("%s%pV", lvl, &vaf);
50 spin_lock_irqsave(&xfs_err_lock,flags); 42 va_end(args);
51 va_start(ap, fmt); 43
52 if (*fmt == '!') fp++; 44 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
53 len = vsnprintf(message, sizeof(message), fp, ap);
54 if (len >= sizeof(message))
55 len = sizeof(message) - 1;
56 if (message[len-1] == '\n')
57 message[len-1] = 0;
58 printk("%s%s\n", err_level[level], message);
59 va_end(ap);
60 spin_unlock_irqrestore(&xfs_err_lock,flags);
61 BUG_ON(level == CE_PANIC);
62} 45}
63 46
64void 47void
65xfs_fs_vcmn_err( 48xfs_fs_cmn_err(
66 int level, 49 const char *lvl,
67 struct xfs_mount *mp, 50 struct xfs_mount *mp,
68 char *fmt, 51 const char *fmt,
69 va_list ap) 52 ...)
70{ 53{
71 unsigned long flags; 54 struct va_format vaf;
72 int len = 0; 55 va_list args;
73 56
74 level &= XFS_ERR_MASK; 57 va_start(args, fmt);
75 if (level > XFS_MAX_ERR_LEVEL) 58 vaf.fmt = fmt;
76 level = XFS_MAX_ERR_LEVEL; 59 vaf.va = &args;
77 60
78 spin_lock_irqsave(&xfs_err_lock,flags); 61 printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
62 va_end(args);
79 63
80 if (mp) { 64 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
81 len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); 65}
66
67/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
68void
69xfs_cmn_err(
70 int panic_tag,
71 const char *lvl,
72 struct xfs_mount *mp,
73 const char *fmt,
74 ...)
75{
76 struct va_format vaf;
77 va_list args;
78 int do_panic = 0;
82 79
83 /* 80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
84 * Skip the printk if we can't print anything useful 81 printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
85 * due to an over-long device name. 82 do_panic = 1;
86 */
87 if (len >= sizeof(message))
88 goto out;
89 } 83 }
90 84
91 len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); 85 va_start(args, fmt);
92 if (len >= sizeof(message)) 86 vaf.fmt = fmt;
93 len = sizeof(message) - 1; 87 vaf.va = &args;
94 if (message[len-1] == '\n')
95 message[len-1] = 0;
96 88
97 printk("%s%s\n", err_level[level], message); 89 printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
98 out: 90 va_end(args);
99 spin_unlock_irqrestore(&xfs_err_lock,flags);
100 91
101 BUG_ON(level == CE_PANIC); 92 BUG_ON(do_panic);
102} 93}
103 94
104void 95void
105assfail(char *expr, char *file, int line) 96assfail(char *expr, char *file, int line)
106{ 97{
107 printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); 98 printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
99 file, line);
108 BUG(); 100 BUG();
109} 101}
110 102
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4..05699f67d47 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
20 20
21#include <stdarg.h> 21#include <stdarg.h>
22 22
23#define CE_DEBUG 7 /* debug */ 23struct xfs_mount;
24#define CE_CONT 6 /* continuation */ 24
25#define CE_NOTE 5 /* notice */ 25#define CE_DEBUG KERN_DEBUG
26#define CE_WARN 4 /* warning */ 26#define CE_CONT KERN_INFO
27#define CE_ALERT 1 /* alert */ 27#define CE_NOTE KERN_NOTICE
28#define CE_PANIC 0 /* panic */ 28#define CE_WARN KERN_WARNING
29 29#define CE_ALERT KERN_ALERT
30extern void cmn_err(int, char *, ...) 30#define CE_PANIC KERN_EMERG
31 __attribute__ ((format (printf, 2, 3))); 31
32void cmn_err(const char *lvl, const char *fmt, ...)
33 __attribute__ ((format (printf, 2, 3)));
34void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
35 const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
36void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
37 const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
38
32extern void assfail(char *expr, char *f, int l); 39extern void assfail(char *expr, char *f, int l);
33 40
34#define ASSERT_ALWAYS(expr) \ 41#define ASSERT_ALWAYS(expr) \
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d..11dd72070cb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
43 43
44#ifdef CONFIG_XFS_POSIX_ACL 44#ifdef CONFIG_XFS_POSIX_ACL
45extern int xfs_check_acl(struct inode *inode, int mask); 45extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
46extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); 46extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
47extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); 47extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
48extern int xfs_acl_chmod(struct inode *inode); 48extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c02..58632cc17f2 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
227 227
228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
229 229
230 rwlock_t pag_ici_lock; /* incore inode lock */ 230 spinlock_t pag_ici_lock; /* incore inode cache lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
233 struct mutex pag_ici_reclaim_lock; /* serialisation point */ 233 struct mutex pag_ici_reclaim_lock; /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca..f3227984a9b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44static int
45xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
46 xfs_agblock_t bno, xfs_extlen_t len);
47
48/* 44/*
49 * Prototypes for per-ag allocation routines 45 * Prototypes for per-ag allocation routines
50 */ 46 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
94 * Lookup the first record less than or equal to [bno, len] 90 * Lookup the first record less than or equal to [bno, len]
95 * in the btree given by cur. 91 * in the btree given by cur.
96 */ 92 */
97STATIC int /* error */ 93int /* error */
98xfs_alloc_lookup_le( 94xfs_alloc_lookup_le(
99 struct xfs_btree_cur *cur, /* btree cursor */ 95 struct xfs_btree_cur *cur, /* btree cursor */
100 xfs_agblock_t bno, /* starting block of extent */ 96 xfs_agblock_t bno, /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
127/* 123/*
128 * Get the data from the pointed-to record. 124 * Get the data from the pointed-to record.
129 */ 125 */
130STATIC int /* error */ 126int /* error */
131xfs_alloc_get_rec( 127xfs_alloc_get_rec(
132 struct xfs_btree_cur *cur, /* btree cursor */ 128 struct xfs_btree_cur *cur, /* btree cursor */
133 xfs_agblock_t *bno, /* output: starting block of extent */ 129 xfs_agblock_t *bno, /* output: starting block of extent */
@@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact(
577 xfs_extlen_t rlen; /* length of returned extent */ 573 xfs_extlen_t rlen; /* length of returned extent */
578 574
579 ASSERT(args->alignment == 1); 575 ASSERT(args->alignment == 1);
576
580 /* 577 /*
581 * Allocate/initialize a cursor for the by-number freespace btree. 578 * Allocate/initialize a cursor for the by-number freespace btree.
582 */ 579 */
583 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 580 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
584 args->agno, XFS_BTNUM_BNO); 581 args->agno, XFS_BTNUM_BNO);
582
585 /* 583 /*
586 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 584 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
587 * Look for the closest free block <= bno, it must contain bno 585 * Look for the closest free block <= bno, it must contain bno
588 * if any free block does. 586 * if any free block does.
589 */ 587 */
590 if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) 588 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
589 if (error)
591 goto error0; 590 goto error0;
592 if (!i) { 591 if (!i)
593 /* 592 goto not_found;
594 * Didn't find it, return null. 593
595 */
596 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
597 args->agbno = NULLAGBLOCK;
598 return 0;
599 }
600 /* 594 /*
601 * Grab the freespace record. 595 * Grab the freespace record.
602 */ 596 */
603 if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) 597 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
598 if (error)
604 goto error0; 599 goto error0;
605 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 600 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
606 ASSERT(fbno <= args->agbno); 601 ASSERT(fbno <= args->agbno);
607 minend = args->agbno + args->minlen; 602 minend = args->agbno + args->minlen;
608 maxend = args->agbno + args->maxlen; 603 maxend = args->agbno + args->maxlen;
609 fend = fbno + flen; 604 fend = fbno + flen;
605
610 /* 606 /*
611 * Give up if the freespace isn't long enough for the minimum request. 607 * Give up if the freespace isn't long enough for the minimum request.
612 */ 608 */
613 if (fend < minend) { 609 if (fend < minend)
614 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 610 goto not_found;
615 args->agbno = NULLAGBLOCK; 611
616 return 0;
617 }
618 /* 612 /*
619 * End of extent will be smaller of the freespace end and the 613 * End of extent will be smaller of the freespace end and the
620 * maximal requested end. 614 * maximal requested end.
621 */ 615 *
622 end = XFS_AGBLOCK_MIN(fend, maxend);
623 /*
624 * Fix the length according to mod and prod if given. 616 * Fix the length according to mod and prod if given.
625 */ 617 */
618 end = XFS_AGBLOCK_MIN(fend, maxend);
626 args->len = end - args->agbno; 619 args->len = end - args->agbno;
627 xfs_alloc_fix_len(args); 620 xfs_alloc_fix_len(args);
628 if (!xfs_alloc_fix_minleft(args)) { 621 if (!xfs_alloc_fix_minleft(args))
629 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 622 goto not_found;
630 return 0; 623
631 }
632 rlen = args->len; 624 rlen = args->len;
633 ASSERT(args->agbno + rlen <= fend); 625 ASSERT(args->agbno + rlen <= fend);
634 end = args->agbno + rlen; 626 end = args->agbno + rlen;
627
635 /* 628 /*
636 * We are allocating agbno for rlen [agbno .. end] 629 * We are allocating agbno for rlen [agbno .. end]
637 * Allocate/initialize a cursor for the by-size btree. 630 * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact(
640 args->agno, XFS_BTNUM_CNT); 633 args->agno, XFS_BTNUM_CNT);
641 ASSERT(args->agbno + args->len <= 634 ASSERT(args->agbno + args->len <=
642 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 635 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
643 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 636 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
644 args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { 637 args->len, XFSA_FIXUP_BNO_OK);
638 if (error) {
645 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 639 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
646 goto error0; 640 goto error0;
647 } 641 }
642
648 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 643 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
649 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 644 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
650 645
651 trace_xfs_alloc_exact_done(args);
652 args->wasfromfl = 0; 646 args->wasfromfl = 0;
647 trace_xfs_alloc_exact_done(args);
648 return 0;
649
650not_found:
651 /* Didn't find it, return null. */
652 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
653 args->agbno = NULLAGBLOCK;
654 trace_xfs_alloc_exact_notfound(args);
653 return 0; 655 return 0;
654 656
655error0: 657error0:
@@ -659,6 +661,95 @@ error0:
659} 661}
660 662
661/* 663/*
664 * Search the btree in a given direction via the search cursor and compare
665 * the records found against the good extent we've already found.
666 */
667STATIC int
668xfs_alloc_find_best_extent(
669 struct xfs_alloc_arg *args, /* allocation argument structure */
670 struct xfs_btree_cur **gcur, /* good cursor */
671 struct xfs_btree_cur **scur, /* searching cursor */
672 xfs_agblock_t gdiff, /* difference for search comparison */
673 xfs_agblock_t *sbno, /* extent found by search */
674 xfs_extlen_t *slen,
675 xfs_extlen_t *slena, /* aligned length */
676 int dir) /* 0 = search right, 1 = search left */
677{
678 xfs_agblock_t bno;
679 xfs_agblock_t new;
680 xfs_agblock_t sdiff;
681 int error;
682 int i;
683
684 /* The good extent is perfect, no need to search. */
685 if (!gdiff)
686 goto out_use_good;
687
688 /*
689 * Look until we find a better one, run out of space or run off the end.
690 */
691 do {
692 error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
693 if (error)
694 goto error0;
695 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
696 xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
697 args->minlen, &bno, slena);
698
699 /*
700 * The good extent is closer than this one.
701 */
702 if (!dir) {
703 if (bno >= args->agbno + gdiff)
704 goto out_use_good;
705 } else {
706 if (bno <= args->agbno - gdiff)
707 goto out_use_good;
708 }
709
710 /*
711 * Same distance, compare length and pick the best.
712 */
713 if (*slena >= args->minlen) {
714 args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
715 xfs_alloc_fix_len(args);
716
717 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
718 args->alignment, *sbno,
719 *slen, &new);
720
721 /*
722 * Choose closer size and invalidate other cursor.
723 */
724 if (sdiff < gdiff)
725 goto out_use_search;
726 goto out_use_good;
727 }
728
729 if (!dir)
730 error = xfs_btree_increment(*scur, 0, &i);
731 else
732 error = xfs_btree_decrement(*scur, 0, &i);
733 if (error)
734 goto error0;
735 } while (i);
736
737out_use_good:
738 xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
739 *scur = NULL;
740 return 0;
741
742out_use_search:
743 xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
744 *gcur = NULL;
745 return 0;
746
747error0:
748 /* caller invalidates cursors */
749 return error;
750}
751
752/*
662 * Allocate a variable extent near bno in the allocation group agno. 753 * Allocate a variable extent near bno in the allocation group agno.
663 * Extent's length (returned in len) will be between minlen and maxlen, 754 * Extent's length (returned in len) will be between minlen and maxlen,
664 * and of the form k * prod + mod unless there's nothing that large. 755 * and of the form k * prod + mod unless there's nothing that large.
@@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near(
925 } 1016 }
926 } 1017 }
927 } while (bno_cur_lt || bno_cur_gt); 1018 } while (bno_cur_lt || bno_cur_gt);
1019
928 /* 1020 /*
929 * Got both cursors still active, need to find better entry. 1021 * Got both cursors still active, need to find better entry.
930 */ 1022 */
931 if (bno_cur_lt && bno_cur_gt) { 1023 if (bno_cur_lt && bno_cur_gt) {
932 /*
933 * Left side is long enough, look for a right side entry.
934 */
935 if (ltlena >= args->minlen) { 1024 if (ltlena >= args->minlen) {
936 /* 1025 /*
937 * Fix up the length. 1026 * Left side is good, look for a right side entry.
938 */ 1027 */
939 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1028 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
940 xfs_alloc_fix_len(args); 1029 xfs_alloc_fix_len(args);
941 rlen = args->len; 1030 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
942 ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
943 args->alignment, ltbno, ltlen, &ltnew); 1031 args->alignment, ltbno, ltlen, &ltnew);
1032
1033 error = xfs_alloc_find_best_extent(args,
1034 &bno_cur_lt, &bno_cur_gt,
1035 ltdiff, &gtbno, &gtlen, &gtlena,
1036 0 /* search right */);
1037 } else {
1038 ASSERT(gtlena >= args->minlen);
1039
944 /* 1040 /*
945 * Not perfect. 1041 * Right side is good, look for a left side entry.
946 */
947 if (ltdiff) {
948 /*
949 * Look until we find a better one, run out of
950 * space, or run off the end.
951 */
952 while (bno_cur_lt && bno_cur_gt) {
953 if ((error = xfs_alloc_get_rec(
954 bno_cur_gt, &gtbno,
955 &gtlen, &i)))
956 goto error0;
957 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
958 xfs_alloc_compute_aligned(gtbno, gtlen,
959 args->alignment, args->minlen,
960 &gtbnoa, &gtlena);
961 /*
962 * The left one is clearly better.
963 */
964 if (gtbnoa >= args->agbno + ltdiff) {
965 xfs_btree_del_cursor(
966 bno_cur_gt,
967 XFS_BTREE_NOERROR);
968 bno_cur_gt = NULL;
969 break;
970 }
971 /*
972 * If we reach a big enough entry,
973 * compare the two and pick the best.
974 */
975 if (gtlena >= args->minlen) {
976 args->len =
977 XFS_EXTLEN_MIN(gtlena,
978 args->maxlen);
979 xfs_alloc_fix_len(args);
980 rlen = args->len;
981 gtdiff = xfs_alloc_compute_diff(
982 args->agbno, rlen,
983 args->alignment,
984 gtbno, gtlen, &gtnew);
985 /*
986 * Right side is better.
987 */
988 if (gtdiff < ltdiff) {
989 xfs_btree_del_cursor(
990 bno_cur_lt,
991 XFS_BTREE_NOERROR);
992 bno_cur_lt = NULL;
993 }
994 /*
995 * Left side is better.
996 */
997 else {
998 xfs_btree_del_cursor(
999 bno_cur_gt,
1000 XFS_BTREE_NOERROR);
1001 bno_cur_gt = NULL;
1002 }
1003 break;
1004 }
1005 /*
1006 * Fell off the right end.
1007 */
1008 if ((error = xfs_btree_increment(
1009 bno_cur_gt, 0, &i)))
1010 goto error0;
1011 if (!i) {
1012 xfs_btree_del_cursor(
1013 bno_cur_gt,
1014 XFS_BTREE_NOERROR);
1015 bno_cur_gt = NULL;
1016 break;
1017 }
1018 }
1019 }
1020 /*
1021 * The left side is perfect, trash the right side.
1022 */
1023 else {
1024 xfs_btree_del_cursor(bno_cur_gt,
1025 XFS_BTREE_NOERROR);
1026 bno_cur_gt = NULL;
1027 }
1028 }
1029 /*
1030 * It's the right side that was found first, look left.
1031 */
1032 else {
1033 /*
1034 * Fix up the length.
1035 */ 1042 */
1036 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1043 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1037 xfs_alloc_fix_len(args); 1044 xfs_alloc_fix_len(args);
1038 rlen = args->len; 1045 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1039 gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
1040 args->alignment, gtbno, gtlen, &gtnew); 1046 args->alignment, gtbno, gtlen, &gtnew);
1041 /* 1047
1042 * Right side entry isn't perfect. 1048 error = xfs_alloc_find_best_extent(args,
1043 */ 1049 &bno_cur_gt, &bno_cur_lt,
1044 if (gtdiff) { 1050 gtdiff, &ltbno, &ltlen, &ltlena,
1045 /* 1051 1 /* search left */);
1046 * Look until we find a better one, run out of
1047 * space, or run off the end.
1048 */
1049 while (bno_cur_lt && bno_cur_gt) {
1050 if ((error = xfs_alloc_get_rec(
1051 bno_cur_lt, &ltbno,
1052 &ltlen, &i)))
1053 goto error0;
1054 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1055 xfs_alloc_compute_aligned(ltbno, ltlen,
1056 args->alignment, args->minlen,
1057 &ltbnoa, &ltlena);
1058 /*
1059 * The right one is clearly better.
1060 */
1061 if (ltbnoa <= args->agbno - gtdiff) {
1062 xfs_btree_del_cursor(
1063 bno_cur_lt,
1064 XFS_BTREE_NOERROR);
1065 bno_cur_lt = NULL;
1066 break;
1067 }
1068 /*
1069 * If we reach a big enough entry,
1070 * compare the two and pick the best.
1071 */
1072 if (ltlena >= args->minlen) {
1073 args->len = XFS_EXTLEN_MIN(
1074 ltlena, args->maxlen);
1075 xfs_alloc_fix_len(args);
1076 rlen = args->len;
1077 ltdiff = xfs_alloc_compute_diff(
1078 args->agbno, rlen,
1079 args->alignment,
1080 ltbno, ltlen, &ltnew);
1081 /*
1082 * Left side is better.
1083 */
1084 if (ltdiff < gtdiff) {
1085 xfs_btree_del_cursor(
1086 bno_cur_gt,
1087 XFS_BTREE_NOERROR);
1088 bno_cur_gt = NULL;
1089 }
1090 /*
1091 * Right side is better.
1092 */
1093 else {
1094 xfs_btree_del_cursor(
1095 bno_cur_lt,
1096 XFS_BTREE_NOERROR);
1097 bno_cur_lt = NULL;
1098 }
1099 break;
1100 }
1101 /*
1102 * Fell off the left end.
1103 */
1104 if ((error = xfs_btree_decrement(
1105 bno_cur_lt, 0, &i)))
1106 goto error0;
1107 if (!i) {
1108 xfs_btree_del_cursor(bno_cur_lt,
1109 XFS_BTREE_NOERROR);
1110 bno_cur_lt = NULL;
1111 break;
1112 }
1113 }
1114 }
1115 /*
1116 * The right side is perfect, trash the left side.
1117 */
1118 else {
1119 xfs_btree_del_cursor(bno_cur_lt,
1120 XFS_BTREE_NOERROR);
1121 bno_cur_lt = NULL;
1122 }
1123 } 1052 }
1053
1054 if (error)
1055 goto error0;
1124 } 1056 }
1057
1125 /* 1058 /*
1126 * If we couldn't get anything, give up. 1059 * If we couldn't get anything, give up.
1127 */ 1060 */
@@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near(
1130 args->agbno = NULLAGBLOCK; 1063 args->agbno = NULLAGBLOCK;
1131 return 0; 1064 return 0;
1132 } 1065 }
1066
1133 /* 1067 /*
1134 * At this point we have selected a freespace entry, either to the 1068 * At this point we have selected a freespace entry, either to the
1135 * left or to the right. If it's on the right, copy all the 1069 * left or to the right. If it's on the right, copy all the
@@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near(
1146 j = 1; 1080 j = 1;
1147 } else 1081 } else
1148 j = 0; 1082 j = 0;
1083
1149 /* 1084 /*
1150 * Fix up the length and compute the useful address. 1085 * Fix up the length and compute the useful address.
1151 */ 1086 */
@@ -2676,7 +2611,7 @@ restart:
2676 * will require a synchronous transaction, but it can still be 2611 * will require a synchronous transaction, but it can still be
2677 * used to distinguish between a partial or exact match. 2612 * used to distinguish between a partial or exact match.
2678 */ 2613 */
2679static int 2614int
2680xfs_alloc_busy_search( 2615xfs_alloc_busy_search(
2681 struct xfs_mount *mp, 2616 struct xfs_mount *mp,
2682 xfs_agnumber_t agno, 2617 xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a9727..d0b3bc72005 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
19#define __XFS_ALLOC_H__ 19#define __XFS_ALLOC_H__
20 20
21struct xfs_buf; 21struct xfs_buf;
22struct xfs_btree_cur;
22struct xfs_mount; 23struct xfs_mount;
23struct xfs_perag; 24struct xfs_perag;
24struct xfs_trans; 25struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
74#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) 75#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
75 76
76/* 77/*
78 * When deciding how much space to allocate out of an AG, we limit the
79 * allocation maximum size to the size the AG. However, we cannot use all the
80 * blocks in the AG - some are permanently used by metadata. These
81 * blocks are generally:
82 * - the AG superblock, AGF, AGI and AGFL
83 * - the AGF (bno and cnt) and AGI btree root blocks
84 * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
85 *
86 * The AG headers are sector sized, so the amount of space they take up is
87 * dependent on filesystem geometry. The others are all single blocks.
88 */
89#define XFS_ALLOC_AG_MAX_USABLE(mp) \
90 ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
91
92
93/*
77 * Argument structure for xfs_alloc routines. 94 * Argument structure for xfs_alloc routines.
78 * This is turned into a structure to avoid having 20 arguments passed 95 * This is turned into a structure to avoid having 20 arguments passed
79 * down several levels of the stack. 96 * down several levels of the stack.
@@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
118 struct xfs_perag *pag); 135 struct xfs_perag *pag);
119 136
120#ifdef __KERNEL__ 137#ifdef __KERNEL__
121
122void 138void
123xfs_alloc_busy_insert(xfs_trans_t *tp, 139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
124 xfs_agnumber_t agno, 140 xfs_agblock_t bno, xfs_extlen_t len);
125 xfs_agblock_t bno,
126 xfs_extlen_t len);
127 141
128void 142void
129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); 143xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
130 144
145int
146xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
147 xfs_agblock_t bno, xfs_extlen_t len);
131#endif /* __KERNEL__ */ 148#endif /* __KERNEL__ */
132 149
133/* 150/*
@@ -205,4 +222,18 @@ xfs_free_extent(
205 xfs_fsblock_t bno, /* starting block number of extent */ 222 xfs_fsblock_t bno, /* starting block number of extent */
206 xfs_extlen_t len); /* length of extent */ 223 xfs_extlen_t len); /* length of extent */
207 224
225int /* error */
226xfs_alloc_lookup_le(
227 struct xfs_btree_cur *cur, /* btree cursor */
228 xfs_agblock_t bno, /* starting block of extent */
229 xfs_extlen_t len, /* length of extent */
230 int *stat); /* success/failure */
231
232int /* error */
233xfs_alloc_get_rec(
234 struct xfs_btree_cur *cur, /* btree cursor */
235 xfs_agblock_t *bno, /* output: starting block of extent */
236 xfs_extlen_t *len, /* output: length of extent */
237 int *stat); /* output: success/failure */
238
208#endif /* __XFS_ALLOC_H__ */ 239#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb..71e90dc2aeb 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
637 * It didn't all fit, so we have to sort everything on hashval. 637 * It didn't all fit, so we have to sort everything on hashval.
638 */ 638 */
639 sbsize = sf->hdr.count * sizeof(*sbuf); 639 sbsize = sf->hdr.count * sizeof(*sbuf);
640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); 640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
641 641
642 /* 642 /*
643 * Scan the attribute list for the rest of the entries, storing 643 * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2386 args.dp = context->dp; 2386 args.dp = context->dp;
2387 args.whichfork = XFS_ATTR_FORK; 2387 args.whichfork = XFS_ATTR_FORK;
2388 args.valuelen = valuelen; 2388 args.valuelen = valuelen;
2389 args.value = kmem_alloc(valuelen, KM_SLEEP); 2389 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk); 2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); 2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
2392 retval = xfs_attr_rmtval_get(&args); 2392 retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8abd12e32e1..dc3afd7739f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
1038 * Filling in the middle part of a previous delayed allocation. 1038 * Filling in the middle part of a previous delayed allocation.
1039 * Contiguity is impossible here. 1039 * Contiguity is impossible here.
1040 * This case is avoided almost all the time. 1040 * This case is avoided almost all the time.
1041 *
1042 * We start with a delayed allocation:
1043 *
1044 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
1045 * PREV @ idx
1046 *
1047 * and we are allocating:
1048 * +rrrrrrrrrrrrrrrrr+
1049 * new
1050 *
1051 * and we set it up for insertion as:
1052 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
1053 * new
1054 * PREV @ idx LEFT RIGHT
1055 * inserted at idx + 1
1041 */ 1056 */
1042 temp = new->br_startoff - PREV.br_startoff; 1057 temp = new->br_startoff - PREV.br_startoff;
1043 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1044 xfs_bmbt_set_blockcount(ep, temp);
1045 r[0] = *new;
1046 r[1].br_state = PREV.br_state;
1047 r[1].br_startblock = 0;
1048 r[1].br_startoff = new_endoff;
1049 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1058 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1050 r[1].br_blockcount = temp2; 1059 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1051 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1060 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1061 LEFT = *new;
1062 RIGHT.br_state = PREV.br_state;
1063 RIGHT.br_startblock = nullstartblock(
1064 (int)xfs_bmap_worst_indlen(ip, temp2));
1065 RIGHT.br_startoff = new_endoff;
1066 RIGHT.br_blockcount = temp2;
1067 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1068 xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
1052 ip->i_df.if_lastex = idx + 1; 1069 ip->i_df.if_lastex = idx + 1;
1053 ip->i_d.di_nextents++; 1070 ip->i_d.di_nextents++;
1054 if (cur == NULL) 1071 if (cur == NULL)
@@ -2430,7 +2447,7 @@ xfs_bmap_btalloc_nullfb(
2430 startag = ag = 0; 2447 startag = ag = 0;
2431 2448
2432 pag = xfs_perag_get(mp, ag); 2449 pag = xfs_perag_get(mp, ag);
2433 while (*blen < ap->alen) { 2450 while (*blen < args->maxlen) {
2434 if (!pag->pagf_init) { 2451 if (!pag->pagf_init) {
2435 error = xfs_alloc_pagf_init(mp, args->tp, ag, 2452 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2436 XFS_ALLOC_FLAG_TRYLOCK); 2453 XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2469,7 @@ xfs_bmap_btalloc_nullfb(
2452 notinit = 1; 2469 notinit = 1;
2453 2470
2454 if (xfs_inode_is_filestream(ap->ip)) { 2471 if (xfs_inode_is_filestream(ap->ip)) {
2455 if (*blen >= ap->alen) 2472 if (*blen >= args->maxlen)
2456 break; 2473 break;
2457 2474
2458 if (ap->userdata) { 2475 if (ap->userdata) {
@@ -2498,14 +2515,14 @@ xfs_bmap_btalloc_nullfb(
2498 * If the best seen length is less than the request 2515 * If the best seen length is less than the request
2499 * length, use the best as the minimum. 2516 * length, use the best as the minimum.
2500 */ 2517 */
2501 else if (*blen < ap->alen) 2518 else if (*blen < args->maxlen)
2502 args->minlen = *blen; 2519 args->minlen = *blen;
2503 /* 2520 /*
2504 * Otherwise we've seen an extent as big as alen, 2521 * Otherwise we've seen an extent as big as maxlen,
2505 * use that as the minimum. 2522 * use that as the minimum.
2506 */ 2523 */
2507 else 2524 else
2508 args->minlen = ap->alen; 2525 args->minlen = args->maxlen;
2509 2526
2510 /* 2527 /*
2511 * set the failure fallback case to look in the selected 2528 * set the failure fallback case to look in the selected
@@ -2573,7 +2590,9 @@ xfs_bmap_btalloc(
2573 args.tp = ap->tp; 2590 args.tp = ap->tp;
2574 args.mp = mp; 2591 args.mp = mp;
2575 args.fsbno = ap->rval; 2592 args.fsbno = ap->rval;
2576 args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); 2593
2594 /* Trim the allocation back to the maximum an AG can fit. */
2595 args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
2577 args.firstblock = ap->firstblock; 2596 args.firstblock = ap->firstblock;
2578 blen = 0; 2597 blen = 0;
2579 if (nullfb) { 2598 if (nullfb) {
@@ -2621,7 +2640,7 @@ xfs_bmap_btalloc(
2621 /* 2640 /*
2622 * Adjust for alignment 2641 * Adjust for alignment
2623 */ 2642 */
2624 if (blen > args.alignment && blen <= ap->alen) 2643 if (blen > args.alignment && blen <= args.maxlen)
2625 args.minlen = blen - args.alignment; 2644 args.minlen = blen - args.alignment;
2626 args.minalignslop = 0; 2645 args.minalignslop = 0;
2627 } else { 2646 } else {
@@ -2640,7 +2659,7 @@ xfs_bmap_btalloc(
2640 * of minlen+alignment+slop doesn't go up 2659 * of minlen+alignment+slop doesn't go up
2641 * between the calls. 2660 * between the calls.
2642 */ 2661 */
2643 if (blen > mp->m_dalign && blen <= ap->alen) 2662 if (blen > mp->m_dalign && blen <= args.maxlen)
2644 nextminlen = blen - mp->m_dalign; 2663 nextminlen = blen - mp->m_dalign;
2645 else 2664 else
2646 nextminlen = args.minlen; 2665 nextminlen = args.minlen;
@@ -4485,6 +4504,16 @@ xfs_bmapi(
4485 /* Figure out the extent size, adjust alen */ 4504 /* Figure out the extent size, adjust alen */
4486 extsz = xfs_get_extsz_hint(ip); 4505 extsz = xfs_get_extsz_hint(ip);
4487 if (extsz) { 4506 if (extsz) {
4507 /*
4508 * make sure we don't exceed a single
4509 * extent length when we align the
4510 * extent by reducing length we are
4511 * going to allocate by the maximum
4512 * amount extent size aligment may
4513 * require.
4514 */
4515 alen = XFS_FILBLKS_MIN(len,
4516 MAXEXTLEN - (2 * extsz - 1));
4488 error = xfs_bmap_extsize_align(mp, 4517 error = xfs_bmap_extsize_align(mp,
4489 &got, &prev, extsz, 4518 &got, &prev, extsz,
4490 rt, eof, 4519 rt, eof,
@@ -5471,8 +5500,13 @@ xfs_getbmap(
5471 if (error) 5500 if (error)
5472 goto out_unlock_iolock; 5501 goto out_unlock_iolock;
5473 } 5502 }
5474 5503 /*
5475 ASSERT(ip->i_delayed_blks == 0); 5504 * even after flushing the inode, there can still be delalloc
5505 * blocks on the inode beyond EOF due to speculative
5506 * preallocation. These are not removed until the release
5507 * function is called or the inode is inactivated. Hence we
5508 * cannot assert here that ip->i_delayed_blks == 0.
5509 */
5476 } 5510 }
5477 5511
5478 lock = xfs_ilock_map_shared(ip); 5512 lock = xfs_ilock_map_shared(ip);
@@ -6070,3 +6104,79 @@ xfs_bmap_disk_count_leaves(
6070 *count += xfs_bmbt_disk_get_blockcount(frp); 6104 *count += xfs_bmbt_disk_get_blockcount(frp);
6071 } 6105 }
6072} 6106}
6107
6108/*
6109 * dead simple method of punching delalyed allocation blocks from a range in
6110 * the inode. Walks a block at a time so will be slow, but is only executed in
6111 * rare error cases so the overhead is not critical. This will alays punch out
6112 * both the start and end blocks, even if the ranges only partially overlap
6113 * them, so it is up to the caller to ensure that partial blocks are not
6114 * passed in.
6115 */
6116int
6117xfs_bmap_punch_delalloc_range(
6118 struct xfs_inode *ip,
6119 xfs_fileoff_t start_fsb,
6120 xfs_fileoff_t length)
6121{
6122 xfs_fileoff_t remaining = length;
6123 int error = 0;
6124
6125 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
6126
6127 do {
6128 int done;
6129 xfs_bmbt_irec_t imap;
6130 int nimaps = 1;
6131 xfs_fsblock_t firstblock;
6132 xfs_bmap_free_t flist;
6133
6134 /*
6135 * Map the range first and check that it is a delalloc extent
6136 * before trying to unmap the range. Otherwise we will be
6137 * trying to remove a real extent (which requires a
6138 * transaction) or a hole, which is probably a bad idea...
6139 */
6140 error = xfs_bmapi(NULL, ip, start_fsb, 1,
6141 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
6142 &nimaps, NULL);
6143
6144 if (error) {
6145 /* something screwed, just bail */
6146 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
6147 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
6148 "Failed delalloc mapping lookup ino %lld fsb %lld.",
6149 ip->i_ino, start_fsb);
6150 }
6151 break;
6152 }
6153 if (!nimaps) {
6154 /* nothing there */
6155 goto next_block;
6156 }
6157 if (imap.br_startblock != DELAYSTARTBLOCK) {
6158 /* been converted, ignore */
6159 goto next_block;
6160 }
6161 WARN_ON(imap.br_blockcount == 0);
6162
6163 /*
6164 * Note: while we initialise the firstblock/flist pair, they
6165 * should never be used because blocks should never be
6166 * allocated or freed for a delalloc extent and hence we need
6167 * don't cancel or finish them after the xfs_bunmapi() call.
6168 */
6169 xfs_bmap_init(&flist, &firstblock);
6170 error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
6171 &flist, &done);
6172 if (error)
6173 break;
6174
6175 ASSERT(!flist.xbf_count && !flist.xbf_first);
6176next_block:
6177 start_fsb++;
6178 remaining--;
6179 } while(remaining > 0);
6180
6181 return error;
6182}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 71ec9b6ecdf..3651191daea 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
394 int whichfork, 394 int whichfork,
395 int *count); 395 int *count);
396 396
397int
398xfs_bmap_punch_delalloc_range(
399 struct xfs_inode *ip,
400 xfs_fileoff_t start_fsb,
401 xfs_fileoff_t length);
397#endif /* __KERNEL__ */ 402#endif /* __KERNEL__ */
398 403
399#endif /* __XFS_BMAP_H__ */ 404#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7..2f9e97c128a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
634 return error; 634 return error;
635 } 635 }
636 ASSERT(!bp || !XFS_BUF_GETERROR(bp)); 636 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
637 if (bp != NULL) { 637 if (bp)
638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); 638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
639 }
640 *bpp = bp; 639 *bpp = bp;
641 return 0; 640 return 0;
642} 641}
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
944 switch (cur->bc_btnum) { 943 switch (cur->bc_btnum) {
945 case XFS_BTNUM_BNO: 944 case XFS_BTNUM_BNO:
946 case XFS_BTNUM_CNT: 945 case XFS_BTNUM_CNT:
947 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); 946 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
948 break; 947 break;
949 case XFS_BTNUM_INO: 948 case XFS_BTNUM_INO:
950 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); 949 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
951 break; 950 break;
952 case XFS_BTNUM_BMAP: 951 case XFS_BTNUM_BMAP:
953 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); 952 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
954 break; 953 break;
955 default: 954 default:
956 ASSERT(0); 955 ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5..6f8c21ce0d6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,8 +141,7 @@ xfs_buf_item_log_check(
141#define xfs_buf_item_log_check(x) 141#define xfs_buf_item_log_check(x)
142#endif 142#endif
143 143
144STATIC void xfs_buf_error_relse(xfs_buf_t *bp); 144STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
145STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
146 145
147/* 146/*
148 * This returns the number of log iovecs needed to log the 147 * This returns the number of log iovecs needed to log the
@@ -428,13 +427,15 @@ xfs_buf_item_unpin(
428 427
429 if (remove) { 428 if (remove) {
430 /* 429 /*
431 * We have to remove the log item from the transaction 430 * If we are in a transaction context, we have to
432 * as we are about to release our reference to the 431 * remove the log item from the transaction as we are
433 * buffer. If we don't, the unlock that occurs later 432 * about to release our reference to the buffer. If we
434 * in xfs_trans_uncommit() will ry to reference the 433 * don't, the unlock that occurs later in
434 * xfs_trans_uncommit() will try to reference the
435 * buffer which we no longer have a hold on. 435 * buffer which we no longer have a hold on.
436 */ 436 */
437 xfs_trans_del_item(lip); 437 if (lip->li_desc)
438 xfs_trans_del_item(lip);
438 439
439 /* 440 /*
440 * Since the transaction no longer refers to the buffer, 441 * Since the transaction no longer refers to the buffer,
@@ -450,7 +451,7 @@ xfs_buf_item_unpin(
450 * xfs_trans_ail_delete() drops the AIL lock. 451 * xfs_trans_ail_delete() drops the AIL lock.
451 */ 452 */
452 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 453 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
453 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); 454 xfs_buf_do_callbacks(bp);
454 XFS_BUF_SET_FSPRIVATE(bp, NULL); 455 XFS_BUF_SET_FSPRIVATE(bp, NULL);
455 XFS_BUF_CLR_IODONE_FUNC(bp); 456 XFS_BUF_CLR_IODONE_FUNC(bp);
456 } else { 457 } else {
@@ -918,15 +919,26 @@ xfs_buf_attach_iodone(
918 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 919 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
919} 920}
920 921
922/*
923 * We can have many callbacks on a buffer. Running the callbacks individually
924 * can cause a lot of contention on the AIL lock, so we allow for a single
925 * callback to be able to scan the remaining lip->li_bio_list for other items
926 * of the same type and callback to be processed in the first call.
927 *
928 * As a result, the loop walking the callback list below will also modify the
929 * list. it removes the first item from the list and then runs the callback.
930 * The loop then restarts from the new head of the list. This allows the
931 * callback to scan and modify the list attached to the buffer and we don't
932 * have to care about maintaining a next item pointer.
933 */
921STATIC void 934STATIC void
922xfs_buf_do_callbacks( 935xfs_buf_do_callbacks(
923 xfs_buf_t *bp, 936 struct xfs_buf *bp)
924 xfs_log_item_t *lip)
925{ 937{
926 xfs_log_item_t *nlip; 938 struct xfs_log_item *lip;
927 939
928 while (lip != NULL) { 940 while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
929 nlip = lip->li_bio_list; 941 XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
930 ASSERT(lip->li_cb != NULL); 942 ASSERT(lip->li_cb != NULL);
931 /* 943 /*
932 * Clear the next pointer so we don't have any 944 * Clear the next pointer so we don't have any
@@ -936,7 +948,6 @@ xfs_buf_do_callbacks(
936 */ 948 */
937 lip->li_bio_list = NULL; 949 lip->li_bio_list = NULL;
938 lip->li_cb(bp, lip); 950 lip->li_cb(bp, lip);
939 lip = nlip;
940 } 951 }
941} 952}
942 953
@@ -949,128 +960,76 @@ xfs_buf_do_callbacks(
949 */ 960 */
950void 961void
951xfs_buf_iodone_callbacks( 962xfs_buf_iodone_callbacks(
952 xfs_buf_t *bp) 963 struct xfs_buf *bp)
953{ 964{
954 xfs_log_item_t *lip; 965 struct xfs_log_item *lip = bp->b_fspriv;
955 static ulong lasttime; 966 struct xfs_mount *mp = lip->li_mountp;
956 static xfs_buftarg_t *lasttarg; 967 static ulong lasttime;
957 xfs_mount_t *mp; 968 static xfs_buftarg_t *lasttarg;
958 969
959 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 970 if (likely(!XFS_BUF_GETERROR(bp)))
960 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 971 goto do_callbacks;
961 972
962 if (XFS_BUF_GETERROR(bp) != 0) { 973 /*
963 /* 974 * If we've already decided to shutdown the filesystem because of
964 * If we've already decided to shutdown the filesystem 975 * I/O errors, there's no point in giving this a retry.
965 * because of IO errors, there's no point in giving this 976 */
966 * a retry. 977 if (XFS_FORCED_SHUTDOWN(mp)) {
967 */ 978 XFS_BUF_SUPER_STALE(bp);
968 mp = lip->li_mountp; 979 trace_xfs_buf_item_iodone(bp, _RET_IP_);
969 if (XFS_FORCED_SHUTDOWN(mp)) { 980 goto do_callbacks;
970 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 981 }
971 XFS_BUF_SUPER_STALE(bp);
972 trace_xfs_buf_item_iodone(bp, _RET_IP_);
973 xfs_buf_do_callbacks(bp, lip);
974 XFS_BUF_SET_FSPRIVATE(bp, NULL);
975 XFS_BUF_CLR_IODONE_FUNC(bp);
976 xfs_buf_ioend(bp, 0);
977 return;
978 }
979 982
980 if ((XFS_BUF_TARGET(bp) != lasttarg) || 983 if (XFS_BUF_TARGET(bp) != lasttarg ||
981 (time_after(jiffies, (lasttime + 5*HZ)))) { 984 time_after(jiffies, (lasttime + 5*HZ))) {
982 lasttime = jiffies; 985 lasttime = jiffies;
983 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 986 cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
984 " block 0x%llx in %s", 987 " block 0x%llx in %s",
985 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 988 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
986 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 989 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
987 } 990 }
988 lasttarg = XFS_BUF_TARGET(bp); 991 lasttarg = XFS_BUF_TARGET(bp);
989 992
990 if (XFS_BUF_ISASYNC(bp)) { 993 /*
991 /* 994 * If the write was asynchronous then noone will be looking for the
992 * If the write was asynchronous then noone will be 995 * error. Clear the error state and write the buffer out again.
993 * looking for the error. Clear the error state 996 *
994 * and write the buffer out again delayed write. 997 * During sync or umount we'll write all pending buffers again
995 * 998 * synchronous, which will catch these errors if they keep hanging
996 * XXXsup This is OK, so long as we catch these 999 * around.
997 * before we start the umount; we don't want these 1000 */
998 * DELWRI metadata bufs to be hanging around. 1001 if (XFS_BUF_ISASYNC(bp)) {
999 */ 1002 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
1000 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ 1003
1001 1004 if (!XFS_BUF_ISSTALE(bp)) {
1002 if (!(XFS_BUF_ISSTALE(bp))) { 1005 XFS_BUF_DELAYWRITE(bp);
1003 XFS_BUF_DELAYWRITE(bp);
1004 XFS_BUF_DONE(bp);
1005 XFS_BUF_SET_START(bp);
1006 }
1007 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1008 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1009 xfs_buf_relse(bp);
1010 } else {
1011 /*
1012 * If the write of the buffer was not asynchronous,
1013 * then we want to make sure to return the error
1014 * to the caller of bwrite(). Because of this we
1015 * cannot clear the B_ERROR state at this point.
1016 * Instead we install a callback function that
1017 * will be called when the buffer is released, and
1018 * that routine will clear the error state and
1019 * set the buffer to be written out again after
1020 * some delay.
1021 */
1022 /* We actually overwrite the existing b-relse
1023 function at times, but we're gonna be shutting down
1024 anyway. */
1025 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1026 XFS_BUF_DONE(bp); 1006 XFS_BUF_DONE(bp);
1027 XFS_BUF_FINISH_IOWAIT(bp); 1007 XFS_BUF_SET_START(bp);
1028 } 1008 }
1009 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1010 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1011 xfs_buf_relse(bp);
1029 return; 1012 return;
1030 } 1013 }
1031 1014
1032 xfs_buf_do_callbacks(bp, lip); 1015 /*
1033 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1016 * If the write of the buffer was synchronous, we want to make
1034 XFS_BUF_CLR_IODONE_FUNC(bp); 1017 * sure to return the error to the caller of xfs_bwrite().
1035 xfs_buf_ioend(bp, 0); 1018 */
1036}
1037
1038/*
1039 * This is a callback routine attached to a buffer which gets an error
1040 * when being written out synchronously.
1041 */
1042STATIC void
1043xfs_buf_error_relse(
1044 xfs_buf_t *bp)
1045{
1046 xfs_log_item_t *lip;
1047 xfs_mount_t *mp;
1048
1049 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1050 mp = (xfs_mount_t *)lip->li_mountp;
1051 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1052
1053 XFS_BUF_STALE(bp); 1019 XFS_BUF_STALE(bp);
1054 XFS_BUF_DONE(bp); 1020 XFS_BUF_DONE(bp);
1055 XFS_BUF_UNDELAYWRITE(bp); 1021 XFS_BUF_UNDELAYWRITE(bp);
1056 XFS_BUF_ERROR(bp,0);
1057 1022
1058 trace_xfs_buf_error_relse(bp, _RET_IP_); 1023 trace_xfs_buf_error_relse(bp, _RET_IP_);
1024 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1059 1025
1060 if (! XFS_FORCED_SHUTDOWN(mp)) 1026do_callbacks:
1061 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1027 xfs_buf_do_callbacks(bp);
1062 /*
1063 * We have to unpin the pinned buffers so do the
1064 * callbacks.
1065 */
1066 xfs_buf_do_callbacks(bp, lip);
1067 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1028 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1068 XFS_BUF_CLR_IODONE_FUNC(bp); 1029 XFS_BUF_CLR_IODONE_FUNC(bp);
1069 XFS_BUF_SET_BRELSE_FUNC(bp,NULL); 1030 xfs_buf_ioend(bp, 0);
1070 xfs_buf_relse(bp);
1071} 1031}
1072 1032
1073
1074/* 1033/*
1075 * This is the iodone() function for buffers which have been 1034 * This is the iodone() function for buffers which have been
1076 * logged. It is called when they are eventually flushed out. 1035 * logged. It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c..b6ecd2061e7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
105 xfs_buf_log_format_t bli_format; /* in-log header */ 105 xfs_buf_log_format_t bli_format; /* in-log header */
106} xfs_buf_log_item_t; 106} xfs_buf_log_item_t;
107 107
108/*
109 * This structure is used during recovery to record the buf log
110 * items which have been canceled and should not be replayed.
111 */
112typedef struct xfs_buf_cancel {
113 xfs_daddr_t bc_blkno;
114 uint bc_len;
115 int bc_refcount;
116 struct xfs_buf_cancel *bc_next;
117} xfs_buf_cancel_t;
118
119void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); 108void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
120void xfs_buf_item_relse(struct xfs_buf *); 109void xfs_buf_item_relse(struct xfs_buf *);
121void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); 110void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a2..e60490bc00a 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
377 ip->i_d.di_format = tip->i_d.di_format; 377 ip->i_d.di_format = tip->i_d.di_format;
378 tip->i_d.di_format = tmp; 378 tip->i_d.di_format = tmp;
379 379
380 /*
381 * The extents in the source inode could still contain speculative
382 * preallocation beyond EOF (e.g. the file is open but not modified
383 * while defrag is in progress). In that case, we need to copy over the
384 * number of delalloc blocks the data fork in the source inode is
385 * tracking beyond EOF so that when the fork is truncated away when the
386 * temporary inode is unlinked we don't underrun the i_delayed_blks
387 * counter on that inode.
388 */
389 ASSERT(tip->i_delayed_blks == 0);
390 tip->i_delayed_blks = ip->i_delayed_blks;
391 ip->i_delayed_blks = 0;
392
380 ilf_fields = XFS_ILOG_CORE; 393 ilf_fields = XFS_ILOG_CORE;
381 394
382 switch(ip->i_d.di_format) { 395 switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed999026766..4c7db74a05f 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
58int xfs_etest[XFS_NUM_INJECT_ERROR]; 58int xfs_etest[XFS_NUM_INJECT_ERROR];
59int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; 59int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
60char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; 60char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
61int xfs_error_test_active;
61 62
62int 63int
63xfs_error_test(int error_tag, int *fsidp, char *expression, 64xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
108 len = strlen(mp->m_fsname); 109 len = strlen(mp->m_fsname);
109 xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); 110 xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
110 strcpy(xfs_etest_fsname[i], mp->m_fsname); 111 strcpy(xfs_etest_fsname[i], mp->m_fsname);
112 xfs_error_test_active++;
111 return 0; 113 return 0;
112 } 114 }
113 } 115 }
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
137 xfs_etest_fsid[i] = 0LL; 139 xfs_etest_fsid[i] = 0LL;
138 kmem_free(xfs_etest_fsname[i]); 140 kmem_free(xfs_etest_fsname[i]);
139 xfs_etest_fsname[i] = NULL; 141 xfs_etest_fsname[i] = NULL;
142 xfs_error_test_active--;
140 } 143 }
141 } 144 }
142 145
@@ -149,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
149} 152}
150#endif /* DEBUG */ 153#endif /* DEBUG */
151 154
152
153void
154xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
155{
156 va_list ap;
157
158 va_start(ap, fmt);
159 xfs_fs_vcmn_err(level, mp, fmt, ap);
160 va_end(ap);
161}
162
163void
164xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
165{
166 va_list ap;
167
168#ifdef DEBUG
169 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
170#endif
171
172 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
173 && (level & CE_ALERT)) {
174 level &= ~CE_ALERT;
175 level |= CE_PANIC;
176 cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
177 }
178 va_start(ap, fmt);
179 xfs_fs_vcmn_err(level, mp, fmt, ap);
180 va_end(ap);
181}
182
183void 155void
184xfs_error_report( 156xfs_error_report(
185 const char *tag, 157 const char *tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb8..10dce5475f0 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,16 +127,17 @@ extern void xfs_corruption_error(const char *tag, int level,
127#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT 127#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
128 128
129#ifdef DEBUG 129#ifdef DEBUG
130extern int xfs_error_test_active;
130extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); 131extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
131 132
132#define XFS_NUM_INJECT_ERROR 10 133#define XFS_NUM_INJECT_ERROR 10
133#define XFS_TEST_ERROR(expr, mp, tag, rf) \ 134#define XFS_TEST_ERROR(expr, mp, tag, rf) \
134 ((expr) || \ 135 ((expr) || (xfs_error_test_active && \
135 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
136 (rf))) 137 (rf))))
137 138
138extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); 139extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
139extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); 140extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
140#else 141#else
141#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
142#define xfs_errortag_add(tag, mp) (ENOSYS) 143#define xfs_errortag_add(tag, mp) (ENOSYS)
@@ -161,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
161 162
162struct xfs_mount; 163struct xfs_mount;
163 164
164extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
165 char *fmt, va_list ap)
166 __attribute__ ((format (printf, 3, 0)));
167extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
168 char *fmt, ...)
169 __attribute__ ((format (printf, 4, 5)));
170extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
171 __attribute__ ((format (printf, 3, 4)));
172
173extern void xfs_hex_dump(void *p, int length); 165extern void xfs_hex_dump(void *p, int length);
174 166
175#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ 167#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
176 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) 168 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
177 169
178#define xfs_fs_mount_cmn_err(f, fmt, args...) \ 170#define xfs_fs_mount_cmn_err(f, fmt, args...) \
179 ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) 171 do { \
172 if (!(f & XFS_MFSI_QUIET)) \
173 cmn_err(CE_WARN, "XFS: " fmt, ## args); \
174 } while (0)
180 175
181#endif /* __XFS_ERROR_H__ */ 176#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf56..d22e6262343 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
48} 48}
49 49
50/* 50/*
51 * Freeing the efi requires that we remove it from the AIL if it has already
52 * been placed there. However, the EFI may not yet have been placed in the AIL
53 * when called by xfs_efi_release() from EFD processing due to the ordering of
54 * committed vs unpin operations in bulk insert operations. Hence the
55 * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
56 * the EFI.
57 */
58STATIC void
59__xfs_efi_release(
60 struct xfs_efi_log_item *efip)
61{
62 struct xfs_ail *ailp = efip->efi_item.li_ailp;
63
64 if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
65 spin_lock(&ailp->xa_lock);
66 /* xfs_trans_ail_delete() drops the AIL lock. */
67 xfs_trans_ail_delete(ailp, &efip->efi_item);
68 xfs_efi_item_free(efip);
69 }
70}
71
72/*
51 * This returns the number of iovecs needed to log the given efi item. 73 * This returns the number of iovecs needed to log the given efi item.
52 * We only need 1 iovec for an efi item. It just logs the efi_log_format 74 * We only need 1 iovec for an efi item. It just logs the efi_log_format
53 * structure. 75 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
74 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 96 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
75 uint size; 97 uint size;
76 98
77 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); 99 ASSERT(atomic_read(&efip->efi_next_extent) ==
100 efip->efi_format.efi_nextents);
78 101
79 efip->efi_format.efi_type = XFS_LI_EFI; 102 efip->efi_format.efi_type = XFS_LI_EFI;
80 103
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
99} 122}
100 123
101/* 124/*
102 * While EFIs cannot really be pinned, the unpin operation is the 125 * While EFIs cannot really be pinned, the unpin operation is the last place at
103 * last place at which the EFI is manipulated during a transaction. 126 * which the EFI is manipulated during a transaction. If we are being asked to
104 * Here we coordinate with xfs_efi_cancel() to determine who gets to 127 * remove the EFI it's because the transaction has been cancelled and by
105 * free the EFI. 128 * definition that means the EFI cannot be in the AIL so remove it from the
129 * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
130 * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
106 */ 131 */
107STATIC void 132STATIC void
108xfs_efi_item_unpin( 133xfs_efi_item_unpin(
@@ -110,20 +135,15 @@ xfs_efi_item_unpin(
110 int remove) 135 int remove)
111{ 136{
112 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 137 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
113 struct xfs_ail *ailp = lip->li_ailp;
114 138
115 spin_lock(&ailp->xa_lock); 139 if (remove) {
116 if (efip->efi_flags & XFS_EFI_CANCELED) { 140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
117 if (remove) 141 if (lip->li_desc)
118 xfs_trans_del_item(lip); 142 xfs_trans_del_item(lip);
119
120 /* xfs_trans_ail_delete() drops the AIL lock. */
121 xfs_trans_ail_delete(ailp, lip);
122 xfs_efi_item_free(efip); 143 xfs_efi_item_free(efip);
123 } else { 144 return;
124 efip->efi_flags |= XFS_EFI_COMMITTED;
125 spin_unlock(&ailp->xa_lock);
126 } 145 }
146 __xfs_efi_release(efip);
127} 147}
128 148
129/* 149/*
@@ -152,16 +172,20 @@ xfs_efi_item_unlock(
152} 172}
153 173
154/* 174/*
155 * The EFI is logged only once and cannot be moved in the log, so 175 * The EFI is logged only once and cannot be moved in the log, so simply return
156 * simply return the lsn at which it's been logged. The canceled 176 * the lsn at which it's been logged. For bulk transaction committed
157 * flag is not paid any attention here. Checking for that is delayed 177 * processing, the EFI may be processed but not yet unpinned prior to the EFD
158 * until the EFI is unpinned. 178 * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
179 * when processing the EFD.
159 */ 180 */
160STATIC xfs_lsn_t 181STATIC xfs_lsn_t
161xfs_efi_item_committed( 182xfs_efi_item_committed(
162 struct xfs_log_item *lip, 183 struct xfs_log_item *lip,
163 xfs_lsn_t lsn) 184 xfs_lsn_t lsn)
164{ 185{
186 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
187
188 set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
165 return lsn; 189 return lsn;
166} 190}
167 191
@@ -230,6 +254,7 @@ xfs_efi_init(
230 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); 254 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
231 efip->efi_format.efi_nextents = nextents; 255 efip->efi_format.efi_nextents = nextents;
232 efip->efi_format.efi_id = (__psint_t)(void*)efip; 256 efip->efi_format.efi_id = (__psint_t)(void*)efip;
257 atomic_set(&efip->efi_next_extent, 0);
233 258
234 return efip; 259 return efip;
235} 260}
@@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
289} 314}
290 315
291/* 316/*
292 * This is called by the efd item code below to release references to 317 * This is called by the efd item code below to release references to the given
293 * the given efi item. Each efd calls this with the number of 318 * efi item. Each efd calls this with the number of extents that it has
294 * extents that it has logged, and when the sum of these reaches 319 * logged, and when the sum of these reaches the total number of extents logged
295 * the total number of extents logged by this efi item we can free 320 * by this efi item we can free the efi item.
296 * the efi item.
297 *
298 * Freeing the efi item requires that we remove it from the AIL.
299 * We'll use the AIL lock to protect our counters as well as
300 * the removal from the AIL.
301 */ 321 */
302void 322void
303xfs_efi_release(xfs_efi_log_item_t *efip, 323xfs_efi_release(xfs_efi_log_item_t *efip,
304 uint nextents) 324 uint nextents)
305{ 325{
306 struct xfs_ail *ailp = efip->efi_item.li_ailp; 326 ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
307 int extents_left; 327 if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
308 328 __xfs_efi_release(efip);
309 ASSERT(efip->efi_next_extent > 0);
310 ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
311
312 spin_lock(&ailp->xa_lock);
313 ASSERT(efip->efi_next_extent >= nextents);
314 efip->efi_next_extent -= nextents;
315 extents_left = efip->efi_next_extent;
316 if (extents_left == 0) {
317 /* xfs_trans_ail_delete() drops the AIL lock. */
318 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
319 xfs_efi_item_free(efip);
320 } else {
321 spin_unlock(&ailp->xa_lock);
322 }
323} 329}
324 330
325static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) 331static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf6..375f68e4253 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
111#define XFS_EFI_MAX_FAST_EXTENTS 16 111#define XFS_EFI_MAX_FAST_EXTENTS 16
112 112
113/* 113/*
114 * Define EFI flags. 114 * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
115 */ 115 */
116#define XFS_EFI_RECOVERED 0x1 116#define XFS_EFI_RECOVERED 1
117#define XFS_EFI_COMMITTED 0x2 117#define XFS_EFI_COMMITTED 2
118#define XFS_EFI_CANCELED 0x4
119 118
120/* 119/*
121 * This is the "extent free intention" log item. It is used 120 * This is the "extent free intention" log item. It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
125 */ 124 */
126typedef struct xfs_efi_log_item { 125typedef struct xfs_efi_log_item {
127 xfs_log_item_t efi_item; 126 xfs_log_item_t efi_item;
128 uint efi_flags; /* misc flags */ 127 atomic_t efi_next_extent;
129 uint efi_next_extent; 128 unsigned long efi_flags; /* misc flags */
130 xfs_efi_log_format_t efi_format; 129 xfs_efi_log_format_t efi_format;
131} xfs_efi_log_item_t; 130} xfs_efi_log_item_t;
132 131
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce569..9124425b7f2 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
744 * If the file's parent directory is known, take its iolock in exclusive 744 * If the file's parent directory is known, take its iolock in exclusive
745 * mode to prevent two sibling files from racing each other to migrate 745 * mode to prevent two sibling files from racing each other to migrate
746 * themselves and their parent to different AGs. 746 * themselves and their parent to different AGs.
747 *
748 * Note that we lock the parent directory iolock inside the child
749 * iolock here. That's fine as we never hold both parent and child
750 * iolock in any other place. This is different from the ilock,
751 * which requires locking of the child after the parent for namespace
752 * operations.
747 */ 753 */
748 if (pip) 754 if (pip)
749 xfs_ilock(pip, XFS_IOLOCK_EXCL); 755 xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
750 756
751 /* 757 /*
752 * A new AG needs to be found for the file. If the file's parent 758 * A new AG needs to be found for the file. If the file's parent
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814a..cec89dd5d7d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
374 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 374 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
375 } else 375 } else
376 mp->m_maxicount = 0; 376 mp->m_maxicount = 0;
377 xfs_set_low_space_thresholds(mp);
377 378
378 /* update secondary superblocks. */ 379 /* update secondary superblocks. */
379 for (agno = 1; agno < nagcount; agno++) { 380 for (agno = 1; agno < nagcount; agno++) {
@@ -611,12 +612,13 @@ out:
611 * 612 *
612 * We cannot use an inode here for this - that will push dirty state back up 613 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from 614 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead. 615 * making progress. Hence we log a field in the superblock instead and use a
616 * synchronous transaction to ensure the superblock is immediately unpinned
617 * and can be written back.
615 */ 618 */
616int 619int
617xfs_fs_log_dummy( 620xfs_fs_log_dummy(
618 xfs_mount_t *mp, 621 xfs_mount_t *mp)
619 int flags)
620{ 622{
621 xfs_trans_t *tp; 623 xfs_trans_t *tp;
622 int error; 624 int error;
@@ -631,8 +633,7 @@ xfs_fs_log_dummy(
631 633
632 /* log the UUID because it is an unchanging field */ 634 /* log the UUID because it is an unchanging field */
633 xfs_mod_sb(tp, XFS_SB_UUID); 635 xfs_mod_sb(tp, XFS_SB_UUID);
634 if (flags & SYNC_WAIT) 636 xfs_trans_set_sync(tp);
635 xfs_trans_set_sync(tp);
636 return xfs_trans_commit(tp, 0); 637 return xfs_trans_commit(tp, 0);
637} 638}
638 639
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1..1b6a98b6688 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); 28extern int xfs_fs_log_dummy(struct xfs_mount *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0cdd26932d8..cb9b6d1469f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
43 43
44 44
45/* 45/*
46 * Define xfs inode iolock lockdep classes. We need to ensure that all active
47 * inodes are considered the same for lockdep purposes, including inodes that
48 * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
49 * guarantee the locks are considered the same when there are multiple lock
50 * initialisation siteѕ. Also, define a reclaimable inode class so it is
51 * obvious in lockdep reports which class the report is against.
52 */
53static struct lock_class_key xfs_iolock_active;
54struct lock_class_key xfs_iolock_reclaimable;
55
56/*
46 * Allocate and initialise an xfs_inode. 57 * Allocate and initialise an xfs_inode.
47 */ 58 */
48STATIC struct xfs_inode * 59STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
69 ASSERT(atomic_read(&ip->i_pincount) == 0); 80 ASSERT(atomic_read(&ip->i_pincount) == 0);
70 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 81 ASSERT(!spin_is_locked(&ip->i_flags_lock));
71 ASSERT(completion_done(&ip->i_flush)); 82 ASSERT(completion_done(&ip->i_flush));
83 ASSERT(ip->i_ino == 0);
72 84
73 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 85 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
86 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
87 &xfs_iolock_active, "xfs_iolock_active");
74 88
75 /* initialise the xfs inode */ 89 /* initialise the xfs inode */
76 ip->i_ino = ino; 90 ip->i_ino = ino;
@@ -85,12 +99,20 @@ xfs_inode_alloc(
85 ip->i_size = 0; 99 ip->i_size = 0;
86 ip->i_new_size = 0; 100 ip->i_new_size = 0;
87 101
88 /* prevent anyone from using this yet */
89 VFS_I(ip)->i_state = I_NEW;
90
91 return ip; 102 return ip;
92} 103}
93 104
105STATIC void
106xfs_inode_free_callback(
107 struct rcu_head *head)
108{
109 struct inode *inode = container_of(head, struct inode, i_rcu);
110 struct xfs_inode *ip = XFS_I(inode);
111
112 INIT_LIST_HEAD(&inode->i_dentry);
113 kmem_zone_free(xfs_inode_zone, ip);
114}
115
94void 116void
95xfs_inode_free( 117xfs_inode_free(
96 struct xfs_inode *ip) 118 struct xfs_inode *ip)
@@ -134,7 +156,18 @@ xfs_inode_free(
134 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 156 ASSERT(!spin_is_locked(&ip->i_flags_lock));
135 ASSERT(completion_done(&ip->i_flush)); 157 ASSERT(completion_done(&ip->i_flush));
136 158
137 kmem_zone_free(xfs_inode_zone, ip); 159 /*
160 * Because we use RCU freeing we need to ensure the inode always
161 * appears to be reclaimed with an invalid inode number when in the
162 * free state. The ip->i_flags_lock provides the barrier against lookup
163 * races.
164 */
165 spin_lock(&ip->i_flags_lock);
166 ip->i_flags = XFS_IRECLAIM;
167 ip->i_ino = 0;
168 spin_unlock(&ip->i_flags_lock);
169
170 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138} 171}
139 172
140/* 173/*
@@ -144,14 +177,29 @@ static int
144xfs_iget_cache_hit( 177xfs_iget_cache_hit(
145 struct xfs_perag *pag, 178 struct xfs_perag *pag,
146 struct xfs_inode *ip, 179 struct xfs_inode *ip,
180 xfs_ino_t ino,
147 int flags, 181 int flags,
148 int lock_flags) __releases(pag->pag_ici_lock) 182 int lock_flags) __releases(RCU)
149{ 183{
150 struct inode *inode = VFS_I(ip); 184 struct inode *inode = VFS_I(ip);
151 struct xfs_mount *mp = ip->i_mount; 185 struct xfs_mount *mp = ip->i_mount;
152 int error; 186 int error;
153 187
188 /*
189 * check for re-use of an inode within an RCU grace period due to the
190 * radix tree nodes not being updated yet. We monitor for this by
191 * setting the inode number to zero before freeing the inode structure.
192 * If the inode has been reallocated and set up, then the inode number
193 * will not match, so check for that, too.
194 */
154 spin_lock(&ip->i_flags_lock); 195 spin_lock(&ip->i_flags_lock);
196 if (ip->i_ino != ino) {
197 trace_xfs_iget_skip(ip);
198 XFS_STATS_INC(xs_ig_frecycle);
199 error = EAGAIN;
200 goto out_error;
201 }
202
155 203
156 /* 204 /*
157 * If we are racing with another cache hit that is currently 205 * If we are racing with another cache hit that is currently
@@ -194,7 +242,7 @@ xfs_iget_cache_hit(
194 ip->i_flags |= XFS_IRECLAIM; 242 ip->i_flags |= XFS_IRECLAIM;
195 243
196 spin_unlock(&ip->i_flags_lock); 244 spin_unlock(&ip->i_flags_lock);
197 read_unlock(&pag->pag_ici_lock); 245 rcu_read_unlock();
198 246
199 error = -inode_init_always(mp->m_super, inode); 247 error = -inode_init_always(mp->m_super, inode);
200 if (error) { 248 if (error) {
@@ -202,7 +250,7 @@ xfs_iget_cache_hit(
202 * Re-initializing the inode failed, and we are in deep 250 * Re-initializing the inode failed, and we are in deep
203 * trouble. Try to re-add it to the reclaim list. 251 * trouble. Try to re-add it to the reclaim list.
204 */ 252 */
205 read_lock(&pag->pag_ici_lock); 253 rcu_read_lock();
206 spin_lock(&ip->i_flags_lock); 254 spin_lock(&ip->i_flags_lock);
207 255
208 ip->i_flags &= ~XFS_INEW; 256 ip->i_flags &= ~XFS_INEW;
@@ -212,14 +260,20 @@ xfs_iget_cache_hit(
212 goto out_error; 260 goto out_error;
213 } 261 }
214 262
215 write_lock(&pag->pag_ici_lock); 263 spin_lock(&pag->pag_ici_lock);
216 spin_lock(&ip->i_flags_lock); 264 spin_lock(&ip->i_flags_lock);
217 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); 265 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
218 ip->i_flags |= XFS_INEW; 266 ip->i_flags |= XFS_INEW;
219 __xfs_inode_clear_reclaim_tag(mp, pag, ip); 267 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
220 inode->i_state = I_NEW; 268 inode->i_state = I_NEW;
269
270 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
271 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
272 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
273 &xfs_iolock_active, "xfs_iolock_active");
274
221 spin_unlock(&ip->i_flags_lock); 275 spin_unlock(&ip->i_flags_lock);
222 write_unlock(&pag->pag_ici_lock); 276 spin_unlock(&pag->pag_ici_lock);
223 } else { 277 } else {
224 /* If the VFS inode is being torn down, pause and try again. */ 278 /* If the VFS inode is being torn down, pause and try again. */
225 if (!igrab(inode)) { 279 if (!igrab(inode)) {
@@ -230,7 +284,7 @@ xfs_iget_cache_hit(
230 284
231 /* We've got a live one. */ 285 /* We've got a live one. */
232 spin_unlock(&ip->i_flags_lock); 286 spin_unlock(&ip->i_flags_lock);
233 read_unlock(&pag->pag_ici_lock); 287 rcu_read_unlock();
234 trace_xfs_iget_hit(ip); 288 trace_xfs_iget_hit(ip);
235 } 289 }
236 290
@@ -244,7 +298,7 @@ xfs_iget_cache_hit(
244 298
245out_error: 299out_error:
246 spin_unlock(&ip->i_flags_lock); 300 spin_unlock(&ip->i_flags_lock);
247 read_unlock(&pag->pag_ici_lock); 301 rcu_read_unlock();
248 return error; 302 return error;
249} 303}
250 304
@@ -297,7 +351,7 @@ xfs_iget_cache_miss(
297 BUG(); 351 BUG();
298 } 352 }
299 353
300 write_lock(&pag->pag_ici_lock); 354 spin_lock(&pag->pag_ici_lock);
301 355
302 /* insert the new inode */ 356 /* insert the new inode */
303 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 357 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -312,14 +366,14 @@ xfs_iget_cache_miss(
312 ip->i_udquot = ip->i_gdquot = NULL; 366 ip->i_udquot = ip->i_gdquot = NULL;
313 xfs_iflags_set(ip, XFS_INEW); 367 xfs_iflags_set(ip, XFS_INEW);
314 368
315 write_unlock(&pag->pag_ici_lock); 369 spin_unlock(&pag->pag_ici_lock);
316 radix_tree_preload_end(); 370 radix_tree_preload_end();
317 371
318 *ipp = ip; 372 *ipp = ip;
319 return 0; 373 return 0;
320 374
321out_preload_end: 375out_preload_end:
322 write_unlock(&pag->pag_ici_lock); 376 spin_unlock(&pag->pag_ici_lock);
323 radix_tree_preload_end(); 377 radix_tree_preload_end();
324 if (lock_flags) 378 if (lock_flags)
325 xfs_iunlock(ip, lock_flags); 379 xfs_iunlock(ip, lock_flags);
@@ -366,7 +420,7 @@ xfs_iget(
366 xfs_agino_t agino; 420 xfs_agino_t agino;
367 421
368 /* reject inode numbers outside existing AGs */ 422 /* reject inode numbers outside existing AGs */
369 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 423 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
370 return EINVAL; 424 return EINVAL;
371 425
372 /* get the perag structure and ensure that it's inode capable */ 426 /* get the perag structure and ensure that it's inode capable */
@@ -375,15 +429,15 @@ xfs_iget(
375 429
376again: 430again:
377 error = 0; 431 error = 0;
378 read_lock(&pag->pag_ici_lock); 432 rcu_read_lock();
379 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 433 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
380 434
381 if (ip) { 435 if (ip) {
382 error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); 436 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
383 if (error) 437 if (error)
384 goto out_error_or_again; 438 goto out_error_or_again;
385 } else { 439 } else {
386 read_unlock(&pag->pag_ici_lock); 440 rcu_read_unlock();
387 XFS_STATS_INC(xs_ig_missed); 441 XFS_STATS_INC(xs_ig_missed);
388 442
389 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 443 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f9..be7cf625421 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
887 * around for a while. This helps to keep recently accessed 887 * around for a while. This helps to keep recently accessed
888 * meta-data in-core longer. 888 * meta-data in-core longer.
889 */ 889 */
890 XFS_BUF_SET_REF(bp, XFS_INO_REF); 890 xfs_buf_set_ref(bp, XFS_INO_REF);
891 891
892 /* 892 /*
893 * Use xfs_trans_brelse() to release the buffer containing the 893 * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
2000 */ 2000 */
2001 for (i = 0; i < ninodes; i++) { 2001 for (i = 0; i < ninodes; i++) {
2002retry: 2002retry:
2003 read_lock(&pag->pag_ici_lock); 2003 rcu_read_lock();
2004 ip = radix_tree_lookup(&pag->pag_ici_root, 2004 ip = radix_tree_lookup(&pag->pag_ici_root,
2005 XFS_INO_TO_AGINO(mp, (inum + i))); 2005 XFS_INO_TO_AGINO(mp, (inum + i)));
2006 2006
2007 /* Inode not in memory or stale, nothing to do */ 2007 /* Inode not in memory, nothing to do */
2008 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2008 if (!ip) {
2009 read_unlock(&pag->pag_ici_lock); 2009 rcu_read_unlock();
2010 continue; 2010 continue;
2011 } 2011 }
2012 2012
2013 /* 2013 /*
2014 * because this is an RCU protected lookup, we could
2015 * find a recently freed or even reallocated inode
2016 * during the lookup. We need to check under the
2017 * i_flags_lock for a valid inode here. Skip it if it
2018 * is not valid, the wrong inode or stale.
2019 */
2020 spin_lock(&ip->i_flags_lock);
2021 if (ip->i_ino != inum + i ||
2022 __xfs_iflags_test(ip, XFS_ISTALE)) {
2023 spin_unlock(&ip->i_flags_lock);
2024 rcu_read_unlock();
2025 continue;
2026 }
2027 spin_unlock(&ip->i_flags_lock);
2028
2029 /*
2014 * Don't try to lock/unlock the current inode, but we 2030 * Don't try to lock/unlock the current inode, but we
2015 * _cannot_ skip the other inodes that we did not find 2031 * _cannot_ skip the other inodes that we did not find
2016 * in the list attached to the buffer and are not 2032 * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
2019 */ 2035 */
2020 if (ip != free_ip && 2036 if (ip != free_ip &&
2021 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2037 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2022 read_unlock(&pag->pag_ici_lock); 2038 rcu_read_unlock();
2023 delay(1); 2039 delay(1);
2024 goto retry; 2040 goto retry;
2025 } 2041 }
2026 read_unlock(&pag->pag_ici_lock); 2042 rcu_read_unlock();
2027 2043
2028 xfs_iflock(ip); 2044 xfs_iflock(ip);
2029 xfs_iflags_set(ip, XFS_ISTALE); 2045 xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
2629 2645
2630 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2646 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2631 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2647 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2632 read_lock(&pag->pag_ici_lock); 2648 rcu_read_lock();
2633 /* really need a gang lookup range call here */ 2649 /* really need a gang lookup range call here */
2634 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2650 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2635 first_index, inodes_per_cluster); 2651 first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
2640 iq = ilist[i]; 2656 iq = ilist[i];
2641 if (iq == ip) 2657 if (iq == ip)
2642 continue; 2658 continue;
2643 /* if the inode lies outside this cluster, we're done. */ 2659
2644 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) 2660 /*
2645 break; 2661 * because this is an RCU protected lookup, we could find a
2662 * recently freed or even reallocated inode during the lookup.
2663 * We need to check under the i_flags_lock for a valid inode
2664 * here. Skip it if it is not valid or the wrong inode.
2665 */
2666 spin_lock(&ip->i_flags_lock);
2667 if (!ip->i_ino ||
2668 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2669 spin_unlock(&ip->i_flags_lock);
2670 continue;
2671 }
2672 spin_unlock(&ip->i_flags_lock);
2673
2646 /* 2674 /*
2647 * Do an un-protected check to see if the inode is dirty and 2675 * Do an un-protected check to see if the inode is dirty and
2648 * is a candidate for flushing. These checks will be repeated 2676 * is a candidate for flushing. These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
2692 } 2720 }
2693 2721
2694out_free: 2722out_free:
2695 read_unlock(&pag->pag_ici_lock); 2723 rcu_read_unlock();
2696 kmem_free(ilist); 2724 kmem_free(ilist);
2697out_put: 2725out_put:
2698 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
2704 * Corruption detected in the clustering loop. Invalidate the 2732 * Corruption detected in the clustering loop. Invalidate the
2705 * inode buffer and shut down the filesystem. 2733 * inode buffer and shut down the filesystem.
2706 */ 2734 */
2707 read_unlock(&pag->pag_ici_lock); 2735 rcu_read_unlock();
2708 /* 2736 /*
2709 * Clean up the buffer. If it was B_DELWRI, just release it -- 2737 * Clean up the buffer. If it was B_DELWRI, just release it --
2710 * brelse can handle it with no problems. If not, shut down the 2738 * brelse can handle it with no problems. If not, shut down the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc..5c95fa8ec11 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
376/* 376/*
377 * In-core inode flags. 377 * In-core inode flags.
378 */ 378 */
379#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ 379#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
380#define XFS_ISTALE 0x0002 /* inode has been staled */ 380#define XFS_ISTALE 0x0002 /* inode has been staled */
381#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ 381#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
382#define XFS_INEW 0x0008 /* inode has just been allocated */ 382#define XFS_INEW 0x0008 /* inode has just been allocated */
383#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ 383#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
384#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ 384#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
385#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
385 386
386/* 387/*
387 * Flags for inode locking. 388 * Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
438#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) 439#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
439#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 440#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
440 441
442extern struct lock_class_key xfs_iolock_reclaimable;
443
441/* 444/*
442 * Flags for xfs_itruncate_start(). 445 * Flags for xfs_itruncate_start().
443 */ 446 */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c7ac020705d..fd4f398bd6f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
657} 657}
658 658
659/* 659/*
660 * This is called to find out where the oldest active copy of the 660 * This is called to find out where the oldest active copy of the inode log
661 * inode log item in the on disk log resides now that the last log 661 * item in the on disk log resides now that the last log write of it completed
662 * write of it completed at the given lsn. Since we always re-log 662 * at the given lsn. Since we always re-log all dirty data in an inode, the
663 * all dirty data in an inode, the latest copy in the on disk log 663 * latest copy in the on disk log is the only one that matters. Therefore,
664 * is the only one that matters. Therefore, simply return the 664 * simply return the given lsn.
665 * given lsn. 665 *
666 * If the inode has been marked stale because the cluster is being freed, we
667 * don't want to (re-)insert this inode into the AIL. There is a race condition
668 * where the cluster buffer may be unpinned before the inode is inserted into
669 * the AIL during transaction committed processing. If the buffer is unpinned
670 * before the inode item has been committed and inserted, then it is possible
671 * for the buffer to be written and IO completions before the inode is inserted
672 * into the AIL. In that case, we'd be inserting a clean, stale inode into the
673 * AIL which will never get removed. It will, however, get reclaimed which
674 * triggers an assert in xfs_inode_free() complaining about freein an inode
675 * still in the AIL.
676 *
677 * To avoid this, return a lower LSN than the one passed in so that the
678 * transaction committed code will not move the inode forward in the AIL but
679 * will still unpin it properly.
666 */ 680 */
667STATIC xfs_lsn_t 681STATIC xfs_lsn_t
668xfs_inode_item_committed( 682xfs_inode_item_committed(
669 struct xfs_log_item *lip, 683 struct xfs_log_item *lip,
670 xfs_lsn_t lsn) 684 xfs_lsn_t lsn)
671{ 685{
686 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
687 struct xfs_inode *ip = iip->ili_inode;
688
689 if (xfs_iflags_test(ip, XFS_ISTALE))
690 return lsn - 1;
672 return lsn; 691 return lsn;
673} 692}
674 693
@@ -823,15 +842,64 @@ xfs_inode_item_destroy(
823 * flushed to disk. It is responsible for removing the inode item 842 * flushed to disk. It is responsible for removing the inode item
824 * from the AIL if it has not been re-logged, and unlocking the inode's 843 * from the AIL if it has not been re-logged, and unlocking the inode's
825 * flush lock. 844 * flush lock.
845 *
846 * To reduce AIL lock traffic as much as possible, we scan the buffer log item
847 * list for other inodes that will run this function. We remove them from the
848 * buffer list so we can process all the inode IO completions in one AIL lock
849 * traversal.
826 */ 850 */
827void 851void
828xfs_iflush_done( 852xfs_iflush_done(
829 struct xfs_buf *bp, 853 struct xfs_buf *bp,
830 struct xfs_log_item *lip) 854 struct xfs_log_item *lip)
831{ 855{
832 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 856 struct xfs_inode_log_item *iip;
833 xfs_inode_t *ip = iip->ili_inode; 857 struct xfs_log_item *blip;
858 struct xfs_log_item *next;
859 struct xfs_log_item *prev;
834 struct xfs_ail *ailp = lip->li_ailp; 860 struct xfs_ail *ailp = lip->li_ailp;
861 int need_ail = 0;
862
863 /*
864 * Scan the buffer IO completions for other inodes being completed and
865 * attach them to the current inode log item.
866 */
867 blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
868 prev = NULL;
869 while (blip != NULL) {
870 if (lip->li_cb != xfs_iflush_done) {
871 prev = blip;
872 blip = blip->li_bio_list;
873 continue;
874 }
875
876 /* remove from list */
877 next = blip->li_bio_list;
878 if (!prev) {
879 XFS_BUF_SET_FSPRIVATE(bp, next);
880 } else {
881 prev->li_bio_list = next;
882 }
883
884 /* add to current list */
885 blip->li_bio_list = lip->li_bio_list;
886 lip->li_bio_list = blip;
887
888 /*
889 * while we have the item, do the unlocked check for needing
890 * the AIL lock.
891 */
892 iip = INODE_ITEM(blip);
893 if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
894 need_ail++;
895
896 blip = next;
897 }
898
899 /* make sure we capture the state of the initial inode. */
900 iip = INODE_ITEM(lip);
901 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
902 need_ail++;
835 903
836 /* 904 /*
837 * We only want to pull the item from the AIL if it is 905 * We only want to pull the item from the AIL if it is
@@ -842,28 +910,37 @@ xfs_iflush_done(
842 * the lock since it's cheaper, and then we recheck while 910 * the lock since it's cheaper, and then we recheck while
843 * holding the lock before removing the inode from the AIL. 911 * holding the lock before removing the inode from the AIL.
844 */ 912 */
845 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { 913 if (need_ail) {
914 struct xfs_log_item *log_items[need_ail];
915 int i = 0;
846 spin_lock(&ailp->xa_lock); 916 spin_lock(&ailp->xa_lock);
847 if (lip->li_lsn == iip->ili_flush_lsn) { 917 for (blip = lip; blip; blip = blip->li_bio_list) {
848 /* xfs_trans_ail_delete() drops the AIL lock. */ 918 iip = INODE_ITEM(blip);
849 xfs_trans_ail_delete(ailp, lip); 919 if (iip->ili_logged &&
850 } else { 920 blip->li_lsn == iip->ili_flush_lsn) {
851 spin_unlock(&ailp->xa_lock); 921 log_items[i++] = blip;
922 }
923 ASSERT(i <= need_ail);
852 } 924 }
925 /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
926 xfs_trans_ail_delete_bulk(ailp, log_items, i);
853 } 927 }
854 928
855 iip->ili_logged = 0;
856 929
857 /* 930 /*
858 * Clear the ili_last_fields bits now that we know that the 931 * clean up and unlock the flush lock now we are done. We can clear the
859 * data corresponding to them is safely on disk. 932 * ili_last_fields bits now that we know that the data corresponding to
933 * them is safely on disk.
860 */ 934 */
861 iip->ili_last_fields = 0; 935 for (blip = lip; blip; blip = next) {
936 next = blip->li_bio_list;
937 blip->li_bio_list = NULL;
862 938
863 /* 939 iip = INODE_ITEM(blip);
864 * Release the inode's flush lock since we're done with it. 940 iip->ili_logged = 0;
865 */ 941 iip->ili_last_fields = 0;
866 xfs_ifunlock(ip); 942 xfs_ifunlock(iip->ili_inode);
943 }
867} 944}
868 945
869/* 946/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369..8a0f044750c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
47 47
48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
49 << mp->m_writeio_log) 49 << mp->m_writeio_log)
50#define XFS_STRAT_WRITE_IMAPS 2
51#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 50#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
52 51
53STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
54 int, struct xfs_bmbt_irec *, int *);
55STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
56 struct xfs_bmbt_irec *, int *);
57STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
58 struct xfs_bmbt_irec *, int *);
59
60int
61xfs_iomap(
62 struct xfs_inode *ip,
63 xfs_off_t offset,
64 ssize_t count,
65 int flags,
66 struct xfs_bmbt_irec *imap,
67 int *nimaps,
68 int *new)
69{
70 struct xfs_mount *mp = ip->i_mount;
71 xfs_fileoff_t offset_fsb, end_fsb;
72 int error = 0;
73 int lockmode = 0;
74 int bmapi_flags = 0;
75
76 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
77
78 *new = 0;
79
80 if (XFS_FORCED_SHUTDOWN(mp))
81 return XFS_ERROR(EIO);
82
83 trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
84
85 switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
86 case BMAPI_READ:
87 lockmode = xfs_ilock_map_shared(ip);
88 bmapi_flags = XFS_BMAPI_ENTIRE;
89 break;
90 case BMAPI_WRITE:
91 lockmode = XFS_ILOCK_EXCL;
92 if (flags & BMAPI_IGNSTATE)
93 bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
94 xfs_ilock(ip, lockmode);
95 break;
96 case BMAPI_ALLOCATE:
97 lockmode = XFS_ILOCK_SHARED;
98 bmapi_flags = XFS_BMAPI_ENTIRE;
99
100 /* Attempt non-blocking lock */
101 if (flags & BMAPI_TRYLOCK) {
102 if (!xfs_ilock_nowait(ip, lockmode))
103 return XFS_ERROR(EAGAIN);
104 } else {
105 xfs_ilock(ip, lockmode);
106 }
107 break;
108 default:
109 BUG();
110 }
111
112 ASSERT(offset <= mp->m_maxioffset);
113 if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
114 count = mp->m_maxioffset - offset;
115 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
116 offset_fsb = XFS_B_TO_FSBT(mp, offset);
117
118 error = xfs_bmapi(NULL, ip, offset_fsb,
119 (xfs_filblks_t)(end_fsb - offset_fsb),
120 bmapi_flags, NULL, 0, imap,
121 nimaps, NULL);
122
123 if (error)
124 goto out;
125
126 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
127 case BMAPI_WRITE:
128 /* If we found an extent, return it */
129 if (*nimaps &&
130 (imap->br_startblock != HOLESTARTBLOCK) &&
131 (imap->br_startblock != DELAYSTARTBLOCK)) {
132 trace_xfs_iomap_found(ip, offset, count, flags, imap);
133 break;
134 }
135
136 if (flags & BMAPI_DIRECT) {
137 error = xfs_iomap_write_direct(ip, offset, count, flags,
138 imap, nimaps);
139 } else {
140 error = xfs_iomap_write_delay(ip, offset, count, flags,
141 imap, nimaps);
142 }
143 if (!error) {
144 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
145 }
146 *new = 1;
147 break;
148 case BMAPI_ALLOCATE:
149 /* If we found an extent, return it */
150 xfs_iunlock(ip, lockmode);
151 lockmode = 0;
152
153 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
154 trace_xfs_iomap_found(ip, offset, count, flags, imap);
155 break;
156 }
157
158 error = xfs_iomap_write_allocate(ip, offset, count,
159 imap, nimaps);
160 break;
161 }
162
163 ASSERT(*nimaps <= 1);
164
165out:
166 if (lockmode)
167 xfs_iunlock(ip, lockmode);
168 return XFS_ERROR(error);
169}
170
171STATIC int 52STATIC int
172xfs_iomap_eof_align_last_fsb( 53xfs_iomap_eof_align_last_fsb(
173 xfs_mount_t *mp, 54 xfs_mount_t *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
236 return EFSCORRUPTED; 117 return EFSCORRUPTED;
237} 118}
238 119
239STATIC int 120int
240xfs_iomap_write_direct( 121xfs_iomap_write_direct(
241 xfs_inode_t *ip, 122 xfs_inode_t *ip,
242 xfs_off_t offset, 123 xfs_off_t offset,
243 size_t count, 124 size_t count,
244 int flags,
245 xfs_bmbt_irec_t *imap, 125 xfs_bmbt_irec_t *imap,
246 int *nmaps) 126 int nmaps)
247{ 127{
248 xfs_mount_t *mp = ip->i_mount; 128 xfs_mount_t *mp = ip->i_mount;
249 xfs_fileoff_t offset_fsb; 129 xfs_fileoff_t offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
279 if (error) 159 if (error)
280 goto error_out; 160 goto error_out;
281 } else { 161 } else {
282 if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) 162 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
283 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 163 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
284 imap->br_blockcount + 164 imap->br_blockcount +
285 imap->br_startoff); 165 imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
331 xfs_trans_ijoin(tp, ip); 211 xfs_trans_ijoin(tp, ip);
332 212
333 bmapi_flag = XFS_BMAPI_WRITE; 213 bmapi_flag = XFS_BMAPI_WRITE;
334 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) 214 if (offset < ip->i_size || extsz)
335 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
336 216
337 /* 217 /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
370 goto error_out; 250 goto error_out;
371 } 251 }
372 252
373 *nmaps = 1;
374 return 0; 253 return 0;
375 254
376error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 255error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
379 258
380error1: /* Just cancel transaction */ 259error1: /* Just cancel transaction */
381 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 260 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
382 *nmaps = 0; /* nothing set-up here */
383 261
384error_out: 262error_out:
385 return XFS_ERROR(error); 263 return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
389 * If the caller is doing a write at the end of the file, then extend the 267 * If the caller is doing a write at the end of the file, then extend the
390 * allocation out to the file system's write iosize. We clean up any extra 268 * allocation out to the file system's write iosize. We clean up any extra
391 * space left over when the file is closed in xfs_inactive(). 269 * space left over when the file is closed in xfs_inactive().
270 *
271 * If we find we already have delalloc preallocation beyond EOF, don't do more
272 * preallocation as it it not needed.
392 */ 273 */
393STATIC int 274STATIC int
394xfs_iomap_eof_want_preallocate( 275xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
396 xfs_inode_t *ip, 277 xfs_inode_t *ip,
397 xfs_off_t offset, 278 xfs_off_t offset,
398 size_t count, 279 size_t count,
399 int ioflag,
400 xfs_bmbt_irec_t *imap, 280 xfs_bmbt_irec_t *imap,
401 int nimaps, 281 int nimaps,
402 int *prealloc) 282 int *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
405 xfs_filblks_t count_fsb; 285 xfs_filblks_t count_fsb;
406 xfs_fsblock_t firstblock; 286 xfs_fsblock_t firstblock;
407 int n, error, imaps; 287 int n, error, imaps;
288 int found_delalloc = 0;
408 289
409 *prealloc = 0; 290 *prealloc = 0;
410 if ((offset + count) <= ip->i_size) 291 if ((offset + count) <= ip->i_size)
@@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate(
429 return 0; 310 return 0;
430 start_fsb += imap[n].br_blockcount; 311 start_fsb += imap[n].br_blockcount;
431 count_fsb -= imap[n].br_blockcount; 312 count_fsb -= imap[n].br_blockcount;
313
314 if (imap[n].br_startblock == DELAYSTARTBLOCK)
315 found_delalloc = 1;
432 } 316 }
433 } 317 }
434 *prealloc = 1; 318 if (!found_delalloc)
319 *prealloc = 1;
435 return 0; 320 return 0;
436} 321}
437 322
438STATIC int 323/*
324 * If we don't have a user specified preallocation size, dynamically increase
325 * the preallocation size as the size of the file grows. Cap the maximum size
326 * at a single extent or less if the filesystem is near full. The closer the
327 * filesystem is to full, the smaller the maximum prealocation.
328 */
329STATIC xfs_fsblock_t
330xfs_iomap_prealloc_size(
331 struct xfs_mount *mp,
332 struct xfs_inode *ip)
333{
334 xfs_fsblock_t alloc_blocks = 0;
335
336 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
337 int shift = 0;
338 int64_t freesp;
339
340 /*
341 * rounddown_pow_of_two() returns an undefined result
342 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
343 * ensure we always pass in a non-zero value.
344 */
345 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
346 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
347 rounddown_pow_of_two(alloc_blocks));
348
349 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
350 freesp = mp->m_sb.sb_fdblocks;
351 if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
352 shift = 2;
353 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
354 shift++;
355 if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
356 shift++;
357 if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
358 shift++;
359 if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
360 shift++;
361 }
362 if (shift)
363 alloc_blocks >>= shift;
364 }
365
366 if (alloc_blocks < mp->m_writeio_blocks)
367 alloc_blocks = mp->m_writeio_blocks;
368
369 return alloc_blocks;
370}
371
372int
439xfs_iomap_write_delay( 373xfs_iomap_write_delay(
440 xfs_inode_t *ip, 374 xfs_inode_t *ip,
441 xfs_off_t offset, 375 xfs_off_t offset,
442 size_t count, 376 size_t count,
443 int ioflag, 377 xfs_bmbt_irec_t *ret_imap)
444 xfs_bmbt_irec_t *ret_imap,
445 int *nmaps)
446{ 378{
447 xfs_mount_t *mp = ip->i_mount; 379 xfs_mount_t *mp = ip->i_mount;
448 xfs_fileoff_t offset_fsb; 380 xfs_fileoff_t offset_fsb;
@@ -469,16 +401,19 @@ xfs_iomap_write_delay(
469 extsz = xfs_get_extsz_hint(ip); 401 extsz = xfs_get_extsz_hint(ip);
470 offset_fsb = XFS_B_TO_FSBT(mp, offset); 402 offset_fsb = XFS_B_TO_FSBT(mp, offset);
471 403
404
472 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, 405 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
473 ioflag, imap, XFS_WRITE_IMAPS, &prealloc); 406 imap, XFS_WRITE_IMAPS, &prealloc);
474 if (error) 407 if (error)
475 return error; 408 return error;
476 409
477retry: 410retry:
478 if (prealloc) { 411 if (prealloc) {
412 xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
413
479 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 414 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
480 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 415 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
481 last_fsb = ioalign + mp->m_writeio_blocks; 416 last_fsb = ioalign + alloc_blocks;
482 } else { 417 } else {
483 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 418 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
484 } 419 }
@@ -496,22 +431,31 @@ retry:
496 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 431 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
497 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 432 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
498 &nimaps, NULL); 433 &nimaps, NULL);
499 if (error && (error != ENOSPC)) 434 switch (error) {
435 case 0:
436 case ENOSPC:
437 case EDQUOT:
438 break;
439 default:
500 return XFS_ERROR(error); 440 return XFS_ERROR(error);
441 }
501 442
502 /* 443 /*
503 * If bmapi returned us nothing, and if we didn't get back EDQUOT, 444 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
504 * then we must have run out of space - flush all other inodes with 445 * ENOSPC, * flush all other inodes with delalloc blocks to free up
505 * delalloc blocks and retry without EOF preallocation. 446 * some of the excess reserved metadata space. For both cases, retry
447 * without EOF preallocation.
506 */ 448 */
507 if (nimaps == 0) { 449 if (nimaps == 0) {
508 trace_xfs_delalloc_enospc(ip, offset, count); 450 trace_xfs_delalloc_enospc(ip, offset, count);
509 if (flushed) 451 if (flushed)
510 return XFS_ERROR(ENOSPC); 452 return XFS_ERROR(error ? error : ENOSPC);
511 453
512 xfs_iunlock(ip, XFS_ILOCK_EXCL); 454 if (error == ENOSPC) {
513 xfs_flush_inodes(ip); 455 xfs_iunlock(ip, XFS_ILOCK_EXCL);
514 xfs_ilock(ip, XFS_ILOCK_EXCL); 456 xfs_flush_inodes(ip);
457 xfs_ilock(ip, XFS_ILOCK_EXCL);
458 }
515 459
516 flushed = 1; 460 flushed = 1;
517 error = 0; 461 error = 0;
@@ -523,8 +467,6 @@ retry:
523 return xfs_cmn_err_fsblock_zero(ip, &imap[0]); 467 return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
524 468
525 *ret_imap = imap[0]; 469 *ret_imap = imap[0];
526 *nmaps = 1;
527
528 return 0; 470 return 0;
529} 471}
530 472
@@ -538,13 +480,12 @@ retry:
538 * We no longer bother to look at the incoming map - all we have to 480 * We no longer bother to look at the incoming map - all we have to
539 * guarantee is that whatever we allocate fills the required range. 481 * guarantee is that whatever we allocate fills the required range.
540 */ 482 */
541STATIC int 483int
542xfs_iomap_write_allocate( 484xfs_iomap_write_allocate(
543 xfs_inode_t *ip, 485 xfs_inode_t *ip,
544 xfs_off_t offset, 486 xfs_off_t offset,
545 size_t count, 487 size_t count,
546 xfs_bmbt_irec_t *imap, 488 xfs_bmbt_irec_t *imap)
547 int *retmap)
548{ 489{
549 xfs_mount_t *mp = ip->i_mount; 490 xfs_mount_t *mp = ip->i_mount;
550 xfs_fileoff_t offset_fsb, last_block; 491 xfs_fileoff_t offset_fsb, last_block;
@@ -557,8 +498,6 @@ xfs_iomap_write_allocate(
557 int error = 0; 498 int error = 0;
558 int nres; 499 int nres;
559 500
560 *retmap = 0;
561
562 /* 501 /*
563 * Make sure that the dquots are there. 502 * Make sure that the dquots are there.
564 */ 503 */
@@ -680,7 +619,6 @@ xfs_iomap_write_allocate(
680 if ((offset_fsb >= imap->br_startoff) && 619 if ((offset_fsb >= imap->br_startoff) &&
681 (offset_fsb < (imap->br_startoff + 620 (offset_fsb < (imap->br_startoff +
682 imap->br_blockcount))) { 621 imap->br_blockcount))) {
683 *retmap = 1;
684 XFS_STATS_INC(xs_xstrat_quick); 622 XFS_STATS_INC(xs_xstrat_quick);
685 return 0; 623 return 0;
686 } 624 }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50..80615760959 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21/* base extent manipulation calls */
22#define BMAPI_READ (1 << 0) /* read extents */
23#define BMAPI_WRITE (1 << 1) /* create extents */
24#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
25
26/* modifiers */
27#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
28#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
29#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
30#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
31
32#define BMAPI_FLAGS \
33 { BMAPI_READ, "READ" }, \
34 { BMAPI_WRITE, "WRITE" }, \
35 { BMAPI_ALLOCATE, "ALLOCATE" }, \
36 { BMAPI_IGNSTATE, "IGNSTATE" }, \
37 { BMAPI_DIRECT, "DIRECT" }, \
38 { BMAPI_TRYLOCK, "TRYLOCK" }
39
40struct xfs_inode; 21struct xfs_inode;
41struct xfs_bmbt_irec; 22struct xfs_bmbt_irec;
42 23
43extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 24extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
44 struct xfs_bmbt_irec *, int *, int *); 25 struct xfs_bmbt_irec *, int);
26extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
27 struct xfs_bmbt_irec *);
28extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
29 struct xfs_bmbt_irec *);
45extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 30extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
46 31
47#endif /* __XFS_IOMAP_H__*/ 32#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9..ae6fef1ff56 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
47 xfs_buftarg_t *log_target, 47 xfs_buftarg_t *log_target,
48 xfs_daddr_t blk_offset, 48 xfs_daddr_t blk_offset,
49 int num_bblks); 49 int num_bblks);
50STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 50STATIC int xlog_space_left(struct log *log, atomic64_t *head);
51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
52STATIC void xlog_dealloc_log(xlog_t *log); 52STATIC void xlog_dealloc_log(xlog_t *log);
53 53
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
70/* local functions to manipulate grant head */ 70/* local functions to manipulate grant head */
71STATIC int xlog_grant_log_space(xlog_t *log, 71STATIC int xlog_grant_log_space(xlog_t *log,
72 xlog_ticket_t *xtic); 72 xlog_ticket_t *xtic);
73STATIC void xlog_grant_push_ail(xfs_mount_t *mp, 73STATIC void xlog_grant_push_ail(struct log *log,
74 int need_bytes); 74 int need_bytes);
75STATIC void xlog_regrant_reserve_log_space(xlog_t *log, 75STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
76 xlog_ticket_t *ticket); 76 xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
81 81
82#if defined(DEBUG) 82#if defined(DEBUG)
83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
84STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 84STATIC void xlog_verify_grant_tail(struct log *log);
85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
86 int count, boolean_t syncing); 86 int count, boolean_t syncing);
87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, 87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
88 xfs_lsn_t tail_lsn); 88 xfs_lsn_t tail_lsn);
89#else 89#else
90#define xlog_verify_dest_ptr(a,b) 90#define xlog_verify_dest_ptr(a,b)
91#define xlog_verify_grant_head(a,b) 91#define xlog_verify_grant_tail(a)
92#define xlog_verify_iclog(a,b,c,d) 92#define xlog_verify_iclog(a,b,c,d)
93#define xlog_verify_tail_lsn(a,b,c) 93#define xlog_verify_tail_lsn(a,b,c)
94#endif 94#endif
95 95
96STATIC int xlog_iclogs_empty(xlog_t *log); 96STATIC int xlog_iclogs_empty(xlog_t *log);
97 97
98
99static void 98static void
100xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 99xlog_grant_sub_space(
100 struct log *log,
101 atomic64_t *head,
102 int bytes)
101{ 103{
102 if (*qp) { 104 int64_t head_val = atomic64_read(head);
103 tic->t_next = (*qp); 105 int64_t new, old;
104 tic->t_prev = (*qp)->t_prev;
105 (*qp)->t_prev->t_next = tic;
106 (*qp)->t_prev = tic;
107 } else {
108 tic->t_prev = tic->t_next = tic;
109 *qp = tic;
110 }
111 106
112 tic->t_flags |= XLOG_TIC_IN_Q; 107 do {
113} 108 int cycle, space;
114 109
115static void 110 xlog_crack_grant_head_val(head_val, &cycle, &space);
116xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
117{
118 if (tic == tic->t_next) {
119 *qp = NULL;
120 } else {
121 *qp = tic->t_next;
122 tic->t_next->t_prev = tic->t_prev;
123 tic->t_prev->t_next = tic->t_next;
124 }
125 111
126 tic->t_next = tic->t_prev = NULL; 112 space -= bytes;
127 tic->t_flags &= ~XLOG_TIC_IN_Q; 113 if (space < 0) {
114 space += log->l_logsize;
115 cycle--;
116 }
117
118 old = head_val;
119 new = xlog_assign_grant_head_val(cycle, space);
120 head_val = atomic64_cmpxchg(head, old, new);
121 } while (head_val != old);
128} 122}
129 123
130static void 124static void
131xlog_grant_sub_space(struct log *log, int bytes) 125xlog_grant_add_space(
126 struct log *log,
127 atomic64_t *head,
128 int bytes)
132{ 129{
133 log->l_grant_write_bytes -= bytes; 130 int64_t head_val = atomic64_read(head);
134 if (log->l_grant_write_bytes < 0) { 131 int64_t new, old;
135 log->l_grant_write_bytes += log->l_logsize;
136 log->l_grant_write_cycle--;
137 }
138
139 log->l_grant_reserve_bytes -= bytes;
140 if ((log)->l_grant_reserve_bytes < 0) {
141 log->l_grant_reserve_bytes += log->l_logsize;
142 log->l_grant_reserve_cycle--;
143 }
144 132
145} 133 do {
134 int tmp;
135 int cycle, space;
146 136
147static void 137 xlog_crack_grant_head_val(head_val, &cycle, &space);
148xlog_grant_add_space_write(struct log *log, int bytes)
149{
150 int tmp = log->l_logsize - log->l_grant_write_bytes;
151 if (tmp > bytes)
152 log->l_grant_write_bytes += bytes;
153 else {
154 log->l_grant_write_cycle++;
155 log->l_grant_write_bytes = bytes - tmp;
156 }
157}
158 138
159static void 139 tmp = log->l_logsize - space;
160xlog_grant_add_space_reserve(struct log *log, int bytes) 140 if (tmp > bytes)
161{ 141 space += bytes;
162 int tmp = log->l_logsize - log->l_grant_reserve_bytes; 142 else {
163 if (tmp > bytes) 143 space = bytes - tmp;
164 log->l_grant_reserve_bytes += bytes; 144 cycle++;
165 else { 145 }
166 log->l_grant_reserve_cycle++;
167 log->l_grant_reserve_bytes = bytes - tmp;
168 }
169}
170 146
171static inline void 147 old = head_val;
172xlog_grant_add_space(struct log *log, int bytes) 148 new = xlog_assign_grant_head_val(cycle, space);
173{ 149 head_val = atomic64_cmpxchg(head, old, new);
174 xlog_grant_add_space_write(log, bytes); 150 } while (head_val != old);
175 xlog_grant_add_space_reserve(log, bytes);
176} 151}
177 152
178static void 153static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
355 330
356 trace_xfs_log_reserve(log, internal_ticket); 331 trace_xfs_log_reserve(log, internal_ticket);
357 332
358 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 333 xlog_grant_push_ail(log, internal_ticket->t_unit_res);
359 retval = xlog_regrant_write_log_space(log, internal_ticket); 334 retval = xlog_regrant_write_log_space(log, internal_ticket);
360 } else { 335 } else {
361 /* may sleep if need to allocate more tickets */ 336 /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
369 344
370 trace_xfs_log_reserve(log, internal_ticket); 345 trace_xfs_log_reserve(log, internal_ticket);
371 346
372 xlog_grant_push_ail(mp, 347 xlog_grant_push_ail(log,
373 (internal_ticket->t_unit_res * 348 (internal_ticket->t_unit_res *
374 internal_ticket->t_cnt)); 349 internal_ticket->t_cnt));
375 retval = xlog_grant_log_space(log, internal_ticket); 350 retval = xlog_grant_log_space(log, internal_ticket);
@@ -402,7 +377,7 @@ xfs_log_mount(
402 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 377 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
403 else { 378 else {
404 cmn_err(CE_NOTE, 379 cmn_err(CE_NOTE,
405 "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 380 "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
406 mp->m_fsname); 381 mp->m_fsname);
407 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 382 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
408 } 383 }
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
584 if (!(iclog->ic_state == XLOG_STATE_ACTIVE || 559 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
585 iclog->ic_state == XLOG_STATE_DIRTY)) { 560 iclog->ic_state == XLOG_STATE_DIRTY)) {
586 if (!XLOG_FORCED_SHUTDOWN(log)) { 561 if (!XLOG_FORCED_SHUTDOWN(log)) {
587 sv_wait(&iclog->ic_force_wait, PMEM, 562 xlog_wait(&iclog->ic_force_wait,
588 &log->l_icloglock, s); 563 &log->l_icloglock);
589 } else { 564 } else {
590 spin_unlock(&log->l_icloglock); 565 spin_unlock(&log->l_icloglock);
591 } 566 }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
625 || iclog->ic_state == XLOG_STATE_DIRTY 600 || iclog->ic_state == XLOG_STATE_DIRTY
626 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 601 || iclog->ic_state == XLOG_STATE_IOERROR) ) {
627 602
628 sv_wait(&iclog->ic_force_wait, PMEM, 603 xlog_wait(&iclog->ic_force_wait,
629 &log->l_icloglock, s); 604 &log->l_icloglock);
630 } else { 605 } else {
631 spin_unlock(&log->l_icloglock); 606 spin_unlock(&log->l_icloglock);
632 } 607 }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t *mp,
703{ 678{
704 xlog_ticket_t *tic; 679 xlog_ticket_t *tic;
705 xlog_t *log = mp->m_log; 680 xlog_t *log = mp->m_log;
706 int need_bytes, free_bytes, cycle, bytes; 681 int need_bytes, free_bytes;
707 682
708 if (XLOG_FORCED_SHUTDOWN(log)) 683 if (XLOG_FORCED_SHUTDOWN(log))
709 return; 684 return;
710 685
711 if (tail_lsn == 0) { 686 if (tail_lsn == 0)
712 /* needed since sync_lsn is 64 bits */ 687 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
713 spin_lock(&log->l_icloglock);
714 tail_lsn = log->l_last_sync_lsn;
715 spin_unlock(&log->l_icloglock);
716 }
717
718 spin_lock(&log->l_grant_lock);
719 688
720 /* Also an invalid lsn. 1 implies that we aren't passing in a valid 689 /* tail_lsn == 1 implies that we weren't passed a valid value. */
721 * tail_lsn. 690 if (tail_lsn != 1)
722 */ 691 atomic64_set(&log->l_tail_lsn, tail_lsn);
723 if (tail_lsn != 1) {
724 log->l_tail_lsn = tail_lsn;
725 }
726 692
727 if ((tic = log->l_write_headq)) { 693 if (!list_empty_careful(&log->l_writeq)) {
728#ifdef DEBUG 694#ifdef DEBUG
729 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 695 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
730 panic("Recovery problem"); 696 panic("Recovery problem");
731#endif 697#endif
732 cycle = log->l_grant_write_cycle; 698 spin_lock(&log->l_grant_write_lock);
733 bytes = log->l_grant_write_bytes; 699 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
734 free_bytes = xlog_space_left(log, cycle, bytes); 700 list_for_each_entry(tic, &log->l_writeq, t_queue) {
735 do {
736 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 701 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
737 702
738 if (free_bytes < tic->t_unit_res && tail_lsn != 1) 703 if (free_bytes < tic->t_unit_res && tail_lsn != 1)
739 break; 704 break;
740 tail_lsn = 0; 705 tail_lsn = 0;
741 free_bytes -= tic->t_unit_res; 706 free_bytes -= tic->t_unit_res;
742 sv_signal(&tic->t_wait); 707 trace_xfs_log_regrant_write_wake_up(log, tic);
743 tic = tic->t_next; 708 wake_up(&tic->t_wait);
744 } while (tic != log->l_write_headq); 709 }
710 spin_unlock(&log->l_grant_write_lock);
745 } 711 }
746 if ((tic = log->l_reserve_headq)) { 712
713 if (!list_empty_careful(&log->l_reserveq)) {
747#ifdef DEBUG 714#ifdef DEBUG
748 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 715 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
749 panic("Recovery problem"); 716 panic("Recovery problem");
750#endif 717#endif
751 cycle = log->l_grant_reserve_cycle; 718 spin_lock(&log->l_grant_reserve_lock);
752 bytes = log->l_grant_reserve_bytes; 719 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
753 free_bytes = xlog_space_left(log, cycle, bytes); 720 list_for_each_entry(tic, &log->l_reserveq, t_queue) {
754 do {
755 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 721 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
756 need_bytes = tic->t_unit_res*tic->t_cnt; 722 need_bytes = tic->t_unit_res*tic->t_cnt;
757 else 723 else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t *mp,
760 break; 726 break;
761 tail_lsn = 0; 727 tail_lsn = 0;
762 free_bytes -= need_bytes; 728 free_bytes -= need_bytes;
763 sv_signal(&tic->t_wait); 729 trace_xfs_log_grant_wake_up(log, tic);
764 tic = tic->t_next; 730 wake_up(&tic->t_wait);
765 } while (tic != log->l_reserve_headq); 731 }
732 spin_unlock(&log->l_grant_reserve_lock);
766 } 733 }
767 spin_unlock(&log->l_grant_lock); 734}
768} /* xfs_log_move_tail */
769 735
770/* 736/*
771 * Determine if we have a transaction that has gone to disk 737 * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
831 * We may be holding the log iclog lock upon entering this routine. 797 * We may be holding the log iclog lock upon entering this routine.
832 */ 798 */
833xfs_lsn_t 799xfs_lsn_t
834xlog_assign_tail_lsn(xfs_mount_t *mp) 800xlog_assign_tail_lsn(
801 struct xfs_mount *mp)
835{ 802{
836 xfs_lsn_t tail_lsn; 803 xfs_lsn_t tail_lsn;
837 xlog_t *log = mp->m_log; 804 struct log *log = mp->m_log;
838 805
839 tail_lsn = xfs_trans_ail_tail(mp->m_ail); 806 tail_lsn = xfs_trans_ail_tail(mp->m_ail);
840 spin_lock(&log->l_grant_lock); 807 if (!tail_lsn)
841 if (tail_lsn != 0) { 808 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
842 log->l_tail_lsn = tail_lsn;
843 } else {
844 tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
845 }
846 spin_unlock(&log->l_grant_lock);
847 809
810 atomic64_set(&log->l_tail_lsn, tail_lsn);
848 return tail_lsn; 811 return tail_lsn;
849} /* xlog_assign_tail_lsn */ 812}
850
851 813
852/* 814/*
853 * Return the space in the log between the tail and the head. The head 815 * Return the space in the log between the tail and the head. The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
864 * result is that we return the size of the log as the amount of space left. 826 * result is that we return the size of the log as the amount of space left.
865 */ 827 */
866STATIC int 828STATIC int
867xlog_space_left(xlog_t *log, int cycle, int bytes) 829xlog_space_left(
868{ 830 struct log *log,
869 int free_bytes; 831 atomic64_t *head)
870 int tail_bytes; 832{
871 int tail_cycle; 833 int free_bytes;
872 834 int tail_bytes;
873 tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); 835 int tail_cycle;
874 tail_cycle = CYCLE_LSN(log->l_tail_lsn); 836 int head_cycle;
875 if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { 837 int head_bytes;
876 free_bytes = log->l_logsize - (bytes - tail_bytes); 838
877 } else if ((tail_cycle + 1) < cycle) { 839 xlog_crack_grant_head(head, &head_cycle, &head_bytes);
840 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
841 tail_bytes = BBTOB(tail_bytes);
842 if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
843 free_bytes = log->l_logsize - (head_bytes - tail_bytes);
844 else if (tail_cycle + 1 < head_cycle)
878 return 0; 845 return 0;
879 } else if (tail_cycle < cycle) { 846 else if (tail_cycle < head_cycle) {
880 ASSERT(tail_cycle == (cycle - 1)); 847 ASSERT(tail_cycle == (head_cycle - 1));
881 free_bytes = tail_bytes - bytes; 848 free_bytes = tail_bytes - head_bytes;
882 } else { 849 } else {
883 /* 850 /*
884 * The reservation head is behind the tail. 851 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
889 "xlog_space_left: head behind tail\n" 856 "xlog_space_left: head behind tail\n"
890 " tail_cycle = %d, tail_bytes = %d\n" 857 " tail_cycle = %d, tail_bytes = %d\n"
891 " GH cycle = %d, GH bytes = %d", 858 " GH cycle = %d, GH bytes = %d",
892 tail_cycle, tail_bytes, cycle, bytes); 859 tail_cycle, tail_bytes, head_cycle, head_bytes);
893 ASSERT(0); 860 ASSERT(0);
894 free_bytes = log->l_logsize; 861 free_bytes = log->l_logsize;
895 } 862 }
896 return free_bytes; 863 return free_bytes;
897} /* xlog_space_left */ 864}
898 865
899 866
900/* 867/*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t *mp,
1047 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1014 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1048 1015
1049 log->l_prev_block = -1; 1016 log->l_prev_block = -1;
1050 log->l_tail_lsn = xlog_assign_lsn(1, 0);
1051 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1017 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
1052 log->l_last_sync_lsn = log->l_tail_lsn; 1018 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
1019 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
1053 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1020 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
1054 log->l_grant_reserve_cycle = 1; 1021 xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
1055 log->l_grant_write_cycle = 1; 1022 xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
1023 INIT_LIST_HEAD(&log->l_reserveq);
1024 INIT_LIST_HEAD(&log->l_writeq);
1025 spin_lock_init(&log->l_grant_reserve_lock);
1026 spin_lock_init(&log->l_grant_write_lock);
1056 1027
1057 error = EFSCORRUPTED; 1028 error = EFSCORRUPTED;
1058 if (xfs_sb_version_hassector(&mp->m_sb)) { 1029 if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1094 log->l_xbuf = bp; 1065 log->l_xbuf = bp;
1095 1066
1096 spin_lock_init(&log->l_icloglock); 1067 spin_lock_init(&log->l_icloglock);
1097 spin_lock_init(&log->l_grant_lock); 1068 init_waitqueue_head(&log->l_flush_wait);
1098 sv_init(&log->l_flush_wait, 0, "flush_wait");
1099 1069
1100 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1070 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1101 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1071 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1151 1121
1152 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); 1122 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1153 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1123 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
1154 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1124 init_waitqueue_head(&iclog->ic_force_wait);
1155 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1125 init_waitqueue_head(&iclog->ic_write_wait);
1156 1126
1157 iclogp = &iclog->ic_next; 1127 iclogp = &iclog->ic_next;
1158 } 1128 }
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t *mp,
1167out_free_iclog: 1137out_free_iclog:
1168 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { 1138 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1169 prev_iclog = iclog->ic_next; 1139 prev_iclog = iclog->ic_next;
1170 if (iclog->ic_bp) { 1140 if (iclog->ic_bp)
1171 sv_destroy(&iclog->ic_force_wait);
1172 sv_destroy(&iclog->ic_write_wait);
1173 xfs_buf_free(iclog->ic_bp); 1141 xfs_buf_free(iclog->ic_bp);
1174 }
1175 kmem_free(iclog); 1142 kmem_free(iclog);
1176 } 1143 }
1177 spinlock_destroy(&log->l_icloglock); 1144 spinlock_destroy(&log->l_icloglock);
1178 spinlock_destroy(&log->l_grant_lock);
1179 xfs_buf_free(log->l_xbuf); 1145 xfs_buf_free(log->l_xbuf);
1180out_free_log: 1146out_free_log:
1181 kmem_free(log); 1147 kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
1223 * water mark. In this manner, we would be creating a low water mark. 1189 * water mark. In this manner, we would be creating a low water mark.
1224 */ 1190 */
1225STATIC void 1191STATIC void
1226xlog_grant_push_ail(xfs_mount_t *mp, 1192xlog_grant_push_ail(
1227 int need_bytes) 1193 struct log *log,
1194 int need_bytes)
1228{ 1195{
1229 xlog_t *log = mp->m_log; /* pointer to the log */ 1196 xfs_lsn_t threshold_lsn = 0;
1230 xfs_lsn_t tail_lsn; /* lsn of the log tail */ 1197 xfs_lsn_t last_sync_lsn;
1231 xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ 1198 int free_blocks;
1232 int free_blocks; /* free blocks left to write to */ 1199 int free_bytes;
1233 int free_bytes; /* free bytes left to write to */ 1200 int threshold_block;
1234 int threshold_block; /* block in lsn we'd like to be at */ 1201 int threshold_cycle;
1235 int threshold_cycle; /* lsn cycle we'd like to be at */ 1202 int free_threshold;
1236 int free_threshold; 1203
1237 1204 ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1238 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1205
1239 1206 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
1240 spin_lock(&log->l_grant_lock); 1207 free_blocks = BTOBBT(free_bytes);
1241 free_bytes = xlog_space_left(log, 1208
1242 log->l_grant_reserve_cycle, 1209 /*
1243 log->l_grant_reserve_bytes); 1210 * Set the threshold for the minimum number of free blocks in the
1244 tail_lsn = log->l_tail_lsn; 1211 * log to the maximum of what the caller needs, one quarter of the
1245 free_blocks = BTOBBT(free_bytes); 1212 * log, and 256 blocks.
1246 1213 */
1247 /* 1214 free_threshold = BTOBB(need_bytes);
1248 * Set the threshold for the minimum number of free blocks in the 1215 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
1249 * log to the maximum of what the caller needs, one quarter of the 1216 free_threshold = MAX(free_threshold, 256);
1250 * log, and 256 blocks. 1217 if (free_blocks >= free_threshold)
1251 */ 1218 return;
1252 free_threshold = BTOBB(need_bytes); 1219
1253 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); 1220 xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
1254 free_threshold = MAX(free_threshold, 256); 1221 &threshold_block);
1255 if (free_blocks < free_threshold) { 1222 threshold_block += free_threshold;
1256 threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
1257 threshold_cycle = CYCLE_LSN(tail_lsn);
1258 if (threshold_block >= log->l_logBBsize) { 1223 if (threshold_block >= log->l_logBBsize) {
1259 threshold_block -= log->l_logBBsize; 1224 threshold_block -= log->l_logBBsize;
1260 threshold_cycle += 1; 1225 threshold_cycle += 1;
1261 } 1226 }
1262 threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block); 1227 threshold_lsn = xlog_assign_lsn(threshold_cycle,
1228 threshold_block);
1229 /*
1230 * Don't pass in an lsn greater than the lsn of the last
1231 * log record known to be on disk. Use a snapshot of the last sync lsn
1232 * so that it doesn't change between the compare and the set.
1233 */
1234 last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
1235 if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
1236 threshold_lsn = last_sync_lsn;
1263 1237
1264 /* Don't pass in an lsn greater than the lsn of the last 1238 /*
1265 * log record known to be on disk. 1239 * Get the transaction layer to kick the dirty buffers out to
1240 * disk asynchronously. No point in trying to do this if
1241 * the filesystem is shutting down.
1266 */ 1242 */
1267 if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) 1243 if (!XLOG_FORCED_SHUTDOWN(log))
1268 threshold_lsn = log->l_last_sync_lsn; 1244 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1269 } 1245}
1270 spin_unlock(&log->l_grant_lock);
1271
1272 /*
1273 * Get the transaction layer to kick the dirty buffers out to
1274 * disk asynchronously. No point in trying to do this if
1275 * the filesystem is shutting down.
1276 */
1277 if (threshold_lsn &&
1278 !XLOG_FORCED_SHUTDOWN(log))
1279 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1280} /* xlog_grant_push_ail */
1281 1246
1282/* 1247/*
1283 * The bdstrat callback function for log bufs. This gives us a central 1248 * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t *log,
1372 roundoff < BBTOB(1))); 1337 roundoff < BBTOB(1)));
1373 1338
1374 /* move grant heads by roundoff in sync */ 1339 /* move grant heads by roundoff in sync */
1375 spin_lock(&log->l_grant_lock); 1340 xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
1376 xlog_grant_add_space(log, roundoff); 1341 xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
1377 spin_unlock(&log->l_grant_lock);
1378 1342
1379 /* put cycle number in every block */ 1343 /* put cycle number in every block */
1380 xlog_pack_data(log, iclog, roundoff); 1344 xlog_pack_data(log, iclog, roundoff);
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
1489 1453
1490 iclog = log->l_iclog; 1454 iclog = log->l_iclog;
1491 for (i=0; i<log->l_iclog_bufs; i++) { 1455 for (i=0; i<log->l_iclog_bufs; i++) {
1492 sv_destroy(&iclog->ic_force_wait);
1493 sv_destroy(&iclog->ic_write_wait);
1494 xfs_buf_free(iclog->ic_bp); 1456 xfs_buf_free(iclog->ic_bp);
1495 next_iclog = iclog->ic_next; 1457 next_iclog = iclog->ic_next;
1496 kmem_free(iclog); 1458 kmem_free(iclog);
1497 iclog = next_iclog; 1459 iclog = next_iclog;
1498 } 1460 }
1499 spinlock_destroy(&log->l_icloglock); 1461 spinlock_destroy(&log->l_icloglock);
1500 spinlock_destroy(&log->l_grant_lock);
1501 1462
1502 xfs_buf_free(log->l_xbuf); 1463 xfs_buf_free(log->l_xbuf);
1503 log->l_mp->m_log = NULL; 1464 log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
2232 lowest_lsn = xlog_get_lowest_lsn(log); 2193 lowest_lsn = xlog_get_lowest_lsn(log);
2233 if (lowest_lsn && 2194 if (lowest_lsn &&
2234 XFS_LSN_CMP(lowest_lsn, 2195 XFS_LSN_CMP(lowest_lsn,
2235 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { 2196 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2236 iclog = iclog->ic_next; 2197 iclog = iclog->ic_next;
2237 continue; /* Leave this iclog for 2198 continue; /* Leave this iclog for
2238 * another thread */ 2199 * another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
2240 2201
2241 iclog->ic_state = XLOG_STATE_CALLBACK; 2202 iclog->ic_state = XLOG_STATE_CALLBACK;
2242 2203
2243 spin_unlock(&log->l_icloglock);
2244 2204
2245 /* l_last_sync_lsn field protected by 2205 /*
2246 * l_grant_lock. Don't worry about iclog's lsn. 2206 * update the last_sync_lsn before we drop the
2247 * No one else can be here except us. 2207 * icloglock to ensure we are the only one that
2208 * can update it.
2248 */ 2209 */
2249 spin_lock(&log->l_grant_lock); 2210 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2250 ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, 2211 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2251 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2212 atomic64_set(&log->l_last_sync_lsn,
2252 log->l_last_sync_lsn = 2213 be64_to_cpu(iclog->ic_header.h_lsn));
2253 be64_to_cpu(iclog->ic_header.h_lsn);
2254 spin_unlock(&log->l_grant_lock);
2255 2214
2256 } else { 2215 } else
2257 spin_unlock(&log->l_icloglock);
2258 ioerrors++; 2216 ioerrors++;
2259 } 2217
2218 spin_unlock(&log->l_icloglock);
2260 2219
2261 /* 2220 /*
2262 * Keep processing entries in the callback list until 2221 * Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
2297 xlog_state_clean_log(log); 2256 xlog_state_clean_log(log);
2298 2257
2299 /* wake up threads waiting in xfs_log_force() */ 2258 /* wake up threads waiting in xfs_log_force() */
2300 sv_broadcast(&iclog->ic_force_wait); 2259 wake_up_all(&iclog->ic_force_wait);
2301 2260
2302 iclog = iclog->ic_next; 2261 iclog = iclog->ic_next;
2303 } while (first_iclog != iclog); 2262 } while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
2344 spin_unlock(&log->l_icloglock); 2303 spin_unlock(&log->l_icloglock);
2345 2304
2346 if (wake) 2305 if (wake)
2347 sv_broadcast(&log->l_flush_wait); 2306 wake_up_all(&log->l_flush_wait);
2348} 2307}
2349 2308
2350 2309
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
2395 * iclog buffer, we wake them all, one will get to do the 2354 * iclog buffer, we wake them all, one will get to do the
2396 * I/O, the others get to wait for the result. 2355 * I/O, the others get to wait for the result.
2397 */ 2356 */
2398 sv_broadcast(&iclog->ic_write_wait); 2357 wake_up_all(&iclog->ic_write_wait);
2399 spin_unlock(&log->l_icloglock); 2358 spin_unlock(&log->l_icloglock);
2400 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2359 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
2401} /* xlog_state_done_syncing */ 2360} /* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
2444 XFS_STATS_INC(xs_log_noiclogs); 2403 XFS_STATS_INC(xs_log_noiclogs);
2445 2404
2446 /* Wait for log writes to have flushed */ 2405 /* Wait for log writes to have flushed */
2447 sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); 2406 xlog_wait(&log->l_flush_wait, &log->l_icloglock);
2448 goto restart; 2407 goto restart;
2449 } 2408 }
2450 2409
@@ -2527,6 +2486,18 @@ restart:
2527 * 2486 *
2528 * Once a ticket gets put onto the reserveq, it will only return after 2487 * Once a ticket gets put onto the reserveq, it will only return after
2529 * the needed reservation is satisfied. 2488 * the needed reservation is satisfied.
2489 *
2490 * This function is structured so that it has a lock free fast path. This is
2491 * necessary because every new transaction reservation will come through this
2492 * path. Hence any lock will be globally hot if we take it unconditionally on
2493 * every pass.
2494 *
2495 * As tickets are only ever moved on and off the reserveq under the
2496 * l_grant_reserve_lock, we only need to take that lock if we are going
2497 * to add the ticket to the queue and sleep. We can avoid taking the lock if the
2498 * ticket was never added to the reserveq because the t_queue list head will be
2499 * empty and we hold the only reference to it so it can safely be checked
2500 * unlocked.
2530 */ 2501 */
2531STATIC int 2502STATIC int
2532xlog_grant_log_space(xlog_t *log, 2503xlog_grant_log_space(xlog_t *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t *log,
2534{ 2505{
2535 int free_bytes; 2506 int free_bytes;
2536 int need_bytes; 2507 int need_bytes;
2537#ifdef DEBUG
2538 xfs_lsn_t tail_lsn;
2539#endif
2540
2541 2508
2542#ifdef DEBUG 2509#ifdef DEBUG
2543 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 2510 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2544 panic("grant Recovery problem"); 2511 panic("grant Recovery problem");
2545#endif 2512#endif
2546 2513
2547 /* Is there space or do we need to sleep? */
2548 spin_lock(&log->l_grant_lock);
2549
2550 trace_xfs_log_grant_enter(log, tic); 2514 trace_xfs_log_grant_enter(log, tic);
2551 2515
2516 need_bytes = tic->t_unit_res;
2517 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2518 need_bytes *= tic->t_ocnt;
2519
2552 /* something is already sleeping; insert new transaction at end */ 2520 /* something is already sleeping; insert new transaction at end */
2553 if (log->l_reserve_headq) { 2521 if (!list_empty_careful(&log->l_reserveq)) {
2554 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2522 spin_lock(&log->l_grant_reserve_lock);
2523 /* recheck the queue now we are locked */
2524 if (list_empty(&log->l_reserveq)) {
2525 spin_unlock(&log->l_grant_reserve_lock);
2526 goto redo;
2527 }
2528 list_add_tail(&tic->t_queue, &log->l_reserveq);
2555 2529
2556 trace_xfs_log_grant_sleep1(log, tic); 2530 trace_xfs_log_grant_sleep1(log, tic);
2557 2531
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t *log,
2563 goto error_return; 2537 goto error_return;
2564 2538
2565 XFS_STATS_INC(xs_sleep_logspace); 2539 XFS_STATS_INC(xs_sleep_logspace);
2566 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); 2540 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2541
2567 /* 2542 /*
2568 * If we got an error, and the filesystem is shutting down, 2543 * If we got an error, and the filesystem is shutting down,
2569 * we'll catch it down below. So just continue... 2544 * we'll catch it down below. So just continue...
2570 */ 2545 */
2571 trace_xfs_log_grant_wake1(log, tic); 2546 trace_xfs_log_grant_wake1(log, tic);
2572 spin_lock(&log->l_grant_lock);
2573 } 2547 }
2574 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2575 need_bytes = tic->t_unit_res*tic->t_ocnt;
2576 else
2577 need_bytes = tic->t_unit_res;
2578 2548
2579redo: 2549redo:
2580 if (XLOG_FORCED_SHUTDOWN(log)) 2550 if (XLOG_FORCED_SHUTDOWN(log))
2581 goto error_return; 2551 goto error_return_unlocked;
2582 2552
2583 free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, 2553 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
2584 log->l_grant_reserve_bytes);
2585 if (free_bytes < need_bytes) { 2554 if (free_bytes < need_bytes) {
2586 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2555 spin_lock(&log->l_grant_reserve_lock);
2587 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2556 if (list_empty(&tic->t_queue))
2557 list_add_tail(&tic->t_queue, &log->l_reserveq);
2588 2558
2589 trace_xfs_log_grant_sleep2(log, tic); 2559 trace_xfs_log_grant_sleep2(log, tic);
2590 2560
2591 spin_unlock(&log->l_grant_lock);
2592 xlog_grant_push_ail(log->l_mp, need_bytes);
2593 spin_lock(&log->l_grant_lock);
2594
2595 XFS_STATS_INC(xs_sleep_logspace);
2596 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2597
2598 spin_lock(&log->l_grant_lock);
2599 if (XLOG_FORCED_SHUTDOWN(log)) 2561 if (XLOG_FORCED_SHUTDOWN(log))
2600 goto error_return; 2562 goto error_return;
2601 2563
2602 trace_xfs_log_grant_wake2(log, tic); 2564 xlog_grant_push_ail(log, need_bytes);
2565
2566 XFS_STATS_INC(xs_sleep_logspace);
2567 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2603 2568
2569 trace_xfs_log_grant_wake2(log, tic);
2604 goto redo; 2570 goto redo;
2605 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2571 }
2606 xlog_del_ticketq(&log->l_reserve_headq, tic);
2607 2572
2608 /* we've got enough space */ 2573 if (!list_empty(&tic->t_queue)) {
2609 xlog_grant_add_space(log, need_bytes); 2574 spin_lock(&log->l_grant_reserve_lock);
2610#ifdef DEBUG 2575 list_del_init(&tic->t_queue);
2611 tail_lsn = log->l_tail_lsn; 2576 spin_unlock(&log->l_grant_reserve_lock);
2612 /*
2613 * Check to make sure the grant write head didn't just over lap the
2614 * tail. If the cycles are the same, we can't be overlapping.
2615 * Otherwise, make sure that the cycles differ by exactly one and
2616 * check the byte count.
2617 */
2618 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2619 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2620 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2621 } 2577 }
2622#endif 2578
2579 /* we've got enough space */
2580 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
2581 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2623 trace_xfs_log_grant_exit(log, tic); 2582 trace_xfs_log_grant_exit(log, tic);
2624 xlog_verify_grant_head(log, 1); 2583 xlog_verify_grant_tail(log);
2625 spin_unlock(&log->l_grant_lock);
2626 return 0; 2584 return 0;
2627 2585
2628 error_return: 2586error_return_unlocked:
2629 if (tic->t_flags & XLOG_TIC_IN_Q) 2587 spin_lock(&log->l_grant_reserve_lock);
2630 xlog_del_ticketq(&log->l_reserve_headq, tic); 2588error_return:
2631 2589 list_del_init(&tic->t_queue);
2590 spin_unlock(&log->l_grant_reserve_lock);
2632 trace_xfs_log_grant_error(log, tic); 2591 trace_xfs_log_grant_error(log, tic);
2633 2592
2634 /* 2593 /*
@@ -2638,7 +2597,6 @@ redo:
2638 */ 2597 */
2639 tic->t_curr_res = 0; 2598 tic->t_curr_res = 0;
2640 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2599 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2641 spin_unlock(&log->l_grant_lock);
2642 return XFS_ERROR(EIO); 2600 return XFS_ERROR(EIO);
2643} /* xlog_grant_log_space */ 2601} /* xlog_grant_log_space */
2644 2602
@@ -2646,17 +2604,14 @@ redo:
2646/* 2604/*
2647 * Replenish the byte reservation required by moving the grant write head. 2605 * Replenish the byte reservation required by moving the grant write head.
2648 * 2606 *
2649 * 2607 * Similar to xlog_grant_log_space, the function is structured to have a lock
2608 * free fast path.
2650 */ 2609 */
2651STATIC int 2610STATIC int
2652xlog_regrant_write_log_space(xlog_t *log, 2611xlog_regrant_write_log_space(xlog_t *log,
2653 xlog_ticket_t *tic) 2612 xlog_ticket_t *tic)
2654{ 2613{
2655 int free_bytes, need_bytes; 2614 int free_bytes, need_bytes;
2656 xlog_ticket_t *ntic;
2657#ifdef DEBUG
2658 xfs_lsn_t tail_lsn;
2659#endif
2660 2615
2661 tic->t_curr_res = tic->t_unit_res; 2616 tic->t_curr_res = tic->t_unit_res;
2662 xlog_tic_reset_res(tic); 2617 xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t *log,
2669 panic("regrant Recovery problem"); 2624 panic("regrant Recovery problem");
2670#endif 2625#endif
2671 2626
2672 spin_lock(&log->l_grant_lock);
2673
2674 trace_xfs_log_regrant_write_enter(log, tic); 2627 trace_xfs_log_regrant_write_enter(log, tic);
2675
2676 if (XLOG_FORCED_SHUTDOWN(log)) 2628 if (XLOG_FORCED_SHUTDOWN(log))
2677 goto error_return; 2629 goto error_return_unlocked;
2678 2630
2679 /* If there are other waiters on the queue then give them a 2631 /* If there are other waiters on the queue then give them a
2680 * chance at logspace before us. Wake up the first waiters, 2632 * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t *log,
2683 * this transaction. 2635 * this transaction.
2684 */ 2636 */
2685 need_bytes = tic->t_unit_res; 2637 need_bytes = tic->t_unit_res;
2686 if ((ntic = log->l_write_headq)) { 2638 if (!list_empty_careful(&log->l_writeq)) {
2687 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2639 struct xlog_ticket *ntic;
2688 log->l_grant_write_bytes); 2640
2689 do { 2641 spin_lock(&log->l_grant_write_lock);
2642 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2643 list_for_each_entry(ntic, &log->l_writeq, t_queue) {
2690 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); 2644 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
2691 2645
2692 if (free_bytes < ntic->t_unit_res) 2646 if (free_bytes < ntic->t_unit_res)
2693 break; 2647 break;
2694 free_bytes -= ntic->t_unit_res; 2648 free_bytes -= ntic->t_unit_res;
2695 sv_signal(&ntic->t_wait); 2649 wake_up(&ntic->t_wait);
2696 ntic = ntic->t_next; 2650 }
2697 } while (ntic != log->l_write_headq);
2698
2699 if (ntic != log->l_write_headq) {
2700 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2701 xlog_ins_ticketq(&log->l_write_headq, tic);
2702 2651
2652 if (ntic != list_first_entry(&log->l_writeq,
2653 struct xlog_ticket, t_queue)) {
2654 if (list_empty(&tic->t_queue))
2655 list_add_tail(&tic->t_queue, &log->l_writeq);
2703 trace_xfs_log_regrant_write_sleep1(log, tic); 2656 trace_xfs_log_regrant_write_sleep1(log, tic);
2704 2657
2705 spin_unlock(&log->l_grant_lock); 2658 xlog_grant_push_ail(log, need_bytes);
2706 xlog_grant_push_ail(log->l_mp, need_bytes);
2707 spin_lock(&log->l_grant_lock);
2708 2659
2709 XFS_STATS_INC(xs_sleep_logspace); 2660 XFS_STATS_INC(xs_sleep_logspace);
2710 sv_wait(&tic->t_wait, PINOD|PLTWAIT, 2661 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2711 &log->l_grant_lock, s);
2712
2713 /* If we're shutting down, this tic is already
2714 * off the queue */
2715 spin_lock(&log->l_grant_lock);
2716 if (XLOG_FORCED_SHUTDOWN(log))
2717 goto error_return;
2718
2719 trace_xfs_log_regrant_write_wake1(log, tic); 2662 trace_xfs_log_regrant_write_wake1(log, tic);
2720 } 2663 } else
2664 spin_unlock(&log->l_grant_write_lock);
2721 } 2665 }
2722 2666
2723redo: 2667redo:
2724 if (XLOG_FORCED_SHUTDOWN(log)) 2668 if (XLOG_FORCED_SHUTDOWN(log))
2725 goto error_return; 2669 goto error_return_unlocked;
2726 2670
2727 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2671 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2728 log->l_grant_write_bytes);
2729 if (free_bytes < need_bytes) { 2672 if (free_bytes < need_bytes) {
2730 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2673 spin_lock(&log->l_grant_write_lock);
2731 xlog_ins_ticketq(&log->l_write_headq, tic); 2674 if (list_empty(&tic->t_queue))
2732 spin_unlock(&log->l_grant_lock); 2675 list_add_tail(&tic->t_queue, &log->l_writeq);
2733 xlog_grant_push_ail(log->l_mp, need_bytes);
2734 spin_lock(&log->l_grant_lock);
2735
2736 XFS_STATS_INC(xs_sleep_logspace);
2737 trace_xfs_log_regrant_write_sleep2(log, tic);
2738
2739 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2740 2676
2741 /* If we're shutting down, this tic is already off the queue */
2742 spin_lock(&log->l_grant_lock);
2743 if (XLOG_FORCED_SHUTDOWN(log)) 2677 if (XLOG_FORCED_SHUTDOWN(log))
2744 goto error_return; 2678 goto error_return;
2745 2679
2680 xlog_grant_push_ail(log, need_bytes);
2681
2682 XFS_STATS_INC(xs_sleep_logspace);
2683 trace_xfs_log_regrant_write_sleep2(log, tic);
2684 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2685
2746 trace_xfs_log_regrant_write_wake2(log, tic); 2686 trace_xfs_log_regrant_write_wake2(log, tic);
2747 goto redo; 2687 goto redo;
2748 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2688 }
2749 xlog_del_ticketq(&log->l_write_headq, tic);
2750 2689
2751 /* we've got enough space */ 2690 if (!list_empty(&tic->t_queue)) {
2752 xlog_grant_add_space_write(log, need_bytes); 2691 spin_lock(&log->l_grant_write_lock);
2753#ifdef DEBUG 2692 list_del_init(&tic->t_queue);
2754 tail_lsn = log->l_tail_lsn; 2693 spin_unlock(&log->l_grant_write_lock);
2755 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2756 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2757 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2758 } 2694 }
2759#endif
2760 2695
2696 /* we've got enough space */
2697 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2761 trace_xfs_log_regrant_write_exit(log, tic); 2698 trace_xfs_log_regrant_write_exit(log, tic);
2762 2699 xlog_verify_grant_tail(log);
2763 xlog_verify_grant_head(log, 1);
2764 spin_unlock(&log->l_grant_lock);
2765 return 0; 2700 return 0;
2766 2701
2767 2702
2703 error_return_unlocked:
2704 spin_lock(&log->l_grant_write_lock);
2768 error_return: 2705 error_return:
2769 if (tic->t_flags & XLOG_TIC_IN_Q) 2706 list_del_init(&tic->t_queue);
2770 xlog_del_ticketq(&log->l_reserve_headq, tic); 2707 spin_unlock(&log->l_grant_write_lock);
2771
2772 trace_xfs_log_regrant_write_error(log, tic); 2708 trace_xfs_log_regrant_write_error(log, tic);
2773 2709
2774 /* 2710 /*
@@ -2778,7 +2714,6 @@ redo:
2778 */ 2714 */
2779 tic->t_curr_res = 0; 2715 tic->t_curr_res = 0;
2780 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2716 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2781 spin_unlock(&log->l_grant_lock);
2782 return XFS_ERROR(EIO); 2717 return XFS_ERROR(EIO);
2783} /* xlog_regrant_write_log_space */ 2718} /* xlog_regrant_write_log_space */
2784 2719
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2799 if (ticket->t_cnt > 0) 2734 if (ticket->t_cnt > 0)
2800 ticket->t_cnt--; 2735 ticket->t_cnt--;
2801 2736
2802 spin_lock(&log->l_grant_lock); 2737 xlog_grant_sub_space(log, &log->l_grant_reserve_head,
2803 xlog_grant_sub_space(log, ticket->t_curr_res); 2738 ticket->t_curr_res);
2739 xlog_grant_sub_space(log, &log->l_grant_write_head,
2740 ticket->t_curr_res);
2804 ticket->t_curr_res = ticket->t_unit_res; 2741 ticket->t_curr_res = ticket->t_unit_res;
2805 xlog_tic_reset_res(ticket); 2742 xlog_tic_reset_res(ticket);
2806 2743
2807 trace_xfs_log_regrant_reserve_sub(log, ticket); 2744 trace_xfs_log_regrant_reserve_sub(log, ticket);
2808 2745
2809 xlog_verify_grant_head(log, 1);
2810
2811 /* just return if we still have some of the pre-reserved space */ 2746 /* just return if we still have some of the pre-reserved space */
2812 if (ticket->t_cnt > 0) { 2747 if (ticket->t_cnt > 0)
2813 spin_unlock(&log->l_grant_lock);
2814 return; 2748 return;
2815 }
2816 2749
2817 xlog_grant_add_space_reserve(log, ticket->t_unit_res); 2750 xlog_grant_add_space(log, &log->l_grant_reserve_head,
2751 ticket->t_unit_res);
2818 2752
2819 trace_xfs_log_regrant_reserve_exit(log, ticket); 2753 trace_xfs_log_regrant_reserve_exit(log, ticket);
2820 2754
2821 xlog_verify_grant_head(log, 0);
2822 spin_unlock(&log->l_grant_lock);
2823 ticket->t_curr_res = ticket->t_unit_res; 2755 ticket->t_curr_res = ticket->t_unit_res;
2824 xlog_tic_reset_res(ticket); 2756 xlog_tic_reset_res(ticket);
2825} /* xlog_regrant_reserve_log_space */ 2757} /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
2843xlog_ungrant_log_space(xlog_t *log, 2775xlog_ungrant_log_space(xlog_t *log,
2844 xlog_ticket_t *ticket) 2776 xlog_ticket_t *ticket)
2845{ 2777{
2778 int bytes;
2779
2846 if (ticket->t_cnt > 0) 2780 if (ticket->t_cnt > 0)
2847 ticket->t_cnt--; 2781 ticket->t_cnt--;
2848 2782
2849 spin_lock(&log->l_grant_lock);
2850 trace_xfs_log_ungrant_enter(log, ticket); 2783 trace_xfs_log_ungrant_enter(log, ticket);
2851
2852 xlog_grant_sub_space(log, ticket->t_curr_res);
2853
2854 trace_xfs_log_ungrant_sub(log, ticket); 2784 trace_xfs_log_ungrant_sub(log, ticket);
2855 2785
2856 /* If this is a permanent reservation ticket, we may be able to free 2786 /*
2787 * If this is a permanent reservation ticket, we may be able to free
2857 * up more space based on the remaining count. 2788 * up more space based on the remaining count.
2858 */ 2789 */
2790 bytes = ticket->t_curr_res;
2859 if (ticket->t_cnt > 0) { 2791 if (ticket->t_cnt > 0) {
2860 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 2792 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2861 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); 2793 bytes += ticket->t_unit_res*ticket->t_cnt;
2862 } 2794 }
2863 2795
2796 xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
2797 xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
2798
2864 trace_xfs_log_ungrant_exit(log, ticket); 2799 trace_xfs_log_ungrant_exit(log, ticket);
2865 2800
2866 xlog_verify_grant_head(log, 1);
2867 spin_unlock(&log->l_grant_lock);
2868 xfs_log_move_tail(log->l_mp, 1); 2801 xfs_log_move_tail(log->l_mp, 1);
2869} /* xlog_ungrant_log_space */ 2802} /* xlog_ungrant_log_space */
2870 2803
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
2901 2834
2902 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 2835 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
2903 /* update tail before writing to iclog */ 2836 /* update tail before writing to iclog */
2904 xlog_assign_tail_lsn(log->l_mp); 2837 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
2905 sync++; 2838 sync++;
2906 iclog->ic_state = XLOG_STATE_SYNCING; 2839 iclog->ic_state = XLOG_STATE_SYNCING;
2907 iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); 2840 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
2908 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); 2841 xlog_verify_tail_lsn(log, iclog, tail_lsn);
2909 /* cycle incremented when incrementing curr_block */ 2842 /* cycle incremented when incrementing curr_block */
2910 } 2843 }
2911 spin_unlock(&log->l_icloglock); 2844 spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
3088 return XFS_ERROR(EIO); 3021 return XFS_ERROR(EIO);
3089 } 3022 }
3090 XFS_STATS_INC(xs_log_force_sleep); 3023 XFS_STATS_INC(xs_log_force_sleep);
3091 sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); 3024 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3092 /* 3025 /*
3093 * No need to grab the log lock here since we're 3026 * No need to grab the log lock here since we're
3094 * only deciding whether or not to return EIO 3027 * only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
3206 3139
3207 XFS_STATS_INC(xs_log_force_sleep); 3140 XFS_STATS_INC(xs_log_force_sleep);
3208 3141
3209 sv_wait(&iclog->ic_prev->ic_write_wait, 3142 xlog_wait(&iclog->ic_prev->ic_write_wait,
3210 PSWP, &log->l_icloglock, s); 3143 &log->l_icloglock);
3211 if (log_flushed) 3144 if (log_flushed)
3212 *log_flushed = 1; 3145 *log_flushed = 1;
3213 already_slept = 1; 3146 already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
3235 return XFS_ERROR(EIO); 3168 return XFS_ERROR(EIO);
3236 } 3169 }
3237 XFS_STATS_INC(xs_log_force_sleep); 3170 XFS_STATS_INC(xs_log_force_sleep);
3238 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); 3171 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3239 /* 3172 /*
3240 * No need to grab the log lock here since we're 3173 * No need to grab the log lock here since we're
3241 * only deciding whether or not to return EIO 3174 * only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
3310 xlog_ticket_t *ticket) 3243 xlog_ticket_t *ticket)
3311{ 3244{
3312 ASSERT(atomic_read(&ticket->t_ref) > 0); 3245 ASSERT(atomic_read(&ticket->t_ref) > 0);
3313 if (atomic_dec_and_test(&ticket->t_ref)) { 3246 if (atomic_dec_and_test(&ticket->t_ref))
3314 sv_destroy(&ticket->t_wait);
3315 kmem_zone_free(xfs_log_ticket_zone, ticket); 3247 kmem_zone_free(xfs_log_ticket_zone, ticket);
3316 }
3317} 3248}
3318 3249
3319xlog_ticket_t * 3250xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
3435 } 3366 }
3436 3367
3437 atomic_set(&tic->t_ref, 1); 3368 atomic_set(&tic->t_ref, 1);
3369 INIT_LIST_HEAD(&tic->t_queue);
3438 tic->t_unit_res = unit_bytes; 3370 tic->t_unit_res = unit_bytes;
3439 tic->t_curr_res = unit_bytes; 3371 tic->t_curr_res = unit_bytes;
3440 tic->t_cnt = cnt; 3372 tic->t_cnt = cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
3445 tic->t_trans_type = 0; 3377 tic->t_trans_type = 0;
3446 if (xflags & XFS_LOG_PERM_RESERV) 3378 if (xflags & XFS_LOG_PERM_RESERV)
3447 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3379 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3448 sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); 3380 init_waitqueue_head(&tic->t_wait);
3449 3381
3450 xlog_tic_reset_res(tic); 3382 xlog_tic_reset_res(tic);
3451 3383
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
3484} 3416}
3485 3417
3486STATIC void 3418STATIC void
3487xlog_verify_grant_head(xlog_t *log, int equals) 3419xlog_verify_grant_tail(
3420 struct log *log)
3488{ 3421{
3489 if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { 3422 int tail_cycle, tail_blocks;
3490 if (equals) 3423 int cycle, space;
3491 ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); 3424
3492 else 3425 /*
3493 ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); 3426 * Check to make sure the grant write head didn't just over lap the
3494 } else { 3427 * tail. If the cycles are the same, we can't be overlapping.
3495 ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); 3428 * Otherwise, make sure that the cycles differ by exactly one and
3496 ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); 3429 * check the byte count.
3497 } 3430 */
3498} /* xlog_verify_grant_head */ 3431 xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
3432 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3433 if (tail_cycle != cycle) {
3434 ASSERT(cycle - 1 == tail_cycle);
3435 ASSERT(space <= BBTOB(tail_blocks));
3436 }
3437}
3499 3438
3500/* check if it will fit */ 3439/* check if it will fit */
3501STATIC void 3440STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
3716 xlog_cil_force(log); 3655 xlog_cil_force(log);
3717 3656
3718 /* 3657 /*
3719 * We must hold both the GRANT lock and the LOG lock, 3658 * mark the filesystem and the as in a shutdown state and wake
3720 * before we mark the filesystem SHUTDOWN and wake 3659 * everybody up to tell them the bad news.
3721 * everybody up to tell the bad news.
3722 */ 3660 */
3723 spin_lock(&log->l_icloglock); 3661 spin_lock(&log->l_icloglock);
3724 spin_lock(&log->l_grant_lock);
3725 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3662 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3726 if (mp->m_sb_bp) 3663 if (mp->m_sb_bp)
3727 XFS_BUF_DONE(mp->m_sb_bp); 3664 XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
3742 spin_unlock(&log->l_icloglock); 3679 spin_unlock(&log->l_icloglock);
3743 3680
3744 /* 3681 /*
3745 * We don't want anybody waiting for log reservations 3682 * We don't want anybody waiting for log reservations after this. That
3746 * after this. That means we have to wake up everybody 3683 * means we have to wake up everybody queued up on reserveq as well as
3747 * queued up on reserve_headq as well as write_headq. 3684 * writeq. In addition, we make sure in xlog_{re}grant_log_space that
3748 * In addition, we make sure in xlog_{re}grant_log_space 3685 * we don't enqueue anything once the SHUTDOWN flag is set, and this
3749 * that we don't enqueue anything once the SHUTDOWN flag 3686 * action is protected by the grant locks.
3750 * is set, and this action is protected by the GRANTLOCK.
3751 */ 3687 */
3752 if ((tic = log->l_reserve_headq)) { 3688 spin_lock(&log->l_grant_reserve_lock);
3753 do { 3689 list_for_each_entry(tic, &log->l_reserveq, t_queue)
3754 sv_signal(&tic->t_wait); 3690 wake_up(&tic->t_wait);
3755 tic = tic->t_next; 3691 spin_unlock(&log->l_grant_reserve_lock);
3756 } while (tic != log->l_reserve_headq); 3692
3757 } 3693 spin_lock(&log->l_grant_write_lock);
3758 3694 list_for_each_entry(tic, &log->l_writeq, t_queue)
3759 if ((tic = log->l_write_headq)) { 3695 wake_up(&tic->t_wait);
3760 do { 3696 spin_unlock(&log->l_grant_write_lock);
3761 sv_signal(&tic->t_wait);
3762 tic = tic->t_next;
3763 } while (tic != log->l_write_headq);
3764 }
3765 spin_unlock(&log->l_grant_lock);
3766 3697
3767 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3698 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3768 ASSERT(!logerror); 3699 ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d..3bd3291ef8d 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket);
191 191
192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); 192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
193 193
194int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 194void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
195 struct xfs_log_vec *log_vector, 195 struct xfs_log_vec *log_vector,
196 xfs_lsn_t *commit_lsn, int flags); 196 xfs_lsn_t *commit_lsn, int flags);
197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97..9ca59be0897 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
61 INIT_LIST_HEAD(&cil->xc_committing); 61 INIT_LIST_HEAD(&cil->xc_committing);
62 spin_lock_init(&cil->xc_cil_lock); 62 spin_lock_init(&cil->xc_cil_lock);
63 init_rwsem(&cil->xc_ctx_lock); 63 init_rwsem(&cil->xc_ctx_lock);
64 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); 64 init_waitqueue_head(&cil->xc_commit_wait);
65 65
66 INIT_LIST_HEAD(&ctx->committing); 66 INIT_LIST_HEAD(&ctx->committing);
67 INIT_LIST_HEAD(&ctx->busy_extents); 67 INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
361 int abort) 361 int abort)
362{ 362{
363 struct xfs_cil_ctx *ctx = args; 363 struct xfs_cil_ctx *ctx = args;
364 struct xfs_log_vec *lv;
365 int abortflag = abort ? XFS_LI_ABORTED : 0;
366 struct xfs_busy_extent *busyp, *n; 364 struct xfs_busy_extent *busyp, *n;
367 365
368 /* unpin all the log items */ 366 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
369 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { 367 ctx->start_lsn, abort);
370 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
371 abortflag);
372 }
373 368
374 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) 369 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
375 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); 370 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -548,7 +543,7 @@ xlog_cil_push(
548 543
549 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); 544 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
550 if (error) 545 if (error)
551 goto out_abort; 546 goto out_abort_free_ticket;
552 547
553 /* 548 /*
554 * now that we've written the checkpoint into the log, strictly 549 * now that we've written the checkpoint into the log, strictly
@@ -568,14 +563,15 @@ restart:
568 * It is still being pushed! Wait for the push to 563 * It is still being pushed! Wait for the push to
569 * complete, then start again from the beginning. 564 * complete, then start again from the beginning.
570 */ 565 */
571 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 566 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
572 goto restart; 567 goto restart;
573 } 568 }
574 } 569 }
575 spin_unlock(&cil->xc_cil_lock); 570 spin_unlock(&cil->xc_cil_lock);
576 571
572 /* xfs_log_done always frees the ticket on error. */
577 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 573 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
578 if (error || commit_lsn == -1) 574 if (commit_lsn == -1)
579 goto out_abort; 575 goto out_abort;
580 576
581 /* attach all the transactions w/ busy extents to iclog */ 577 /* attach all the transactions w/ busy extents to iclog */
@@ -592,7 +588,7 @@ restart:
592 */ 588 */
593 spin_lock(&cil->xc_cil_lock); 589 spin_lock(&cil->xc_cil_lock);
594 ctx->commit_lsn = commit_lsn; 590 ctx->commit_lsn = commit_lsn;
595 sv_broadcast(&cil->xc_commit_wait); 591 wake_up_all(&cil->xc_commit_wait);
596 spin_unlock(&cil->xc_cil_lock); 592 spin_unlock(&cil->xc_cil_lock);
597 593
598 /* release the hounds! */ 594 /* release the hounds! */
@@ -605,6 +601,8 @@ out_free_ticket:
605 kmem_free(new_ctx); 601 kmem_free(new_ctx);
606 return 0; 602 return 0;
607 603
604out_abort_free_ticket:
605 xfs_log_ticket_put(tic);
608out_abort: 606out_abort:
609 xlog_cil_committed(ctx, XFS_LI_ABORTED); 607 xlog_cil_committed(ctx, XFS_LI_ABORTED);
610 return XFS_ERROR(EIO); 608 return XFS_ERROR(EIO);
@@ -627,7 +625,7 @@ out_abort:
627 * background commit, returns without it held once background commits are 625 * background commit, returns without it held once background commits are
628 * allowed again. 626 * allowed again.
629 */ 627 */
630int 628void
631xfs_log_commit_cil( 629xfs_log_commit_cil(
632 struct xfs_mount *mp, 630 struct xfs_mount *mp,
633 struct xfs_trans *tp, 631 struct xfs_trans *tp,
@@ -642,11 +640,6 @@ xfs_log_commit_cil(
642 if (flags & XFS_TRANS_RELEASE_LOG_RES) 640 if (flags & XFS_TRANS_RELEASE_LOG_RES)
643 log_flags = XFS_LOG_REL_PERM_RESERV; 641 log_flags = XFS_LOG_REL_PERM_RESERV;
644 642
645 if (XLOG_FORCED_SHUTDOWN(log)) {
646 xlog_cil_free_logvec(log_vector);
647 return XFS_ERROR(EIO);
648 }
649
650 /* 643 /*
651 * do all the hard work of formatting items (including memory 644 * do all the hard work of formatting items (including memory
652 * allocation) outside the CIL context lock. This prevents stalling CIL 645 * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -706,7 +699,6 @@ xfs_log_commit_cil(
706 */ 699 */
707 if (push) 700 if (push)
708 xlog_cil_push(log, 0); 701 xlog_cil_push(log, 0);
709 return 0;
710} 702}
711 703
712/* 704/*
@@ -757,7 +749,7 @@ restart:
757 * It is still being pushed! Wait for the push to 749 * It is still being pushed! Wait for the push to
758 * complete, then start again from the beginning. 750 * complete, then start again from the beginning.
759 */ 751 */
760 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 752 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
761 goto restart; 753 goto restart;
762 } 754 }
763 if (ctx->sequence != sequence) 755 if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617..d5f8be8f4bf 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
21struct xfs_buf; 21struct xfs_buf;
22struct log; 22struct log;
23struct xlog_ticket; 23struct xlog_ticket;
24struct xfs_buf_cancel;
25struct xfs_mount; 24struct xfs_mount;
26 25
27/* 26/*
@@ -54,7 +53,6 @@ struct xfs_mount;
54 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ 53 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
55 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) 54 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
56 55
57
58static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) 56static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
59{ 57{
60 return ((xfs_lsn_t)cycle << 32) | block; 58 return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
133 */ 131 */
134#define XLOG_TIC_INITED 0x1 /* has been initialized */ 132#define XLOG_TIC_INITED 0x1 /* has been initialized */
135#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ 133#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
136#define XLOG_TIC_IN_Q 0x4
137 134
138#define XLOG_TIC_FLAGS \ 135#define XLOG_TIC_FLAGS \
139 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ 136 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
140 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ 137 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
141 { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" }
142 138
143#endif /* __KERNEL__ */ 139#endif /* __KERNEL__ */
144 140
@@ -244,9 +240,8 @@ typedef struct xlog_res {
244} xlog_res_t; 240} xlog_res_t;
245 241
246typedef struct xlog_ticket { 242typedef struct xlog_ticket {
247 sv_t t_wait; /* ticket wait queue : 20 */ 243 wait_queue_head_t t_wait; /* ticket wait queue */
248 struct xlog_ticket *t_next; /* :4|8 */ 244 struct list_head t_queue; /* reserve/write queue */
249 struct xlog_ticket *t_prev; /* :4|8 */
250 xlog_tid_t t_tid; /* transaction identifier : 4 */ 245 xlog_tid_t t_tid; /* transaction identifier : 4 */
251 atomic_t t_ref; /* ticket reference count : 4 */ 246 atomic_t t_ref; /* ticket reference count : 4 */
252 int t_curr_res; /* current reservation in bytes : 4 */ 247 int t_curr_res; /* current reservation in bytes : 4 */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
353 * and move everything else out to subsequent cachelines. 348 * and move everything else out to subsequent cachelines.
354 */ 349 */
355typedef struct xlog_in_core { 350typedef struct xlog_in_core {
356 sv_t ic_force_wait; 351 wait_queue_head_t ic_force_wait;
357 sv_t ic_write_wait; 352 wait_queue_head_t ic_write_wait;
358 struct xlog_in_core *ic_next; 353 struct xlog_in_core *ic_next;
359 struct xlog_in_core *ic_prev; 354 struct xlog_in_core *ic_prev;
360 struct xfs_buf *ic_bp; 355 struct xfs_buf *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
421 struct xfs_cil_ctx *xc_ctx; 416 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock; 417 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 418 struct list_head xc_committing;
424 sv_t xc_commit_wait; 419 wait_queue_head_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence; 420 xfs_lsn_t xc_current_sequence;
426}; 421};
427 422
@@ -491,7 +486,7 @@ typedef struct log {
491 struct xfs_buftarg *l_targ; /* buftarg of log */ 486 struct xfs_buftarg *l_targ; /* buftarg of log */
492 uint l_flags; 487 uint l_flags;
493 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 488 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
494 struct xfs_buf_cancel **l_buf_cancel_table; 489 struct list_head *l_buf_cancel_table;
495 int l_iclog_hsize; /* size of iclog header */ 490 int l_iclog_hsize; /* size of iclog header */
496 int l_iclog_heads; /* # of iclog header sectors */ 491 int l_iclog_heads; /* # of iclog header sectors */
497 uint l_sectBBsize; /* sector size in BBs (2^n) */ 492 uint l_sectBBsize; /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
503 int l_logBBsize; /* size of log in BB chunks */ 498 int l_logBBsize; /* size of log in BB chunks */
504 499
505 /* The following block of fields are changed while holding icloglock */ 500 /* The following block of fields are changed while holding icloglock */
506 sv_t l_flush_wait ____cacheline_aligned_in_smp; 501 wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
507 /* waiting for iclog flush */ 502 /* waiting for iclog flush */
508 int l_covered_state;/* state of "covering disk 503 int l_covered_state;/* state of "covering disk
509 * log entries" */ 504 * log entries" */
510 xlog_in_core_t *l_iclog; /* head log queue */ 505 xlog_in_core_t *l_iclog; /* head log queue */
511 spinlock_t l_icloglock; /* grab to change iclog state */ 506 spinlock_t l_icloglock; /* grab to change iclog state */
512 xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
513 * buffers */
514 xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
515 int l_curr_cycle; /* Cycle number of log writes */ 507 int l_curr_cycle; /* Cycle number of log writes */
516 int l_prev_cycle; /* Cycle number before last 508 int l_prev_cycle; /* Cycle number before last
517 * block increment */ 509 * block increment */
518 int l_curr_block; /* current logical log block */ 510 int l_curr_block; /* current logical log block */
519 int l_prev_block; /* previous logical log block */ 511 int l_prev_block; /* previous logical log block */
520 512
521 /* The following block of fields are changed while holding grant_lock */ 513 /*
522 spinlock_t l_grant_lock ____cacheline_aligned_in_smp; 514 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
523 xlog_ticket_t *l_reserve_headq; 515 * read without needing to hold specific locks. To avoid operations
524 xlog_ticket_t *l_write_headq; 516 * contending with other hot objects, place each of them on a separate
525 int l_grant_reserve_cycle; 517 * cacheline.
526 int l_grant_reserve_bytes; 518 */
527 int l_grant_write_cycle; 519 /* lsn of last LR on disk */
528 int l_grant_write_bytes; 520 atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
521 /* lsn of 1st LR with unflushed * buffers */
522 atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
523
524 /*
525 * ticket grant locks, queues and accounting have their own cachlines
526 * as these are quite hot and can be operated on concurrently.
527 */
528 spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
529 struct list_head l_reserveq;
530 atomic64_t l_grant_reserve_head;
531
532 spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
533 struct list_head l_writeq;
534 atomic64_t l_grant_write_head;
529 535
530 /* The following field are used for debugging; need to hold icloglock */ 536 /* The following field are used for debugging; need to hold icloglock */
531#ifdef DEBUG 537#ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
534 540
535} xlog_t; 541} xlog_t;
536 542
543#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
544 ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
545
537#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 546#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
538 547
539/* common routines */ 548/* common routines */
@@ -562,6 +571,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
562 xlog_in_core_t **commit_iclog, uint flags); 571 xlog_in_core_t **commit_iclog, uint flags);
563 572
564/* 573/*
574 * When we crack an atomic LSN, we sample it first so that the value will not
575 * change while we are cracking it into the component values. This means we
576 * will always get consistent component values to work from. This should always
577 * be used to smaple and crack LSNs taht are stored and updated in atomic
578 * variables.
579 */
580static inline void
581xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
582{
583 xfs_lsn_t val = atomic64_read(lsn);
584
585 *cycle = CYCLE_LSN(val);
586 *block = BLOCK_LSN(val);
587}
588
589/*
590 * Calculate and assign a value to an atomic LSN variable from component pieces.
591 */
592static inline void
593xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
594{
595 atomic64_set(lsn, xlog_assign_lsn(cycle, block));
596}
597
598/*
599 * When we crack the grant head, we sample it first so that the value will not
600 * change while we are cracking it into the component values. This means we
601 * will always get consistent component values to work from.
602 */
603static inline void
604xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
605{
606 *cycle = val >> 32;
607 *space = val & 0xffffffff;
608}
609
610static inline void
611xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
612{
613 xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
614}
615
616static inline int64_t
617xlog_assign_grant_head_val(int cycle, int space)
618{
619 return ((int64_t)cycle << 32) | space;
620}
621
622static inline void
623xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
624{
625 atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
626}
627
628/*
565 * Committed Item List interfaces 629 * Committed Item List interfaces
566 */ 630 */
567int xlog_cil_init(struct log *log); 631int xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
585 */ 649 */
586#define XLOG_UNMOUNT_REC_TYPE (-1U) 650#define XLOG_UNMOUNT_REC_TYPE (-1U)
587 651
652/*
653 * Wrapper function for waiting on a wait queue serialised against wakeups
654 * by a spinlock. This matches the semantics of all the wait queues used in the
655 * log code.
656 */
657static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
658{
659 DECLARE_WAITQUEUE(wait, current);
660
661 add_wait_queue_exclusive(wq, &wait);
662 __set_current_state(TASK_UNINTERRUPTIBLE);
663 spin_unlock(lock);
664 schedule();
665 remove_wait_queue(wq, &wait);
666}
588#endif /* __KERNEL__ */ 667#endif /* __KERNEL__ */
589 668
590#endif /* __XFS_LOG_PRIV_H__ */ 669#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458..aa0ebb77690 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void xlog_recover_check_summary(xlog_t *);
53#endif 53#endif
54 54
55/* 55/*
56 * This structure is used during recovery to record the buf log items which
57 * have been canceled and should not be replayed.
58 */
59struct xfs_buf_cancel {
60 xfs_daddr_t bc_blkno;
61 uint bc_len;
62 int bc_refcount;
63 struct list_head bc_list;
64};
65
66/*
56 * Sector aligned buffer routines for buffer create/read/write/access 67 * Sector aligned buffer routines for buffer create/read/write/access
57 */ 68 */
58 69
@@ -925,12 +936,12 @@ xlog_find_tail(
925 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 936 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
926 if (found == 2) 937 if (found == 2)
927 log->l_curr_cycle++; 938 log->l_curr_cycle++;
928 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); 939 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
929 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); 940 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
930 log->l_grant_reserve_cycle = log->l_curr_cycle; 941 xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
931 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); 942 BBTOB(log->l_curr_block));
932 log->l_grant_write_cycle = log->l_curr_cycle; 943 xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
933 log->l_grant_write_bytes = BBTOB(log->l_curr_block); 944 BBTOB(log->l_curr_block));
934 945
935 /* 946 /*
936 * Look for unmount record. If we find it, then we know there 947 * Look for unmount record. If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
960 } 971 }
961 after_umount_blk = (i + hblks + (int) 972 after_umount_blk = (i + hblks + (int)
962 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; 973 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
963 tail_lsn = log->l_tail_lsn; 974 tail_lsn = atomic64_read(&log->l_tail_lsn);
964 if (*head_blk == after_umount_blk && 975 if (*head_blk == after_umount_blk &&
965 be32_to_cpu(rhead->h_num_logops) == 1) { 976 be32_to_cpu(rhead->h_num_logops) == 1) {
966 umount_data_blk = (i + hblks) % log->l_logBBsize; 977 umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
975 * log records will point recovery to after the 986 * log records will point recovery to after the
976 * current unmount record. 987 * current unmount record.
977 */ 988 */
978 log->l_tail_lsn = 989 xlog_assign_atomic_lsn(&log->l_tail_lsn,
979 xlog_assign_lsn(log->l_curr_cycle, 990 log->l_curr_cycle, after_umount_blk);
980 after_umount_blk); 991 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
981 log->l_last_sync_lsn = 992 log->l_curr_cycle, after_umount_blk);
982 xlog_assign_lsn(log->l_curr_cycle,
983 after_umount_blk);
984 *tail_blk = after_umount_blk; 993 *tail_blk = after_umount_blk;
985 994
986 /* 995 /*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
1605 * record in the table to tell us how many times we expect to see this 1614 * record in the table to tell us how many times we expect to see this
1606 * record during the second pass. 1615 * record during the second pass.
1607 */ 1616 */
1608STATIC void 1617STATIC int
1609xlog_recover_do_buffer_pass1( 1618xlog_recover_buffer_pass1(
1610 xlog_t *log, 1619 struct log *log,
1611 xfs_buf_log_format_t *buf_f) 1620 xlog_recover_item_t *item)
1612{ 1621{
1613 xfs_buf_cancel_t *bcp; 1622 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1614 xfs_buf_cancel_t *nextp; 1623 struct list_head *bucket;
1615 xfs_buf_cancel_t *prevp; 1624 struct xfs_buf_cancel *bcp;
1616 xfs_buf_cancel_t **bucket;
1617 xfs_daddr_t blkno = 0;
1618 uint len = 0;
1619 ushort flags = 0;
1620
1621 switch (buf_f->blf_type) {
1622 case XFS_LI_BUF:
1623 blkno = buf_f->blf_blkno;
1624 len = buf_f->blf_len;
1625 flags = buf_f->blf_flags;
1626 break;
1627 }
1628 1625
1629 /* 1626 /*
1630 * If this isn't a cancel buffer item, then just return. 1627 * If this isn't a cancel buffer item, then just return.
1631 */ 1628 */
1632 if (!(flags & XFS_BLF_CANCEL)) { 1629 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1633 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1630 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1634 return; 1631 return 0;
1635 }
1636
1637 /*
1638 * Insert an xfs_buf_cancel record into the hash table of
1639 * them. If there is already an identical record, bump
1640 * its reference count.
1641 */
1642 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1643 XLOG_BC_TABLE_SIZE];
1644 /*
1645 * If the hash bucket is empty then just insert a new record into
1646 * the bucket.
1647 */
1648 if (*bucket == NULL) {
1649 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1650 KM_SLEEP);
1651 bcp->bc_blkno = blkno;
1652 bcp->bc_len = len;
1653 bcp->bc_refcount = 1;
1654 bcp->bc_next = NULL;
1655 *bucket = bcp;
1656 return;
1657 } 1632 }
1658 1633
1659 /* 1634 /*
1660 * The hash bucket is not empty, so search for duplicates of our 1635 * Insert an xfs_buf_cancel record into the hash table of them.
1661 * record. If we find one them just bump its refcount. If not 1636 * If there is already an identical record, bump its reference count.
1662 * then add us at the end of the list.
1663 */ 1637 */
1664 prevp = NULL; 1638 bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1665 nextp = *bucket; 1639 list_for_each_entry(bcp, bucket, bc_list) {
1666 while (nextp != NULL) { 1640 if (bcp->bc_blkno == buf_f->blf_blkno &&
1667 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1641 bcp->bc_len == buf_f->blf_len) {
1668 nextp->bc_refcount++; 1642 bcp->bc_refcount++;
1669 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); 1643 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1670 return; 1644 return 0;
1671 } 1645 }
1672 prevp = nextp; 1646 }
1673 nextp = nextp->bc_next; 1647
1674 } 1648 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1675 ASSERT(prevp != NULL); 1649 bcp->bc_blkno = buf_f->blf_blkno;
1676 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), 1650 bcp->bc_len = buf_f->blf_len;
1677 KM_SLEEP);
1678 bcp->bc_blkno = blkno;
1679 bcp->bc_len = len;
1680 bcp->bc_refcount = 1; 1651 bcp->bc_refcount = 1;
1681 bcp->bc_next = NULL; 1652 list_add_tail(&bcp->bc_list, bucket);
1682 prevp->bc_next = bcp; 1653
1683 trace_xfs_log_recover_buf_cancel_add(log, buf_f); 1654 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1655 return 0;
1684} 1656}
1685 1657
1686/* 1658/*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
1698 */ 1670 */
1699STATIC int 1671STATIC int
1700xlog_check_buffer_cancelled( 1672xlog_check_buffer_cancelled(
1701 xlog_t *log, 1673 struct log *log,
1702 xfs_daddr_t blkno, 1674 xfs_daddr_t blkno,
1703 uint len, 1675 uint len,
1704 ushort flags) 1676 ushort flags)
1705{ 1677{
1706 xfs_buf_cancel_t *bcp; 1678 struct list_head *bucket;
1707 xfs_buf_cancel_t *prevp; 1679 struct xfs_buf_cancel *bcp;
1708 xfs_buf_cancel_t **bucket;
1709 1680
1710 if (log->l_buf_cancel_table == NULL) { 1681 if (log->l_buf_cancel_table == NULL) {
1711 /* 1682 /*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
1716 return 0; 1687 return 0;
1717 } 1688 }
1718 1689
1719 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1720 XLOG_BC_TABLE_SIZE];
1721 bcp = *bucket;
1722 if (bcp == NULL) {
1723 /*
1724 * There is no corresponding entry in the table built
1725 * in pass one, so this buffer has not been cancelled.
1726 */
1727 ASSERT(!(flags & XFS_BLF_CANCEL));
1728 return 0;
1729 }
1730
1731 /* 1690 /*
1732 * Search for an entry in the buffer cancel table that 1691 * Search for an entry in the cancel table that matches our buffer.
1733 * matches our buffer.
1734 */ 1692 */
1735 prevp = NULL; 1693 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1736 while (bcp != NULL) { 1694 list_for_each_entry(bcp, bucket, bc_list) {
1737 if (bcp->bc_blkno == blkno && bcp->bc_len == len) { 1695 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1738 /* 1696 goto found;
1739 * We've go a match, so return 1 so that the
1740 * recovery of this buffer is cancelled.
1741 * If this buffer is actually a buffer cancel
1742 * log item, then decrement the refcount on the
1743 * one in the table and remove it if this is the
1744 * last reference.
1745 */
1746 if (flags & XFS_BLF_CANCEL) {
1747 bcp->bc_refcount--;
1748 if (bcp->bc_refcount == 0) {
1749 if (prevp == NULL) {
1750 *bucket = bcp->bc_next;
1751 } else {
1752 prevp->bc_next = bcp->bc_next;
1753 }
1754 kmem_free(bcp);
1755 }
1756 }
1757 return 1;
1758 }
1759 prevp = bcp;
1760 bcp = bcp->bc_next;
1761 } 1697 }
1698
1762 /* 1699 /*
1763 * We didn't find a corresponding entry in the table, so 1700 * We didn't find a corresponding entry in the table, so return 0 so
1764 * return 0 so that the buffer is NOT cancelled. 1701 * that the buffer is NOT cancelled.
1765 */ 1702 */
1766 ASSERT(!(flags & XFS_BLF_CANCEL)); 1703 ASSERT(!(flags & XFS_BLF_CANCEL));
1767 return 0; 1704 return 0;
1768}
1769 1705
1770STATIC int 1706found:
1771xlog_recover_do_buffer_pass2( 1707 /*
1772 xlog_t *log, 1708 * We've go a match, so return 1 so that the recovery of this buffer
1773 xfs_buf_log_format_t *buf_f) 1709 * is cancelled. If this buffer is actually a buffer cancel log
1774{ 1710 * item, then decrement the refcount on the one in the table and
1775 xfs_daddr_t blkno = 0; 1711 * remove it if this is the last reference.
1776 ushort flags = 0; 1712 */
1777 uint len = 0; 1713 if (flags & XFS_BLF_CANCEL) {
1778 1714 if (--bcp->bc_refcount == 0) {
1779 switch (buf_f->blf_type) { 1715 list_del(&bcp->bc_list);
1780 case XFS_LI_BUF: 1716 kmem_free(bcp);
1781 blkno = buf_f->blf_blkno; 1717 }
1782 flags = buf_f->blf_flags;
1783 len = buf_f->blf_len;
1784 break;
1785 } 1718 }
1786 1719 return 1;
1787 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1788} 1720}
1789 1721
1790/* 1722/*
1791 * Perform recovery for a buffer full of inodes. In these buffers, 1723 * Perform recovery for a buffer full of inodes. In these buffers, the only
1792 * the only data which should be recovered is that which corresponds 1724 * data which should be recovered is that which corresponds to the
1793 * to the di_next_unlinked pointers in the on disk inode structures. 1725 * di_next_unlinked pointers in the on disk inode structures. The rest of the
1794 * The rest of the data for the inodes is always logged through the 1726 * data for the inodes is always logged through the inodes themselves rather
1795 * inodes themselves rather than the inode buffer and is recovered 1727 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1796 * in xlog_recover_do_inode_trans().
1797 * 1728 *
1798 * The only time when buffers full of inodes are fully recovered is 1729 * The only time when buffers full of inodes are fully recovered is when the
1799 * when the buffer is full of newly allocated inodes. In this case 1730 * buffer is full of newly allocated inodes. In this case the buffer will
1800 * the buffer will not be marked as an inode buffer and so will be 1731 * not be marked as an inode buffer and so will be sent to
1801 * sent to xlog_recover_do_reg_buffer() below during recovery. 1732 * xlog_recover_do_reg_buffer() below during recovery.
1802 */ 1733 */
1803STATIC int 1734STATIC int
1804xlog_recover_do_inode_buffer( 1735xlog_recover_do_inode_buffer(
1805 xfs_mount_t *mp, 1736 struct xfs_mount *mp,
1806 xlog_recover_item_t *item, 1737 xlog_recover_item_t *item,
1807 xfs_buf_t *bp, 1738 struct xfs_buf *bp,
1808 xfs_buf_log_format_t *buf_f) 1739 xfs_buf_log_format_t *buf_f)
1809{ 1740{
1810 int i; 1741 int i;
1811 int item_index; 1742 int item_index = 0;
1812 int bit; 1743 int bit = 0;
1813 int nbits; 1744 int nbits = 0;
1814 int reg_buf_offset; 1745 int reg_buf_offset = 0;
1815 int reg_buf_bytes; 1746 int reg_buf_bytes = 0;
1816 int next_unlinked_offset; 1747 int next_unlinked_offset;
1817 int inodes_per_buf; 1748 int inodes_per_buf;
1818 xfs_agino_t *logged_nextp; 1749 xfs_agino_t *logged_nextp;
1819 xfs_agino_t *buffer_nextp; 1750 xfs_agino_t *buffer_nextp;
1820 unsigned int *data_map = NULL;
1821 unsigned int map_size = 0;
1822 1751
1823 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1752 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1824 1753
1825 switch (buf_f->blf_type) {
1826 case XFS_LI_BUF:
1827 data_map = buf_f->blf_data_map;
1828 map_size = buf_f->blf_map_size;
1829 break;
1830 }
1831 /*
1832 * Set the variables corresponding to the current region to
1833 * 0 so that we'll initialize them on the first pass through
1834 * the loop.
1835 */
1836 reg_buf_offset = 0;
1837 reg_buf_bytes = 0;
1838 bit = 0;
1839 nbits = 0;
1840 item_index = 0;
1841 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; 1754 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1842 for (i = 0; i < inodes_per_buf; i++) { 1755 for (i = 0; i < inodes_per_buf; i++) {
1843 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 1756 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
1852 * the current di_next_unlinked field. 1765 * the current di_next_unlinked field.
1853 */ 1766 */
1854 bit += nbits; 1767 bit += nbits;
1855 bit = xfs_next_bit(data_map, map_size, bit); 1768 bit = xfs_next_bit(buf_f->blf_data_map,
1769 buf_f->blf_map_size, bit);
1856 1770
1857 /* 1771 /*
1858 * If there are no more logged regions in the 1772 * If there are no more logged regions in the
1859 * buffer, then we're done. 1773 * buffer, then we're done.
1860 */ 1774 */
1861 if (bit == -1) { 1775 if (bit == -1)
1862 return 0; 1776 return 0;
1863 }
1864 1777
1865 nbits = xfs_contig_bits(data_map, map_size, 1778 nbits = xfs_contig_bits(buf_f->blf_data_map,
1866 bit); 1779 buf_f->blf_map_size, bit);
1867 ASSERT(nbits > 0); 1780 ASSERT(nbits > 0);
1868 reg_buf_offset = bit << XFS_BLF_SHIFT; 1781 reg_buf_offset = bit << XFS_BLF_SHIFT;
1869 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 1782 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
1875 * di_next_unlinked field, then move on to the next 1788 * di_next_unlinked field, then move on to the next
1876 * di_next_unlinked field. 1789 * di_next_unlinked field.
1877 */ 1790 */
1878 if (next_unlinked_offset < reg_buf_offset) { 1791 if (next_unlinked_offset < reg_buf_offset)
1879 continue; 1792 continue;
1880 }
1881 1793
1882 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1794 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1883 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 1795 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
1913 * given buffer. The bitmap in the buf log format structure indicates 1825 * given buffer. The bitmap in the buf log format structure indicates
1914 * where to place the logged data. 1826 * where to place the logged data.
1915 */ 1827 */
1916/*ARGSUSED*/
1917STATIC void 1828STATIC void
1918xlog_recover_do_reg_buffer( 1829xlog_recover_do_reg_buffer(
1919 struct xfs_mount *mp, 1830 struct xfs_mount *mp,
1920 xlog_recover_item_t *item, 1831 xlog_recover_item_t *item,
1921 xfs_buf_t *bp, 1832 struct xfs_buf *bp,
1922 xfs_buf_log_format_t *buf_f) 1833 xfs_buf_log_format_t *buf_f)
1923{ 1834{
1924 int i; 1835 int i;
1925 int bit; 1836 int bit;
1926 int nbits; 1837 int nbits;
1927 unsigned int *data_map = NULL;
1928 unsigned int map_size = 0;
1929 int error; 1838 int error;
1930 1839
1931 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 1840 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1932 1841
1933 switch (buf_f->blf_type) {
1934 case XFS_LI_BUF:
1935 data_map = buf_f->blf_data_map;
1936 map_size = buf_f->blf_map_size;
1937 break;
1938 }
1939 bit = 0; 1842 bit = 0;
1940 i = 1; /* 0 is the buf format structure */ 1843 i = 1; /* 0 is the buf format structure */
1941 while (1) { 1844 while (1) {
1942 bit = xfs_next_bit(data_map, map_size, bit); 1845 bit = xfs_next_bit(buf_f->blf_data_map,
1846 buf_f->blf_map_size, bit);
1943 if (bit == -1) 1847 if (bit == -1)
1944 break; 1848 break;
1945 nbits = xfs_contig_bits(data_map, map_size, bit); 1849 nbits = xfs_contig_bits(buf_f->blf_data_map,
1850 buf_f->blf_map_size, bit);
1946 ASSERT(nbits > 0); 1851 ASSERT(nbits > 0);
1947 ASSERT(item->ri_buf[i].i_addr != NULL); 1852 ASSERT(item->ri_buf[i].i_addr != NULL);
1948 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 1853 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
2176 * for more details on the implementation of the table of cancel records. 2081 * for more details on the implementation of the table of cancel records.
2177 */ 2082 */
2178STATIC int 2083STATIC int
2179xlog_recover_do_buffer_trans( 2084xlog_recover_buffer_pass2(
2180 xlog_t *log, 2085 xlog_t *log,
2181 xlog_recover_item_t *item, 2086 xlog_recover_item_t *item)
2182 int pass)
2183{ 2087{
2184 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2088 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2185 xfs_mount_t *mp; 2089 xfs_mount_t *mp = log->l_mp;
2186 xfs_buf_t *bp; 2090 xfs_buf_t *bp;
2187 int error; 2091 int error;
2188 int cancel;
2189 xfs_daddr_t blkno;
2190 int len;
2191 ushort flags;
2192 uint buf_flags; 2092 uint buf_flags;
2193 2093
2194 if (pass == XLOG_RECOVER_PASS1) { 2094 /*
2195 /* 2095 * In this pass we only want to recover all the buffers which have
2196 * In this pass we're only looking for buf items 2096 * not been cancelled and are not cancellation buffers themselves.
2197 * with the XFS_BLF_CANCEL bit set. 2097 */
2198 */ 2098 if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2199 xlog_recover_do_buffer_pass1(log, buf_f); 2099 buf_f->blf_len, buf_f->blf_flags)) {
2100 trace_xfs_log_recover_buf_cancel(log, buf_f);
2200 return 0; 2101 return 0;
2201 } else {
2202 /*
2203 * In this pass we want to recover all the buffers
2204 * which have not been cancelled and are not
2205 * cancellation buffers themselves. The routine
2206 * we call here will tell us whether or not to
2207 * continue with the replay of this buffer.
2208 */
2209 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2210 if (cancel) {
2211 trace_xfs_log_recover_buf_cancel(log, buf_f);
2212 return 0;
2213 }
2214 } 2102 }
2103
2215 trace_xfs_log_recover_buf_recover(log, buf_f); 2104 trace_xfs_log_recover_buf_recover(log, buf_f);
2216 switch (buf_f->blf_type) {
2217 case XFS_LI_BUF:
2218 blkno = buf_f->blf_blkno;
2219 len = buf_f->blf_len;
2220 flags = buf_f->blf_flags;
2221 break;
2222 default:
2223 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2224 "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2225 buf_f->blf_type, log->l_mp->m_logname ?
2226 log->l_mp->m_logname : "internal");
2227 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2228 XFS_ERRLEVEL_LOW, log->l_mp);
2229 return XFS_ERROR(EFSCORRUPTED);
2230 }
2231 2105
2232 mp = log->l_mp;
2233 buf_flags = XBF_LOCK; 2106 buf_flags = XBF_LOCK;
2234 if (!(flags & XFS_BLF_INODE_BUF)) 2107 if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
2235 buf_flags |= XBF_MAPPED; 2108 buf_flags |= XBF_MAPPED;
2236 2109
2237 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2110 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2111 buf_flags);
2238 if (XFS_BUF_ISERROR(bp)) { 2112 if (XFS_BUF_ISERROR(bp)) {
2239 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, 2113 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
2240 bp, blkno); 2114 bp, buf_f->blf_blkno);
2241 error = XFS_BUF_GETERROR(bp); 2115 error = XFS_BUF_GETERROR(bp);
2242 xfs_buf_relse(bp); 2116 xfs_buf_relse(bp);
2243 return error; 2117 return error;
2244 } 2118 }
2245 2119
2246 error = 0; 2120 error = 0;
2247 if (flags & XFS_BLF_INODE_BUF) { 2121 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2248 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2122 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2249 } else if (flags & 2123 } else if (buf_f->blf_flags &
2250 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2124 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2251 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2125 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2252 } else { 2126 } else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
2286} 2160}
2287 2161
2288STATIC int 2162STATIC int
2289xlog_recover_do_inode_trans( 2163xlog_recover_inode_pass2(
2290 xlog_t *log, 2164 xlog_t *log,
2291 xlog_recover_item_t *item, 2165 xlog_recover_item_t *item)
2292 int pass)
2293{ 2166{
2294 xfs_inode_log_format_t *in_f; 2167 xfs_inode_log_format_t *in_f;
2295 xfs_mount_t *mp; 2168 xfs_mount_t *mp = log->l_mp;
2296 xfs_buf_t *bp; 2169 xfs_buf_t *bp;
2297 xfs_dinode_t *dip; 2170 xfs_dinode_t *dip;
2298 xfs_ino_t ino;
2299 int len; 2171 int len;
2300 xfs_caddr_t src; 2172 xfs_caddr_t src;
2301 xfs_caddr_t dest; 2173 xfs_caddr_t dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
2305 xfs_icdinode_t *dicp; 2177 xfs_icdinode_t *dicp;
2306 int need_free = 0; 2178 int need_free = 0;
2307 2179
2308 if (pass == XLOG_RECOVER_PASS1) {
2309 return 0;
2310 }
2311
2312 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2180 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2313 in_f = item->ri_buf[0].i_addr; 2181 in_f = item->ri_buf[0].i_addr;
2314 } else { 2182 } else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
2318 if (error) 2186 if (error)
2319 goto error; 2187 goto error;
2320 } 2188 }
2321 ino = in_f->ilf_ino;
2322 mp = log->l_mp;
2323 2189
2324 /* 2190 /*
2325 * Inode buffers can be freed, look out for it, 2191 * Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
2354 xfs_buf_relse(bp); 2220 xfs_buf_relse(bp);
2355 xfs_fs_cmn_err(CE_ALERT, mp, 2221 xfs_fs_cmn_err(CE_ALERT, mp,
2356 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2222 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2357 dip, bp, ino); 2223 dip, bp, in_f->ilf_ino);
2358 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", 2224 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2359 XFS_ERRLEVEL_LOW, mp); 2225 XFS_ERRLEVEL_LOW, mp);
2360 error = EFSCORRUPTED; 2226 error = EFSCORRUPTED;
2361 goto error; 2227 goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
2365 xfs_buf_relse(bp); 2231 xfs_buf_relse(bp);
2366 xfs_fs_cmn_err(CE_ALERT, mp, 2232 xfs_fs_cmn_err(CE_ALERT, mp,
2367 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", 2233 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2368 item, ino); 2234 item, in_f->ilf_ino);
2369 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", 2235 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2370 XFS_ERRLEVEL_LOW, mp); 2236 XFS_ERRLEVEL_LOW, mp);
2371 error = EFSCORRUPTED; 2237 error = EFSCORRUPTED;
2372 goto error; 2238 goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
2394 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { 2260 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2395 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2261 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2396 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2262 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2397 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", 2263 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2398 XFS_ERRLEVEL_LOW, mp, dicp); 2264 XFS_ERRLEVEL_LOW, mp, dicp);
2399 xfs_buf_relse(bp); 2265 xfs_buf_relse(bp);
2400 xfs_fs_cmn_err(CE_ALERT, mp, 2266 xfs_fs_cmn_err(CE_ALERT, mp,
2401 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2267 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2402 item, dip, bp, ino); 2268 item, dip, bp, in_f->ilf_ino);
2403 error = EFSCORRUPTED; 2269 error = EFSCORRUPTED;
2404 goto error; 2270 goto error;
2405 } 2271 }
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
2407 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2273 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2408 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 2274 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2409 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2275 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2410 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", 2276 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2411 XFS_ERRLEVEL_LOW, mp, dicp); 2277 XFS_ERRLEVEL_LOW, mp, dicp);
2412 xfs_buf_relse(bp); 2278 xfs_buf_relse(bp);
2413 xfs_fs_cmn_err(CE_ALERT, mp, 2279 xfs_fs_cmn_err(CE_ALERT, mp,
2414 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2280 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2415 item, dip, bp, ino); 2281 item, dip, bp, in_f->ilf_ino);
2416 error = EFSCORRUPTED; 2282 error = EFSCORRUPTED;
2417 goto error; 2283 goto error;
2418 } 2284 }
2419 } 2285 }
2420 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2286 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2421 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", 2287 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2422 XFS_ERRLEVEL_LOW, mp, dicp); 2288 XFS_ERRLEVEL_LOW, mp, dicp);
2423 xfs_buf_relse(bp); 2289 xfs_buf_relse(bp);
2424 xfs_fs_cmn_err(CE_ALERT, mp, 2290 xfs_fs_cmn_err(CE_ALERT, mp,
2425 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2291 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2426 item, dip, bp, ino, 2292 item, dip, bp, in_f->ilf_ino,
2427 dicp->di_nextents + dicp->di_anextents, 2293 dicp->di_nextents + dicp->di_anextents,
2428 dicp->di_nblocks); 2294 dicp->di_nblocks);
2429 error = EFSCORRUPTED; 2295 error = EFSCORRUPTED;
2430 goto error; 2296 goto error;
2431 } 2297 }
2432 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2298 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2433 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", 2299 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2434 XFS_ERRLEVEL_LOW, mp, dicp); 2300 XFS_ERRLEVEL_LOW, mp, dicp);
2435 xfs_buf_relse(bp); 2301 xfs_buf_relse(bp);
2436 xfs_fs_cmn_err(CE_ALERT, mp, 2302 xfs_fs_cmn_err(CE_ALERT, mp,
2437 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", 2303 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2438 item, dip, bp, ino, dicp->di_forkoff); 2304 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2439 error = EFSCORRUPTED; 2305 error = EFSCORRUPTED;
2440 goto error; 2306 goto error;
2441 } 2307 }
2442 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { 2308 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2443 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", 2309 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2444 XFS_ERRLEVEL_LOW, mp, dicp); 2310 XFS_ERRLEVEL_LOW, mp, dicp);
2445 xfs_buf_relse(bp); 2311 xfs_buf_relse(bp);
2446 xfs_fs_cmn_err(CE_ALERT, mp, 2312 xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
2532 break; 2398 break;
2533 2399
2534 default: 2400 default:
2535 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); 2401 xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
2536 ASSERT(0); 2402 ASSERT(0);
2537 xfs_buf_relse(bp); 2403 xfs_buf_relse(bp);
2538 error = EIO; 2404 error = EIO;
@@ -2556,18 +2422,11 @@ error:
2556 * of that type. 2422 * of that type.
2557 */ 2423 */
2558STATIC int 2424STATIC int
2559xlog_recover_do_quotaoff_trans( 2425xlog_recover_quotaoff_pass1(
2560 xlog_t *log, 2426 xlog_t *log,
2561 xlog_recover_item_t *item, 2427 xlog_recover_item_t *item)
2562 int pass)
2563{ 2428{
2564 xfs_qoff_logformat_t *qoff_f; 2429 xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
2565
2566 if (pass == XLOG_RECOVER_PASS2) {
2567 return (0);
2568 }
2569
2570 qoff_f = item->ri_buf[0].i_addr;
2571 ASSERT(qoff_f); 2430 ASSERT(qoff_f);
2572 2431
2573 /* 2432 /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
2588 * Recover a dquot record 2447 * Recover a dquot record
2589 */ 2448 */
2590STATIC int 2449STATIC int
2591xlog_recover_do_dquot_trans( 2450xlog_recover_dquot_pass2(
2592 xlog_t *log, 2451 xlog_t *log,
2593 xlog_recover_item_t *item, 2452 xlog_recover_item_t *item)
2594 int pass)
2595{ 2453{
2596 xfs_mount_t *mp; 2454 xfs_mount_t *mp = log->l_mp;
2597 xfs_buf_t *bp; 2455 xfs_buf_t *bp;
2598 struct xfs_disk_dquot *ddq, *recddq; 2456 struct xfs_disk_dquot *ddq, *recddq;
2599 int error; 2457 int error;
2600 xfs_dq_logformat_t *dq_f; 2458 xfs_dq_logformat_t *dq_f;
2601 uint type; 2459 uint type;
2602 2460
2603 if (pass == XLOG_RECOVER_PASS1) {
2604 return 0;
2605 }
2606 mp = log->l_mp;
2607 2461
2608 /* 2462 /*
2609 * Filesystems are required to send in quota flags at mount time. 2463 * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
2647 if ((error = xfs_qm_dqcheck(recddq, 2501 if ((error = xfs_qm_dqcheck(recddq,
2648 dq_f->qlf_id, 2502 dq_f->qlf_id,
2649 0, XFS_QMOPT_DOWARN, 2503 0, XFS_QMOPT_DOWARN,
2650 "xlog_recover_do_dquot_trans (log copy)"))) { 2504 "xlog_recover_dquot_pass2 (log copy)"))) {
2651 return XFS_ERROR(EIO); 2505 return XFS_ERROR(EIO);
2652 } 2506 }
2653 ASSERT(dq_f->qlf_len == 1); 2507 ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
2670 * minimal initialization then. 2524 * minimal initialization then.
2671 */ 2525 */
2672 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2526 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2673 "xlog_recover_do_dquot_trans")) { 2527 "xlog_recover_dquot_pass2")) {
2674 xfs_buf_relse(bp); 2528 xfs_buf_relse(bp);
2675 return XFS_ERROR(EIO); 2529 return XFS_ERROR(EIO);
2676 } 2530 }
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
2693 * LSN. 2547 * LSN.
2694 */ 2548 */
2695STATIC int 2549STATIC int
2696xlog_recover_do_efi_trans( 2550xlog_recover_efi_pass2(
2697 xlog_t *log, 2551 xlog_t *log,
2698 xlog_recover_item_t *item, 2552 xlog_recover_item_t *item,
2699 xfs_lsn_t lsn, 2553 xfs_lsn_t lsn)
2700 int pass)
2701{ 2554{
2702 int error; 2555 int error;
2703 xfs_mount_t *mp; 2556 xfs_mount_t *mp = log->l_mp;
2704 xfs_efi_log_item_t *efip; 2557 xfs_efi_log_item_t *efip;
2705 xfs_efi_log_format_t *efi_formatp; 2558 xfs_efi_log_format_t *efi_formatp;
2706 2559
2707 if (pass == XLOG_RECOVER_PASS1) {
2708 return 0;
2709 }
2710
2711 efi_formatp = item->ri_buf[0].i_addr; 2560 efi_formatp = item->ri_buf[0].i_addr;
2712 2561
2713 mp = log->l_mp;
2714 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2562 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2715 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), 2563 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2716 &(efip->efi_format)))) { 2564 &(efip->efi_format)))) {
2717 xfs_efi_item_free(efip); 2565 xfs_efi_item_free(efip);
2718 return error; 2566 return error;
2719 } 2567 }
2720 efip->efi_next_extent = efi_formatp->efi_nextents; 2568 atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2721 efip->efi_flags |= XFS_EFI_COMMITTED;
2722 2569
2723 spin_lock(&log->l_ailp->xa_lock); 2570 spin_lock(&log->l_ailp->xa_lock);
2724 /* 2571 /*
2725 * xfs_trans_ail_update() drops the AIL lock. 2572 * xfs_trans_ail_update() drops the AIL lock.
2726 */ 2573 */
2727 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); 2574 xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2728 return 0; 2575 return 0;
2729} 2576}
2730 2577
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
2737 * efd format structure. If we find it, we remove the efi from the 2584 * efd format structure. If we find it, we remove the efi from the
2738 * AIL and free it. 2585 * AIL and free it.
2739 */ 2586 */
2740STATIC void 2587STATIC int
2741xlog_recover_do_efd_trans( 2588xlog_recover_efd_pass2(
2742 xlog_t *log, 2589 xlog_t *log,
2743 xlog_recover_item_t *item, 2590 xlog_recover_item_t *item)
2744 int pass)
2745{ 2591{
2746 xfs_efd_log_format_t *efd_formatp; 2592 xfs_efd_log_format_t *efd_formatp;
2747 xfs_efi_log_item_t *efip = NULL; 2593 xfs_efi_log_item_t *efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
2750 struct xfs_ail_cursor cur; 2596 struct xfs_ail_cursor cur;
2751 struct xfs_ail *ailp = log->l_ailp; 2597 struct xfs_ail *ailp = log->l_ailp;
2752 2598
2753 if (pass == XLOG_RECOVER_PASS1) {
2754 return;
2755 }
2756
2757 efd_formatp = item->ri_buf[0].i_addr; 2599 efd_formatp = item->ri_buf[0].i_addr;
2758 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2600 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2759 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2601 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
2785 } 2627 }
2786 xfs_trans_ail_cursor_done(ailp, &cur); 2628 xfs_trans_ail_cursor_done(ailp, &cur);
2787 spin_unlock(&ailp->xa_lock); 2629 spin_unlock(&ailp->xa_lock);
2788}
2789
2790/*
2791 * Perform the transaction
2792 *
2793 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2794 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2795 */
2796STATIC int
2797xlog_recover_do_trans(
2798 xlog_t *log,
2799 xlog_recover_t *trans,
2800 int pass)
2801{
2802 int error = 0;
2803 xlog_recover_item_t *item;
2804
2805 error = xlog_recover_reorder_trans(log, trans, pass);
2806 if (error)
2807 return error;
2808
2809 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2810 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2811 switch (ITEM_TYPE(item)) {
2812 case XFS_LI_BUF:
2813 error = xlog_recover_do_buffer_trans(log, item, pass);
2814 break;
2815 case XFS_LI_INODE:
2816 error = xlog_recover_do_inode_trans(log, item, pass);
2817 break;
2818 case XFS_LI_EFI:
2819 error = xlog_recover_do_efi_trans(log, item,
2820 trans->r_lsn, pass);
2821 break;
2822 case XFS_LI_EFD:
2823 xlog_recover_do_efd_trans(log, item, pass);
2824 error = 0;
2825 break;
2826 case XFS_LI_DQUOT:
2827 error = xlog_recover_do_dquot_trans(log, item, pass);
2828 break;
2829 case XFS_LI_QUOTAOFF:
2830 error = xlog_recover_do_quotaoff_trans(log, item,
2831 pass);
2832 break;
2833 default:
2834 xlog_warn(
2835 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2836 ASSERT(0);
2837 error = XFS_ERROR(EIO);
2838 break;
2839 }
2840
2841 if (error)
2842 return error;
2843 }
2844 2630
2845 return 0; 2631 return 0;
2846} 2632}
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
2852 */ 2638 */
2853STATIC void 2639STATIC void
2854xlog_recover_free_trans( 2640xlog_recover_free_trans(
2855 xlog_recover_t *trans) 2641 struct xlog_recover *trans)
2856{ 2642{
2857 xlog_recover_item_t *item, *n; 2643 xlog_recover_item_t *item, *n;
2858 int i; 2644 int i;
@@ -2871,17 +2657,95 @@ xlog_recover_free_trans(
2871} 2657}
2872 2658
2873STATIC int 2659STATIC int
2660xlog_recover_commit_pass1(
2661 struct log *log,
2662 struct xlog_recover *trans,
2663 xlog_recover_item_t *item)
2664{
2665 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
2666
2667 switch (ITEM_TYPE(item)) {
2668 case XFS_LI_BUF:
2669 return xlog_recover_buffer_pass1(log, item);
2670 case XFS_LI_QUOTAOFF:
2671 return xlog_recover_quotaoff_pass1(log, item);
2672 case XFS_LI_INODE:
2673 case XFS_LI_EFI:
2674 case XFS_LI_EFD:
2675 case XFS_LI_DQUOT:
2676 /* nothing to do in pass 1 */
2677 return 0;
2678 default:
2679 xlog_warn(
2680 "XFS: invalid item type (%d) xlog_recover_commit_pass1",
2681 ITEM_TYPE(item));
2682 ASSERT(0);
2683 return XFS_ERROR(EIO);
2684 }
2685}
2686
2687STATIC int
2688xlog_recover_commit_pass2(
2689 struct log *log,
2690 struct xlog_recover *trans,
2691 xlog_recover_item_t *item)
2692{
2693 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2694
2695 switch (ITEM_TYPE(item)) {
2696 case XFS_LI_BUF:
2697 return xlog_recover_buffer_pass2(log, item);
2698 case XFS_LI_INODE:
2699 return xlog_recover_inode_pass2(log, item);
2700 case XFS_LI_EFI:
2701 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2702 case XFS_LI_EFD:
2703 return xlog_recover_efd_pass2(log, item);
2704 case XFS_LI_DQUOT:
2705 return xlog_recover_dquot_pass2(log, item);
2706 case XFS_LI_QUOTAOFF:
2707 /* nothing to do in pass2 */
2708 return 0;
2709 default:
2710 xlog_warn(
2711 "XFS: invalid item type (%d) xlog_recover_commit_pass2",
2712 ITEM_TYPE(item));
2713 ASSERT(0);
2714 return XFS_ERROR(EIO);
2715 }
2716}
2717
2718/*
2719 * Perform the transaction.
2720 *
2721 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2722 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2723 */
2724STATIC int
2874xlog_recover_commit_trans( 2725xlog_recover_commit_trans(
2875 xlog_t *log, 2726 struct log *log,
2876 xlog_recover_t *trans, 2727 struct xlog_recover *trans,
2877 int pass) 2728 int pass)
2878{ 2729{
2879 int error; 2730 int error = 0;
2731 xlog_recover_item_t *item;
2880 2732
2881 hlist_del(&trans->r_list); 2733 hlist_del(&trans->r_list);
2882 if ((error = xlog_recover_do_trans(log, trans, pass))) 2734
2735 error = xlog_recover_reorder_trans(log, trans, pass);
2736 if (error)
2883 return error; 2737 return error;
2884 xlog_recover_free_trans(trans); /* no error */ 2738
2739 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2740 if (pass == XLOG_RECOVER_PASS1)
2741 error = xlog_recover_commit_pass1(log, trans, item);
2742 else
2743 error = xlog_recover_commit_pass2(log, trans, item);
2744 if (error)
2745 return error;
2746 }
2747
2748 xlog_recover_free_trans(trans);
2885 return 0; 2749 return 0;
2886} 2750}
2887 2751
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
3011 xfs_extent_t *extp; 2875 xfs_extent_t *extp;
3012 xfs_fsblock_t startblock_fsb; 2876 xfs_fsblock_t startblock_fsb;
3013 2877
3014 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); 2878 ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
3015 2879
3016 /* 2880 /*
3017 * First check the validity of the extents described by the 2881 * First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
3050 extp->ext_len); 2914 extp->ext_len);
3051 } 2915 }
3052 2916
3053 efip->efi_flags |= XFS_EFI_RECOVERED; 2917 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3054 error = xfs_trans_commit(tp, 0); 2918 error = xfs_trans_commit(tp, 0);
3055 return error; 2919 return error;
3056 2920
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
3107 * Skip EFIs that we've already processed. 2971 * Skip EFIs that we've already processed.
3108 */ 2972 */
3109 efip = (xfs_efi_log_item_t *)lip; 2973 efip = (xfs_efi_log_item_t *)lip;
3110 if (efip->efi_flags & XFS_EFI_RECOVERED) { 2974 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3111 lip = xfs_trans_ail_cursor_next(ailp, &cur); 2975 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3112 continue; 2976 continue;
3113 } 2977 }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
3724 xfs_daddr_t head_blk, 3588 xfs_daddr_t head_blk,
3725 xfs_daddr_t tail_blk) 3589 xfs_daddr_t tail_blk)
3726{ 3590{
3727 int error; 3591 int error, i;
3728 3592
3729 ASSERT(head_blk != tail_blk); 3593 ASSERT(head_blk != tail_blk);
3730 3594
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
3732 * First do a pass to find all of the cancelled buf log items. 3596 * First do a pass to find all of the cancelled buf log items.
3733 * Store them in the buf_cancel_table for use in the second pass. 3597 * Store them in the buf_cancel_table for use in the second pass.
3734 */ 3598 */
3735 log->l_buf_cancel_table = 3599 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3736 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * 3600 sizeof(struct list_head),
3737 sizeof(xfs_buf_cancel_t*),
3738 KM_SLEEP); 3601 KM_SLEEP);
3602 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3603 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3604
3739 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 3605 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3740 XLOG_RECOVER_PASS1); 3606 XLOG_RECOVER_PASS1);
3741 if (error != 0) { 3607 if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
3754 int i; 3620 int i;
3755 3621
3756 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 3622 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3757 ASSERT(log->l_buf_cancel_table[i] == NULL); 3623 ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3758 } 3624 }
3759#endif /* DEBUG */ 3625#endif /* DEBUG */
3760 3626
@@ -3934,7 +3800,7 @@ xlog_recover_finish(
3934 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3800 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3935 } else { 3801 } else {
3936 cmn_err(CE_DEBUG, 3802 cmn_err(CE_DEBUG,
3937 "!Ending clean XFS mount for filesystem: %s\n", 3803 "Ending clean XFS mount for filesystem: %s\n",
3938 log->l_mp->m_fsname); 3804 log->l_mp->m_fsname);
3939 } 3805 }
3940 return 0; 3806 return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b1498ab5a39..d447aef84bc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -275,6 +275,7 @@ xfs_free_perag(
275 pag = radix_tree_delete(&mp->m_perag_tree, agno); 275 pag = radix_tree_delete(&mp->m_perag_tree, agno);
276 spin_unlock(&mp->m_perag_lock); 276 spin_unlock(&mp->m_perag_lock);
277 ASSERT(pag); 277 ASSERT(pag);
278 ASSERT(atomic_read(&pag->pag_ref) == 0);
278 call_rcu(&pag->rcu_head, __xfs_free_perag); 279 call_rcu(&pag->rcu_head, __xfs_free_perag);
279 } 280 }
280} 281}
@@ -471,7 +472,7 @@ xfs_initialize_perag(
471 goto out_unwind; 472 goto out_unwind;
472 pag->pag_agno = index; 473 pag->pag_agno = index;
473 pag->pag_mount = mp; 474 pag->pag_mount = mp;
474 rwlock_init(&pag->pag_ici_lock); 475 spin_lock_init(&pag->pag_ici_lock);
475 mutex_init(&pag->pag_ici_reclaim_lock); 476 mutex_init(&pag->pag_ici_reclaim_lock);
476 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 477 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
477 spin_lock_init(&pag->pag_buf_lock); 478 spin_lock_init(&pag->pag_buf_lock);
@@ -974,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
974} 975}
975 976
976/* 977/*
978 * precalculate the low space thresholds for dynamic speculative preallocation.
979 */
980void
981xfs_set_low_space_thresholds(
982 struct xfs_mount *mp)
983{
984 int i;
985
986 for (i = 0; i < XFS_LOWSP_MAX; i++) {
987 __uint64_t space = mp->m_sb.sb_dblocks;
988
989 do_div(space, 100);
990 mp->m_low_space[i] = space * (i + 1);
991 }
992}
993
994
995/*
977 * Set whether we're using inode alignment. 996 * Set whether we're using inode alignment.
978 */ 997 */
979STATIC void 998STATIC void
@@ -1195,6 +1214,9 @@ xfs_mountfs(
1195 */ 1214 */
1196 xfs_set_rw_sizes(mp); 1215 xfs_set_rw_sizes(mp);
1197 1216
1217 /* set the low space thresholds for dynamic preallocation */
1218 xfs_set_low_space_thresholds(mp);
1219
1198 /* 1220 /*
1199 * Set the inode cluster size. 1221 * Set the inode cluster size.
1200 * This may still be overridden by the file system 1222 * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b498074..a62e8971539 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
103 xfs_mod_incore_sb(mp, field, delta, rsvd) 103 xfs_mod_incore_sb(mp, field, delta, rsvd)
104#endif 104#endif
105 105
106/* dynamic preallocation free space thresholds, 5% down to 1% */
107enum {
108 XFS_LOWSP_1_PCNT = 0,
109 XFS_LOWSP_2_PCNT,
110 XFS_LOWSP_3_PCNT,
111 XFS_LOWSP_4_PCNT,
112 XFS_LOWSP_5_PCNT,
113 XFS_LOWSP_MAX,
114};
115
106typedef struct xfs_mount { 116typedef struct xfs_mount {
107 struct super_block *m_super; 117 struct super_block *m_super;
108 xfs_tid_t m_tid; /* next unused tid for fs */ 118 xfs_tid_t m_tid; /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
202 __int64_t m_update_flags; /* sb flags we need to update 212 __int64_t m_update_flags; /* sb flags we need to update
203 on the next remount,rw */ 213 on the next remount,rw */
204 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 214 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
215 int64_t m_low_space[XFS_LOWSP_MAX];
216 /* low free space thresholds */
205} xfs_mount_t; 217} xfs_mount_t;
206 218
207/* 219/*
@@ -379,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
379 391
380extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 392extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
381 393
394extern void xfs_set_low_space_thresholds(struct xfs_mount *);
395
382#endif /* __KERNEL__ */ 396#endif /* __KERNEL__ */
383 397
384extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 398extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2..edfa178bafb 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
408 spin_lock(&mru->lock); 408 spin_lock(&mru->lock);
409 if (mru->queued) { 409 if (mru->queued) {
410 spin_unlock(&mru->lock); 410 spin_unlock(&mru->lock);
411 cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work); 411 cancel_delayed_work_sync(&mru->work);
412 spin_lock(&mru->lock); 412 spin_lock(&mru->lock);
413 } 413 }
414 414
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd..9bb6eda4cd2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) 346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
347#define xfs_trans_apply_dquot_deltas(tp) 347#define xfs_trans_apply_dquot_deltas(tp)
348#define xfs_trans_unreserve_and_mod_dquots(tp) 348#define xfs_trans_unreserve_and_mod_dquots(tp)
349#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) 349static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
350#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) 350 struct xfs_inode *ip, long nblks, long ninos, uint flags)
351{
352 return 0;
353}
354static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
355 struct xfs_mount *mp, struct xfs_dquot *udqp,
356 struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
357{
358 return 0;
359}
351#define xfs_qm_vop_create_dqattach(tp, ip, u, g) 360#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
352#define xfs_qm_vop_rename_dqattach(it) (0) 361#define xfs_qm_vop_rename_dqattach(it) (0)
353#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) 362#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
357#define xfs_qm_dqdetach(ip) 366#define xfs_qm_dqdetach(ip)
358#define xfs_qm_dqrele(d) 367#define xfs_qm_dqrele(d)
359#define xfs_qm_statvfs(ip, s) 368#define xfs_qm_statvfs(ip, s)
360#define xfs_qm_sync(mp, fl) (0) 369static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
370{
371 return 0;
372}
361#define xfs_qm_newmount(mp, a, b) (0) 373#define xfs_qm_newmount(mp, a, b) (0)
362#define xfs_qm_mount_quotas(mp) 374#define xfs_qm_mount_quotas(mp)
363#define xfs_qm_unmount(mp) 375#define xfs_qm_unmount(mp)
364#define xfs_qm_unmount_quotas(mp) (0) 376#define xfs_qm_unmount_quotas(mp)
365#endif /* CONFIG_XFS_QUOTA */ 377#endif /* CONFIG_XFS_QUOTA */
366 378
367#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ 379#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a..77a59891734 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
297 * it and some incremental backup programs won't work without it. 297 * it and some incremental backup programs won't work without it.
298 */ 298 */
299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
300 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
300 301
301 /* 302 /*
302 * Adjust the link count on src_dp. This is necessary when 303 * Adjust the link count on src_dp. This is necessary when
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711..76922793f64 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
1137 if (blkdelta) 1137 if (blkdelta)
1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); 1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1139out: 1139out:
1140 ASSERT(error = 0); 1140 ASSERT(error == 0);
1141 return; 1141 return;
1142} 1142}
1143 1143
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
1350 * they could be immediately flushed and we'd have to race with the flusher 1350 * they could be immediately flushed and we'd have to race with the flusher
1351 * trying to pull the item from the AIL as we add it. 1351 * trying to pull the item from the AIL as we add it.
1352 */ 1352 */
1353void 1353static void
1354xfs_trans_item_committed( 1354xfs_trans_item_committed(
1355 struct xfs_log_item *lip, 1355 struct xfs_log_item *lip,
1356 xfs_lsn_t commit_lsn, 1356 xfs_lsn_t commit_lsn,
@@ -1425,21 +1425,120 @@ xfs_trans_committed(
1425 xfs_trans_free(tp); 1425 xfs_trans_free(tp);
1426} 1426}
1427 1427
1428static inline void
1429xfs_log_item_batch_insert(
1430 struct xfs_ail *ailp,
1431 struct xfs_log_item **log_items,
1432 int nr_items,
1433 xfs_lsn_t commit_lsn)
1434{
1435 int i;
1436
1437 spin_lock(&ailp->xa_lock);
1438 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
1439 xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
1440
1441 for (i = 0; i < nr_items; i++)
1442 IOP_UNPIN(log_items[i], 0);
1443}
1444
1428/* 1445/*
1429 * Called from the trans_commit code when we notice that 1446 * Bulk operation version of xfs_trans_committed that takes a log vector of
1430 * the filesystem is in the middle of a forced shutdown. 1447 * items to insert into the AIL. This uses bulk AIL insertion techniques to
1448 * minimise lock traffic.
1449 *
1450 * If we are called with the aborted flag set, it is because a log write during
1451 * a CIL checkpoint commit has failed. In this case, all the items in the
1452 * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
1453 * means that checkpoint commit abort handling is treated exactly the same
1454 * as an iclog write error even though we haven't started any IO yet. Hence in
1455 * this case all we need to do is IOP_COMMITTED processing, followed by an
1456 * IOP_UNPIN(aborted) call.
1457 */
1458void
1459xfs_trans_committed_bulk(
1460 struct xfs_ail *ailp,
1461 struct xfs_log_vec *log_vector,
1462 xfs_lsn_t commit_lsn,
1463 int aborted)
1464{
1465#define LOG_ITEM_BATCH_SIZE 32
1466 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
1467 struct xfs_log_vec *lv;
1468 int i = 0;
1469
1470 /* unpin all the log items */
1471 for (lv = log_vector; lv; lv = lv->lv_next ) {
1472 struct xfs_log_item *lip = lv->lv_item;
1473 xfs_lsn_t item_lsn;
1474
1475 if (aborted)
1476 lip->li_flags |= XFS_LI_ABORTED;
1477 item_lsn = IOP_COMMITTED(lip, commit_lsn);
1478
1479 /* item_lsn of -1 means the item was freed */
1480 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1481 continue;
1482
1483 /*
1484 * if we are aborting the operation, no point in inserting the
1485 * object into the AIL as we are in a shutdown situation.
1486 */
1487 if (aborted) {
1488 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
1489 IOP_UNPIN(lip, 1);
1490 continue;
1491 }
1492
1493 if (item_lsn != commit_lsn) {
1494
1495 /*
1496 * Not a bulk update option due to unusual item_lsn.
1497 * Push into AIL immediately, rechecking the lsn once
1498 * we have the ail lock. Then unpin the item.
1499 */
1500 spin_lock(&ailp->xa_lock);
1501 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
1502 xfs_trans_ail_update(ailp, lip, item_lsn);
1503 else
1504 spin_unlock(&ailp->xa_lock);
1505 IOP_UNPIN(lip, 0);
1506 continue;
1507 }
1508
1509 /* Item is a candidate for bulk AIL insert. */
1510 log_items[i++] = lv->lv_item;
1511 if (i >= LOG_ITEM_BATCH_SIZE) {
1512 xfs_log_item_batch_insert(ailp, log_items,
1513 LOG_ITEM_BATCH_SIZE, commit_lsn);
1514 i = 0;
1515 }
1516 }
1517
1518 /* make sure we insert the remainder! */
1519 if (i)
1520 xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
1521}
1522
1523/*
1524 * Called from the trans_commit code when we notice that the filesystem is in
1525 * the middle of a forced shutdown.
1526 *
1527 * When we are called here, we have already pinned all the items in the
1528 * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
1529 * so we can simply walk the items in the transaction, unpin them with an abort
1530 * flag and then free the items. Note that unpinning the items can result in
1531 * them being freed immediately, so we need to use a safe list traversal method
1532 * here.
1431 */ 1533 */
1432STATIC void 1534STATIC void
1433xfs_trans_uncommit( 1535xfs_trans_uncommit(
1434 struct xfs_trans *tp, 1536 struct xfs_trans *tp,
1435 uint flags) 1537 uint flags)
1436{ 1538{
1437 struct xfs_log_item_desc *lidp; 1539 struct xfs_log_item_desc *lidp, *n;
1438 1540
1439 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 1541 list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
1440 /*
1441 * Unpin all but those that aren't dirty.
1442 */
1443 if (lidp->lid_flags & XFS_LID_DIRTY) 1542 if (lidp->lid_flags & XFS_LID_DIRTY)
1444 IOP_UNPIN(lidp->lid_item, 1); 1543 IOP_UNPIN(lidp->lid_item, 1);
1445 } 1544 }
@@ -1656,7 +1755,6 @@ xfs_trans_commit_cil(
1656 int flags) 1755 int flags)
1657{ 1756{
1658 struct xfs_log_vec *log_vector; 1757 struct xfs_log_vec *log_vector;
1659 int error;
1660 1758
1661 /* 1759 /*
1662 * Get each log item to allocate a vector structure for 1760 * Get each log item to allocate a vector structure for
@@ -1667,9 +1765,7 @@ xfs_trans_commit_cil(
1667 if (!log_vector) 1765 if (!log_vector)
1668 return ENOMEM; 1766 return ENOMEM;
1669 1767
1670 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); 1768 xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1671 if (error)
1672 return error;
1673 1769
1674 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1770 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1675 xfs_trans_free(tp); 1771 xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a8..c2042b736b8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
294#define XFS_ALLOC_BTREE_REF 2 294#define XFS_ALLOC_BTREE_REF 2
295#define XFS_BMAP_BTREE_REF 2 295#define XFS_BMAP_BTREE_REF 2
296#define XFS_DIR_BTREE_REF 2 296#define XFS_DIR_BTREE_REF 2
297#define XFS_INO_REF 2
297#define XFS_ATTR_BTREE_REF 1 298#define XFS_ATTR_BTREE_REF 1
298#define XFS_INO_REF 1
299#define XFS_DQUOT_REF 1 299#define XFS_DQUOT_REF 1
300 300
301#ifdef __KERNEL__ 301#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff..c5bbbc45db9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30 30
31STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *); 31STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
32STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); 32STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *); 33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *); 34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
35 35
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
449 xfs_log_move_tail(ailp->xa_mount, 1); 449 xfs_log_move_tail(ailp->xa_mount, 1);
450} /* xfs_trans_unlocked_item */ 450} /* xfs_trans_unlocked_item */
451 451
452
453/* 452/*
454 * Update the position of the item in the AIL with the new 453 * xfs_trans_ail_update - bulk AIL insertion operation.
455 * lsn. If it is not yet in the AIL, add it. Otherwise, move 454 *
456 * it to its new position by removing it and re-adding it. 455 * @xfs_trans_ail_update takes an array of log items that all need to be
456 * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
457 * be added. Otherwise, it will be repositioned by removing it and re-adding
458 * it to the AIL. If we move the first item in the AIL, update the log tail to
459 * match the new minimum LSN in the AIL.
457 * 460 *
458 * Wakeup anyone with an lsn less than the item's lsn. If the item 461 * This function takes the AIL lock once to execute the update operations on
459 * we move in the AIL is the minimum one, update the tail lsn in the 462 * all the items in the array, and as such should not be called with the AIL
460 * log manager. 463 * lock held. As a result, once we have the AIL lock, we need to check each log
464 * item LSN to confirm it needs to be moved forward in the AIL.
461 * 465 *
462 * This function must be called with the AIL lock held. The lock 466 * To optimise the insert operation, we delete all the items from the AIL in
463 * is dropped before returning. 467 * the first pass, moving them into a temporary list, then splice the temporary
468 * list into the correct position in the AIL. This avoids needing to do an
469 * insert operation on every item.
470 *
471 * This function must be called with the AIL lock held. The lock is dropped
472 * before returning.
464 */ 473 */
465void 474void
466xfs_trans_ail_update( 475xfs_trans_ail_update_bulk(
467 struct xfs_ail *ailp, 476 struct xfs_ail *ailp,
468 xfs_log_item_t *lip, 477 struct xfs_log_item **log_items,
469 xfs_lsn_t lsn) __releases(ailp->xa_lock) 478 int nr_items,
479 xfs_lsn_t lsn) __releases(ailp->xa_lock)
470{ 480{
471 xfs_log_item_t *dlip = NULL; 481 xfs_log_item_t *mlip;
472 xfs_log_item_t *mlip; /* ptr to minimum lip */
473 xfs_lsn_t tail_lsn; 482 xfs_lsn_t tail_lsn;
483 int mlip_changed = 0;
484 int i;
485 LIST_HEAD(tmp);
474 486
475 mlip = xfs_ail_min(ailp); 487 mlip = xfs_ail_min(ailp);
476 488
477 if (lip->li_flags & XFS_LI_IN_AIL) { 489 for (i = 0; i < nr_items; i++) {
478 dlip = xfs_ail_delete(ailp, lip); 490 struct xfs_log_item *lip = log_items[i];
479 ASSERT(dlip == lip); 491 if (lip->li_flags & XFS_LI_IN_AIL) {
480 xfs_trans_ail_cursor_clear(ailp, dlip); 492 /* check if we really need to move the item */
481 } else { 493 if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
482 lip->li_flags |= XFS_LI_IN_AIL; 494 continue;
495
496 xfs_ail_delete(ailp, lip);
497 if (mlip == lip)
498 mlip_changed = 1;
499 } else {
500 lip->li_flags |= XFS_LI_IN_AIL;
501 }
502 lip->li_lsn = lsn;
503 list_add(&lip->li_ail, &tmp);
483 } 504 }
484 505
485 lip->li_lsn = lsn; 506 xfs_ail_splice(ailp, &tmp, lsn);
486 xfs_ail_insert(ailp, lip);
487 507
488 if (mlip == dlip) { 508 if (!mlip_changed) {
489 mlip = xfs_ail_min(ailp);
490 /*
491 * It is not safe to access mlip after the AIL lock is
492 * dropped, so we must get a copy of li_lsn before we do
493 * so. This is especially important on 32-bit platforms
494 * where accessing and updating 64-bit values like li_lsn
495 * is not atomic.
496 */
497 tail_lsn = mlip->li_lsn;
498 spin_unlock(&ailp->xa_lock);
499 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
500 } else {
501 spin_unlock(&ailp->xa_lock); 509 spin_unlock(&ailp->xa_lock);
510 return;
502 } 511 }
503 512
504 513 /*
505} /* xfs_trans_update_ail */ 514 * It is not safe to access mlip after the AIL lock is dropped, so we
515 * must get a copy of li_lsn before we do so. This is especially
516 * important on 32-bit platforms where accessing and updating 64-bit
517 * values like li_lsn is not atomic.
518 */
519 mlip = xfs_ail_min(ailp);
520 tail_lsn = mlip->li_lsn;
521 spin_unlock(&ailp->xa_lock);
522 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
523}
506 524
507/* 525/*
508 * Delete the given item from the AIL. It must already be in 526 * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
509 * the AIL.
510 * 527 *
511 * Wakeup anyone with an lsn less than item's lsn. If the item 528 * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
512 * we delete in the AIL is the minimum one, update the tail lsn in the 529 * removed from the AIL. The caller is already holding the AIL lock, and done
513 * log manager. 530 * all the checks necessary to ensure the items passed in via @log_items are
531 * ready for deletion. This includes checking that the items are in the AIL.
514 * 532 *
515 * Clear the IN_AIL flag from the item, reset its lsn to 0, and 533 * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
516 * bump the AIL's generation count to indicate that the tree 534 * flag from the item and reset the item's lsn to 0. If we remove the first
517 * has changed. 535 * item in the AIL, update the log tail to match the new minimum LSN in the
536 * AIL.
518 * 537 *
519 * This function must be called with the AIL lock held. The lock 538 * This function will not drop the AIL lock until all items are removed from
520 * is dropped before returning. 539 * the AIL to minimise the amount of lock traffic on the AIL. This does not
540 * greatly increase the AIL hold time, but does significantly reduce the amount
541 * of traffic on the lock, especially during IO completion.
542 *
543 * This function must be called with the AIL lock held. The lock is dropped
544 * before returning.
521 */ 545 */
522void 546void
523xfs_trans_ail_delete( 547xfs_trans_ail_delete_bulk(
524 struct xfs_ail *ailp, 548 struct xfs_ail *ailp,
525 xfs_log_item_t *lip) __releases(ailp->xa_lock) 549 struct xfs_log_item **log_items,
550 int nr_items) __releases(ailp->xa_lock)
526{ 551{
527 xfs_log_item_t *dlip;
528 xfs_log_item_t *mlip; 552 xfs_log_item_t *mlip;
529 xfs_lsn_t tail_lsn; 553 xfs_lsn_t tail_lsn;
554 int mlip_changed = 0;
555 int i;
530 556
531 if (lip->li_flags & XFS_LI_IN_AIL) { 557 mlip = xfs_ail_min(ailp);
532 mlip = xfs_ail_min(ailp);
533 dlip = xfs_ail_delete(ailp, lip);
534 ASSERT(dlip == lip);
535 xfs_trans_ail_cursor_clear(ailp, dlip);
536
537 558
538 lip->li_flags &= ~XFS_LI_IN_AIL; 559 for (i = 0; i < nr_items; i++) {
539 lip->li_lsn = 0; 560 struct xfs_log_item *lip = log_items[i];
561 if (!(lip->li_flags & XFS_LI_IN_AIL)) {
562 struct xfs_mount *mp = ailp->xa_mount;
540 563
541 if (mlip == dlip) {
542 mlip = xfs_ail_min(ailp);
543 /*
544 * It is not safe to access mlip after the AIL lock
545 * is dropped, so we must get a copy of li_lsn
546 * before we do so. This is especially important
547 * on 32-bit platforms where accessing and updating
548 * 64-bit values like li_lsn is not atomic.
549 */
550 tail_lsn = mlip ? mlip->li_lsn : 0;
551 spin_unlock(&ailp->xa_lock);
552 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
553 } else {
554 spin_unlock(&ailp->xa_lock); 564 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) {
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 return;
555 } 572 }
573
574 xfs_ail_delete(ailp, lip);
575 lip->li_flags &= ~XFS_LI_IN_AIL;
576 lip->li_lsn = 0;
577 if (mlip == lip)
578 mlip_changed = 1;
556 } 579 }
557 else {
558 /*
559 * If the file system is not being shutdown, we are in
560 * serious trouble if we get to this stage.
561 */
562 struct xfs_mount *mp = ailp->xa_mount;
563 580
581 if (!mlip_changed) {
564 spin_unlock(&ailp->xa_lock); 582 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) { 583 return;
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 } 584 }
572}
573
574 585
586 /*
587 * It is not safe to access mlip after the AIL lock is dropped, so we
588 * must get a copy of li_lsn before we do so. This is especially
589 * important on 32-bit platforms where accessing and updating 64-bit
590 * values like li_lsn is not atomic. It is possible we've emptied the
591 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
592 */
593 mlip = xfs_ail_min(ailp);
594 tail_lsn = mlip ? mlip->li_lsn : 0;
595 spin_unlock(&ailp->xa_lock);
596 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
597}
575 598
576/* 599/*
577 * The active item list (AIL) is a doubly linked list of log 600 * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
623} 646}
624 647
625/* 648/*
626 * Insert the given log item into the AIL. 649 * splice the log item list into the AIL at the given LSN.
627 * We almost always insert at the end of the list, so on inserts
628 * we search from the end of the list to find where the
629 * new item belongs.
630 */ 650 */
631STATIC void 651STATIC void
632xfs_ail_insert( 652xfs_ail_splice(
633 struct xfs_ail *ailp, 653 struct xfs_ail *ailp,
634 xfs_log_item_t *lip) 654 struct list_head *list,
635/* ARGSUSED */ 655 xfs_lsn_t lsn)
636{ 656{
637 xfs_log_item_t *next_lip; 657 xfs_log_item_t *next_lip;
638 658
@@ -640,39 +660,33 @@ xfs_ail_insert(
640 * If the list is empty, just insert the item. 660 * If the list is empty, just insert the item.
641 */ 661 */
642 if (list_empty(&ailp->xa_ail)) { 662 if (list_empty(&ailp->xa_ail)) {
643 list_add(&lip->li_ail, &ailp->xa_ail); 663 list_splice(list, &ailp->xa_ail);
644 return; 664 return;
645 } 665 }
646 666
647 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { 667 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
648 if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0) 668 if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
649 break; 669 break;
650 } 670 }
651 671
652 ASSERT((&next_lip->li_ail == &ailp->xa_ail) || 672 ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
653 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); 673 (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
654
655 list_add(&lip->li_ail, &next_lip->li_ail);
656 674
657 xfs_ail_check(ailp, lip); 675 list_splice_init(list, &next_lip->li_ail);
658 return; 676 return;
659} 677}
660 678
661/* 679/*
662 * Delete the given item from the AIL. Return a pointer to the item. 680 * Delete the given item from the AIL. Return a pointer to the item.
663 */ 681 */
664/*ARGSUSED*/ 682STATIC void
665STATIC xfs_log_item_t *
666xfs_ail_delete( 683xfs_ail_delete(
667 struct xfs_ail *ailp, 684 struct xfs_ail *ailp,
668 xfs_log_item_t *lip) 685 xfs_log_item_t *lip)
669/* ARGSUSED */
670{ 686{
671 xfs_ail_check(ailp, lip); 687 xfs_ail_check(ailp, lip);
672
673 list_del(&lip->li_ail); 688 list_del(&lip->li_ail);
674 689 xfs_trans_ail_cursor_clear(ailp, lip);
675 return lip;
676} 690}
677 691
678/* 692/*
@@ -682,7 +696,6 @@ xfs_ail_delete(
682STATIC xfs_log_item_t * 696STATIC xfs_log_item_t *
683xfs_ail_min( 697xfs_ail_min(
684 struct xfs_ail *ailp) 698 struct xfs_ail *ailp)
685/* ARGSUSED */
686{ 699{
687 if (list_empty(&ailp->xa_ail)) 700 if (list_empty(&ailp->xa_ail))
688 return NULL; 701 return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
699xfs_ail_next( 712xfs_ail_next(
700 struct xfs_ail *ailp, 713 struct xfs_ail *ailp,
701 xfs_log_item_t *lip) 714 xfs_log_item_t *lip)
702/* ARGSUSED */
703{ 715{
704 if (lip->li_ail.next == &ailp->xa_ail) 716 if (lip->li_ail.next == &ailp->xa_ail)
705 return NULL; 717 return NULL;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa7..f7590f5bade 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
69 tp->t_flags |= XFS_TRANS_DIRTY; 69 tp->t_flags |= XFS_TRANS_DIRTY;
70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; 70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
71 71
72 next_extent = efip->efi_next_extent; 72 /*
73 * atomic_inc_return gives us the value after the increment;
74 * we want to use it as an array index so we need to subtract 1 from
75 * it.
76 */
77 next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
73 ASSERT(next_extent < efip->efi_format.efi_nextents); 78 ASSERT(next_extent < efip->efi_format.efi_nextents);
74 extp = &(efip->efi_format.efi_extents[next_extent]); 79 extp = &(efip->efi_format.efi_extents[next_extent]);
75 extp->ext_start = start_block; 80 extp->ext_start = start_block;
76 extp->ext_len = ext_len; 81 extp->ext_len = ext_len;
77 efip->efi_next_extent++;
78} 82}
79 83
80 84
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de..35162c238fa 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
22struct xfs_log_item_desc; 22struct xfs_log_item_desc;
23struct xfs_mount; 23struct xfs_mount;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_ail;
26struct xfs_log_vec;
25 27
26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 28void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27void xfs_trans_del_item(struct xfs_log_item *); 29void xfs_trans_del_item(struct xfs_log_item *);
28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 30void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29 int flags); 31 int flags);
30void xfs_trans_item_committed(struct xfs_log_item *lip,
31 xfs_lsn_t commit_lsn, int aborted);
32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
33 33
34void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
35 xfs_lsn_t commit_lsn, int aborted);
34/* 36/*
35 * AIL traversal cursor. 37 * AIL traversal cursor.
36 * 38 *
@@ -73,12 +75,29 @@ struct xfs_ail {
73/* 75/*
74 * From xfs_trans_ail.c 76 * From xfs_trans_ail.c
75 */ 77 */
76void xfs_trans_ail_update(struct xfs_ail *ailp, 78void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
77 struct xfs_log_item *lip, xfs_lsn_t lsn) 79 struct xfs_log_item **log_items, int nr_items,
78 __releases(ailp->xa_lock); 80 xfs_lsn_t lsn) __releases(ailp->xa_lock);
79void xfs_trans_ail_delete(struct xfs_ail *ailp, 81static inline void
80 struct xfs_log_item *lip) 82xfs_trans_ail_update(
81 __releases(ailp->xa_lock); 83 struct xfs_ail *ailp,
84 struct xfs_log_item *lip,
85 xfs_lsn_t lsn) __releases(ailp->xa_lock)
86{
87 xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
88}
89
90void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
91 struct xfs_log_item **log_items, int nr_items)
92 __releases(ailp->xa_lock);
93static inline void
94xfs_trans_ail_delete(
95 struct xfs_ail *ailp,
96 xfs_log_item_t *lip) __releases(ailp->xa_lock)
97{
98 xfs_trans_ail_delete_bulk(ailp, &lip, 1);
99}
100
82void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); 101void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
83void xfs_trans_unlocked_item(struct xfs_ail *, 102void xfs_trans_unlocked_item(struct xfs_ail *,
84 xfs_log_item_t *); 103 xfs_log_item_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151..d8e6f8cd6f0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
964 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 964 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
965 } 965 }
966 966
967 if (ip->i_d.di_nlink != 0) { 967 if (ip->i_d.di_nlink == 0)
968 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 968 return 0;
969 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
970 ip->i_delayed_blks > 0)) &&
971 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
972 (!(ip->i_d.di_flags &
973 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
974 969
975 /* 970 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
976 * If we can't get the iolock just skip truncating 971 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
977 * the blocks past EOF because we could deadlock 972 ip->i_delayed_blks > 0)) &&
978 * with the mmap_sem otherwise. We'll get another 973 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
979 * chance to drop them once the last reference to 974 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
980 * the inode is dropped, so we'll never leak blocks
981 * permanently.
982 */
983 error = xfs_free_eofblocks(mp, ip,
984 XFS_FREE_EOF_TRYLOCK);
985 if (error)
986 return error;
987 }
988 }
989 975
976 /*
977 * If we can't get the iolock just skip truncating the blocks
978 * past EOF because we could deadlock with the mmap_sem
979 * otherwise. We'll get another chance to drop them once the
980 * last reference to the inode is dropped, so we'll never leak
981 * blocks permanently.
982 *
983 * Further, check if the inode is being opened, written and
984 * closed frequently and we have delayed allocation blocks
985 * oustanding (e.g. streaming writes from the NFS server),
986 * truncating the blocks past EOF will cause fragmentation to
987 * occur.
988 *
989 * In this case don't do the truncation, either, but we have to
990 * be careful how we detect this case. Blocks beyond EOF show
991 * up as i_delayed_blks even when the inode is clean, so we
992 * need to truncate them away first before checking for a dirty
993 * release. Hence on the first dirty close we will still remove
994 * the speculative allocation, but after that we will leave it
995 * in place.
996 */
997 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
998 return 0;
999
1000 error = xfs_free_eofblocks(mp, ip,
1001 XFS_FREE_EOF_TRYLOCK);
1002 if (error)
1003 return error;
1004
1005 /* delalloc blocks after truncation means it really is dirty */
1006 if (ip->i_delayed_blks)
1007 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1008 }
990 return 0; 1009 return 0;
991} 1010}
992 1011