aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2011-03-25 11:41:20 -0400
committerArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2011-03-25 11:41:20 -0400
commit7bf7e370d5919112c223a269462cd0b546903829 (patch)
tree03ccc715239df14ae168277dbccc9d9cf4d8a2c8 /fs
parent68b1a1e786f29c900fa1c516a402e24f0ece622a (diff)
parentd39dd11c3e6a7af5c20bfac40594db36cf270f42 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus-1
* 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6: (9356 commits) [media] rc: update for bitop name changes fs: simplify iget & friends fs: pull inode->i_lock up out of writeback_single_inode fs: rename inode_lock to inode_hash_lock fs: move i_wb_list out from under inode_lock fs: move i_sb_list out from under inode_lock fs: remove inode_lock from iput_final and prune_icache fs: Lock the inode LRU list separately fs: factor inode disposal fs: protect inode->i_state with inode->i_lock lib, arch: add filter argument to show_mem and fix private implementations SLUB: Write to per cpu data when allocating it slub: Fix debugobjects with lockless fastpath autofs4: Do not potentially dereference NULL pointer returned by fget() in autofs_dev_ioctl_setpipefd() autofs4 - remove autofs4_lock autofs4 - fix d_manage() return on rcu-walk autofs4 - fix autofs4_expire_indirect() traversal autofs4 - fix dentry leak in autofs4_expire_direct() autofs4 - reinstate last used update on access vfs - check non-mountpoint dentry might block in __follow_mount_rcu() ... NOTE! This merge commit was created to fix compilation error. The block tree was merged upstream and removed the 'elv_queue_empty()' function which the new 'mtdswap' driver is using. So a simple merge of the mtd tree with upstream does not compile. And the mtd tree has already be published, so re-basing it is not an option. To fix this unfortunate situation, I had to merge upstream into the mtd-2.6.git tree without committing, put the fixup patch on top of this, and then commit this. The result is that we do not have commits which do not compile. In other words, this merge commit "merges" 3 things: the MTD tree, the upstream tree, and the fixup patch.
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c34
-rw-r--r--fs/9p/cache.c204
-rw-r--r--fs/9p/cache.h64
-rw-r--r--fs/9p/fid.c127
-rw-r--r--fs/9p/fid.h5
-rw-r--r--fs/9p/v9fs.c108
-rw-r--r--fs/9p/v9fs.h59
-rw-r--r--fs/9p/v9fs_vfs.h26
-rw-r--r--fs/9p/vfs_addr.c194
-rw-r--r--fs/9p/vfs_dentry.c47
-rw-r--r--fs/9p/vfs_dir.c1
-rw-r--r--fs/9p/vfs_file.c323
-rw-r--r--fs/9p/vfs_inode.c322
-rw-r--r--fs/9p/vfs_inode_dotl.c206
-rw-r--r--fs/9p/vfs_super.c67
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/Kconfig1
-rw-r--r--fs/adfs/adfs.h25
-rw-r--r--fs/adfs/dir.c6
-rw-r--r--fs/adfs/dir_f.c23
-rw-r--r--fs/adfs/dir_fplus.c119
-rw-r--r--fs/adfs/inode.c69
-rw-r--r--fs/adfs/super.c36
-rw-r--r--fs/affs/Makefile2
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c137
-rw-r--r--fs/attr.c4
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c4
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/root.c68
-rw-r--r--fs/autofs4/waitq.c6
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/file.c1
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/bio-integrity.c3
-rw-r--r--fs/bio.c12
-rw-r--r--fs/block_dev.c63
-rw-r--r--fs/btrfs/acl.c8
-rw-r--r--fs/btrfs/compression.c27
-rw-r--r--fs/btrfs/ctree.h12
-rw-r--r--fs/btrfs/disk-io.c96
-rw-r--r--fs/btrfs/export.c10
-rw-r--r--fs/btrfs/extent-tree.c130
-rw-r--r--fs/btrfs/extent_io.c221
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/file-item.c5
-rw-r--r--fs/btrfs/file.c117
-rw-r--r--fs/btrfs/free-space-cache.c162
-rw-r--r--fs/btrfs/inode.c187
-rw-r--r--fs/btrfs/ioctl.c38
-rw-r--r--fs/btrfs/lzo.c21
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/relocation.c43
-rw-r--r--fs/btrfs/super.c16
-rw-r--r--fs/btrfs/transaction.c5
-rw-r--r--fs/btrfs/tree-log.c35
-rw-r--r--fs/btrfs/volumes.c125
-rw-r--r--fs/btrfs/xattr.c6
-rw-r--r--fs/btrfs/xattr.h3
-rw-r--r--fs/btrfs/zlib.c3
-rw-r--r--fs/buffer.c53
-rw-r--r--fs/cachefiles/namei.c52
-rw-r--r--fs/ceph/caps.c43
-rw-r--r--fs/ceph/debugfs.c6
-rw-r--r--fs/ceph/dir.c49
-rw-r--r--fs/ceph/file.c10
-rw-r--r--fs/ceph/inode.c37
-rw-r--r--fs/ceph/mds_client.c10
-rw-r--r--fs/ceph/snap.c14
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/ceph/super.h66
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/README5
-rw-r--r--fs/cifs/cifs_dfs_ref.c10
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsencrypt.c38
-rw-r--r--fs/cifs/cifsencrypt.h33
-rw-r--r--fs/cifs/cifsfs.c15
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h37
-rw-r--r--fs/cifs/cifsproto.h11
-rw-r--r--fs/cifs/cifssmb.c8
-rw-r--r--fs/cifs/connect.c70
-rw-r--r--fs/cifs/file.c241
-rw-r--r--fs/cifs/link.c59
-rw-r--r--fs/cifs/md4.c205
-rw-r--r--fs/cifs/md5.c366
-rw-r--r--fs/cifs/md5.h38
-rw-r--r--fs/cifs/misc.c116
-rw-r--r--fs/cifs/netmisc.c8
-rw-r--r--fs/cifs/readdir.c3
-rw-r--r--fs/cifs/sess.c8
-rw-r--r--fs/cifs/smbdes.c1
-rw-r--r--fs/cifs/smbencrypt.c92
-rw-r--r--fs/cifs/transport.c69
-rw-r--r--fs/coda/Makefile2
-rw-r--r--fs/coda/sysctl.c8
-rw-r--r--fs/compat.c72
-rw-r--r--fs/dcache.c132
-rw-r--r--fs/debugfs/inode.c26
-rw-r--r--fs/devpts/inode.c21
-rw-r--r--fs/direct-io.c13
-rw-r--r--fs/dlm/ast.c257
-rw-r--r--fs/dlm/ast.h7
-rw-r--r--fs/dlm/config.c4
-rw-r--r--fs/dlm/debug_fs.c4
-rw-r--r--fs/dlm/dlm_internal.h35
-rw-r--r--fs/dlm/lock.c38
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/dlm/rcom.c4
-rw-r--r--fs/dlm/user.c185
-rw-r--r--fs/dlm/user.h3
-rw-r--r--fs/drop_caches.c24
-rw-r--r--fs/ecryptfs/dentry.c22
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h3
-rw-r--r--fs/ecryptfs/file.c1
-rw-r--r--fs/ecryptfs/inode.c138
-rw-r--r--fs/efs/inode.c1
-rw-r--r--fs/eventfd.c12
-rw-r--r--fs/eventpoll.c175
-rw-r--r--fs/exec.c20
-rw-r--r--fs/exofs/common.h18
-rw-r--r--fs/exofs/dir.c33
-rw-r--r--fs/exofs/exofs.h6
-rw-r--r--fs/exofs/file.c16
-rw-r--r--fs/exofs/inode.c50
-rw-r--r--fs/exofs/namei.c8
-rw-r--r--fs/exofs/super.c190
-rw-r--r--fs/exportfs/expfs.c11
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext2/ext2.h8
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/ioctl.c6
-rw-r--r--fs/ext2/namei.c17
-rw-r--r--fs/ext2/xattr.h6
-rw-r--r--fs/ext2/xattr_security.c5
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/balloc.c21
-rw-r--r--fs/ext3/ialloc.c5
-rw-r--r--fs/ext3/inode.c3
-rw-r--r--fs/ext3/ioctl.c6
-rw-r--r--fs/ext3/namei.c17
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/ext3/xattr.h4
-rw-r--r--fs/ext3/xattr_security.c5
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/ext4.h22
-rw-r--r--fs/ext4/extents.c14
-rw-r--r--fs/ext4/file.c60
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/ioctl.c8
-rw-r--r--fs/ext4/mballoc.c100
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/page-io.c39
-rw-r--r--fs/ext4/super.c75
-rw-r--r--fs/ext4/xattr.h4
-rw-r--r--fs/ext4/xattr_security.c5
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/fcntl.c39
-rw-r--r--fs/fhandle.c265
-rw-r--r--fs/fifo.c3
-rw-r--r--fs/file_table.c66
-rw-r--r--fs/freevxfs/vxfs_subr.c1
-rw-r--r--fs/fs-writeback.c141
-rw-r--r--fs/fuse/cuse.c14
-rw-r--r--fs/fuse/dev.c27
-rw-r--r--fs/fuse/dir.c45
-rw-r--r--fs/fuse/file.c54
-rw-r--r--fs/fuse/fuse_i.h7
-rw-r--r--fs/fuse/inode.c5
-rw-r--r--fs/generic_acl.c2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c7
-rw-r--r--fs/gfs2/aops.c4
-rw-r--r--fs/gfs2/bmap.c20
-rw-r--r--fs/gfs2/dentry.c2
-rw-r--r--fs/gfs2/export.c8
-rw-r--r--fs/gfs2/file.c79
-rw-r--r--fs/gfs2/glock.c414
-rw-r--r--fs/gfs2/glock.h39
-rw-r--r--fs/gfs2/glops.c33
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c7
-rw-r--r--fs/gfs2/lock_dlm.c14
-rw-r--r--fs/gfs2/log.c36
-rw-r--r--fs/gfs2/lops.c22
-rw-r--r--fs/gfs2/main.c17
-rw-r--r--fs/gfs2/meta_io.c5
-rw-r--r--fs/gfs2/ops_fstype.c11
-rw-r--r--fs/gfs2/ops_inode.c10
-rw-r--r--fs/gfs2/quota.c14
-rw-r--r--fs/gfs2/rgrp.c34
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/hfs/dir.c50
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/extents.c4
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c2
-rw-r--r--fs/hfsplus/part_tbl.c4
-rw-r--r--fs/hfsplus/super.c106
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/hpfs/Kconfig2
-rw-r--r--fs/hpfs/dir.c23
-rw-r--r--fs/hpfs/file.c10
-rw-r--r--fs/hpfs/hpfs_fn.h22
-rw-r--r--fs/hpfs/inode.c9
-rw-r--r--fs/hpfs/namei.c49
-rw-r--r--fs/hpfs/super.c23
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/inode.c733
-rw-r--r--fs/internal.h28
-rw-r--r--fs/ioctl.c28
-rw-r--r--fs/isofs/export.c8
-rw-r--r--fs/isofs/inode.c1
-rw-r--r--fs/jbd/commit.c22
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd2/commit.c22
-rw-r--r--fs/jbd2/journal.c11
-rw-r--r--fs/jbd2/transaction.c21
-rw-r--r--fs/jffs2/acl.c2
-rw-r--r--fs/jffs2/compr_zlib.c7
-rw-r--r--fs/jffs2/dir.c9
-rw-r--r--fs/jffs2/nodelist.h2
-rw-r--r--fs/jffs2/security.c5
-rw-r--r--fs/jffs2/write.c18
-rw-r--r--fs/jffs2/xattr.h5
-rw-r--r--fs/jfs/Makefile2
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/ioctl.c2
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_xattr.h5
-rw-r--r--fs/jfs/namei.c13
-rw-r--r--fs/jfs/xattr.c8
-rw-r--r--fs/lockd/host.c9
-rw-r--r--fs/locks.c13
-rw-r--r--fs/logfs/compr.c2
-rw-r--r--fs/logfs/dev_bdev.c2
-rw-r--r--fs/logfs/file.c2
-rw-r--r--fs/logfs/inode.c2
-rw-r--r--fs/minix/Kconfig8
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/minix/minix.h74
-rw-r--r--fs/minix/namei.c8
-rw-r--r--fs/mpage.c8
-rw-r--r--fs/namei.c1598
-rw-r--r--fs/namespace.c355
-rw-r--r--fs/ncpfs/Makefile2
-rw-r--r--fs/nfs/callback.c109
-rw-r--r--fs/nfs/callback.h4
-rw-r--r--fs/nfs/callback_proc.c14
-rw-r--r--fs/nfs/callback_xdr.c5
-rw-r--r--fs/nfs/client.c146
-rw-r--r--fs/nfs/delegation.c6
-rw-r--r--fs/nfs/dir.c13
-rw-r--r--fs/nfs/direct.c42
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/getroot.c42
-rw-r--r--fs/nfs/idmap.c90
-rw-r--r--fs/nfs/inode.c35
-rw-r--r--fs/nfs/internal.h46
-rw-r--r--fs/nfs/namespace.c66
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c5
-rw-r--r--fs/nfs/nfs4_fs.h38
-rw-r--r--fs/nfs/nfs4filelayout.c361
-rw-r--r--fs/nfs/nfs4filelayout.h19
-rw-r--r--fs/nfs/nfs4filelayoutdev.c265
-rw-r--r--fs/nfs/nfs4namespace.c41
-rw-r--r--fs/nfs/nfs4proc.c284
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4state.c41
-rw-r--r--fs/nfs/nfs4xdr.c51
-rw-r--r--fs/nfs/nfsroot.c29
-rw-r--r--fs/nfs/pagelist.c22
-rw-r--r--fs/nfs/pnfs.c330
-rw-r--r--fs/nfs/pnfs.h118
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c127
-rw-r--r--fs/nfs/super.c478
-rw-r--r--fs/nfs/unlink.c22
-rw-r--r--fs/nfs/write.c157
-rw-r--r--fs/nfs_common/nfsacl.c54
-rw-r--r--fs/nfsctl.c21
-rw-r--r--fs/nfsd/export.c1
-rw-r--r--fs/nfsd/nfs4callback.c8
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c4
-rw-r--r--fs/nfsd/nfs4state.c347
-rw-r--r--fs/nfsd/nfs4xdr.c11
-rw-r--r--fs/nfsd/nfsctl.c35
-rw-r--r--fs/nfsd/state.h17
-rw-r--r--fs/nfsd/vfs.c21
-rw-r--r--fs/nilfs2/alloc.c12
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/bmap.c12
-rw-r--r--fs/nilfs2/bmap.h3
-rw-r--r--fs/nilfs2/btnode.c12
-rw-r--r--fs/nilfs2/btnode.h1
-rw-r--r--fs/nilfs2/btree.c6
-rw-r--r--fs/nilfs2/dir.c5
-rw-r--r--fs/nilfs2/direct.c4
-rw-r--r--fs/nilfs2/file.c4
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/inode.c84
-rw-r--r--fs/nilfs2/ioctl.c115
-rw-r--r--fs/nilfs2/mdt.c13
-rw-r--r--fs/nilfs2/mdt.h2
-rw-r--r--fs/nilfs2/namei.c10
-rw-r--r--fs/nilfs2/nilfs.h33
-rw-r--r--fs/nilfs2/page.c18
-rw-r--r--fs/nilfs2/page.h4
-rw-r--r--fs/nilfs2/recovery.c32
-rw-r--r--fs/nilfs2/sb.h85
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/nilfs2/segment.c261
-rw-r--r--fs/nilfs2/segment.h14
-rw-r--r--fs/nilfs2/super.c219
-rw-r--r--fs/nilfs2/the_nilfs.c44
-rw-r--r--fs/nilfs2/the_nilfs.h51
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/inode_mark.c42
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/notify/mark.c1
-rw-r--r--fs/notify/vfsmount_mark.c1
-rw-r--r--fs/ntfs/Makefile19
-rw-r--r--fs/ntfs/aops.c4
-rw-r--r--fs/ntfs/compress.c3
-rw-r--r--fs/ntfs/inode.c4
-rw-r--r--fs/ntfs/mft.c11
-rw-r--r--fs/ocfs2/Makefile4
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/dcache.c2
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/ocfs2.h10
-rw-r--r--fs/ocfs2/quota.h3
-rw-r--r--fs/ocfs2/quota_global.c27
-rw-r--r--fs/ocfs2/refcounttree.c12
-rw-r--r--fs/ocfs2/super.c35
-rw-r--r--fs/ocfs2/xattr.c10
-rw-r--r--fs/ocfs2/xattr.h4
-rw-r--r--fs/omfs/dir.c66
-rw-r--r--fs/omfs/file.c1
-rw-r--r--fs/open.c152
-rw-r--r--fs/partitions/check.c3
-rw-r--r--fs/partitions/ldm.c5
-rw-r--r--fs/partitions/mac.c17
-rw-r--r--fs/partitions/osf.c12
-rw-r--r--fs/posix_acl.c17
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c208
-rw-r--r--fs/proc/consoles.c4
-rw-r--r--fs/proc/generic.c8
-rw-r--r--fs/proc/inode.c10
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/proc_devtree.c2
-rw-r--r--fs/proc/proc_sysctl.c8
-rw-r--r--fs/proc/root.c32
-rw-r--r--fs/proc/task_mmu.c135
-rw-r--r--fs/proc/task_nommu.c6
-rw-r--r--fs/pstore/Kconfig13
-rw-r--r--fs/pstore/Makefile7
-rw-r--r--fs/pstore/inode.c311
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c201
-rw-r--r--fs/qnx4/inode.c1
-rw-r--r--fs/quota/dquot.c41
-rw-r--r--fs/quota/quota_v2.c2
-rw-r--r--fs/reiserfs/Makefile4
-rw-r--r--fs/reiserfs/inode.c8
-rw-r--r--fs/reiserfs/ioctl.c4
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/namei.c15
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/reiserfs/xattr_security.c3
-rw-r--r--fs/select.c3
-rw-r--r--fs/squashfs/Kconfig12
-rw-r--r--fs/squashfs/block.c8
-rw-r--r--fs/squashfs/decompressor.c34
-rw-r--r--fs/squashfs/decompressor.h7
-rw-r--r--fs/squashfs/dir.c9
-rw-r--r--fs/squashfs/lzo_wrapper.c4
-rw-r--r--fs/squashfs/namei.c12
-rw-r--r--fs/squashfs/squashfs.h1
-rw-r--r--fs/squashfs/squashfs_fs.h4
-rw-r--r--fs/squashfs/super.c15
-rw-r--r--fs/squashfs/xz_wrapper.c59
-rw-r--r--fs/squashfs/zlib_wrapper.c16
-rw-r--r--fs/stat.c7
-rw-r--r--fs/statfs.c176
-rw-r--r--fs/super.c166
-rw-r--r--fs/sync.c28
-rw-r--r--fs/sysv/itree.c1
-rw-r--r--fs/sysv/namei.c8
-rw-r--r--fs/ubifs/Kconfig32
-rw-r--r--fs/ubifs/commit.c58
-rw-r--r--fs/ubifs/debug.c34
-rw-r--r--fs/ubifs/debug.h30
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/file.c11
-rw-r--r--fs/ubifs/io.c201
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c28
-rw-r--r--fs/ubifs/lprops.c26
-rw-r--r--fs/ubifs/lpt_commit.c56
-rw-r--r--fs/ubifs/orphan.c10
-rw-r--r--fs/ubifs/recovery.c44
-rw-r--r--fs/ubifs/scan.c2
-rw-r--r--fs/ubifs/super.c55
-rw-r--r--fs/ubifs/tnc.c10
-rw-r--r--fs/ubifs/ubifs.h45
-rw-r--r--fs/udf/balloc.c11
-rw-r--r--fs/udf/file.c8
-rw-r--r--fs/udf/inode.c240
-rw-r--r--fs/udf/namei.c18
-rw-r--r--fs/udf/truncate.c146
-rw-r--r--fs/udf/udfdecl.h12
-rw-r--r--fs/ufs/Kconfig1
-rw-r--r--fs/ufs/inode.c79
-rw-r--r--fs/ufs/namei.c44
-rw-r--r--fs/ufs/super.c64
-rw-r--r--fs/ufs/truncate.c7
-rw-r--r--fs/ufs/ufs.h6
-rw-r--r--fs/ufs/util.c2
-rw-r--r--fs/ufs/util.h2
-rw-r--r--fs/utimes.c2
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xfs/Makefile12
-rw-r--r--fs/xfs/linux-2.6/kmem.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c35
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h23
-rw-r--r--fs/xfs/linux-2.6/xfs_message.c133
-rw-r--r--fs/xfs/linux-2.6/xfs_message.h38
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c128
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c2
-rw-r--r--fs/xfs/quota/xfs_dquot.c48
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c5
-rw-r--r--fs/xfs/quota/xfs_qm.c95
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c3
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c85
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c5
-rw-r--r--fs/xfs/support/debug.c107
-rw-r--r--fs/xfs/support/debug.h61
-rw-r--r--fs/xfs/xfs_alloc.c158
-rw-r--r--fs/xfs/xfs_alloc.h16
-rw-r--r--fs/xfs/xfs_bmap.c85
-rw-r--r--fs/xfs/xfs_buf_item.c27
-rw-r--r--fs/xfs/xfs_da_btree.c9
-rw-r--r--fs/xfs/xfs_dfrag.c4
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_dir2_node.c25
-rw-r--r--fs/xfs/xfs_error.c22
-rw-r--r--fs/xfs/xfs_error.h19
-rw-r--r--fs/xfs/xfs_extfree_item.c3
-rw-r--r--fs/xfs/xfs_fsops.c9
-rw-r--r--fs/xfs/xfs_ialloc.c82
-rw-r--r--fs/xfs/xfs_inode.c129
-rw-r--r--fs/xfs/xfs_inode.h23
-rw-r--r--fs/xfs/xfs_iomap.c19
-rw-r--r--fs/xfs/xfs_log.c124
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c15
-rw-r--r--fs/xfs/xfs_log_priv.h4
-rw-r--r--fs/xfs/xfs_log_recover.c223
-rw-r--r--fs/xfs/xfs_mount.c148
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c92
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_rw.c58
-rw-r--r--fs/xfs/xfs_trans.c41
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_ail.c2
-rw-r--r--fs/xfs/xfs_trans_buf.c6
-rw-r--r--fs/xfs/xfs_trans_inode.c22
-rw-r--r--fs/xfs/xfs_vnodeops.c74
502 files changed, 13303 insertions, 9389 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 02a2cf616318..535ab6eccb1a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -21,8 +21,8 @@
21#include <linux/posix_acl_xattr.h> 21#include <linux/posix_acl_xattr.h>
22#include "xattr.h" 22#include "xattr.h"
23#include "acl.h" 23#include "acl.h"
24#include "v9fs_vfs.h"
25#include "v9fs.h" 24#include "v9fs.h"
25#include "v9fs_vfs.h"
26 26
27static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) 27static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
28{ 28{
@@ -59,7 +59,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
59 struct v9fs_session_info *v9ses; 59 struct v9fs_session_info *v9ses;
60 60
61 v9ses = v9fs_inode2v9ses(inode); 61 v9ses = v9fs_inode2v9ses(inode);
62 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { 62 if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
63 ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
63 set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL); 64 set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
64 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); 65 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
65 return 0; 66 return 0;
@@ -71,11 +72,15 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
71 if (!IS_ERR(dacl) && !IS_ERR(pacl)) { 72 if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
72 set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); 73 set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
73 set_cached_acl(inode, ACL_TYPE_ACCESS, pacl); 74 set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
74 posix_acl_release(dacl);
75 posix_acl_release(pacl);
76 } else 75 } else
77 retval = -EIO; 76 retval = -EIO;
78 77
78 if (!IS_ERR(dacl))
79 posix_acl_release(dacl);
80
81 if (!IS_ERR(pacl))
82 posix_acl_release(pacl);
83
79 return retval; 84 return retval;
80} 85}
81 86
@@ -100,9 +105,10 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
100 return -ECHILD; 105 return -ECHILD;
101 106
102 v9ses = v9fs_inode2v9ses(inode); 107 v9ses = v9fs_inode2v9ses(inode);
103 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { 108 if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
109 ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
104 /* 110 /*
105 * On access = client mode get the acl 111 * On access = client and acl = on mode get the acl
106 * values from the server 112 * values from the server
107 */ 113 */
108 return 0; 114 return 0;
@@ -128,6 +134,10 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
128 struct inode *inode = dentry->d_inode; 134 struct inode *inode = dentry->d_inode;
129 135
130 set_cached_acl(inode, type, acl); 136 set_cached_acl(inode, type, acl);
137
138 if (!acl)
139 return 0;
140
131 /* Set a setxattr request to server */ 141 /* Set a setxattr request to server */
132 size = posix_acl_xattr_size(acl->a_count); 142 size = posix_acl_xattr_size(acl->a_count);
133 buffer = kmalloc(size, GFP_KERNEL); 143 buffer = kmalloc(size, GFP_KERNEL);
@@ -177,10 +187,8 @@ int v9fs_acl_chmod(struct dentry *dentry)
177int v9fs_set_create_acl(struct dentry *dentry, 187int v9fs_set_create_acl(struct dentry *dentry,
178 struct posix_acl *dpacl, struct posix_acl *pacl) 188 struct posix_acl *dpacl, struct posix_acl *pacl)
179{ 189{
180 if (dpacl) 190 v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
181 v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); 191 v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
182 if (pacl)
183 v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
184 posix_acl_release(dpacl); 192 posix_acl_release(dpacl);
185 posix_acl_release(pacl); 193 posix_acl_release(pacl);
186 return 0; 194 return 0;
@@ -254,7 +262,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
254 if (strcmp(name, "") != 0) 262 if (strcmp(name, "") != 0)
255 return -EINVAL; 263 return -EINVAL;
256 264
257 v9ses = v9fs_inode2v9ses(dentry->d_inode); 265 v9ses = v9fs_dentry2v9ses(dentry);
258 /* 266 /*
259 * We allow set/get/list of acl when access=client is not specified 267 * We allow set/get/list of acl when access=client is not specified
260 */ 268 */
@@ -304,7 +312,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
304 if (strcmp(name, "") != 0) 312 if (strcmp(name, "") != 0)
305 return -EINVAL; 313 return -EINVAL;
306 314
307 v9ses = v9fs_inode2v9ses(dentry->d_inode); 315 v9ses = v9fs_dentry2v9ses(dentry);
308 /* 316 /*
309 * set the attribute on the remote. Without even looking at the 317 * set the attribute on the remote. Without even looking at the
310 * xattr value. We leave it to the server to validate 318 * xattr value. We leave it to the server to validate
@@ -315,7 +323,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
315 323
316 if (S_ISLNK(inode->i_mode)) 324 if (S_ISLNK(inode->i_mode))
317 return -EOPNOTSUPP; 325 return -EOPNOTSUPP;
318 if (!is_owner_or_cap(inode)) 326 if (!inode_owner_or_capable(inode))
319 return -EPERM; 327 return -EPERM;
320 if (value) { 328 if (value) {
321 /* update the cached acl value */ 329 /* update the cached acl value */
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 0dbe0d139ac2..5b335c5086a1 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -33,67 +33,11 @@
33 33
34#define CACHETAG_LEN 11 34#define CACHETAG_LEN 11
35 35
36struct kmem_cache *vcookie_cache;
37
38struct fscache_netfs v9fs_cache_netfs = { 36struct fscache_netfs v9fs_cache_netfs = {
39 .name = "9p", 37 .name = "9p",
40 .version = 0, 38 .version = 0,
41}; 39};
42 40
43static void init_once(void *foo)
44{
45 struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
46 vcookie->fscache = NULL;
47 vcookie->qid = NULL;
48 inode_init_once(&vcookie->inode);
49}
50
51/**
52 * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
53 * vcookie to inode mapping
54 *
55 * Returns 0 on success.
56 */
57
58static int v9fs_init_vcookiecache(void)
59{
60 vcookie_cache = kmem_cache_create("vcookie_cache",
61 sizeof(struct v9fs_cookie),
62 0, (SLAB_RECLAIM_ACCOUNT|
63 SLAB_MEM_SPREAD),
64 init_once);
65 if (!vcookie_cache)
66 return -ENOMEM;
67
68 return 0;
69}
70
71/**
72 * v9fs_destroy_vcookiecache - destroy the cache of vcookies
73 *
74 */
75
76static void v9fs_destroy_vcookiecache(void)
77{
78 kmem_cache_destroy(vcookie_cache);
79}
80
81int __v9fs_cache_register(void)
82{
83 int ret;
84 ret = v9fs_init_vcookiecache();
85 if (ret < 0)
86 return ret;
87
88 return fscache_register_netfs(&v9fs_cache_netfs);
89}
90
91void __v9fs_cache_unregister(void)
92{
93 v9fs_destroy_vcookiecache();
94 fscache_unregister_netfs(&v9fs_cache_netfs);
95}
96
97/** 41/**
98 * v9fs_random_cachetag - Generate a random tag to be associated 42 * v9fs_random_cachetag - Generate a random tag to be associated
99 * with a new cache session. 43 * with a new cache session.
@@ -133,9 +77,9 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
133} 77}
134 78
135const struct fscache_cookie_def v9fs_cache_session_index_def = { 79const struct fscache_cookie_def v9fs_cache_session_index_def = {
136 .name = "9P.session", 80 .name = "9P.session",
137 .type = FSCACHE_COOKIE_TYPE_INDEX, 81 .type = FSCACHE_COOKIE_TYPE_INDEX,
138 .get_key = v9fs_cache_session_get_key, 82 .get_key = v9fs_cache_session_get_key,
139}; 83};
140 84
141void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) 85void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
@@ -163,33 +107,33 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
163static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, 107static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
164 void *buffer, uint16_t bufmax) 108 void *buffer, uint16_t bufmax)
165{ 109{
166 const struct v9fs_cookie *vcookie = cookie_netfs_data; 110 const struct v9fs_inode *v9inode = cookie_netfs_data;
167 memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path)); 111 memcpy(buffer, &v9inode->fscache_key->path,
168 112 sizeof(v9inode->fscache_key->path));
169 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode, 113 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
170 vcookie->qid->path); 114 v9inode->fscache_key->path);
171 return sizeof(vcookie->qid->path); 115 return sizeof(v9inode->fscache_key->path);
172} 116}
173 117
174static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, 118static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
175 uint64_t *size) 119 uint64_t *size)
176{ 120{
177 const struct v9fs_cookie *vcookie = cookie_netfs_data; 121 const struct v9fs_inode *v9inode = cookie_netfs_data;
178 *size = i_size_read(&vcookie->inode); 122 *size = i_size_read(&v9inode->vfs_inode);
179 123
180 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode, 124 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
181 *size); 125 *size);
182} 126}
183 127
184static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, 128static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
185 void *buffer, uint16_t buflen) 129 void *buffer, uint16_t buflen)
186{ 130{
187 const struct v9fs_cookie *vcookie = cookie_netfs_data; 131 const struct v9fs_inode *v9inode = cookie_netfs_data;
188 memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version)); 132 memcpy(buffer, &v9inode->fscache_key->version,
189 133 sizeof(v9inode->fscache_key->version));
190 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode, 134 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
191 vcookie->qid->version); 135 v9inode->fscache_key->version);
192 return sizeof(vcookie->qid->version); 136 return sizeof(v9inode->fscache_key->version);
193} 137}
194 138
195static enum 139static enum
@@ -197,13 +141,13 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
197 const void *buffer, 141 const void *buffer,
198 uint16_t buflen) 142 uint16_t buflen)
199{ 143{
200 const struct v9fs_cookie *vcookie = cookie_netfs_data; 144 const struct v9fs_inode *v9inode = cookie_netfs_data;
201 145
202 if (buflen != sizeof(vcookie->qid->version)) 146 if (buflen != sizeof(v9inode->fscache_key->version))
203 return FSCACHE_CHECKAUX_OBSOLETE; 147 return FSCACHE_CHECKAUX_OBSOLETE;
204 148
205 if (memcmp(buffer, &vcookie->qid->version, 149 if (memcmp(buffer, &v9inode->fscache_key->version,
206 sizeof(vcookie->qid->version))) 150 sizeof(v9inode->fscache_key->version)))
207 return FSCACHE_CHECKAUX_OBSOLETE; 151 return FSCACHE_CHECKAUX_OBSOLETE;
208 152
209 return FSCACHE_CHECKAUX_OKAY; 153 return FSCACHE_CHECKAUX_OKAY;
@@ -211,7 +155,7 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
211 155
212static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) 156static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
213{ 157{
214 struct v9fs_cookie *vcookie = cookie_netfs_data; 158 struct v9fs_inode *v9inode = cookie_netfs_data;
215 struct pagevec pvec; 159 struct pagevec pvec;
216 pgoff_t first; 160 pgoff_t first;
217 int loop, nr_pages; 161 int loop, nr_pages;
@@ -220,7 +164,7 @@ static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
220 first = 0; 164 first = 0;
221 165
222 for (;;) { 166 for (;;) {
223 nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping, 167 nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
224 first, 168 first,
225 PAGEVEC_SIZE - pagevec_count(&pvec)); 169 PAGEVEC_SIZE - pagevec_count(&pvec));
226 if (!nr_pages) 170 if (!nr_pages)
@@ -249,115 +193,114 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
249 193
250void v9fs_cache_inode_get_cookie(struct inode *inode) 194void v9fs_cache_inode_get_cookie(struct inode *inode)
251{ 195{
252 struct v9fs_cookie *vcookie; 196 struct v9fs_inode *v9inode;
253 struct v9fs_session_info *v9ses; 197 struct v9fs_session_info *v9ses;
254 198
255 if (!S_ISREG(inode->i_mode)) 199 if (!S_ISREG(inode->i_mode))
256 return; 200 return;
257 201
258 vcookie = v9fs_inode2cookie(inode); 202 v9inode = V9FS_I(inode);
259 if (vcookie->fscache) 203 if (v9inode->fscache)
260 return; 204 return;
261 205
262 v9ses = v9fs_inode2v9ses(inode); 206 v9ses = v9fs_inode2v9ses(inode);
263 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, 207 v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
264 &v9fs_cache_inode_index_def, 208 &v9fs_cache_inode_index_def,
265 vcookie); 209 v9inode);
266 210
267 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, 211 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
268 vcookie->fscache); 212 v9inode->fscache);
269} 213}
270 214
271void v9fs_cache_inode_put_cookie(struct inode *inode) 215void v9fs_cache_inode_put_cookie(struct inode *inode)
272{ 216{
273 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 217 struct v9fs_inode *v9inode = V9FS_I(inode);
274 218
275 if (!vcookie->fscache) 219 if (!v9inode->fscache)
276 return; 220 return;
277 P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, 221 P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
278 vcookie->fscache); 222 v9inode->fscache);
279 223
280 fscache_relinquish_cookie(vcookie->fscache, 0); 224 fscache_relinquish_cookie(v9inode->fscache, 0);
281 vcookie->fscache = NULL; 225 v9inode->fscache = NULL;
282} 226}
283 227
284void v9fs_cache_inode_flush_cookie(struct inode *inode) 228void v9fs_cache_inode_flush_cookie(struct inode *inode)
285{ 229{
286 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 230 struct v9fs_inode *v9inode = V9FS_I(inode);
287 231
288 if (!vcookie->fscache) 232 if (!v9inode->fscache)
289 return; 233 return;
290 P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, 234 P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
291 vcookie->fscache); 235 v9inode->fscache);
292 236
293 fscache_relinquish_cookie(vcookie->fscache, 1); 237 fscache_relinquish_cookie(v9inode->fscache, 1);
294 vcookie->fscache = NULL; 238 v9inode->fscache = NULL;
295} 239}
296 240
297void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) 241void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
298{ 242{
299 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 243 struct v9fs_inode *v9inode = V9FS_I(inode);
300 struct p9_fid *fid; 244 struct p9_fid *fid;
301 245
302 if (!vcookie->fscache) 246 if (!v9inode->fscache)
303 return; 247 return;
304 248
305 spin_lock(&vcookie->lock); 249 spin_lock(&v9inode->fscache_lock);
306 fid = filp->private_data; 250 fid = filp->private_data;
307 if ((filp->f_flags & O_ACCMODE) != O_RDONLY) 251 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
308 v9fs_cache_inode_flush_cookie(inode); 252 v9fs_cache_inode_flush_cookie(inode);
309 else 253 else
310 v9fs_cache_inode_get_cookie(inode); 254 v9fs_cache_inode_get_cookie(inode);
311 255
312 spin_unlock(&vcookie->lock); 256 spin_unlock(&v9inode->fscache_lock);
313} 257}
314 258
315void v9fs_cache_inode_reset_cookie(struct inode *inode) 259void v9fs_cache_inode_reset_cookie(struct inode *inode)
316{ 260{
317 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 261 struct v9fs_inode *v9inode = V9FS_I(inode);
318 struct v9fs_session_info *v9ses; 262 struct v9fs_session_info *v9ses;
319 struct fscache_cookie *old; 263 struct fscache_cookie *old;
320 264
321 if (!vcookie->fscache) 265 if (!v9inode->fscache)
322 return; 266 return;
323 267
324 old = vcookie->fscache; 268 old = v9inode->fscache;
325 269
326 spin_lock(&vcookie->lock); 270 spin_lock(&v9inode->fscache_lock);
327 fscache_relinquish_cookie(vcookie->fscache, 1); 271 fscache_relinquish_cookie(v9inode->fscache, 1);
328 272
329 v9ses = v9fs_inode2v9ses(inode); 273 v9ses = v9fs_inode2v9ses(inode);
330 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, 274 v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
331 &v9fs_cache_inode_index_def, 275 &v9fs_cache_inode_index_def,
332 vcookie); 276 v9inode);
333
334 P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", 277 P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
335 inode, old, vcookie->fscache); 278 inode, old, v9inode->fscache);
336 279
337 spin_unlock(&vcookie->lock); 280 spin_unlock(&v9inode->fscache_lock);
338} 281}
339 282
340int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) 283int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
341{ 284{
342 struct inode *inode = page->mapping->host; 285 struct inode *inode = page->mapping->host;
343 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 286 struct v9fs_inode *v9inode = V9FS_I(inode);
344 287
345 BUG_ON(!vcookie->fscache); 288 BUG_ON(!v9inode->fscache);
346 289
347 return fscache_maybe_release_page(vcookie->fscache, page, gfp); 290 return fscache_maybe_release_page(v9inode->fscache, page, gfp);
348} 291}
349 292
350void __v9fs_fscache_invalidate_page(struct page *page) 293void __v9fs_fscache_invalidate_page(struct page *page)
351{ 294{
352 struct inode *inode = page->mapping->host; 295 struct inode *inode = page->mapping->host;
353 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 296 struct v9fs_inode *v9inode = V9FS_I(inode);
354 297
355 BUG_ON(!vcookie->fscache); 298 BUG_ON(!v9inode->fscache);
356 299
357 if (PageFsCache(page)) { 300 if (PageFsCache(page)) {
358 fscache_wait_on_page_write(vcookie->fscache, page); 301 fscache_wait_on_page_write(v9inode->fscache, page);
359 BUG_ON(!PageLocked(page)); 302 BUG_ON(!PageLocked(page));
360 fscache_uncache_page(vcookie->fscache, page); 303 fscache_uncache_page(v9inode->fscache, page);
361 } 304 }
362} 305}
363 306
@@ -380,13 +323,13 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data,
380int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) 323int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
381{ 324{
382 int ret; 325 int ret;
383 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 326 const struct v9fs_inode *v9inode = V9FS_I(inode);
384 327
385 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); 328 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
386 if (!vcookie->fscache) 329 if (!v9inode->fscache)
387 return -ENOBUFS; 330 return -ENOBUFS;
388 331
389 ret = fscache_read_or_alloc_page(vcookie->fscache, 332 ret = fscache_read_or_alloc_page(v9inode->fscache,
390 page, 333 page,
391 v9fs_vfs_readpage_complete, 334 v9fs_vfs_readpage_complete,
392 NULL, 335 NULL,
@@ -418,13 +361,13 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
418 unsigned *nr_pages) 361 unsigned *nr_pages)
419{ 362{
420 int ret; 363 int ret;
421 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 364 const struct v9fs_inode *v9inode = V9FS_I(inode);
422 365
423 P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); 366 P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
424 if (!vcookie->fscache) 367 if (!v9inode->fscache)
425 return -ENOBUFS; 368 return -ENOBUFS;
426 369
427 ret = fscache_read_or_alloc_pages(vcookie->fscache, 370 ret = fscache_read_or_alloc_pages(v9inode->fscache,
428 mapping, pages, nr_pages, 371 mapping, pages, nr_pages,
429 v9fs_vfs_readpage_complete, 372 v9fs_vfs_readpage_complete,
430 NULL, 373 NULL,
@@ -453,11 +396,22 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
453void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) 396void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
454{ 397{
455 int ret; 398 int ret;
456 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 399 const struct v9fs_inode *v9inode = V9FS_I(inode);
457 400
458 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); 401 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
459 ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL); 402 ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
460 P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); 403 P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret);
461 if (ret != 0) 404 if (ret != 0)
462 v9fs_uncache_page(inode, page); 405 v9fs_uncache_page(inode, page);
463} 406}
407
408/*
409 * wait for a page to complete writing to the cache
410 */
411void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
412{
413 const struct v9fs_inode *v9inode = V9FS_I(inode);
414 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
415 if (PageFsCache(page))
416 fscache_wait_on_page_write(v9inode->fscache, page);
417}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index a94192bfaee8..049507a5b01c 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -25,20 +25,6 @@
25#include <linux/fscache.h> 25#include <linux/fscache.h>
26#include <linux/spinlock.h> 26#include <linux/spinlock.h>
27 27
28extern struct kmem_cache *vcookie_cache;
29
30struct v9fs_cookie {
31 spinlock_t lock;
32 struct inode inode;
33 struct fscache_cookie *fscache;
34 struct p9_qid *qid;
35};
36
37static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
38{
39 return container_of(inode, struct v9fs_cookie, inode);
40}
41
42extern struct fscache_netfs v9fs_cache_netfs; 28extern struct fscache_netfs v9fs_cache_netfs;
43extern const struct fscache_cookie_def v9fs_cache_session_index_def; 29extern const struct fscache_cookie_def v9fs_cache_session_index_def;
44extern const struct fscache_cookie_def v9fs_cache_inode_index_def; 30extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
@@ -64,23 +50,8 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode,
64 struct list_head *pages, 50 struct list_head *pages,
65 unsigned *nr_pages); 51 unsigned *nr_pages);
66extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page); 52extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
67 53extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
68 54 struct page *page);
69/**
70 * v9fs_cache_register - Register v9fs file system with the cache
71 */
72static inline int v9fs_cache_register(void)
73{
74 return __v9fs_cache_register();
75}
76
77/**
78 * v9fs_cache_unregister - Unregister v9fs from the cache
79 */
80static inline void v9fs_cache_unregister(void)
81{
82 __v9fs_cache_unregister();
83}
84 55
85static inline int v9fs_fscache_release_page(struct page *page, 56static inline int v9fs_fscache_release_page(struct page *page,
86 gfp_t gfp) 57 gfp_t gfp)
@@ -117,28 +88,27 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
117 88
118static inline void v9fs_uncache_page(struct inode *inode, struct page *page) 89static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
119{ 90{
120 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 91 struct v9fs_inode *v9inode = V9FS_I(inode);
121 fscache_uncache_page(vcookie->fscache, page); 92 fscache_uncache_page(v9inode->fscache, page);
122 BUG_ON(PageFsCache(page)); 93 BUG_ON(PageFsCache(page));
123} 94}
124 95
125static inline void v9fs_vcookie_set_qid(struct inode *inode, 96static inline void v9fs_fscache_set_key(struct inode *inode,
126 struct p9_qid *qid) 97 struct p9_qid *qid)
127{ 98{
128 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 99 struct v9fs_inode *v9inode = V9FS_I(inode);
129 spin_lock(&vcookie->lock); 100 spin_lock(&v9inode->fscache_lock);
130 vcookie->qid = qid; 101 v9inode->fscache_key = qid;
131 spin_unlock(&vcookie->lock); 102 spin_unlock(&v9inode->fscache_lock);
132} 103}
133 104
134#else /* CONFIG_9P_FSCACHE */ 105static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
135 106 struct page *page)
136static inline int v9fs_cache_register(void)
137{ 107{
138 return 1; 108 return __v9fs_fscache_wait_on_page_write(inode, page);
139} 109}
140 110
141static inline void v9fs_cache_unregister(void) {} 111#else /* CONFIG_9P_FSCACHE */
142 112
143static inline int v9fs_fscache_release_page(struct page *page, 113static inline int v9fs_fscache_release_page(struct page *page,
144 gfp_t gfp) { 114 gfp_t gfp) {
@@ -168,9 +138,11 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
168static inline void v9fs_uncache_page(struct inode *inode, struct page *page) 138static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
169{} 139{}
170 140
171static inline void v9fs_vcookie_set_qid(struct inode *inode, 141static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
172 struct p9_qid *qid) 142 struct page *page)
173{} 143{
144 return;
145}
174 146
175#endif /* CONFIG_9P_FSCACHE */ 147#endif /* CONFIG_9P_FSCACHE */
176#endif /* _9P_CACHE_H */ 148#endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b00223c99d70..0ee594569dcc 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -125,46 +125,17 @@ err_out:
125 return -ENOMEM; 125 return -ENOMEM;
126} 126}
127 127
128/** 128static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
129 * v9fs_fid_lookup - lookup for a fid, try to walk if not found 129 uid_t uid, int any)
130 * @dentry: dentry to look for fid in
131 *
132 * Look for a fid in the specified dentry for the current user.
133 * If no fid is found, try to create one walking from a fid from the parent
134 * dentry (if it has one), or the root dentry. If the user haven't accessed
135 * the fs yet, attach now and walk from the root.
136 */
137
138struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
139{ 130{
140 int i, n, l, clone, any, access;
141 u32 uid;
142 struct p9_fid *fid, *old_fid = NULL;
143 struct dentry *ds; 131 struct dentry *ds;
144 struct v9fs_session_info *v9ses;
145 char **wnames, *uname; 132 char **wnames, *uname;
133 int i, n, l, clone, access;
134 struct v9fs_session_info *v9ses;
135 struct p9_fid *fid, *old_fid = NULL;
146 136
147 v9ses = v9fs_inode2v9ses(dentry->d_inode); 137 v9ses = v9fs_dentry2v9ses(dentry);
148 access = v9ses->flags & V9FS_ACCESS_MASK; 138 access = v9ses->flags & V9FS_ACCESS_MASK;
149 switch (access) {
150 case V9FS_ACCESS_SINGLE:
151 case V9FS_ACCESS_USER:
152 case V9FS_ACCESS_CLIENT:
153 uid = current_fsuid();
154 any = 0;
155 break;
156
157 case V9FS_ACCESS_ANY:
158 uid = v9ses->uid;
159 any = 1;
160 break;
161
162 default:
163 uid = ~0;
164 any = 0;
165 break;
166 }
167
168 fid = v9fs_fid_find(dentry, uid, any); 139 fid = v9fs_fid_find(dentry, uid, any);
169 if (fid) 140 if (fid)
170 return fid; 141 return fid;
@@ -250,6 +221,45 @@ err_out:
250 return fid; 221 return fid;
251} 222}
252 223
224/**
225 * v9fs_fid_lookup - lookup for a fid, try to walk if not found
226 * @dentry: dentry to look for fid in
227 *
228 * Look for a fid in the specified dentry for the current user.
229 * If no fid is found, try to create one walking from a fid from the parent
230 * dentry (if it has one), or the root dentry. If the user haven't accessed
231 * the fs yet, attach now and walk from the root.
232 */
233
234struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
235{
236 uid_t uid;
237 int any, access;
238 struct v9fs_session_info *v9ses;
239
240 v9ses = v9fs_dentry2v9ses(dentry);
241 access = v9ses->flags & V9FS_ACCESS_MASK;
242 switch (access) {
243 case V9FS_ACCESS_SINGLE:
244 case V9FS_ACCESS_USER:
245 case V9FS_ACCESS_CLIENT:
246 uid = current_fsuid();
247 any = 0;
248 break;
249
250 case V9FS_ACCESS_ANY:
251 uid = v9ses->uid;
252 any = 1;
253 break;
254
255 default:
256 uid = ~0;
257 any = 0;
258 break;
259 }
260 return v9fs_fid_lookup_with_uid(dentry, uid, any);
261}
262
253struct p9_fid *v9fs_fid_clone(struct dentry *dentry) 263struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
254{ 264{
255 struct p9_fid *fid, *ret; 265 struct p9_fid *fid, *ret;
@@ -261,3 +271,50 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
261 ret = p9_client_walk(fid, 0, NULL, 1); 271 ret = p9_client_walk(fid, 0, NULL, 1);
262 return ret; 272 return ret;
263} 273}
274
275static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
276{
277 struct p9_fid *fid, *ret;
278
279 fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
280 if (IS_ERR(fid))
281 return fid;
282
283 ret = p9_client_walk(fid, 0, NULL, 1);
284 return ret;
285}
286
287struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
288{
289 int err, flags;
290 struct p9_fid *fid;
291 struct v9fs_session_info *v9ses;
292
293 v9ses = v9fs_dentry2v9ses(dentry);
294 fid = v9fs_fid_clone_with_uid(dentry, 0);
295 if (IS_ERR(fid))
296 goto error_out;
297 /*
298 * writeback fid will only be used to write back the
299 * dirty pages. We always request for the open fid in read-write
300 * mode so that a partial page write which result in page
301 * read can work.
302 *
303 * we don't have a tsyncfs operation for older version
304 * of protocol. So make sure the write back fid is
305 * opened in O_SYNC mode.
306 */
307 if (!v9fs_proto_dotl(v9ses))
308 flags = O_RDWR | O_SYNC;
309 else
310 flags = O_RDWR;
311
312 err = p9_client_open(fid, flags);
313 if (err < 0) {
314 p9_client_clunk(fid);
315 fid = ERR_PTR(err);
316 goto error_out;
317 }
318error_out:
319 return fid;
320}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index c3bbd6af996d..bb0b6e7f58fc 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -19,7 +19,8 @@
19 * Boston, MA 02111-1301 USA 19 * Boston, MA 02111-1301 USA
20 * 20 *
21 */ 21 */
22 22#ifndef FS_9P_FID_H
23#define FS_9P_FID_H
23#include <linux/list.h> 24#include <linux/list.h>
24 25
25/** 26/**
@@ -45,3 +46,5 @@ struct v9fs_dentry {
45struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); 46struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
46struct p9_fid *v9fs_fid_clone(struct dentry *dentry); 47struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
47int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); 48int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
49struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
50#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2f77cd33ba83..c82b017f51f3 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -39,6 +39,7 @@
39 39
40static DEFINE_SPINLOCK(v9fs_sessionlist_lock); 40static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
41static LIST_HEAD(v9fs_sessionlist); 41static LIST_HEAD(v9fs_sessionlist);
42struct kmem_cache *v9fs_inode_cache;
42 43
43/* 44/*
44 * Option Parsing (code inspired by NFS code) 45 * Option Parsing (code inspired by NFS code)
@@ -55,7 +56,7 @@ enum {
55 /* Cache options */ 56 /* Cache options */
56 Opt_cache_loose, Opt_fscache, 57 Opt_cache_loose, Opt_fscache,
57 /* Access options */ 58 /* Access options */
58 Opt_access, 59 Opt_access, Opt_posixacl,
59 /* Error token */ 60 /* Error token */
60 Opt_err 61 Opt_err
61}; 62};
@@ -73,6 +74,7 @@ static const match_table_t tokens = {
73 {Opt_fscache, "fscache"}, 74 {Opt_fscache, "fscache"},
74 {Opt_cachetag, "cachetag=%s"}, 75 {Opt_cachetag, "cachetag=%s"},
75 {Opt_access, "access=%s"}, 76 {Opt_access, "access=%s"},
77 {Opt_posixacl, "posixacl"},
76 {Opt_err, NULL} 78 {Opt_err, NULL}
77}; 79};
78 80
@@ -194,15 +196,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
194 else if (strcmp(s, "any") == 0) 196 else if (strcmp(s, "any") == 0)
195 v9ses->flags |= V9FS_ACCESS_ANY; 197 v9ses->flags |= V9FS_ACCESS_ANY;
196 else if (strcmp(s, "client") == 0) { 198 else if (strcmp(s, "client") == 0) {
197#ifdef CONFIG_9P_FS_POSIX_ACL
198 v9ses->flags |= V9FS_ACCESS_CLIENT; 199 v9ses->flags |= V9FS_ACCESS_CLIENT;
199#else
200 P9_DPRINTK(P9_DEBUG_ERROR,
201 "access=client option not supported\n");
202 kfree(s);
203 ret = -EINVAL;
204 goto free_and_return;
205#endif
206 } else { 200 } else {
207 v9ses->flags |= V9FS_ACCESS_SINGLE; 201 v9ses->flags |= V9FS_ACCESS_SINGLE;
208 v9ses->uid = simple_strtoul(s, &e, 10); 202 v9ses->uid = simple_strtoul(s, &e, 10);
@@ -212,6 +206,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
212 kfree(s); 206 kfree(s);
213 break; 207 break;
214 208
209 case Opt_posixacl:
210#ifdef CONFIG_9P_FS_POSIX_ACL
211 v9ses->flags |= V9FS_POSIX_ACL;
212#else
213 P9_DPRINTK(P9_DEBUG_ERROR,
214 "Not defined CONFIG_9P_FS_POSIX_ACL. "
215 "Ignoring posixacl option\n");
216#endif
217 break;
218
215 default: 219 default:
216 continue; 220 continue;
217 } 221 }
@@ -260,19 +264,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
260 list_add(&v9ses->slist, &v9fs_sessionlist); 264 list_add(&v9ses->slist, &v9fs_sessionlist);
261 spin_unlock(&v9fs_sessionlist_lock); 265 spin_unlock(&v9fs_sessionlist_lock);
262 266
263 v9ses->flags = V9FS_ACCESS_USER;
264 strcpy(v9ses->uname, V9FS_DEFUSER); 267 strcpy(v9ses->uname, V9FS_DEFUSER);
265 strcpy(v9ses->aname, V9FS_DEFANAME); 268 strcpy(v9ses->aname, V9FS_DEFANAME);
266 v9ses->uid = ~0; 269 v9ses->uid = ~0;
267 v9ses->dfltuid = V9FS_DEFUID; 270 v9ses->dfltuid = V9FS_DEFUID;
268 v9ses->dfltgid = V9FS_DEFGID; 271 v9ses->dfltgid = V9FS_DEFGID;
269 272
270 rc = v9fs_parse_options(v9ses, data);
271 if (rc < 0) {
272 retval = rc;
273 goto error;
274 }
275
276 v9ses->clnt = p9_client_create(dev_name, data); 273 v9ses->clnt = p9_client_create(dev_name, data);
277 if (IS_ERR(v9ses->clnt)) { 274 if (IS_ERR(v9ses->clnt)) {
278 retval = PTR_ERR(v9ses->clnt); 275 retval = PTR_ERR(v9ses->clnt);
@@ -281,10 +278,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
281 goto error; 278 goto error;
282 } 279 }
283 280
284 if (p9_is_proto_dotl(v9ses->clnt)) 281 v9ses->flags = V9FS_ACCESS_USER;
282
283 if (p9_is_proto_dotl(v9ses->clnt)) {
284 v9ses->flags = V9FS_ACCESS_CLIENT;
285 v9ses->flags |= V9FS_PROTO_2000L; 285 v9ses->flags |= V9FS_PROTO_2000L;
286 else if (p9_is_proto_dotu(v9ses->clnt)) 286 } else if (p9_is_proto_dotu(v9ses->clnt)) {
287 v9ses->flags |= V9FS_PROTO_2000U; 287 v9ses->flags |= V9FS_PROTO_2000U;
288 }
289
290 rc = v9fs_parse_options(v9ses, data);
291 if (rc < 0) {
292 retval = rc;
293 goto error;
294 }
288 295
289 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 296 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
290 297
@@ -306,6 +313,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
306 v9ses->flags |= V9FS_ACCESS_ANY; 313 v9ses->flags |= V9FS_ACCESS_ANY;
307 v9ses->uid = ~0; 314 v9ses->uid = ~0;
308 } 315 }
316 if (!v9fs_proto_dotl(v9ses) ||
317 !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
318 /*
319 * We support ACL checks on clinet only if the protocol is
320 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
321 */
322 v9ses->flags &= ~V9FS_ACL_MASK;
323 }
309 324
310 fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0, 325 fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
311 v9ses->aname); 326 v9ses->aname);
@@ -467,6 +482,63 @@ static void v9fs_sysfs_cleanup(void)
467 kobject_put(v9fs_kobj); 482 kobject_put(v9fs_kobj);
468} 483}
469 484
485static void v9fs_inode_init_once(void *foo)
486{
487 struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
488#ifdef CONFIG_9P_FSCACHE
489 v9inode->fscache = NULL;
490 v9inode->fscache_key = NULL;
491#endif
492 inode_init_once(&v9inode->vfs_inode);
493}
494
495/**
496 * v9fs_init_inode_cache - initialize a cache for 9P
497 * Returns 0 on success.
498 */
499static int v9fs_init_inode_cache(void)
500{
501 v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
502 sizeof(struct v9fs_inode),
503 0, (SLAB_RECLAIM_ACCOUNT|
504 SLAB_MEM_SPREAD),
505 v9fs_inode_init_once);
506 if (!v9fs_inode_cache)
507 return -ENOMEM;
508
509 return 0;
510}
511
512/**
513 * v9fs_destroy_inode_cache - destroy the cache of 9P inode
514 *
515 */
516static void v9fs_destroy_inode_cache(void)
517{
518 kmem_cache_destroy(v9fs_inode_cache);
519}
520
521static int v9fs_cache_register(void)
522{
523 int ret;
524 ret = v9fs_init_inode_cache();
525 if (ret < 0)
526 return ret;
527#ifdef CONFIG_9P_FSCACHE
528 return fscache_register_netfs(&v9fs_cache_netfs);
529#else
530 return ret;
531#endif
532}
533
534static void v9fs_cache_unregister(void)
535{
536 v9fs_destroy_inode_cache();
537#ifdef CONFIG_9P_FSCACHE
538 fscache_unregister_netfs(&v9fs_cache_netfs);
539#endif
540}
541
470/** 542/**
471 * init_v9fs - Initialize module 543 * init_v9fs - Initialize module
472 * 544 *
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index c4b5d8864f0d..9665c2b840e6 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,9 @@
20 * Boston, MA 02111-1301 USA 20 * Boston, MA 02111-1301 USA
21 * 21 *
22 */ 22 */
23#ifndef FS_9P_V9FS_H
24#define FS_9P_V9FS_H
25
23#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
24 27
25/** 28/**
@@ -28,8 +31,10 @@
28 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions 31 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
29 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy 32 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
30 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) 33 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
34 * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
31 * @V9FS_ACCESS_ANY: use a single attach for all users 35 * @V9FS_ACCESS_ANY: use a single attach for all users
32 * @V9FS_ACCESS_MASK: bit mask of different ACCESS options 36 * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
37 * @V9FS_POSIX_ACL: POSIX ACLs are enforced
33 * 38 *
34 * Session flags reflect options selected by users at mount time 39 * Session flags reflect options selected by users at mount time
35 */ 40 */
@@ -37,13 +42,15 @@
37 V9FS_ACCESS_USER | \ 42 V9FS_ACCESS_USER | \
38 V9FS_ACCESS_CLIENT) 43 V9FS_ACCESS_CLIENT)
39#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY 44#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
45#define V9FS_ACL_MASK V9FS_POSIX_ACL
40 46
41enum p9_session_flags { 47enum p9_session_flags {
42 V9FS_PROTO_2000U = 0x01, 48 V9FS_PROTO_2000U = 0x01,
43 V9FS_PROTO_2000L = 0x02, 49 V9FS_PROTO_2000L = 0x02,
44 V9FS_ACCESS_SINGLE = 0x04, 50 V9FS_ACCESS_SINGLE = 0x04,
45 V9FS_ACCESS_USER = 0x08, 51 V9FS_ACCESS_USER = 0x08,
46 V9FS_ACCESS_CLIENT = 0x10 52 V9FS_ACCESS_CLIENT = 0x10,
53 V9FS_POSIX_ACL = 0x20
47}; 54};
48 55
49/* possible values of ->cache */ 56/* possible values of ->cache */
@@ -109,8 +116,29 @@ struct v9fs_session_info {
109 struct list_head slist; /* list of sessions registered with v9fs */ 116 struct list_head slist; /* list of sessions registered with v9fs */
110 struct backing_dev_info bdi; 117 struct backing_dev_info bdi;
111 struct rw_semaphore rename_sem; 118 struct rw_semaphore rename_sem;
119 struct p9_fid *root_fid; /* Used for file system sync */
120};
121
122/* cache_validity flags */
123#define V9FS_INO_INVALID_ATTR 0x01
124
125struct v9fs_inode {
126#ifdef CONFIG_9P_FSCACHE
127 spinlock_t fscache_lock;
128 struct fscache_cookie *fscache;
129 struct p9_qid *fscache_key;
130#endif
131 unsigned int cache_validity;
132 struct p9_fid *writeback_fid;
133 struct mutex v_mutex;
134 struct inode vfs_inode;
112}; 135};
113 136
137static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
138{
139 return container_of(inode, struct v9fs_inode, vfs_inode);
140}
141
114struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 142struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
115 char *); 143 char *);
116extern void v9fs_session_close(struct v9fs_session_info *v9ses); 144extern void v9fs_session_close(struct v9fs_session_info *v9ses);
@@ -124,16 +152,15 @@ extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
124 struct inode *new_dir, struct dentry *new_dentry); 152 struct inode *new_dir, struct dentry *new_dentry);
125extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, 153extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
126 void *p); 154 void *p);
127extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses, 155extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
128 struct p9_fid *fid, 156 struct p9_fid *fid,
129 struct super_block *sb); 157 struct super_block *sb);
130
131extern const struct inode_operations v9fs_dir_inode_operations_dotl; 158extern const struct inode_operations v9fs_dir_inode_operations_dotl;
132extern const struct inode_operations v9fs_file_inode_operations_dotl; 159extern const struct inode_operations v9fs_file_inode_operations_dotl;
133extern const struct inode_operations v9fs_symlink_inode_operations_dotl; 160extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
134extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses, 161extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
135 struct p9_fid *fid, 162 struct p9_fid *fid,
136 struct super_block *sb); 163 struct super_block *sb);
137 164
138/* other default globals */ 165/* other default globals */
139#define V9FS_PORT 564 166#define V9FS_PORT 564
@@ -147,6 +174,11 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
147 return (inode->i_sb->s_fs_info); 174 return (inode->i_sb->s_fs_info);
148} 175}
149 176
177static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
178{
179 return dentry->d_sb->s_fs_info;
180}
181
150static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) 182static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
151{ 183{
152 return v9ses->flags & V9FS_PROTO_2000U; 184 return v9ses->flags & V9FS_PROTO_2000U;
@@ -158,7 +190,7 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
158} 190}
159 191
160/** 192/**
161 * v9fs_inode_from_fid - Helper routine to populate an inode by 193 * v9fs_get_inode_from_fid - Helper routine to populate an inode by
162 * issuing a attribute request 194 * issuing a attribute request
163 * @v9ses: session information 195 * @v9ses: session information
164 * @fid: fid to issue attribute request for 196 * @fid: fid to issue attribute request for
@@ -166,11 +198,12 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
166 * 198 *
167 */ 199 */
168static inline struct inode * 200static inline struct inode *
169v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, 201v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
170 struct super_block *sb) 202 struct super_block *sb)
171{ 203{
172 if (v9fs_proto_dotl(v9ses)) 204 if (v9fs_proto_dotl(v9ses))
173 return v9fs_inode_dotl(v9ses, fid, sb); 205 return v9fs_inode_from_fid_dotl(v9ses, fid, sb);
174 else 206 else
175 return v9fs_inode(v9ses, fid, sb); 207 return v9fs_inode_from_fid(v9ses, fid, sb);
176} 208}
209#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index b789f8e597ec..4014160903a9 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -20,6 +20,8 @@
20 * Boston, MA 02111-1301 USA 20 * Boston, MA 02111-1301 USA
21 * 21 *
22 */ 22 */
23#ifndef FS_9P_V9FS_VFS_H
24#define FS_9P_V9FS_VFS_H
23 25
24/* plan9 semantics are that created files are implicitly opened. 26/* plan9 semantics are that created files are implicitly opened.
25 * But linux semantics are that you call create, then open. 27 * But linux semantics are that you call create, then open.
@@ -36,6 +38,7 @@
36 * unlink calls remove, which is an implicit clunk. So we have to track 38 * unlink calls remove, which is an implicit clunk. So we have to track
37 * that kind of thing so that we don't try to clunk a dead fid. 39 * that kind of thing so that we don't try to clunk a dead fid.
38 */ 40 */
41#define P9_LOCK_TIMEOUT (30*HZ)
39 42
40extern struct file_system_type v9fs_fs_type; 43extern struct file_system_type v9fs_fs_type;
41extern const struct address_space_operations v9fs_addr_operations; 44extern const struct address_space_operations v9fs_addr_operations;
@@ -45,13 +48,15 @@ extern const struct file_operations v9fs_dir_operations;
45extern const struct file_operations v9fs_dir_operations_dotl; 48extern const struct file_operations v9fs_dir_operations_dotl;
46extern const struct dentry_operations v9fs_dentry_operations; 49extern const struct dentry_operations v9fs_dentry_operations;
47extern const struct dentry_operations v9fs_cached_dentry_operations; 50extern const struct dentry_operations v9fs_cached_dentry_operations;
51extern const struct file_operations v9fs_cached_file_operations;
52extern const struct file_operations v9fs_cached_file_operations_dotl;
53extern struct kmem_cache *v9fs_inode_cache;
48 54
49#ifdef CONFIG_9P_FSCACHE
50struct inode *v9fs_alloc_inode(struct super_block *sb); 55struct inode *v9fs_alloc_inode(struct super_block *sb);
51void v9fs_destroy_inode(struct inode *inode); 56void v9fs_destroy_inode(struct inode *inode);
52#endif
53
54struct inode *v9fs_get_inode(struct super_block *sb, int mode); 57struct inode *v9fs_get_inode(struct super_block *sb, int mode);
58int v9fs_init_inode(struct v9fs_session_info *v9ses,
59 struct inode *inode, int mode);
55void v9fs_evict_inode(struct inode *inode); 60void v9fs_evict_inode(struct inode *inode);
56ino_t v9fs_qid2ino(struct p9_qid *qid); 61ino_t v9fs_qid2ino(struct p9_qid *qid);
57void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); 62void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -62,8 +67,19 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
62int v9fs_uflags2omode(int uflags, int extended); 67int v9fs_uflags2omode(int uflags, int extended);
63 68
64ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); 69ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
70ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
65void v9fs_blank_wstat(struct p9_wstat *wstat); 71void v9fs_blank_wstat(struct p9_wstat *wstat);
66int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); 72int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
67int v9fs_file_fsync_dotl(struct file *filp, int datasync); 73int v9fs_file_fsync_dotl(struct file *filp, int datasync);
68 74ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
69#define P9_LOCK_TIMEOUT (30*HZ) 75 const char __user *, size_t, loff_t *, int);
76int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
77int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
78static inline void v9fs_invalidate_inode_attr(struct inode *inode)
79{
80 struct v9fs_inode *v9inode;
81 v9inode = V9FS_I(inode);
82 v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
83 return;
84}
85#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index b7f2a8e3863e..2524e4cbb8ea 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -39,16 +39,16 @@
39#include "v9fs.h" 39#include "v9fs.h"
40#include "v9fs_vfs.h" 40#include "v9fs_vfs.h"
41#include "cache.h" 41#include "cache.h"
42#include "fid.h"
42 43
43/** 44/**
44 * v9fs_vfs_readpage - read an entire page in from 9P 45 * v9fs_fid_readpage - read an entire page in from 9P
45 * 46 *
46 * @filp: file being read 47 * @fid: fid being read
47 * @page: structure to page 48 * @page: structure to page
48 * 49 *
49 */ 50 */
50 51static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
51static int v9fs_vfs_readpage(struct file *filp, struct page *page)
52{ 52{
53 int retval; 53 int retval;
54 loff_t offset; 54 loff_t offset;
@@ -67,7 +67,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
67 buffer = kmap(page); 67 buffer = kmap(page);
68 offset = page_offset(page); 68 offset = page_offset(page);
69 69
70 retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); 70 retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset);
71 if (retval < 0) { 71 if (retval < 0) {
72 v9fs_uncache_page(inode, page); 72 v9fs_uncache_page(inode, page);
73 goto done; 73 goto done;
@@ -87,6 +87,19 @@ done:
87} 87}
88 88
89/** 89/**
90 * v9fs_vfs_readpage - read an entire page in from 9P
91 *
92 * @filp: file being read
93 * @page: structure to page
94 *
95 */
96
97static int v9fs_vfs_readpage(struct file *filp, struct page *page)
98{
99 return v9fs_fid_readpage(filp->private_data, page);
100}
101
102/**
90 * v9fs_vfs_readpages - read a set of pages from 9P 103 * v9fs_vfs_readpages - read a set of pages from 9P
91 * 104 *
92 * @filp: file being read 105 * @filp: file being read
@@ -124,7 +137,6 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
124{ 137{
125 if (PagePrivate(page)) 138 if (PagePrivate(page))
126 return 0; 139 return 0;
127
128 return v9fs_fscache_release_page(page, gfp); 140 return v9fs_fscache_release_page(page, gfp);
129} 141}
130 142
@@ -137,20 +149,89 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
137 149
138static void v9fs_invalidate_page(struct page *page, unsigned long offset) 150static void v9fs_invalidate_page(struct page *page, unsigned long offset)
139{ 151{
152 /*
153 * If called with zero offset, we should release
154 * the private state assocated with the page
155 */
140 if (offset == 0) 156 if (offset == 0)
141 v9fs_fscache_invalidate_page(page); 157 v9fs_fscache_invalidate_page(page);
142} 158}
143 159
160static int v9fs_vfs_writepage_locked(struct page *page)
161{
162 char *buffer;
163 int retval, len;
164 loff_t offset, size;
165 mm_segment_t old_fs;
166 struct v9fs_inode *v9inode;
167 struct inode *inode = page->mapping->host;
168
169 v9inode = V9FS_I(inode);
170 size = i_size_read(inode);
171 if (page->index == size >> PAGE_CACHE_SHIFT)
172 len = size & ~PAGE_CACHE_MASK;
173 else
174 len = PAGE_CACHE_SIZE;
175
176 set_page_writeback(page);
177
178 buffer = kmap(page);
179 offset = page_offset(page);
180
181 old_fs = get_fs();
182 set_fs(get_ds());
183 /* We should have writeback_fid always set */
184 BUG_ON(!v9inode->writeback_fid);
185
186 retval = v9fs_file_write_internal(inode,
187 v9inode->writeback_fid,
188 (__force const char __user *)buffer,
189 len, &offset, 0);
190 if (retval > 0)
191 retval = 0;
192
193 set_fs(old_fs);
194 kunmap(page);
195 end_page_writeback(page);
196 return retval;
197}
198
199static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
200{
201 int retval;
202
203 retval = v9fs_vfs_writepage_locked(page);
204 if (retval < 0) {
205 if (retval == -EAGAIN) {
206 redirty_page_for_writepage(wbc, page);
207 retval = 0;
208 } else {
209 SetPageError(page);
210 mapping_set_error(page->mapping, retval);
211 }
212 } else
213 retval = 0;
214
215 unlock_page(page);
216 return retval;
217}
218
144/** 219/**
145 * v9fs_launder_page - Writeback a dirty page 220 * v9fs_launder_page - Writeback a dirty page
146 * Since the writes go directly to the server, we simply return a 0
147 * here to indicate success.
148 *
149 * Returns 0 on success. 221 * Returns 0 on success.
150 */ 222 */
151 223
152static int v9fs_launder_page(struct page *page) 224static int v9fs_launder_page(struct page *page)
153{ 225{
226 int retval;
227 struct inode *inode = page->mapping->host;
228
229 v9fs_fscache_wait_on_page_write(inode, page);
230 if (clear_page_dirty_for_io(page)) {
231 retval = v9fs_vfs_writepage_locked(page);
232 if (retval)
233 return retval;
234 }
154 return 0; 235 return 0;
155} 236}
156 237
@@ -173,9 +254,15 @@ static int v9fs_launder_page(struct page *page)
173 * with an error. 254 * with an error.
174 * 255 *
175 */ 256 */
176ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 257static ssize_t
177 loff_t pos, unsigned long nr_segs) 258v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
259 loff_t pos, unsigned long nr_segs)
178{ 260{
261 /*
262 * FIXME
263 * Now that we do caching with cache mode enabled, We need
264 * to support direct IO
265 */
179 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " 266 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
180 "off/no(%lld/%lu) EINVAL\n", 267 "off/no(%lld/%lu) EINVAL\n",
181 iocb->ki_filp->f_path.dentry->d_name.name, 268 iocb->ki_filp->f_path.dentry->d_name.name,
@@ -183,11 +270,84 @@ ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
183 270
184 return -EINVAL; 271 return -EINVAL;
185} 272}
273
274static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
275 loff_t pos, unsigned len, unsigned flags,
276 struct page **pagep, void **fsdata)
277{
278 int retval = 0;
279 struct page *page;
280 struct v9fs_inode *v9inode;
281 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
282 struct inode *inode = mapping->host;
283
284 v9inode = V9FS_I(inode);
285start:
286 page = grab_cache_page_write_begin(mapping, index, flags);
287 if (!page) {
288 retval = -ENOMEM;
289 goto out;
290 }
291 BUG_ON(!v9inode->writeback_fid);
292 if (PageUptodate(page))
293 goto out;
294
295 if (len == PAGE_CACHE_SIZE)
296 goto out;
297
298 retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
299 page_cache_release(page);
300 if (!retval)
301 goto start;
302out:
303 *pagep = page;
304 return retval;
305}
306
307static int v9fs_write_end(struct file *filp, struct address_space *mapping,
308 loff_t pos, unsigned len, unsigned copied,
309 struct page *page, void *fsdata)
310{
311 loff_t last_pos = pos + copied;
312 struct inode *inode = page->mapping->host;
313
314 if (unlikely(copied < len)) {
315 /*
316 * zero out the rest of the area
317 */
318 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
319
320 zero_user(page, from + copied, len - copied);
321 flush_dcache_page(page);
322 }
323
324 if (!PageUptodate(page))
325 SetPageUptodate(page);
326 /*
327 * No need to use i_size_read() here, the i_size
328 * cannot change under us because we hold the i_mutex.
329 */
330 if (last_pos > inode->i_size) {
331 inode_add_bytes(inode, last_pos - inode->i_size);
332 i_size_write(inode, last_pos);
333 }
334 set_page_dirty(page);
335 unlock_page(page);
336 page_cache_release(page);
337
338 return copied;
339}
340
341
186const struct address_space_operations v9fs_addr_operations = { 342const struct address_space_operations v9fs_addr_operations = {
187 .readpage = v9fs_vfs_readpage, 343 .readpage = v9fs_vfs_readpage,
188 .readpages = v9fs_vfs_readpages, 344 .readpages = v9fs_vfs_readpages,
189 .releasepage = v9fs_release_page, 345 .set_page_dirty = __set_page_dirty_nobuffers,
190 .invalidatepage = v9fs_invalidate_page, 346 .writepage = v9fs_vfs_writepage,
191 .launder_page = v9fs_launder_page, 347 .write_begin = v9fs_write_begin,
192 .direct_IO = v9fs_direct_IO, 348 .write_end = v9fs_write_end,
349 .releasepage = v9fs_release_page,
350 .invalidatepage = v9fs_invalidate_page,
351 .launder_page = v9fs_launder_page,
352 .direct_IO = v9fs_direct_IO,
193}; 353};
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 233b7d4ffe5e..b6a3b9f7fe4d 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -63,20 +63,15 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
63 * v9fs_cached_dentry_delete - called when dentry refcount equals 0 63 * v9fs_cached_dentry_delete - called when dentry refcount equals 0
64 * @dentry: dentry in question 64 * @dentry: dentry in question
65 * 65 *
66 * Only return 1 if our inode is invalid. Only non-synthetic files
67 * (ones without mtime == 0) should be calling this function.
68 *
69 */ 66 */
70
71static int v9fs_cached_dentry_delete(const struct dentry *dentry) 67static int v9fs_cached_dentry_delete(const struct dentry *dentry)
72{ 68{
73 struct inode *inode = dentry->d_inode; 69 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
74 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, 70 dentry->d_name.name, dentry);
75 dentry);
76 71
77 if(!inode) 72 /* Don't cache negative dentries */
73 if (!dentry->d_inode)
78 return 1; 74 return 1;
79
80 return 0; 75 return 0;
81} 76}
82 77
@@ -105,7 +100,41 @@ static void v9fs_dentry_release(struct dentry *dentry)
105 } 100 }
106} 101}
107 102
103static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
104{
105 struct p9_fid *fid;
106 struct inode *inode;
107 struct v9fs_inode *v9inode;
108
109 if (nd->flags & LOOKUP_RCU)
110 return -ECHILD;
111
112 inode = dentry->d_inode;
113 if (!inode)
114 goto out_valid;
115
116 v9inode = V9FS_I(inode);
117 if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
118 int retval;
119 struct v9fs_session_info *v9ses;
120 fid = v9fs_fid_lookup(dentry);
121 if (IS_ERR(fid))
122 return PTR_ERR(fid);
123
124 v9ses = v9fs_inode2v9ses(inode);
125 if (v9fs_proto_dotl(v9ses))
126 retval = v9fs_refresh_inode_dotl(fid, inode);
127 else
128 retval = v9fs_refresh_inode(fid, inode);
129 if (retval <= 0)
130 return retval;
131 }
132out_valid:
133 return 1;
134}
135
108const struct dentry_operations v9fs_cached_dentry_operations = { 136const struct dentry_operations v9fs_cached_dentry_operations = {
137 .d_revalidate = v9fs_lookup_revalidate,
109 .d_delete = v9fs_cached_dentry_delete, 138 .d_delete = v9fs_cached_dentry_delete,
110 .d_release = v9fs_dentry_release, 139 .d_release = v9fs_dentry_release,
111}; 140};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b84ebe8cefed..9c2bdda5cd9d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -295,7 +295,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
295 P9_DPRINTK(P9_DEBUG_VFS, 295 P9_DPRINTK(P9_DEBUG_VFS,
296 "v9fs_dir_release: inode: %p filp: %p fid: %d\n", 296 "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
297 inode, filp, fid ? fid->fid : -1); 297 inode, filp, fid ? fid->fid : -1);
298 filemap_write_and_wait(inode->i_mapping);
299 if (fid) 298 if (fid)
300 p9_client_clunk(fid); 299 p9_client_clunk(fid);
301 return 0; 300 return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 240c30674396..ffed55817f0c 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -44,8 +44,7 @@
44#include "fid.h" 44#include "fid.h"
45#include "cache.h" 45#include "cache.h"
46 46
47static const struct file_operations v9fs_cached_file_operations; 47static const struct vm_operations_struct v9fs_file_vm_ops;
48static const struct file_operations v9fs_cached_file_operations_dotl;
49 48
50/** 49/**
51 * v9fs_file_open - open a file (or directory) 50 * v9fs_file_open - open a file (or directory)
@@ -57,11 +56,13 @@ static const struct file_operations v9fs_cached_file_operations_dotl;
57int v9fs_file_open(struct inode *inode, struct file *file) 56int v9fs_file_open(struct inode *inode, struct file *file)
58{ 57{
59 int err; 58 int err;
59 struct v9fs_inode *v9inode;
60 struct v9fs_session_info *v9ses; 60 struct v9fs_session_info *v9ses;
61 struct p9_fid *fid; 61 struct p9_fid *fid;
62 int omode; 62 int omode;
63 63
64 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); 64 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
65 v9inode = V9FS_I(inode);
65 v9ses = v9fs_inode2v9ses(inode); 66 v9ses = v9fs_inode2v9ses(inode);
66 if (v9fs_proto_dotl(v9ses)) 67 if (v9fs_proto_dotl(v9ses))
67 omode = file->f_flags; 68 omode = file->f_flags;
@@ -89,20 +90,34 @@ int v9fs_file_open(struct inode *inode, struct file *file)
89 } 90 }
90 91
91 file->private_data = fid; 92 file->private_data = fid;
92 if ((fid->qid.version) && (v9ses->cache)) { 93 mutex_lock(&v9inode->v_mutex);
93 P9_DPRINTK(P9_DEBUG_VFS, "cached"); 94 if (v9ses->cache && !v9inode->writeback_fid &&
94 /* enable cached file options */ 95 ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
95 if(file->f_op == &v9fs_file_operations) 96 /*
96 file->f_op = &v9fs_cached_file_operations; 97 * clone a fid and add it to writeback_fid
97 else if (file->f_op == &v9fs_file_operations_dotl) 98 * we do it during open time instead of
98 file->f_op = &v9fs_cached_file_operations_dotl; 99 * page dirty time via write_begin/page_mkwrite
99 100 * because we want write after unlink usecase
101 * to work.
102 */
103 fid = v9fs_writeback_fid(file->f_path.dentry);
104 if (IS_ERR(fid)) {
105 err = PTR_ERR(fid);
106 mutex_unlock(&v9inode->v_mutex);
107 goto out_error;
108 }
109 v9inode->writeback_fid = (void *) fid;
110 }
111 mutex_unlock(&v9inode->v_mutex);
100#ifdef CONFIG_9P_FSCACHE 112#ifdef CONFIG_9P_FSCACHE
113 if (v9ses->cache)
101 v9fs_cache_inode_set_cookie(inode, file); 114 v9fs_cache_inode_set_cookie(inode, file);
102#endif 115#endif
103 }
104
105 return 0; 116 return 0;
117out_error:
118 p9_client_clunk(file->private_data);
119 file->private_data = NULL;
120 return err;
106} 121}
107 122
108/** 123/**
@@ -335,25 +350,22 @@ out_err:
335} 350}
336 351
337/** 352/**
338 * v9fs_file_readn - read from a file 353 * v9fs_fid_readn - read from a fid
339 * @filp: file pointer to read 354 * @fid: fid to read
340 * @data: data buffer to read data into 355 * @data: data buffer to read data into
341 * @udata: user data buffer to read data into 356 * @udata: user data buffer to read data into
342 * @count: size of buffer 357 * @count: size of buffer
343 * @offset: offset at which to read data 358 * @offset: offset at which to read data
344 * 359 *
345 */ 360 */
346
347ssize_t 361ssize_t
348v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, 362v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
349 u64 offset) 363 u64 offset)
350{ 364{
351 int n, total, size; 365 int n, total, size;
352 struct p9_fid *fid = filp->private_data;
353 366
354 P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, 367 P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
355 (long long unsigned) offset, count); 368 (long long unsigned) offset, count);
356
357 n = 0; 369 n = 0;
358 total = 0; 370 total = 0;
359 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; 371 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -379,6 +391,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
379} 391}
380 392
381/** 393/**
394 * v9fs_file_readn - read from a file
395 * @filp: file pointer to read
396 * @data: data buffer to read data into
397 * @udata: user data buffer to read data into
398 * @count: size of buffer
399 * @offset: offset at which to read data
400 *
401 */
402ssize_t
403v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
404 u64 offset)
405{
406 return v9fs_fid_readn(filp->private_data, data, udata, count, offset);
407}
408
409/**
382 * v9fs_file_read - read from a file 410 * v9fs_file_read - read from a file
383 * @filp: file pointer to read 411 * @filp: file pointer to read
384 * @udata: user data buffer to read data into 412 * @udata: user data buffer to read data into
@@ -410,45 +438,22 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
410 return ret; 438 return ret;
411} 439}
412 440
413/** 441ssize_t
414 * v9fs_file_write - write to a file 442v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
415 * @filp: file pointer to write 443 const char __user *data, size_t count,
416 * @data: data buffer to write data from 444 loff_t *offset, int invalidate)
417 * @count: size of buffer
418 * @offset: offset at which to write data
419 *
420 */
421
422static ssize_t
423v9fs_file_write(struct file *filp, const char __user * data,
424 size_t count, loff_t * offset)
425{ 445{
426 ssize_t retval;
427 size_t total = 0;
428 int n; 446 int n;
429 struct p9_fid *fid; 447 loff_t i_size;
448 size_t total = 0;
430 struct p9_client *clnt; 449 struct p9_client *clnt;
431 struct inode *inode = filp->f_path.dentry->d_inode;
432 loff_t origin = *offset; 450 loff_t origin = *offset;
433 unsigned long pg_start, pg_end; 451 unsigned long pg_start, pg_end;
434 452
435 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 453 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
436 (int)count, (int)*offset); 454 (int)count, (int)*offset);
437 455
438 fid = filp->private_data;
439 clnt = fid->clnt; 456 clnt = fid->clnt;
440
441 retval = generic_write_checks(filp, &origin, &count, 0);
442 if (retval)
443 goto out;
444
445 retval = -EINVAL;
446 if ((ssize_t) count < 0)
447 goto out;
448 retval = 0;
449 if (!count)
450 goto out;
451
452 do { 457 do {
453 n = p9_client_write(fid, NULL, data+total, origin+total, count); 458 n = p9_client_write(fid, NULL, data+total, origin+total, count);
454 if (n <= 0) 459 if (n <= 0)
@@ -457,25 +462,63 @@ v9fs_file_write(struct file *filp, const char __user * data,
457 total += n; 462 total += n;
458 } while (count > 0); 463 } while (count > 0);
459 464
460 if (total > 0) { 465 if (invalidate && (total > 0)) {
461 pg_start = origin >> PAGE_CACHE_SHIFT; 466 pg_start = origin >> PAGE_CACHE_SHIFT;
462 pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT; 467 pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
463 if (inode->i_mapping && inode->i_mapping->nrpages) 468 if (inode->i_mapping && inode->i_mapping->nrpages)
464 invalidate_inode_pages2_range(inode->i_mapping, 469 invalidate_inode_pages2_range(inode->i_mapping,
465 pg_start, pg_end); 470 pg_start, pg_end);
466 *offset += total; 471 *offset += total;
467 i_size_write(inode, i_size_read(inode) + total); 472 i_size = i_size_read(inode);
468 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; 473 if (*offset > i_size) {
474 inode_add_bytes(inode, *offset - i_size);
475 i_size_write(inode, *offset);
476 }
469 } 477 }
470
471 if (n < 0) 478 if (n < 0)
472 retval = n; 479 return n;
473 else 480
474 retval = total; 481 return total;
482}
483
484/**
485 * v9fs_file_write - write to a file
486 * @filp: file pointer to write
487 * @data: data buffer to write data from
488 * @count: size of buffer
489 * @offset: offset at which to write data
490 *
491 */
492static ssize_t
493v9fs_file_write(struct file *filp, const char __user * data,
494 size_t count, loff_t *offset)
495{
496 ssize_t retval = 0;
497 loff_t origin = *offset;
498
499
500 retval = generic_write_checks(filp, &origin, &count, 0);
501 if (retval)
502 goto out;
503
504 retval = -EINVAL;
505 if ((ssize_t) count < 0)
506 goto out;
507 retval = 0;
508 if (!count)
509 goto out;
510
511 retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode,
512 filp->private_data,
513 data, count, &origin, 1);
514 /* update offset on successful write */
515 if (retval > 0)
516 *offset = origin;
475out: 517out:
476 return retval; 518 return retval;
477} 519}
478 520
521
479static int v9fs_file_fsync(struct file *filp, int datasync) 522static int v9fs_file_fsync(struct file *filp, int datasync)
480{ 523{
481 struct p9_fid *fid; 524 struct p9_fid *fid;
@@ -505,28 +548,182 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync)
505 return retval; 548 return retval;
506} 549}
507 550
508static const struct file_operations v9fs_cached_file_operations = { 551static int
552v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
553{
554 int retval;
555
556 retval = generic_file_mmap(file, vma);
557 if (!retval)
558 vma->vm_ops = &v9fs_file_vm_ops;
559
560 return retval;
561}
562
563static int
564v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
565{
566 struct v9fs_inode *v9inode;
567 struct page *page = vmf->page;
568 struct file *filp = vma->vm_file;
569 struct inode *inode = filp->f_path.dentry->d_inode;
570
571
572 P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
573 page, (unsigned long)filp->private_data);
574
575 v9inode = V9FS_I(inode);
576 /* make sure the cache has finished storing the page */
577 v9fs_fscache_wait_on_page_write(inode, page);
578 BUG_ON(!v9inode->writeback_fid);
579 lock_page(page);
580 if (page->mapping != inode->i_mapping)
581 goto out_unlock;
582
583 return VM_FAULT_LOCKED;
584out_unlock:
585 unlock_page(page);
586 return VM_FAULT_NOPAGE;
587}
588
589static ssize_t
590v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
591 loff_t *offsetp)
592{
593 loff_t size, offset;
594 struct inode *inode;
595 struct address_space *mapping;
596
597 offset = *offsetp;
598 mapping = filp->f_mapping;
599 inode = mapping->host;
600 if (!count)
601 return 0;
602 size = i_size_read(inode);
603 if (offset < size)
604 filemap_write_and_wait_range(mapping, offset,
605 offset + count - 1);
606
607 return v9fs_file_read(filp, udata, count, offsetp);
608}
609
610/**
611 * v9fs_cached_file_read - read from a file
612 * @filp: file pointer to read
613 * @udata: user data buffer to read data into
614 * @count: size of buffer
615 * @offset: offset at which to read data
616 *
617 */
618static ssize_t
619v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
620 loff_t *offset)
621{
622 if (filp->f_flags & O_DIRECT)
623 return v9fs_direct_read(filp, data, count, offset);
624 return do_sync_read(filp, data, count, offset);
625}
626
627static ssize_t
628v9fs_direct_write(struct file *filp, const char __user * data,
629 size_t count, loff_t *offsetp)
630{
631 loff_t offset;
632 ssize_t retval;
633 struct inode *inode;
634 struct address_space *mapping;
635
636 offset = *offsetp;
637 mapping = filp->f_mapping;
638 inode = mapping->host;
639 if (!count)
640 return 0;
641
642 mutex_lock(&inode->i_mutex);
643 retval = filemap_write_and_wait_range(mapping, offset,
644 offset + count - 1);
645 if (retval)
646 goto err_out;
647 /*
648 * After a write we want buffered reads to be sure to go to disk to get
649 * the new data. We invalidate clean cached page from the region we're
650 * about to write. We do this *before* the write so that if we fail
651 * here we fall back to buffered write
652 */
653 if (mapping->nrpages) {
654 pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
655 pgoff_t pg_end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
656
657 retval = invalidate_inode_pages2_range(mapping,
658 pg_start, pg_end);
659 /*
660 * If a page can not be invalidated, fall back
661 * to buffered write.
662 */
663 if (retval) {
664 if (retval == -EBUSY)
665 goto buff_write;
666 goto err_out;
667 }
668 }
669 retval = v9fs_file_write(filp, data, count, offsetp);
670err_out:
671 mutex_unlock(&inode->i_mutex);
672 return retval;
673
674buff_write:
675 mutex_unlock(&inode->i_mutex);
676 return do_sync_write(filp, data, count, offsetp);
677}
678
679/**
680 * v9fs_cached_file_write - write to a file
681 * @filp: file pointer to write
682 * @data: data buffer to write data from
683 * @count: size of buffer
684 * @offset: offset at which to write data
685 *
686 */
687static ssize_t
688v9fs_cached_file_write(struct file *filp, const char __user * data,
689 size_t count, loff_t *offset)
690{
691
692 if (filp->f_flags & O_DIRECT)
693 return v9fs_direct_write(filp, data, count, offset);
694 return do_sync_write(filp, data, count, offset);
695}
696
697static const struct vm_operations_struct v9fs_file_vm_ops = {
698 .fault = filemap_fault,
699 .page_mkwrite = v9fs_vm_page_mkwrite,
700};
701
702
703const struct file_operations v9fs_cached_file_operations = {
509 .llseek = generic_file_llseek, 704 .llseek = generic_file_llseek,
510 .read = do_sync_read, 705 .read = v9fs_cached_file_read,
706 .write = v9fs_cached_file_write,
511 .aio_read = generic_file_aio_read, 707 .aio_read = generic_file_aio_read,
512 .write = v9fs_file_write, 708 .aio_write = generic_file_aio_write,
513 .open = v9fs_file_open, 709 .open = v9fs_file_open,
514 .release = v9fs_dir_release, 710 .release = v9fs_dir_release,
515 .lock = v9fs_file_lock, 711 .lock = v9fs_file_lock,
516 .mmap = generic_file_readonly_mmap, 712 .mmap = v9fs_file_mmap,
517 .fsync = v9fs_file_fsync, 713 .fsync = v9fs_file_fsync,
518}; 714};
519 715
520static const struct file_operations v9fs_cached_file_operations_dotl = { 716const struct file_operations v9fs_cached_file_operations_dotl = {
521 .llseek = generic_file_llseek, 717 .llseek = generic_file_llseek,
522 .read = do_sync_read, 718 .read = v9fs_cached_file_read,
719 .write = v9fs_cached_file_write,
523 .aio_read = generic_file_aio_read, 720 .aio_read = generic_file_aio_read,
524 .write = v9fs_file_write, 721 .aio_write = generic_file_aio_write,
525 .open = v9fs_file_open, 722 .open = v9fs_file_open,
526 .release = v9fs_dir_release, 723 .release = v9fs_dir_release,
527 .lock = v9fs_file_lock_dotl, 724 .lock = v9fs_file_lock_dotl,
528 .flock = v9fs_file_flock_dotl, 725 .flock = v9fs_file_flock_dotl,
529 .mmap = generic_file_readonly_mmap, 726 .mmap = v9fs_file_mmap,
530 .fsync = v9fs_file_fsync_dotl, 727 .fsync = v9fs_file_fsync_dotl,
531}; 728};
532 729
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b76a40bdf4c2..7f6c67703195 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -203,26 +203,26 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
203 wstat->extension = NULL; 203 wstat->extension = NULL;
204} 204}
205 205
206#ifdef CONFIG_9P_FSCACHE
207/** 206/**
208 * v9fs_alloc_inode - helper function to allocate an inode 207 * v9fs_alloc_inode - helper function to allocate an inode
209 * This callback is executed before setting up the inode so that we
210 * can associate a vcookie with each inode.
211 * 208 *
212 */ 209 */
213
214struct inode *v9fs_alloc_inode(struct super_block *sb) 210struct inode *v9fs_alloc_inode(struct super_block *sb)
215{ 211{
216 struct v9fs_cookie *vcookie; 212 struct v9fs_inode *v9inode;
217 vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache, 213 v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
218 GFP_KERNEL); 214 GFP_KERNEL);
219 if (!vcookie) 215 if (!v9inode)
220 return NULL; 216 return NULL;
221 217#ifdef CONFIG_9P_FSCACHE
222 vcookie->fscache = NULL; 218 v9inode->fscache = NULL;
223 vcookie->qid = NULL; 219 v9inode->fscache_key = NULL;
224 spin_lock_init(&vcookie->lock); 220 spin_lock_init(&v9inode->fscache_lock);
225 return &vcookie->inode; 221#endif
222 v9inode->writeback_fid = NULL;
223 v9inode->cache_validity = 0;
224 mutex_init(&v9inode->v_mutex);
225 return &v9inode->vfs_inode;
226} 226}
227 227
228/** 228/**
@@ -234,35 +234,18 @@ static void v9fs_i_callback(struct rcu_head *head)
234{ 234{
235 struct inode *inode = container_of(head, struct inode, i_rcu); 235 struct inode *inode = container_of(head, struct inode, i_rcu);
236 INIT_LIST_HEAD(&inode->i_dentry); 236 INIT_LIST_HEAD(&inode->i_dentry);
237 kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode)); 237 kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
238} 238}
239 239
240void v9fs_destroy_inode(struct inode *inode) 240void v9fs_destroy_inode(struct inode *inode)
241{ 241{
242 call_rcu(&inode->i_rcu, v9fs_i_callback); 242 call_rcu(&inode->i_rcu, v9fs_i_callback);
243} 243}
244#endif
245
246/**
247 * v9fs_get_inode - helper function to setup an inode
248 * @sb: superblock
249 * @mode: mode to setup inode with
250 *
251 */
252 244
253struct inode *v9fs_get_inode(struct super_block *sb, int mode) 245int v9fs_init_inode(struct v9fs_session_info *v9ses,
246 struct inode *inode, int mode)
254{ 247{
255 int err; 248 int err = 0;
256 struct inode *inode;
257 struct v9fs_session_info *v9ses = sb->s_fs_info;
258
259 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
260
261 inode = new_inode(sb);
262 if (!inode) {
263 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
264 return ERR_PTR(-ENOMEM);
265 }
266 249
267 inode_init_owner(inode, NULL, mode); 250 inode_init_owner(inode, NULL, mode);
268 inode->i_blocks = 0; 251 inode->i_blocks = 0;
@@ -292,14 +275,20 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
292 case S_IFREG: 275 case S_IFREG:
293 if (v9fs_proto_dotl(v9ses)) { 276 if (v9fs_proto_dotl(v9ses)) {
294 inode->i_op = &v9fs_file_inode_operations_dotl; 277 inode->i_op = &v9fs_file_inode_operations_dotl;
295 inode->i_fop = &v9fs_file_operations_dotl; 278 if (v9ses->cache)
279 inode->i_fop =
280 &v9fs_cached_file_operations_dotl;
281 else
282 inode->i_fop = &v9fs_file_operations_dotl;
296 } else { 283 } else {
297 inode->i_op = &v9fs_file_inode_operations; 284 inode->i_op = &v9fs_file_inode_operations;
298 inode->i_fop = &v9fs_file_operations; 285 if (v9ses->cache)
286 inode->i_fop = &v9fs_cached_file_operations;
287 else
288 inode->i_fop = &v9fs_file_operations;
299 } 289 }
300 290
301 break; 291 break;
302
303 case S_IFLNK: 292 case S_IFLNK:
304 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { 293 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
305 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " 294 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
@@ -335,12 +324,37 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
335 err = -EINVAL; 324 err = -EINVAL;
336 goto error; 325 goto error;
337 } 326 }
327error:
328 return err;
338 329
339 return inode; 330}
340 331
341error: 332/**
342 iput(inode); 333 * v9fs_get_inode - helper function to setup an inode
343 return ERR_PTR(err); 334 * @sb: superblock
335 * @mode: mode to setup inode with
336 *
337 */
338
339struct inode *v9fs_get_inode(struct super_block *sb, int mode)
340{
341 int err;
342 struct inode *inode;
343 struct v9fs_session_info *v9ses = sb->s_fs_info;
344
345 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
346
347 inode = new_inode(sb);
348 if (!inode) {
349 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
350 return ERR_PTR(-ENOMEM);
351 }
352 err = v9fs_init_inode(v9ses, inode, mode);
353 if (err) {
354 iput(inode);
355 return ERR_PTR(err);
356 }
357 return inode;
344} 358}
345 359
346/* 360/*
@@ -403,6 +417,8 @@ error:
403 */ 417 */
404void v9fs_evict_inode(struct inode *inode) 418void v9fs_evict_inode(struct inode *inode)
405{ 419{
420 struct v9fs_inode *v9inode = V9FS_I(inode);
421
406 truncate_inode_pages(inode->i_mapping, 0); 422 truncate_inode_pages(inode->i_mapping, 0);
407 end_writeback(inode); 423 end_writeback(inode);
408 filemap_fdatawrite(inode->i_mapping); 424 filemap_fdatawrite(inode->i_mapping);
@@ -410,41 +426,67 @@ void v9fs_evict_inode(struct inode *inode)
410#ifdef CONFIG_9P_FSCACHE 426#ifdef CONFIG_9P_FSCACHE
411 v9fs_cache_inode_put_cookie(inode); 427 v9fs_cache_inode_put_cookie(inode);
412#endif 428#endif
429 /* clunk the fid stashed in writeback_fid */
430 if (v9inode->writeback_fid) {
431 p9_client_clunk(v9inode->writeback_fid);
432 v9inode->writeback_fid = NULL;
433 }
413} 434}
414 435
415struct inode * 436static struct inode *v9fs_qid_iget(struct super_block *sb,
416v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, 437 struct p9_qid *qid,
417 struct super_block *sb) 438 struct p9_wstat *st)
418{ 439{
419 int err, umode; 440 int retval, umode;
420 struct inode *ret = NULL; 441 unsigned long i_ino;
421 struct p9_wstat *st; 442 struct inode *inode;
422 443 struct v9fs_session_info *v9ses = sb->s_fs_info;
423 st = p9_client_stat(fid);
424 if (IS_ERR(st))
425 return ERR_CAST(st);
426 444
445 i_ino = v9fs_qid2ino(qid);
446 inode = iget_locked(sb, i_ino);
447 if (!inode)
448 return ERR_PTR(-ENOMEM);
449 if (!(inode->i_state & I_NEW))
450 return inode;
451 /*
452 * initialize the inode with the stat info
453 * FIXME!! we may need support for stale inodes
454 * later.
455 */
427 umode = p9mode2unixmode(v9ses, st->mode); 456 umode = p9mode2unixmode(v9ses, st->mode);
428 ret = v9fs_get_inode(sb, umode); 457 retval = v9fs_init_inode(v9ses, inode, umode);
429 if (IS_ERR(ret)) { 458 if (retval)
430 err = PTR_ERR(ret);
431 goto error; 459 goto error;
432 }
433
434 v9fs_stat2inode(st, ret, sb);
435 ret->i_ino = v9fs_qid2ino(&st->qid);
436 460
461 v9fs_stat2inode(st, inode, sb);
437#ifdef CONFIG_9P_FSCACHE 462#ifdef CONFIG_9P_FSCACHE
438 v9fs_vcookie_set_qid(ret, &st->qid); 463 v9fs_fscache_set_key(inode, &st->qid);
439 v9fs_cache_inode_get_cookie(ret); 464 v9fs_cache_inode_get_cookie(inode);
440#endif 465#endif
441 p9stat_free(st); 466 unlock_new_inode(inode);
442 kfree(st); 467 return inode;
443 return ret;
444error: 468error:
469 unlock_new_inode(inode);
470 iput(inode);
471 return ERR_PTR(retval);
472
473}
474
475struct inode *
476v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
477 struct super_block *sb)
478{
479 struct p9_wstat *st;
480 struct inode *inode = NULL;
481
482 st = p9_client_stat(fid);
483 if (IS_ERR(st))
484 return ERR_CAST(st);
485
486 inode = v9fs_qid_iget(sb, &st->qid, st);
445 p9stat_free(st); 487 p9stat_free(st);
446 kfree(st); 488 kfree(st);
447 return ERR_PTR(err); 489 return inode;
448} 490}
449 491
450/** 492/**
@@ -458,8 +500,8 @@ error:
458static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) 500static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
459{ 501{
460 int retval; 502 int retval;
461 struct inode *file_inode;
462 struct p9_fid *v9fid; 503 struct p9_fid *v9fid;
504 struct inode *file_inode;
463 505
464 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, 506 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
465 rmdir); 507 rmdir);
@@ -470,8 +512,20 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
470 return PTR_ERR(v9fid); 512 return PTR_ERR(v9fid);
471 513
472 retval = p9_client_remove(v9fid); 514 retval = p9_client_remove(v9fid);
473 if (!retval) 515 if (!retval) {
474 drop_nlink(file_inode); 516 /*
517 * directories on unlink should have zero
518 * link count
519 */
520 if (rmdir) {
521 clear_nlink(file_inode);
522 drop_nlink(dir);
523 } else
524 drop_nlink(file_inode);
525
526 v9fs_invalidate_inode_attr(file_inode);
527 v9fs_invalidate_inode_attr(dir);
528 }
475 return retval; 529 return retval;
476} 530}
477 531
@@ -531,7 +585,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
531 } 585 }
532 586
533 /* instantiate inode and assign the unopened fid to the dentry */ 587 /* instantiate inode and assign the unopened fid to the dentry */
534 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 588 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
535 if (IS_ERR(inode)) { 589 if (IS_ERR(inode)) {
536 err = PTR_ERR(inode); 590 err = PTR_ERR(inode);
537 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 591 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -570,9 +624,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
570 int err; 624 int err;
571 u32 perm; 625 u32 perm;
572 int flags; 626 int flags;
573 struct v9fs_session_info *v9ses;
574 struct p9_fid *fid;
575 struct file *filp; 627 struct file *filp;
628 struct v9fs_inode *v9inode;
629 struct v9fs_session_info *v9ses;
630 struct p9_fid *fid, *inode_fid;
576 631
577 err = 0; 632 err = 0;
578 fid = NULL; 633 fid = NULL;
@@ -592,8 +647,29 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
592 goto error; 647 goto error;
593 } 648 }
594 649
650 v9fs_invalidate_inode_attr(dir);
595 /* if we are opening a file, assign the open fid to the file */ 651 /* if we are opening a file, assign the open fid to the file */
596 if (nd && nd->flags & LOOKUP_OPEN) { 652 if (nd && nd->flags & LOOKUP_OPEN) {
653 v9inode = V9FS_I(dentry->d_inode);
654 mutex_lock(&v9inode->v_mutex);
655 if (v9ses->cache && !v9inode->writeback_fid &&
656 ((flags & O_ACCMODE) != O_RDONLY)) {
657 /*
658 * clone a fid and add it to writeback_fid
659 * we do it during open time instead of
660 * page dirty time via write_begin/page_mkwrite
661 * because we want write after unlink usecase
662 * to work.
663 */
664 inode_fid = v9fs_writeback_fid(dentry);
665 if (IS_ERR(inode_fid)) {
666 err = PTR_ERR(inode_fid);
667 mutex_unlock(&v9inode->v_mutex);
668 goto error;
669 }
670 v9inode->writeback_fid = (void *) inode_fid;
671 }
672 mutex_unlock(&v9inode->v_mutex);
597 filp = lookup_instantiate_filp(nd, dentry, generic_file_open); 673 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
598 if (IS_ERR(filp)) { 674 if (IS_ERR(filp)) {
599 err = PTR_ERR(filp); 675 err = PTR_ERR(filp);
@@ -601,6 +677,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
601 } 677 }
602 678
603 filp->private_data = fid; 679 filp->private_data = fid;
680#ifdef CONFIG_9P_FSCACHE
681 if (v9ses->cache)
682 v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
683#endif
604 } else 684 } else
605 p9_client_clunk(fid); 685 p9_client_clunk(fid);
606 686
@@ -625,8 +705,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
625{ 705{
626 int err; 706 int err;
627 u32 perm; 707 u32 perm;
628 struct v9fs_session_info *v9ses;
629 struct p9_fid *fid; 708 struct p9_fid *fid;
709 struct v9fs_session_info *v9ses;
630 710
631 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); 711 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
632 err = 0; 712 err = 0;
@@ -636,6 +716,9 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
636 if (IS_ERR(fid)) { 716 if (IS_ERR(fid)) {
637 err = PTR_ERR(fid); 717 err = PTR_ERR(fid);
638 fid = NULL; 718 fid = NULL;
719 } else {
720 inc_nlink(dir);
721 v9fs_invalidate_inode_attr(dir);
639 } 722 }
640 723
641 if (fid) 724 if (fid)
@@ -687,7 +770,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
687 return ERR_PTR(result); 770 return ERR_PTR(result);
688 } 771 }
689 772
690 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 773 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
691 if (IS_ERR(inode)) { 774 if (IS_ERR(inode)) {
692 result = PTR_ERR(inode); 775 result = PTR_ERR(inode);
693 inode = NULL; 776 inode = NULL;
@@ -747,17 +830,19 @@ int
747v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 830v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
748 struct inode *new_dir, struct dentry *new_dentry) 831 struct inode *new_dir, struct dentry *new_dentry)
749{ 832{
833 int retval;
750 struct inode *old_inode; 834 struct inode *old_inode;
835 struct inode *new_inode;
751 struct v9fs_session_info *v9ses; 836 struct v9fs_session_info *v9ses;
752 struct p9_fid *oldfid; 837 struct p9_fid *oldfid;
753 struct p9_fid *olddirfid; 838 struct p9_fid *olddirfid;
754 struct p9_fid *newdirfid; 839 struct p9_fid *newdirfid;
755 struct p9_wstat wstat; 840 struct p9_wstat wstat;
756 int retval;
757 841
758 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 842 P9_DPRINTK(P9_DEBUG_VFS, "\n");
759 retval = 0; 843 retval = 0;
760 old_inode = old_dentry->d_inode; 844 old_inode = old_dentry->d_inode;
845 new_inode = new_dentry->d_inode;
761 v9ses = v9fs_inode2v9ses(old_inode); 846 v9ses = v9fs_inode2v9ses(old_inode);
762 oldfid = v9fs_fid_lookup(old_dentry); 847 oldfid = v9fs_fid_lookup(old_dentry);
763 if (IS_ERR(oldfid)) 848 if (IS_ERR(oldfid))
@@ -798,9 +883,30 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
798 retval = p9_client_wstat(oldfid, &wstat); 883 retval = p9_client_wstat(oldfid, &wstat);
799 884
800clunk_newdir: 885clunk_newdir:
801 if (!retval) 886 if (!retval) {
887 if (new_inode) {
888 if (S_ISDIR(new_inode->i_mode))
889 clear_nlink(new_inode);
890 else
891 drop_nlink(new_inode);
892 /*
893 * Work around vfs rename rehash bug with
894 * FS_RENAME_DOES_D_MOVE
895 */
896 v9fs_invalidate_inode_attr(new_inode);
897 }
898 if (S_ISDIR(old_inode->i_mode)) {
899 if (!new_inode)
900 inc_nlink(new_dir);
901 drop_nlink(old_dir);
902 }
903 v9fs_invalidate_inode_attr(old_inode);
904 v9fs_invalidate_inode_attr(old_dir);
905 v9fs_invalidate_inode_attr(new_dir);
906
802 /* successful rename */ 907 /* successful rename */
803 d_move(old_dentry, new_dentry); 908 d_move(old_dentry, new_dentry);
909 }
804 up_write(&v9ses->rename_sem); 910 up_write(&v9ses->rename_sem);
805 p9_client_clunk(newdirfid); 911 p9_client_clunk(newdirfid);
806 912
@@ -830,10 +936,11 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
830 936
831 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); 937 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
832 err = -EPERM; 938 err = -EPERM;
833 v9ses = v9fs_inode2v9ses(dentry->d_inode); 939 v9ses = v9fs_dentry2v9ses(dentry);
834 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) 940 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
835 return simple_getattr(mnt, dentry, stat); 941 generic_fillattr(dentry->d_inode, stat);
836 942 return 0;
943 }
837 fid = v9fs_fid_lookup(dentry); 944 fid = v9fs_fid_lookup(dentry);
838 if (IS_ERR(fid)) 945 if (IS_ERR(fid))
839 return PTR_ERR(fid); 946 return PTR_ERR(fid);
@@ -865,8 +972,12 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
865 struct p9_wstat wstat; 972 struct p9_wstat wstat;
866 973
867 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 974 P9_DPRINTK(P9_DEBUG_VFS, "\n");
975 retval = inode_change_ok(dentry->d_inode, iattr);
976 if (retval)
977 return retval;
978
868 retval = -EPERM; 979 retval = -EPERM;
869 v9ses = v9fs_inode2v9ses(dentry->d_inode); 980 v9ses = v9fs_dentry2v9ses(dentry);
870 fid = v9fs_fid_lookup(dentry); 981 fid = v9fs_fid_lookup(dentry);
871 if(IS_ERR(fid)) 982 if(IS_ERR(fid))
872 return PTR_ERR(fid); 983 return PTR_ERR(fid);
@@ -892,16 +1003,19 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
892 wstat.n_gid = iattr->ia_gid; 1003 wstat.n_gid = iattr->ia_gid;
893 } 1004 }
894 1005
1006 /* Write all dirty data */
1007 if (S_ISREG(dentry->d_inode->i_mode))
1008 filemap_write_and_wait(dentry->d_inode->i_mapping);
1009
895 retval = p9_client_wstat(fid, &wstat); 1010 retval = p9_client_wstat(fid, &wstat);
896 if (retval < 0) 1011 if (retval < 0)
897 return retval; 1012 return retval;
898 1013
899 if ((iattr->ia_valid & ATTR_SIZE) && 1014 if ((iattr->ia_valid & ATTR_SIZE) &&
900 iattr->ia_size != i_size_read(dentry->d_inode)) { 1015 iattr->ia_size != i_size_read(dentry->d_inode))
901 retval = vmtruncate(dentry->d_inode, iattr->ia_size); 1016 truncate_setsize(dentry->d_inode, iattr->ia_size);
902 if (retval) 1017
903 return retval; 1018 v9fs_invalidate_inode_attr(dentry->d_inode);
904 }
905 1019
906 setattr_copy(dentry->d_inode, iattr); 1020 setattr_copy(dentry->d_inode, iattr);
907 mark_inode_dirty(dentry->d_inode); 1021 mark_inode_dirty(dentry->d_inode);
@@ -924,6 +1038,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
924 char tag_name[14]; 1038 char tag_name[14];
925 unsigned int i_nlink; 1039 unsigned int i_nlink;
926 struct v9fs_session_info *v9ses = sb->s_fs_info; 1040 struct v9fs_session_info *v9ses = sb->s_fs_info;
1041 struct v9fs_inode *v9inode = V9FS_I(inode);
927 1042
928 inode->i_nlink = 1; 1043 inode->i_nlink = 1;
929 1044
@@ -983,6 +1098,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
983 1098
984 /* not real number of blocks, but 512 byte ones ... */ 1099 /* not real number of blocks, but 512 byte ones ... */
985 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; 1100 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
1101 v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
986} 1102}
987 1103
988/** 1104/**
@@ -1023,7 +1139,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1023 1139
1024 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); 1140 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
1025 retval = -EPERM; 1141 retval = -EPERM;
1026 v9ses = v9fs_inode2v9ses(dentry->d_inode); 1142 v9ses = v9fs_dentry2v9ses(dentry);
1027 fid = v9fs_fid_lookup(dentry); 1143 fid = v9fs_fid_lookup(dentry);
1028 if (IS_ERR(fid)) 1144 if (IS_ERR(fid))
1029 return PTR_ERR(fid); 1145 return PTR_ERR(fid);
@@ -1115,8 +1231,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1115 int mode, const char *extension) 1231 int mode, const char *extension)
1116{ 1232{
1117 u32 perm; 1233 u32 perm;
1118 struct v9fs_session_info *v9ses;
1119 struct p9_fid *fid; 1234 struct p9_fid *fid;
1235 struct v9fs_session_info *v9ses;
1120 1236
1121 v9ses = v9fs_inode2v9ses(dir); 1237 v9ses = v9fs_inode2v9ses(dir);
1122 if (!v9fs_proto_dotu(v9ses)) { 1238 if (!v9fs_proto_dotu(v9ses)) {
@@ -1130,6 +1246,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1130 if (IS_ERR(fid)) 1246 if (IS_ERR(fid))
1131 return PTR_ERR(fid); 1247 return PTR_ERR(fid);
1132 1248
1249 v9fs_invalidate_inode_attr(dir);
1133 p9_client_clunk(fid); 1250 p9_client_clunk(fid);
1134 return 0; 1251 return 0;
1135} 1252}
@@ -1166,8 +1283,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1166 struct dentry *dentry) 1283 struct dentry *dentry)
1167{ 1284{
1168 int retval; 1285 int retval;
1169 struct p9_fid *oldfid;
1170 char *name; 1286 char *name;
1287 struct p9_fid *oldfid;
1171 1288
1172 P9_DPRINTK(P9_DEBUG_VFS, 1289 P9_DPRINTK(P9_DEBUG_VFS,
1173 " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, 1290 " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
@@ -1186,7 +1303,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1186 sprintf(name, "%d\n", oldfid->fid); 1303 sprintf(name, "%d\n", oldfid->fid);
1187 retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); 1304 retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
1188 __putname(name); 1305 __putname(name);
1189 1306 if (!retval) {
1307 v9fs_refresh_inode(oldfid, old_dentry->d_inode);
1308 v9fs_invalidate_inode_attr(dir);
1309 }
1190clunk_fid: 1310clunk_fid:
1191 p9_client_clunk(oldfid); 1311 p9_client_clunk(oldfid);
1192 return retval; 1312 return retval;
@@ -1237,6 +1357,32 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1237 return retval; 1357 return retval;
1238} 1358}
1239 1359
1360int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
1361{
1362 loff_t i_size;
1363 struct p9_wstat *st;
1364 struct v9fs_session_info *v9ses;
1365
1366 v9ses = v9fs_inode2v9ses(inode);
1367 st = p9_client_stat(fid);
1368 if (IS_ERR(st))
1369 return PTR_ERR(st);
1370
1371 spin_lock(&inode->i_lock);
1372 /*
1373 * We don't want to refresh inode->i_size,
1374 * because we may have cached data
1375 */
1376 i_size = inode->i_size;
1377 v9fs_stat2inode(st, inode, inode->i_sb);
1378 if (v9ses->cache)
1379 inode->i_size = i_size;
1380 spin_unlock(&inode->i_lock);
1381 p9stat_free(st);
1382 kfree(st);
1383 return 0;
1384}
1385
1240static const struct inode_operations v9fs_dir_inode_operations_dotu = { 1386static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1241 .create = v9fs_vfs_create, 1387 .create = v9fs_vfs_create,
1242 .lookup = v9fs_vfs_lookup, 1388 .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index fe3ffa9aace4..ffbb113d5f33 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -86,40 +86,63 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
86 return dentry; 86 return dentry;
87} 87}
88 88
89static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
90 struct p9_qid *qid,
91 struct p9_fid *fid,
92 struct p9_stat_dotl *st)
93{
94 int retval;
95 unsigned long i_ino;
96 struct inode *inode;
97 struct v9fs_session_info *v9ses = sb->s_fs_info;
98
99 i_ino = v9fs_qid2ino(qid);
100 inode = iget_locked(sb, i_ino);
101 if (!inode)
102 return ERR_PTR(-ENOMEM);
103 if (!(inode->i_state & I_NEW))
104 return inode;
105 /*
106 * initialize the inode with the stat info
107 * FIXME!! we may need support for stale inodes
108 * later.
109 */
110 retval = v9fs_init_inode(v9ses, inode, st->st_mode);
111 if (retval)
112 goto error;
113
114 v9fs_stat2inode_dotl(st, inode);
115#ifdef CONFIG_9P_FSCACHE
116 v9fs_fscache_set_key(inode, &st->qid);
117 v9fs_cache_inode_get_cookie(inode);
118#endif
119 retval = v9fs_get_acl(inode, fid);
120 if (retval)
121 goto error;
122
123 unlock_new_inode(inode);
124 return inode;
125error:
126 unlock_new_inode(inode);
127 iput(inode);
128 return ERR_PTR(retval);
129
130}
131
89struct inode * 132struct inode *
90v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, 133v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
91 struct super_block *sb) 134 struct super_block *sb)
92{ 135{
93 struct inode *ret = NULL;
94 int err;
95 struct p9_stat_dotl *st; 136 struct p9_stat_dotl *st;
137 struct inode *inode = NULL;
96 138
97 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); 139 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
98 if (IS_ERR(st)) 140 if (IS_ERR(st))
99 return ERR_CAST(st); 141 return ERR_CAST(st);
100 142
101 ret = v9fs_get_inode(sb, st->st_mode); 143 inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st);
102 if (IS_ERR(ret)) {
103 err = PTR_ERR(ret);
104 goto error;
105 }
106
107 v9fs_stat2inode_dotl(st, ret);
108 ret->i_ino = v9fs_qid2ino(&st->qid);
109#ifdef CONFIG_9P_FSCACHE
110 v9fs_vcookie_set_qid(ret, &st->qid);
111 v9fs_cache_inode_get_cookie(ret);
112#endif
113 err = v9fs_get_acl(ret, fid);
114 if (err) {
115 iput(ret);
116 goto error;
117 }
118 kfree(st); 144 kfree(st);
119 return ret; 145 return inode;
120error:
121 kfree(st);
122 return ERR_PTR(err);
123} 146}
124 147
125/** 148/**
@@ -136,16 +159,17 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
136 struct nameidata *nd) 159 struct nameidata *nd)
137{ 160{
138 int err = 0; 161 int err = 0;
139 char *name = NULL;
140 gid_t gid; 162 gid_t gid;
141 int flags; 163 int flags;
142 mode_t mode; 164 mode_t mode;
143 struct v9fs_session_info *v9ses; 165 char *name = NULL;
144 struct p9_fid *fid = NULL;
145 struct p9_fid *dfid, *ofid;
146 struct file *filp; 166 struct file *filp;
147 struct p9_qid qid; 167 struct p9_qid qid;
148 struct inode *inode; 168 struct inode *inode;
169 struct p9_fid *fid = NULL;
170 struct v9fs_inode *v9inode;
171 struct p9_fid *dfid, *ofid, *inode_fid;
172 struct v9fs_session_info *v9ses;
149 struct posix_acl *pacl = NULL, *dacl = NULL; 173 struct posix_acl *pacl = NULL, *dacl = NULL;
150 174
151 v9ses = v9fs_inode2v9ses(dir); 175 v9ses = v9fs_inode2v9ses(dir);
@@ -196,6 +220,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
196 err); 220 err);
197 goto error; 221 goto error;
198 } 222 }
223 v9fs_invalidate_inode_attr(dir);
199 224
200 /* instantiate inode and assign the unopened fid to the dentry */ 225 /* instantiate inode and assign the unopened fid to the dentry */
201 fid = p9_client_walk(dfid, 1, &name, 1); 226 fid = p9_client_walk(dfid, 1, &name, 1);
@@ -205,7 +230,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
205 fid = NULL; 230 fid = NULL;
206 goto error; 231 goto error;
207 } 232 }
208 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 233 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
209 if (IS_ERR(inode)) { 234 if (IS_ERR(inode)) {
210 err = PTR_ERR(inode); 235 err = PTR_ERR(inode);
211 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 236 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -219,6 +244,26 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
219 /* Now set the ACL based on the default value */ 244 /* Now set the ACL based on the default value */
220 v9fs_set_create_acl(dentry, dacl, pacl); 245 v9fs_set_create_acl(dentry, dacl, pacl);
221 246
247 v9inode = V9FS_I(inode);
248 mutex_lock(&v9inode->v_mutex);
249 if (v9ses->cache && !v9inode->writeback_fid &&
250 ((flags & O_ACCMODE) != O_RDONLY)) {
251 /*
252 * clone a fid and add it to writeback_fid
253 * we do it during open time instead of
254 * page dirty time via write_begin/page_mkwrite
255 * because we want write after unlink usecase
256 * to work.
257 */
258 inode_fid = v9fs_writeback_fid(dentry);
259 if (IS_ERR(inode_fid)) {
260 err = PTR_ERR(inode_fid);
261 mutex_unlock(&v9inode->v_mutex);
262 goto error;
263 }
264 v9inode->writeback_fid = (void *) inode_fid;
265 }
266 mutex_unlock(&v9inode->v_mutex);
222 /* Since we are opening a file, assign the open fid to the file */ 267 /* Since we are opening a file, assign the open fid to the file */
223 filp = lookup_instantiate_filp(nd, dentry, generic_file_open); 268 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
224 if (IS_ERR(filp)) { 269 if (IS_ERR(filp)) {
@@ -226,6 +271,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
226 return PTR_ERR(filp); 271 return PTR_ERR(filp);
227 } 272 }
228 filp->private_data = ofid; 273 filp->private_data = ofid;
274#ifdef CONFIG_9P_FSCACHE
275 if (v9ses->cache)
276 v9fs_cache_inode_set_cookie(inode, filp);
277#endif
229 return 0; 278 return 0;
230 279
231error: 280error:
@@ -300,7 +349,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
300 goto error; 349 goto error;
301 } 350 }
302 351
303 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 352 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
304 if (IS_ERR(inode)) { 353 if (IS_ERR(inode)) {
305 err = PTR_ERR(inode); 354 err = PTR_ERR(inode);
306 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", 355 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -327,7 +376,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
327 } 376 }
328 /* Now set the ACL based on the default value */ 377 /* Now set the ACL based on the default value */
329 v9fs_set_create_acl(dentry, dacl, pacl); 378 v9fs_set_create_acl(dentry, dacl, pacl);
330 379 inc_nlink(dir);
380 v9fs_invalidate_inode_attr(dir);
331error: 381error:
332 if (fid) 382 if (fid)
333 p9_client_clunk(fid); 383 p9_client_clunk(fid);
@@ -345,10 +395,11 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
345 395
346 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); 396 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
347 err = -EPERM; 397 err = -EPERM;
348 v9ses = v9fs_inode2v9ses(dentry->d_inode); 398 v9ses = v9fs_dentry2v9ses(dentry);
349 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) 399 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
350 return simple_getattr(mnt, dentry, stat); 400 generic_fillattr(dentry->d_inode, stat);
351 401 return 0;
402 }
352 fid = v9fs_fid_lookup(dentry); 403 fid = v9fs_fid_lookup(dentry);
353 if (IS_ERR(fid)) 404 if (IS_ERR(fid))
354 return PTR_ERR(fid); 405 return PTR_ERR(fid);
@@ -401,22 +452,24 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
401 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; 452 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
402 453
403 retval = -EPERM; 454 retval = -EPERM;
404 v9ses = v9fs_inode2v9ses(dentry->d_inode); 455 v9ses = v9fs_dentry2v9ses(dentry);
405 fid = v9fs_fid_lookup(dentry); 456 fid = v9fs_fid_lookup(dentry);
406 if (IS_ERR(fid)) 457 if (IS_ERR(fid))
407 return PTR_ERR(fid); 458 return PTR_ERR(fid);
408 459
460 /* Write all dirty data */
461 if (S_ISREG(dentry->d_inode->i_mode))
462 filemap_write_and_wait(dentry->d_inode->i_mapping);
463
409 retval = p9_client_setattr(fid, &p9attr); 464 retval = p9_client_setattr(fid, &p9attr);
410 if (retval < 0) 465 if (retval < 0)
411 return retval; 466 return retval;
412 467
413 if ((iattr->ia_valid & ATTR_SIZE) && 468 if ((iattr->ia_valid & ATTR_SIZE) &&
414 iattr->ia_size != i_size_read(dentry->d_inode)) { 469 iattr->ia_size != i_size_read(dentry->d_inode))
415 retval = vmtruncate(dentry->d_inode, iattr->ia_size); 470 truncate_setsize(dentry->d_inode, iattr->ia_size);
416 if (retval)
417 return retval;
418 }
419 471
472 v9fs_invalidate_inode_attr(dentry->d_inode);
420 setattr_copy(dentry->d_inode, iattr); 473 setattr_copy(dentry->d_inode, iattr);
421 mark_inode_dirty(dentry->d_inode); 474 mark_inode_dirty(dentry->d_inode);
422 if (iattr->ia_valid & ATTR_MODE) { 475 if (iattr->ia_valid & ATTR_MODE) {
@@ -439,6 +492,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
439void 492void
440v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) 493v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
441{ 494{
495 struct v9fs_inode *v9inode = V9FS_I(inode);
442 496
443 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { 497 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
444 inode->i_atime.tv_sec = stat->st_atime_sec; 498 inode->i_atime.tv_sec = stat->st_atime_sec;
@@ -497,20 +551,21 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
497 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION 551 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
498 * because the inode structure does not have fields for them. 552 * because the inode structure does not have fields for them.
499 */ 553 */
554 v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
500} 555}
501 556
502static int 557static int
503v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, 558v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
504 const char *symname) 559 const char *symname)
505{ 560{
506 struct v9fs_session_info *v9ses;
507 struct p9_fid *dfid;
508 struct p9_fid *fid = NULL;
509 struct inode *inode;
510 struct p9_qid qid;
511 char *name;
512 int err; 561 int err;
513 gid_t gid; 562 gid_t gid;
563 char *name;
564 struct p9_qid qid;
565 struct inode *inode;
566 struct p9_fid *dfid;
567 struct p9_fid *fid = NULL;
568 struct v9fs_session_info *v9ses;
514 569
515 name = (char *) dentry->d_name.name; 570 name = (char *) dentry->d_name.name;
516 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", 571 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
@@ -534,6 +589,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
534 goto error; 589 goto error;
535 } 590 }
536 591
592 v9fs_invalidate_inode_attr(dir);
537 if (v9ses->cache) { 593 if (v9ses->cache) {
538 /* Now walk from the parent so we can get an unopened fid. */ 594 /* Now walk from the parent so we can get an unopened fid. */
539 fid = p9_client_walk(dfid, 1, &name, 1); 595 fid = p9_client_walk(dfid, 1, &name, 1);
@@ -546,7 +602,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
546 } 602 }
547 603
548 /* instantiate inode and assign the unopened fid to dentry */ 604 /* instantiate inode and assign the unopened fid to dentry */
549 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 605 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
550 if (IS_ERR(inode)) { 606 if (IS_ERR(inode)) {
551 err = PTR_ERR(inode); 607 err = PTR_ERR(inode);
552 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", 608 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -588,10 +644,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
588 struct dentry *dentry) 644 struct dentry *dentry)
589{ 645{
590 int err; 646 int err;
591 struct p9_fid *dfid, *oldfid;
592 char *name; 647 char *name;
593 struct v9fs_session_info *v9ses;
594 struct dentry *dir_dentry; 648 struct dentry *dir_dentry;
649 struct p9_fid *dfid, *oldfid;
650 struct v9fs_session_info *v9ses;
595 651
596 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", 652 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
597 dir->i_ino, old_dentry->d_name.name, 653 dir->i_ino, old_dentry->d_name.name,
@@ -616,29 +672,17 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
616 return err; 672 return err;
617 } 673 }
618 674
675 v9fs_invalidate_inode_attr(dir);
619 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 676 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
620 /* Get the latest stat info from server. */ 677 /* Get the latest stat info from server. */
621 struct p9_fid *fid; 678 struct p9_fid *fid;
622 struct p9_stat_dotl *st;
623
624 fid = v9fs_fid_lookup(old_dentry); 679 fid = v9fs_fid_lookup(old_dentry);
625 if (IS_ERR(fid)) 680 if (IS_ERR(fid))
626 return PTR_ERR(fid); 681 return PTR_ERR(fid);
627 682
628 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); 683 v9fs_refresh_inode_dotl(fid, old_dentry->d_inode);
629 if (IS_ERR(st))
630 return PTR_ERR(st);
631
632 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
633
634 kfree(st);
635 } else {
636 /* Caching disabled. No need to get upto date stat info.
637 * This dentry will be released immediately. So, just hold the
638 * inode
639 */
640 ihold(old_dentry->d_inode);
641 } 684 }
685 ihold(old_dentry->d_inode);
642 d_instantiate(dentry, old_dentry->d_inode); 686 d_instantiate(dentry, old_dentry->d_inode);
643 687
644 return err; 688 return err;
@@ -657,12 +701,12 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
657 dev_t rdev) 701 dev_t rdev)
658{ 702{
659 int err; 703 int err;
704 gid_t gid;
660 char *name; 705 char *name;
661 mode_t mode; 706 mode_t mode;
662 struct v9fs_session_info *v9ses; 707 struct v9fs_session_info *v9ses;
663 struct p9_fid *fid = NULL, *dfid = NULL; 708 struct p9_fid *fid = NULL, *dfid = NULL;
664 struct inode *inode; 709 struct inode *inode;
665 gid_t gid;
666 struct p9_qid qid; 710 struct p9_qid qid;
667 struct dentry *dir_dentry; 711 struct dentry *dir_dentry;
668 struct posix_acl *dacl = NULL, *pacl = NULL; 712 struct posix_acl *dacl = NULL, *pacl = NULL;
@@ -699,6 +743,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
699 if (err < 0) 743 if (err < 0)
700 goto error; 744 goto error;
701 745
746 v9fs_invalidate_inode_attr(dir);
702 /* instantiate inode and assign the unopened fid to the dentry */ 747 /* instantiate inode and assign the unopened fid to the dentry */
703 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 748 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
704 fid = p9_client_walk(dfid, 1, &name, 1); 749 fid = p9_client_walk(dfid, 1, &name, 1);
@@ -710,7 +755,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
710 goto error; 755 goto error;
711 } 756 }
712 757
713 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 758 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
714 if (IS_ERR(inode)) { 759 if (IS_ERR(inode)) {
715 err = PTR_ERR(inode); 760 err = PTR_ERR(inode);
716 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", 761 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -782,6 +827,31 @@ ndset:
782 return NULL; 827 return NULL;
783} 828}
784 829
830int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
831{
832 loff_t i_size;
833 struct p9_stat_dotl *st;
834 struct v9fs_session_info *v9ses;
835
836 v9ses = v9fs_inode2v9ses(inode);
837 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
838 if (IS_ERR(st))
839 return PTR_ERR(st);
840
841 spin_lock(&inode->i_lock);
842 /*
843 * We don't want to refresh inode->i_size,
844 * because we may have cached data
845 */
846 i_size = inode->i_size;
847 v9fs_stat2inode_dotl(st, inode);
848 if (v9ses->cache)
849 inode->i_size = i_size;
850 spin_unlock(&inode->i_lock);
851 kfree(st);
852 return 0;
853}
854
785const struct inode_operations v9fs_dir_inode_operations_dotl = { 855const struct inode_operations v9fs_dir_inode_operations_dotl = {
786 .create = v9fs_vfs_create_dotl, 856 .create = v9fs_vfs_create_dotl,
787 .lookup = v9fs_vfs_lookup, 857 .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index dbaabe3b8131..f3eed3383e4f 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -86,12 +86,15 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
86 } else 86 } else
87 sb->s_op = &v9fs_super_ops; 87 sb->s_op = &v9fs_super_ops;
88 sb->s_bdi = &v9ses->bdi; 88 sb->s_bdi = &v9ses->bdi;
89 if (v9ses->cache)
90 sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
89 91
90 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 92 sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
91 MS_NOATIME; 93 if (!v9ses->cache)
94 sb->s_flags |= MS_SYNCHRONOUS;
92 95
93#ifdef CONFIG_9P_FS_POSIX_ACL 96#ifdef CONFIG_9P_FS_POSIX_ACL
94 if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT) 97 if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
95 sb->s_flags |= MS_POSIXACL; 98 sb->s_flags |= MS_POSIXACL;
96#endif 99#endif
97 100
@@ -151,7 +154,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
151 retval = PTR_ERR(inode); 154 retval = PTR_ERR(inode);
152 goto release_sb; 155 goto release_sb;
153 } 156 }
154
155 root = d_alloc_root(inode); 157 root = d_alloc_root(inode);
156 if (!root) { 158 if (!root) {
157 iput(inode); 159 iput(inode);
@@ -166,7 +168,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
166 retval = PTR_ERR(st); 168 retval = PTR_ERR(st);
167 goto release_sb; 169 goto release_sb;
168 } 170 }
169 171 root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
170 v9fs_stat2inode_dotl(st, root->d_inode); 172 v9fs_stat2inode_dotl(st, root->d_inode);
171 kfree(st); 173 kfree(st);
172 } else { 174 } else {
@@ -183,10 +185,21 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
183 p9stat_free(st); 185 p9stat_free(st);
184 kfree(st); 186 kfree(st);
185 } 187 }
188 v9fs_fid_add(root, fid);
186 retval = v9fs_get_acl(inode, fid); 189 retval = v9fs_get_acl(inode, fid);
187 if (retval) 190 if (retval)
188 goto release_sb; 191 goto release_sb;
189 v9fs_fid_add(root, fid); 192 /*
193 * Add the root fid to session info. This is used
194 * for file system sync. We want a cloned fid here
195 * so that we can do a sync_filesystem after a
196 * shrink_dcache_for_umount
197 */
198 v9ses->root_fid = v9fs_fid_clone(root);
199 if (IS_ERR(v9ses->root_fid)) {
200 retval = PTR_ERR(v9ses->root_fid);
201 goto release_sb;
202 }
190 203
191 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); 204 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
192 return dget(sb->s_root); 205 return dget(sb->s_root);
@@ -197,15 +210,11 @@ close_session:
197 v9fs_session_close(v9ses); 210 v9fs_session_close(v9ses);
198 kfree(v9ses); 211 kfree(v9ses);
199 return ERR_PTR(retval); 212 return ERR_PTR(retval);
200
201release_sb: 213release_sb:
202 /* 214 /*
203 * we will do the session_close and root dentry release 215 * we will do the session_close and root dentry
204 * in the below call. But we need to clunk fid, because we haven't 216 * release in the below call.
205 * attached the fid to dentry so it won't get clunked
206 * automatically.
207 */ 217 */
208 p9_client_clunk(fid);
209 deactivate_locked_super(sb); 218 deactivate_locked_super(sb);
210 return ERR_PTR(retval); 219 return ERR_PTR(retval);
211} 220}
@@ -223,7 +232,7 @@ static void v9fs_kill_super(struct super_block *s)
223 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); 232 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
224 233
225 kill_anon_super(s); 234 kill_anon_super(s);
226 235 p9_client_clunk(v9ses->root_fid);
227 v9fs_session_cancel(v9ses); 236 v9fs_session_cancel(v9ses);
228 v9fs_session_close(v9ses); 237 v9fs_session_close(v9ses);
229 kfree(v9ses); 238 kfree(v9ses);
@@ -253,7 +262,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
253 goto done; 262 goto done;
254 } 263 }
255 264
256 v9ses = v9fs_inode2v9ses(dentry->d_inode); 265 v9ses = v9fs_dentry2v9ses(dentry);
257 if (v9fs_proto_dotl(v9ses)) { 266 if (v9fs_proto_dotl(v9ses)) {
258 res = p9_client_statfs(fid, &rs); 267 res = p9_client_statfs(fid, &rs);
259 if (res == 0) { 268 if (res == 0) {
@@ -276,11 +285,31 @@ done:
276 return res; 285 return res;
277} 286}
278 287
288static int v9fs_sync_fs(struct super_block *sb, int wait)
289{
290 struct v9fs_session_info *v9ses = sb->s_fs_info;
291
292 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_sync_fs: super_block %p\n", sb);
293 return p9_client_sync_fs(v9ses->root_fid);
294}
295
296static int v9fs_drop_inode(struct inode *inode)
297{
298 struct v9fs_session_info *v9ses;
299 v9ses = v9fs_inode2v9ses(inode);
300 if (v9ses->cache)
301 return generic_drop_inode(inode);
302 /*
303 * in case of non cached mode always drop the
304 * the inode because we want the inode attribute
305 * to always match that on the server.
306 */
307 return 1;
308}
309
279static const struct super_operations v9fs_super_ops = { 310static const struct super_operations v9fs_super_ops = {
280#ifdef CONFIG_9P_FSCACHE
281 .alloc_inode = v9fs_alloc_inode, 311 .alloc_inode = v9fs_alloc_inode,
282 .destroy_inode = v9fs_destroy_inode, 312 .destroy_inode = v9fs_destroy_inode,
283#endif
284 .statfs = simple_statfs, 313 .statfs = simple_statfs,
285 .evict_inode = v9fs_evict_inode, 314 .evict_inode = v9fs_evict_inode,
286 .show_options = generic_show_options, 315 .show_options = generic_show_options,
@@ -288,11 +317,11 @@ static const struct super_operations v9fs_super_ops = {
288}; 317};
289 318
290static const struct super_operations v9fs_super_ops_dotl = { 319static const struct super_operations v9fs_super_ops_dotl = {
291#ifdef CONFIG_9P_FSCACHE
292 .alloc_inode = v9fs_alloc_inode, 320 .alloc_inode = v9fs_alloc_inode,
293 .destroy_inode = v9fs_destroy_inode, 321 .destroy_inode = v9fs_destroy_inode,
294#endif 322 .sync_fs = v9fs_sync_fs,
295 .statfs = v9fs_statfs, 323 .statfs = v9fs_statfs,
324 .drop_inode = v9fs_drop_inode,
296 .evict_inode = v9fs_evict_inode, 325 .evict_inode = v9fs_evict_inode,
297 .show_options = generic_show_options, 326 .show_options = generic_show_options,
298 .umount_begin = v9fs_umount_begin, 327 .umount_begin = v9fs_umount_begin,
@@ -303,5 +332,5 @@ struct file_system_type v9fs_fs_type = {
303 .mount = v9fs_mount, 332 .mount = v9fs_mount,
304 .kill_sb = v9fs_kill_super, 333 .kill_sb = v9fs_kill_super,
305 .owner = THIS_MODULE, 334 .owner = THIS_MODULE,
306 .fs_flags = FS_RENAME_DOES_D_MOVE, 335 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
307}; 336};
diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa57edc..f3aa9b08b228 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
47 def_bool n 47 def_bool n
48 48
49config EXPORTFS 49config EXPORTFS
50 tristate 50 bool
51 51
52config FILE_LOCKING 52config FILE_LOCKING
53 bool "Enable POSIX file locking API" if EXPERT 53 bool "Enable POSIX file locking API" if EXPERT
@@ -187,6 +187,7 @@ source "fs/omfs/Kconfig"
187source "fs/hpfs/Kconfig" 187source "fs/hpfs/Kconfig"
188source "fs/qnx4/Kconfig" 188source "fs/qnx4/Kconfig"
189source "fs/romfs/Kconfig" 189source "fs/romfs/Kconfig"
190source "fs/pstore/Kconfig"
190source "fs/sysv/Kconfig" 191source "fs/sysv/Kconfig"
191source "fs/ufs/Kconfig" 192source "fs/ufs/Kconfig"
192source "fs/exofs/Kconfig" 193source "fs/exofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c8..fb68c2b8cf8a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
48obj-$(CONFIG_NFS_COMMON) += nfs_common/ 48obj-$(CONFIG_NFS_COMMON) += nfs_common/
49obj-$(CONFIG_GENERIC_ACL) += generic_acl.o 49obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
50 50
51obj-$(CONFIG_FHANDLE) += fhandle.o
52
51obj-y += quota/ 53obj-y += quota/
52 54
53obj-$(CONFIG_PROC_FS) += proc/ 55obj-$(CONFIG_PROC_FS) += proc/
@@ -121,3 +123,4 @@ obj-$(CONFIG_BTRFS_FS) += btrfs/
121obj-$(CONFIG_GFS2_FS) += gfs2/ 123obj-$(CONFIG_GFS2_FS) += gfs2/
122obj-$(CONFIG_EXOFS_FS) += exofs/ 124obj-$(CONFIG_EXOFS_FS) += exofs/
123obj-$(CONFIG_CEPH_FS) += ceph/ 125obj-$(CONFIG_CEPH_FS) += ceph/
126obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 1dd5f34b3cf2..e55182a74605 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,7 +1,6 @@
1config ADFS_FS 1config ADFS_FS
2 tristate "ADFS file system support (EXPERIMENTAL)" 2 tristate "ADFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on BLOCK && EXPERIMENTAL
4 depends on BKL # need to fix
5 help 4 help
6 The Acorn Disc Filing System is the standard file system of the 5 The Acorn Disc Filing System is the standard file system of the
7 RiscOS operating system which runs on Acorn's ARM-based Risc PC 6 RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 2ff622f6f547..718ac1f440c6 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -50,6 +50,7 @@ struct adfs_sb_info {
50 gid_t s_gid; /* owner gid */ 50 gid_t s_gid; /* owner gid */
51 umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ 51 umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
52 umode_t s_other_mask; /* ADFS other perm -> unix perm */ 52 umode_t s_other_mask; /* ADFS other perm -> unix perm */
53 int s_ftsuffix; /* ,xyz hex filetype suffix option */
53 54
54 __u32 s_ids_per_zone; /* max. no ids in one zone */ 55 __u32 s_ids_per_zone; /* max. no ids in one zone */
55 __u32 s_idlen; /* length of ID in map */ 56 __u32 s_idlen; /* length of ID in map */
@@ -79,6 +80,10 @@ struct adfs_dir {
79 80
80 int nr_buffers; 81 int nr_buffers;
81 struct buffer_head *bh[4]; 82 struct buffer_head *bh[4];
83
84 /* big directories need allocated buffers */
85 struct buffer_head **bh_fplus;
86
82 unsigned int pos; 87 unsigned int pos;
83 unsigned int parent_id; 88 unsigned int parent_id;
84 89
@@ -89,7 +94,7 @@ struct adfs_dir {
89/* 94/*
90 * This is the overall maximum name length 95 * This is the overall maximum name length
91 */ 96 */
92#define ADFS_MAX_NAME_LEN 256 97#define ADFS_MAX_NAME_LEN (256 + 4) /* +4 for ,xyz hex filetype suffix */
93struct object_info { 98struct object_info {
94 __u32 parent_id; /* parent object id */ 99 __u32 parent_id; /* parent object id */
95 __u32 file_id; /* object id */ 100 __u32 file_id; /* object id */
@@ -97,10 +102,26 @@ struct object_info {
97 __u32 execaddr; /* execution address */ 102 __u32 execaddr; /* execution address */
98 __u32 size; /* size */ 103 __u32 size; /* size */
99 __u8 attr; /* RISC OS attributes */ 104 __u8 attr; /* RISC OS attributes */
100 unsigned char name_len; /* name length */ 105 unsigned int name_len; /* name length */
101 char name[ADFS_MAX_NAME_LEN];/* file name */ 106 char name[ADFS_MAX_NAME_LEN];/* file name */
107
108 /* RISC OS file type (12-bit: derived from loadaddr) */
109 __u16 filetype;
102}; 110};
103 111
112/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */
113static inline int append_filetype_suffix(char *buf, __u16 filetype)
114{
115 if (filetype == 0xffff) /* no explicit 12-bit file type was set */
116 return 0;
117
118 *buf++ = ',';
119 *buf++ = hex_asc_lo(filetype >> 8);
120 *buf++ = hex_asc_lo(filetype >> 4);
121 *buf++ = hex_asc_lo(filetype >> 0);
122 return 4;
123}
124
104struct adfs_dir_ops { 125struct adfs_dir_ops {
105 int (*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir); 126 int (*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir);
106 int (*setpos)(struct adfs_dir *dir, unsigned int fpos); 127 int (*setpos)(struct adfs_dir *dir, unsigned int fpos);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 3b4a764ed780..3d83075aaa2e 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,7 +9,6 @@
9 * 9 *
10 * Common directory handling for ADFS 10 * Common directory handling for ADFS
11 */ 11 */
12#include <linux/smp_lock.h>
13#include "adfs.h" 12#include "adfs.h"
14 13
15/* 14/*
@@ -27,8 +26,6 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
27 struct adfs_dir dir; 26 struct adfs_dir dir;
28 int ret = 0; 27 int ret = 0;
29 28
30 lock_kernel();
31
32 if (filp->f_pos >> 32) 29 if (filp->f_pos >> 32)
33 goto out; 30 goto out;
34 31
@@ -70,7 +67,6 @@ free_out:
70 ops->free(&dir); 67 ops->free(&dir);
71 68
72out: 69out:
73 unlock_kernel();
74 return ret; 70 return ret;
75} 71}
76 72
@@ -276,7 +272,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
276 struct object_info obj; 272 struct object_info obj;
277 int error; 273 int error;
278 274
279 lock_kernel();
280 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); 275 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
281 if (error == 0) { 276 if (error == 0) {
282 error = -EACCES; 277 error = -EACCES;
@@ -288,7 +283,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
288 if (inode) 283 if (inode)
289 error = 0; 284 error = 0;
290 } 285 }
291 unlock_kernel();
292 d_add(dentry, inode); 286 d_add(dentry, inode);
293 return ERR_PTR(error); 287 return ERR_PTR(error);
294} 288}
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index bafc71222e25..4bbe853ee50a 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -52,7 +52,6 @@ static inline int adfs_readname(char *buf, char *ptr, int maxlen)
52 *buf++ = *ptr; 52 *buf++ = *ptr;
53 ptr++; 53 ptr++;
54 } 54 }
55 *buf = '\0';
56 55
57 return buf - old_buf; 56 return buf - old_buf;
58} 57}
@@ -208,7 +207,8 @@ release_buffers:
208 * convert a disk-based directory entry to a Linux ADFS directory entry 207 * convert a disk-based directory entry to a Linux ADFS directory entry
209 */ 208 */
210static inline void 209static inline void
211adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de) 210adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj,
211 struct adfs_direntry *de)
212{ 212{
213 obj->name_len = adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN); 213 obj->name_len = adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN);
214 obj->file_id = adfs_readval(de->dirinddiscadd, 3); 214 obj->file_id = adfs_readval(de->dirinddiscadd, 3);
@@ -216,6 +216,23 @@ adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de)
216 obj->execaddr = adfs_readval(de->direxec, 4); 216 obj->execaddr = adfs_readval(de->direxec, 4);
217 obj->size = adfs_readval(de->dirlen, 4); 217 obj->size = adfs_readval(de->dirlen, 4);
218 obj->attr = de->newdiratts; 218 obj->attr = de->newdiratts;
219 obj->filetype = -1;
220
221 /*
222 * object is a file and is filetyped and timestamped?
223 * RISC OS 12-bit filetype is stored in load_address[19:8]
224 */
225 if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
226 (0xfff00000 == (0xfff00000 & obj->loadaddr))) {
227 obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
228
229 /* optionally append the ,xyz hex filetype suffix */
230 if (ADFS_SB(dir->sb)->s_ftsuffix)
231 obj->name_len +=
232 append_filetype_suffix(
233 &obj->name[obj->name_len],
234 obj->filetype);
235 }
219} 236}
220 237
221/* 238/*
@@ -260,7 +277,7 @@ __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj)
260 if (!de.dirobname[0]) 277 if (!de.dirobname[0])
261 return -ENOENT; 278 return -ENOENT;
262 279
263 adfs_dir2obj(obj, &de); 280 adfs_dir2obj(dir, obj, &de);
264 281
265 return 0; 282 return 0;
266} 283}
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1796bb352d05..d9e3bee4e653 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -8,6 +8,7 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/slab.h>
11#include "adfs.h" 12#include "adfs.h"
12#include "dir_fplus.h" 13#include "dir_fplus.h"
13 14
@@ -22,30 +23,53 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
22 23
23 dir->nr_buffers = 0; 24 dir->nr_buffers = 0;
24 25
26 /* start off using fixed bh set - only alloc for big dirs */
27 dir->bh_fplus = &dir->bh[0];
28
25 block = __adfs_block_map(sb, id, 0); 29 block = __adfs_block_map(sb, id, 0);
26 if (!block) { 30 if (!block) {
27 adfs_error(sb, "dir object %X has a hole at offset 0", id); 31 adfs_error(sb, "dir object %X has a hole at offset 0", id);
28 goto out; 32 goto out;
29 } 33 }
30 34
31 dir->bh[0] = sb_bread(sb, block); 35 dir->bh_fplus[0] = sb_bread(sb, block);
32 if (!dir->bh[0]) 36 if (!dir->bh_fplus[0])
33 goto out; 37 goto out;
34 dir->nr_buffers += 1; 38 dir->nr_buffers += 1;
35 39
36 h = (struct adfs_bigdirheader *)dir->bh[0]->b_data; 40 h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data;
37 size = le32_to_cpu(h->bigdirsize); 41 size = le32_to_cpu(h->bigdirsize);
38 if (size != sz) { 42 if (size != sz) {
39 printk(KERN_WARNING "adfs: adfs_fplus_read: directory header size\n" 43 printk(KERN_WARNING "adfs: adfs_fplus_read:"
40 " does not match directory size\n"); 44 " directory header size %X\n"
45 " does not match directory size %X\n",
46 size, sz);
41 } 47 }
42 48
43 if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || 49 if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
44 h->bigdirversion[2] != 0 || size & 2047 || 50 h->bigdirversion[2] != 0 || size & 2047 ||
45 h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) 51 h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) {
52 printk(KERN_WARNING "adfs: dir object %X has"
53 " malformed dir header\n", id);
46 goto out; 54 goto out;
55 }
47 56
48 size >>= sb->s_blocksize_bits; 57 size >>= sb->s_blocksize_bits;
58 if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) {
59 /* this directory is too big for fixed bh set, must allocate */
60 struct buffer_head **bh_fplus =
61 kzalloc(size * sizeof(struct buffer_head *),
62 GFP_KERNEL);
63 if (!bh_fplus) {
64 adfs_error(sb, "not enough memory for"
65 " dir object %X (%d blocks)", id, size);
66 goto out;
67 }
68 dir->bh_fplus = bh_fplus;
69 /* copy over the pointer to the block that we've already read */
70 dir->bh_fplus[0] = dir->bh[0];
71 }
72
49 for (blk = 1; blk < size; blk++) { 73 for (blk = 1; blk < size; blk++) {
50 block = __adfs_block_map(sb, id, blk); 74 block = __adfs_block_map(sb, id, blk);
51 if (!block) { 75 if (!block) {
@@ -53,25 +77,44 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
53 goto out; 77 goto out;
54 } 78 }
55 79
56 dir->bh[blk] = sb_bread(sb, block); 80 dir->bh_fplus[blk] = sb_bread(sb, block);
57 if (!dir->bh[blk]) 81 if (!dir->bh_fplus[blk]) {
82 adfs_error(sb, "dir object %X failed read for"
83 " offset %d, mapped block %X",
84 id, blk, block);
58 goto out; 85 goto out;
59 dir->nr_buffers = blk; 86 }
87
88 dir->nr_buffers += 1;
60 } 89 }
61 90
62 t = (struct adfs_bigdirtail *)(dir->bh[size - 1]->b_data + (sb->s_blocksize - 8)); 91 t = (struct adfs_bigdirtail *)
92 (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8));
63 93
64 if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || 94 if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
65 t->bigdirendmasseq != h->startmasseq || 95 t->bigdirendmasseq != h->startmasseq ||
66 t->reserved[0] != 0 || t->reserved[1] != 0) 96 t->reserved[0] != 0 || t->reserved[1] != 0) {
97 printk(KERN_WARNING "adfs: dir object %X has "
98 "malformed dir end\n", id);
67 goto out; 99 goto out;
100 }
68 101
69 dir->parent_id = le32_to_cpu(h->bigdirparent); 102 dir->parent_id = le32_to_cpu(h->bigdirparent);
70 dir->sb = sb; 103 dir->sb = sb;
71 return 0; 104 return 0;
105
72out: 106out:
73 for (i = 0; i < dir->nr_buffers; i++) 107 if (dir->bh_fplus) {
74 brelse(dir->bh[i]); 108 for (i = 0; i < dir->nr_buffers; i++)
109 brelse(dir->bh_fplus[i]);
110
111 if (&dir->bh[0] != dir->bh_fplus)
112 kfree(dir->bh_fplus);
113
114 dir->bh_fplus = NULL;
115 }
116
117 dir->nr_buffers = 0;
75 dir->sb = NULL; 118 dir->sb = NULL;
76 return ret; 119 return ret;
77} 120}
@@ -79,7 +122,8 @@ out:
79static int 122static int
80adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) 123adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos)
81{ 124{
82 struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data; 125 struct adfs_bigdirheader *h =
126 (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
83 int ret = -ENOENT; 127 int ret = -ENOENT;
84 128
85 if (fpos <= le32_to_cpu(h->bigdirentries)) { 129 if (fpos <= le32_to_cpu(h->bigdirentries)) {
@@ -102,21 +146,27 @@ dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len)
102 partial = sb->s_blocksize - offset; 146 partial = sb->s_blocksize - offset;
103 147
104 if (partial >= len) 148 if (partial >= len)
105 memcpy(to, dir->bh[buffer]->b_data + offset, len); 149 memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len);
106 else { 150 else {
107 char *c = (char *)to; 151 char *c = (char *)to;
108 152
109 remainder = len - partial; 153 remainder = len - partial;
110 154
111 memcpy(c, dir->bh[buffer]->b_data + offset, partial); 155 memcpy(c,
112 memcpy(c + partial, dir->bh[buffer + 1]->b_data, remainder); 156 dir->bh_fplus[buffer]->b_data + offset,
157 partial);
158
159 memcpy(c + partial,
160 dir->bh_fplus[buffer + 1]->b_data,
161 remainder);
113 } 162 }
114} 163}
115 164
116static int 165static int
117adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) 166adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
118{ 167{
119 struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data; 168 struct adfs_bigdirheader *h =
169 (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
120 struct adfs_bigdirentry bde; 170 struct adfs_bigdirentry bde;
121 unsigned int offset; 171 unsigned int offset;
122 int i, ret = -ENOENT; 172 int i, ret = -ENOENT;
@@ -147,6 +197,24 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
147 if (obj->name[i] == '/') 197 if (obj->name[i] == '/')
148 obj->name[i] = '.'; 198 obj->name[i] = '.';
149 199
200 obj->filetype = -1;
201
202 /*
203 * object is a file and is filetyped and timestamped?
204 * RISC OS 12-bit filetype is stored in load_address[19:8]
205 */
206 if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
207 (0xfff00000 == (0xfff00000 & obj->loadaddr))) {
208 obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
209
210 /* optionally append the ,xyz hex filetype suffix */
211 if (ADFS_SB(dir->sb)->s_ftsuffix)
212 obj->name_len +=
213 append_filetype_suffix(
214 &obj->name[obj->name_len],
215 obj->filetype);
216 }
217
150 dir->pos += 1; 218 dir->pos += 1;
151 ret = 0; 219 ret = 0;
152out: 220out:
@@ -160,7 +228,7 @@ adfs_fplus_sync(struct adfs_dir *dir)
160 int i; 228 int i;
161 229
162 for (i = dir->nr_buffers - 1; i >= 0; i--) { 230 for (i = dir->nr_buffers - 1; i >= 0; i--) {
163 struct buffer_head *bh = dir->bh[i]; 231 struct buffer_head *bh = dir->bh_fplus[i];
164 sync_dirty_buffer(bh); 232 sync_dirty_buffer(bh);
165 if (buffer_req(bh) && !buffer_uptodate(bh)) 233 if (buffer_req(bh) && !buffer_uptodate(bh))
166 err = -EIO; 234 err = -EIO;
@@ -174,8 +242,17 @@ adfs_fplus_free(struct adfs_dir *dir)
174{ 242{
175 int i; 243 int i;
176 244
177 for (i = 0; i < dir->nr_buffers; i++) 245 if (dir->bh_fplus) {
178 brelse(dir->bh[i]); 246 for (i = 0; i < dir->nr_buffers; i++)
247 brelse(dir->bh_fplus[i]);
248
249 if (&dir->bh[0] != dir->bh_fplus)
250 kfree(dir->bh_fplus);
251
252 dir->bh_fplus = NULL;
253 }
254
255 dir->nr_buffers = 0;
179 dir->sb = NULL; 256 dir->sb = NULL;
180} 257}
181 258
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 65794b8fe79e..d5250c5aae21 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,7 +7,6 @@
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/smp_lock.h>
11#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
12#include <linux/writeback.h> 11#include <linux/writeback.h>
13#include "adfs.h" 12#include "adfs.h"
@@ -73,32 +72,18 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
73static const struct address_space_operations adfs_aops = { 72static const struct address_space_operations adfs_aops = {
74 .readpage = adfs_readpage, 73 .readpage = adfs_readpage,
75 .writepage = adfs_writepage, 74 .writepage = adfs_writepage,
76 .sync_page = block_sync_page,
77 .write_begin = adfs_write_begin, 75 .write_begin = adfs_write_begin,
78 .write_end = generic_write_end, 76 .write_end = generic_write_end,
79 .bmap = _adfs_bmap 77 .bmap = _adfs_bmap
80}; 78};
81 79
82static inline unsigned int
83adfs_filetype(struct inode *inode)
84{
85 unsigned int type;
86
87 if (ADFS_I(inode)->stamped)
88 type = (ADFS_I(inode)->loadaddr >> 8) & 0xfff;
89 else
90 type = (unsigned int) -1;
91
92 return type;
93}
94
95/* 80/*
96 * Convert ADFS attributes and filetype to Linux permission. 81 * Convert ADFS attributes and filetype to Linux permission.
97 */ 82 */
98static umode_t 83static umode_t
99adfs_atts2mode(struct super_block *sb, struct inode *inode) 84adfs_atts2mode(struct super_block *sb, struct inode *inode)
100{ 85{
101 unsigned int filetype, attr = ADFS_I(inode)->attr; 86 unsigned int attr = ADFS_I(inode)->attr;
102 umode_t mode, rmask; 87 umode_t mode, rmask;
103 struct adfs_sb_info *asb = ADFS_SB(sb); 88 struct adfs_sb_info *asb = ADFS_SB(sb);
104 89
@@ -107,9 +92,7 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode)
107 return S_IFDIR | S_IXUGO | mode; 92 return S_IFDIR | S_IXUGO | mode;
108 } 93 }
109 94
110 filetype = adfs_filetype(inode); 95 switch (ADFS_I(inode)->filetype) {
111
112 switch (filetype) {
113 case 0xfc0: /* LinkFS */ 96 case 0xfc0: /* LinkFS */
114 return S_IFLNK|S_IRWXUGO; 97 return S_IFLNK|S_IRWXUGO;
115 98
@@ -175,50 +158,48 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode)
175 158
176/* 159/*
177 * Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time 160 * Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time
178 * referenced to 1 Jan 1900 (til 2248) 161 * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds
162 * of time to convert from RISC OS epoch to Unix epoch.
179 */ 163 */
180static void 164static void
181adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) 165adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
182{ 166{
183 unsigned int high, low; 167 unsigned int high, low;
168 /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
169 * 01 Jan 1900 00:00:00 (RISC OS epoch)
170 */
171 static const s64 nsec_unix_epoch_diff_risc_os_epoch =
172 2208988800000000000LL;
173 s64 nsec;
184 174
185 if (ADFS_I(inode)->stamped == 0) 175 if (ADFS_I(inode)->stamped == 0)
186 goto cur_time; 176 goto cur_time;
187 177
188 high = ADFS_I(inode)->loadaddr << 24; 178 high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */
189 low = ADFS_I(inode)->execaddr; 179 low = ADFS_I(inode)->execaddr; /* bottom 32 bits of timestamp */
190 180
191 high |= low >> 8; 181 /* convert 40-bit centi-seconds to 32-bit seconds
192 low &= 255; 182 * going via nanoseconds to retain precision
183 */
184 nsec = (((s64) high << 32) | (s64) low) * 10000000; /* cs to ns */
193 185
194 /* Files dated pre 01 Jan 1970 00:00:00. */ 186 /* Files dated pre 01 Jan 1970 00:00:00. */
195 if (high < 0x336e996a) 187 if (nsec < nsec_unix_epoch_diff_risc_os_epoch)
196 goto too_early; 188 goto too_early;
197 189
198 /* Files dated post 18 Jan 2038 03:14:05. */ 190 /* convert from RISC OS to Unix epoch */
199 if (high >= 0x656e9969) 191 nsec -= nsec_unix_epoch_diff_risc_os_epoch;
200 goto too_late;
201
202 /* discard 2208988800 (0x336e996a00) seconds of time */
203 high -= 0x336e996a;
204 192
205 /* convert 40-bit centi-seconds to 32-bit seconds */ 193 *tv = ns_to_timespec(nsec);
206 tv->tv_sec = (((high % 100) << 8) + low) / 100 + (high / 100 << 8);
207 tv->tv_nsec = 0;
208 return; 194 return;
209 195
210 cur_time: 196 cur_time:
211 *tv = CURRENT_TIME_SEC; 197 *tv = CURRENT_TIME;
212 return; 198 return;
213 199
214 too_early: 200 too_early:
215 tv->tv_sec = tv->tv_nsec = 0; 201 tv->tv_sec = tv->tv_nsec = 0;
216 return; 202 return;
217
218 too_late:
219 tv->tv_sec = 0x7ffffffd;
220 tv->tv_nsec = 0;
221 return;
222} 203}
223 204
224/* 205/*
@@ -280,7 +261,8 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
280 ADFS_I(inode)->loadaddr = obj->loadaddr; 261 ADFS_I(inode)->loadaddr = obj->loadaddr;
281 ADFS_I(inode)->execaddr = obj->execaddr; 262 ADFS_I(inode)->execaddr = obj->execaddr;
282 ADFS_I(inode)->attr = obj->attr; 263 ADFS_I(inode)->attr = obj->attr;
283 ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000); 264 ADFS_I(inode)->filetype = obj->filetype;
265 ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
284 266
285 inode->i_mode = adfs_atts2mode(sb, inode); 267 inode->i_mode = adfs_atts2mode(sb, inode);
286 adfs_adfs2unix_time(&inode->i_mtime, inode); 268 adfs_adfs2unix_time(&inode->i_mtime, inode);
@@ -316,8 +298,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
316 unsigned int ia_valid = attr->ia_valid; 298 unsigned int ia_valid = attr->ia_valid;
317 int error; 299 int error;
318 300
319 lock_kernel();
320
321 error = inode_change_ok(inode, attr); 301 error = inode_change_ok(inode, attr);
322 302
323 /* 303 /*
@@ -359,7 +339,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
359 if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE)) 339 if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE))
360 mark_inode_dirty(inode); 340 mark_inode_dirty(inode);
361out: 341out:
362 unlock_kernel();
363 return error; 342 return error;
364} 343}
365 344
@@ -374,7 +353,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
374 struct object_info obj; 353 struct object_info obj;
375 int ret; 354 int ret;
376 355
377 lock_kernel();
378 obj.file_id = inode->i_ino; 356 obj.file_id = inode->i_ino;
379 obj.name_len = 0; 357 obj.name_len = 0;
380 obj.parent_id = ADFS_I(inode)->parent_id; 358 obj.parent_id = ADFS_I(inode)->parent_id;
@@ -384,6 +362,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
384 obj.size = inode->i_size; 362 obj.size = inode->i_size;
385 363
386 ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); 364 ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
387 unlock_kernel();
388 return ret; 365 return ret;
389} 366}
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2d7954049fbe..c8bf36a1996a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -14,7 +14,6 @@
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/smp_lock.h>
18#include <linux/statfs.h> 17#include <linux/statfs.h>
19#include "adfs.h" 18#include "adfs.h"
20#include "dir_f.h" 19#include "dir_f.h"
@@ -120,15 +119,11 @@ static void adfs_put_super(struct super_block *sb)
120 int i; 119 int i;
121 struct adfs_sb_info *asb = ADFS_SB(sb); 120 struct adfs_sb_info *asb = ADFS_SB(sb);
122 121
123 lock_kernel();
124
125 for (i = 0; i < asb->s_map_size; i++) 122 for (i = 0; i < asb->s_map_size; i++)
126 brelse(asb->s_map[i].dm_bh); 123 brelse(asb->s_map[i].dm_bh);
127 kfree(asb->s_map); 124 kfree(asb->s_map);
128 kfree(asb); 125 kfree(asb);
129 sb->s_fs_info = NULL; 126 sb->s_fs_info = NULL;
130
131 unlock_kernel();
132} 127}
133 128
134static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) 129static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -143,17 +138,20 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
143 seq_printf(seq, ",ownmask=%o", asb->s_owner_mask); 138 seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
144 if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK) 139 if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
145 seq_printf(seq, ",othmask=%o", asb->s_other_mask); 140 seq_printf(seq, ",othmask=%o", asb->s_other_mask);
141 if (asb->s_ftsuffix != 0)
142 seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix);
146 143
147 return 0; 144 return 0;
148} 145}
149 146
150enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err}; 147enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
151 148
152static const match_table_t tokens = { 149static const match_table_t tokens = {
153 {Opt_uid, "uid=%u"}, 150 {Opt_uid, "uid=%u"},
154 {Opt_gid, "gid=%u"}, 151 {Opt_gid, "gid=%u"},
155 {Opt_ownmask, "ownmask=%o"}, 152 {Opt_ownmask, "ownmask=%o"},
156 {Opt_othmask, "othmask=%o"}, 153 {Opt_othmask, "othmask=%o"},
154 {Opt_ftsuffix, "ftsuffix=%u"},
157 {Opt_err, NULL} 155 {Opt_err, NULL}
158}; 156};
159 157
@@ -194,6 +192,11 @@ static int parse_options(struct super_block *sb, char *options)
194 return -EINVAL; 192 return -EINVAL;
195 asb->s_other_mask = option; 193 asb->s_other_mask = option;
196 break; 194 break;
195 case Opt_ftsuffix:
196 if (match_int(args, &option))
197 return -EINVAL;
198 asb->s_ftsuffix = option;
199 break;
197 default: 200 default:
198 printk("ADFS-fs: unrecognised mount option \"%s\" " 201 printk("ADFS-fs: unrecognised mount option \"%s\" "
199 "or missing value\n", p); 202 "or missing value\n", p);
@@ -359,15 +362,11 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
359 struct adfs_sb_info *asb; 362 struct adfs_sb_info *asb;
360 struct inode *root; 363 struct inode *root;
361 364
362 lock_kernel();
363
364 sb->s_flags |= MS_NODIRATIME; 365 sb->s_flags |= MS_NODIRATIME;
365 366
366 asb = kzalloc(sizeof(*asb), GFP_KERNEL); 367 asb = kzalloc(sizeof(*asb), GFP_KERNEL);
367 if (!asb) { 368 if (!asb)
368 unlock_kernel();
369 return -ENOMEM; 369 return -ENOMEM;
370 }
371 sb->s_fs_info = asb; 370 sb->s_fs_info = asb;
372 371
373 /* set default options */ 372 /* set default options */
@@ -375,6 +374,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
375 asb->s_gid = 0; 374 asb->s_gid = 0;
376 asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; 375 asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
377 asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; 376 asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
377 asb->s_ftsuffix = 0;
378 378
379 if (parse_options(sb, data)) 379 if (parse_options(sb, data))
380 goto error; 380 goto error;
@@ -454,11 +454,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
454 454
455 root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root); 455 root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root);
456 root_obj.name_len = 0; 456 root_obj.name_len = 0;
457 root_obj.loadaddr = 0; 457 /* Set root object date as 01 Jan 1987 00:00:00 */
458 root_obj.execaddr = 0; 458 root_obj.loadaddr = 0xfff0003f;
459 root_obj.execaddr = 0xec22c000;
459 root_obj.size = ADFS_NEWDIR_SIZE; 460 root_obj.size = ADFS_NEWDIR_SIZE;
460 root_obj.attr = ADFS_NDA_DIRECTORY | ADFS_NDA_OWNER_READ | 461 root_obj.attr = ADFS_NDA_DIRECTORY | ADFS_NDA_OWNER_READ |
461 ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ; 462 ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ;
463 root_obj.filetype = -1;
462 464
463 /* 465 /*
464 * If this is a F+ disk with variable length directories, 466 * If this is a F+ disk with variable length directories,
@@ -472,6 +474,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
472 asb->s_dir = &adfs_f_dir_ops; 474 asb->s_dir = &adfs_f_dir_ops;
473 asb->s_namelen = ADFS_F_NAME_LEN; 475 asb->s_namelen = ADFS_F_NAME_LEN;
474 } 476 }
477 /*
478 * ,xyz hex filetype suffix may be added by driver
479 * to files that have valid RISC OS filetype
480 */
481 if (asb->s_ftsuffix)
482 asb->s_namelen += 4;
475 483
476 sb->s_d_op = &adfs_dentry_operations; 484 sb->s_d_op = &adfs_dentry_operations;
477 root = adfs_iget(sb, &root_obj); 485 root = adfs_iget(sb, &root_obj);
@@ -485,7 +493,6 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
485 adfs_error(sb, "get root inode failed\n"); 493 adfs_error(sb, "get root inode failed\n");
486 goto error; 494 goto error;
487 } 495 }
488 unlock_kernel();
489 return 0; 496 return 0;
490 497
491error_free_bh: 498error_free_bh:
@@ -493,7 +500,6 @@ error_free_bh:
493error: 500error:
494 sb->s_fs_info = NULL; 501 sb->s_fs_info = NULL;
495 kfree(asb); 502 kfree(asb);
496 unlock_kernel();
497 return -EINVAL; 503 return -EINVAL;
498} 504}
499 505
diff --git a/fs/affs/Makefile b/fs/affs/Makefile
index b2c4f54446f3..3988b4a78339 100644
--- a/fs/affs/Makefile
+++ b/fs/affs/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the Linux affs filesystem routines. 2# Makefile for the Linux affs filesystem routines.
3# 3#
4 4
5#EXTRA_CFLAGS=-DDEBUG=1 5#ccflags-y := -DDEBUG=1
6 6
7obj-$(CONFIG_AFFS_FS) += affs.o 7obj-$(CONFIG_AFFS_FS) += affs.o
8 8
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0a90dcd46de2..acf321b70fcd 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,7 +429,6 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
429const struct address_space_operations affs_aops = { 429const struct address_space_operations affs_aops = {
430 .readpage = affs_readpage, 430 .readpage = affs_readpage,
431 .writepage = affs_writepage, 431 .writepage = affs_writepage,
432 .sync_page = block_sync_page,
433 .write_begin = affs_write_begin, 432 .write_begin = affs_write_begin,
434 .write_end = generic_write_end, 433 .write_end = generic_write_end,
435 .bmap = _affs_bmap 434 .bmap = _affs_bmap
@@ -786,7 +785,6 @@ out:
786const struct address_space_operations affs_aops_ofs = { 785const struct address_space_operations affs_aops_ofs = {
787 .readpage = affs_readpage_ofs, 786 .readpage = affs_readpage_ofs,
788 //.writepage = affs_writepage_ofs, 787 //.writepage = affs_writepage_ofs,
789 //.sync_page = affs_sync_page_ofs,
790 .write_begin = affs_write_begin_ofs, 788 .write_begin = affs_write_begin_ofs,
791 .write_end = affs_write_end_ofs 789 .write_end = affs_write_end_ofs
792}; 790};
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 15690bb1d3b5..789b3afb3423 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
140 candidate->first = candidate->last = index; 140 candidate->first = candidate->last = index;
141 candidate->offset_first = from; 141 candidate->offset_first = from;
142 candidate->to_last = to; 142 candidate->to_last = to;
143 INIT_LIST_HEAD(&candidate->link);
143 candidate->usage = 1; 144 candidate->usage = 1;
144 candidate->state = AFS_WBACK_PENDING; 145 candidate->state = AFS_WBACK_PENDING;
145 init_waitqueue_head(&candidate->waitq); 146 init_waitqueue_head(&candidate->waitq);
diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a9..e29ec485af25 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -34,8 +34,6 @@
34#include <linux/security.h> 34#include <linux/security.h>
35#include <linux/eventfd.h> 35#include <linux/eventfd.h>
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/mempool.h>
38#include <linux/hash.h>
39#include <linux/compat.h> 37#include <linux/compat.h>
40 38
41#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
@@ -65,14 +63,6 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
65static DEFINE_SPINLOCK(fput_lock); 63static DEFINE_SPINLOCK(fput_lock);
66static LIST_HEAD(fput_head); 64static LIST_HEAD(fput_head);
67 65
68#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
69#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
70struct aio_batch_entry {
71 struct hlist_node list;
72 struct address_space *mapping;
73};
74mempool_t *abe_pool;
75
76static void aio_kick_handler(struct work_struct *); 66static void aio_kick_handler(struct work_struct *);
77static void aio_queue_work(struct kioctx *); 67static void aio_queue_work(struct kioctx *);
78 68
@@ -85,9 +75,8 @@ static int __init aio_setup(void)
85 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 75 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
86 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 76 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
87 77
88 aio_wq = create_workqueue("aio"); 78 aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */
89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); 79 BUG_ON(!aio_wq);
90 BUG_ON(!aio_wq || !abe_pool);
91 80
92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 81 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
93 82
@@ -239,15 +228,23 @@ static void __put_ioctx(struct kioctx *ctx)
239 call_rcu(&ctx->rcu_head, ctx_rcu_free); 228 call_rcu(&ctx->rcu_head, ctx_rcu_free);
240} 229}
241 230
242#define get_ioctx(kioctx) do { \ 231static inline void get_ioctx(struct kioctx *kioctx)
243 BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ 232{
244 atomic_inc(&(kioctx)->users); \ 233 BUG_ON(atomic_read(&kioctx->users) <= 0);
245} while (0) 234 atomic_inc(&kioctx->users);
246#define put_ioctx(kioctx) do { \ 235}
247 BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ 236
248 if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ 237static inline int try_get_ioctx(struct kioctx *kioctx)
249 __put_ioctx(kioctx); \ 238{
250} while (0) 239 return atomic_inc_not_zero(&kioctx->users);
240}
241
242static inline void put_ioctx(struct kioctx *kioctx)
243{
244 BUG_ON(atomic_read(&kioctx->users) <= 0);
245 if (unlikely(atomic_dec_and_test(&kioctx->users)))
246 __put_ioctx(kioctx);
247}
251 248
252/* ioctx_alloc 249/* ioctx_alloc
253 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. 250 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
@@ -512,7 +509,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
512 ctx->reqs_active--; 509 ctx->reqs_active--;
513 510
514 if (unlikely(!ctx->reqs_active && ctx->dead)) 511 if (unlikely(!ctx->reqs_active && ctx->dead))
515 wake_up(&ctx->wait); 512 wake_up_all(&ctx->wait);
516} 513}
517 514
518static void aio_fput_routine(struct work_struct *data) 515static void aio_fput_routine(struct work_struct *data)
@@ -569,7 +566,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
569 spin_lock(&fput_lock); 566 spin_lock(&fput_lock);
570 list_add(&req->ki_list, &fput_head); 567 list_add(&req->ki_list, &fput_head);
571 spin_unlock(&fput_lock); 568 spin_unlock(&fput_lock);
572 queue_work(aio_wq, &fput_work); 569 schedule_work(&fput_work);
573 } else { 570 } else {
574 req->ki_filp = NULL; 571 req->ki_filp = NULL;
575 really_put_req(ctx, req); 572 really_put_req(ctx, req);
@@ -601,8 +598,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
601 rcu_read_lock(); 598 rcu_read_lock();
602 599
603 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { 600 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
604 if (ctx->user_id == ctx_id && !ctx->dead) { 601 /*
605 get_ioctx(ctx); 602 * RCU protects us against accessing freed memory but
603 * we have to be careful not to get a reference when the
604 * reference count already dropped to 0 (ctx->dead test
605 * is unreliable because of races).
606 */
607 if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
606 ret = ctx; 608 ret = ctx;
607 break; 609 break;
608 } 610 }
@@ -1216,7 +1218,7 @@ static void io_destroy(struct kioctx *ioctx)
1216 * by other CPUs at this point. Right now, we rely on the 1218 * by other CPUs at this point. Right now, we rely on the
1217 * locking done by the above calls to ensure this consistency. 1219 * locking done by the above calls to ensure this consistency.
1218 */ 1220 */
1219 wake_up(&ioctx->wait); 1221 wake_up_all(&ioctx->wait);
1220 put_ioctx(ioctx); /* once for the lookup */ 1222 put_ioctx(ioctx); /* once for the lookup */
1221} 1223}
1222 1224
@@ -1512,57 +1514,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1512 return 0; 1514 return 0;
1513} 1515}
1514 1516
1515static void aio_batch_add(struct address_space *mapping,
1516 struct hlist_head *batch_hash)
1517{
1518 struct aio_batch_entry *abe;
1519 struct hlist_node *pos;
1520 unsigned bucket;
1521
1522 bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
1523 hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
1524 if (abe->mapping == mapping)
1525 return;
1526 }
1527
1528 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1529
1530 /*
1531 * we should be using igrab here, but
1532 * we don't want to hammer on the global
1533 * inode spinlock just to take an extra
1534 * reference on a file that we must already
1535 * have a reference to.
1536 *
1537 * When we're called, we always have a reference
1538 * on the file, so we must always have a reference
1539 * on the inode, so ihold() is safe here.
1540 */
1541 ihold(mapping->host);
1542 abe->mapping = mapping;
1543 hlist_add_head(&abe->list, &batch_hash[bucket]);
1544 return;
1545}
1546
1547static void aio_batch_free(struct hlist_head *batch_hash)
1548{
1549 struct aio_batch_entry *abe;
1550 struct hlist_node *pos, *n;
1551 int i;
1552
1553 for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
1554 hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
1555 blk_run_address_space(abe->mapping);
1556 iput(abe->mapping->host);
1557 hlist_del(&abe->list);
1558 mempool_free(abe, abe_pool);
1559 }
1560 }
1561}
1562
1563static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1517static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1564 struct iocb *iocb, struct hlist_head *batch_hash, 1518 struct iocb *iocb, bool compat)
1565 bool compat)
1566{ 1519{
1567 struct kiocb *req; 1520 struct kiocb *req;
1568 struct file *file; 1521 struct file *file;
@@ -1629,6 +1582,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1629 goto out_put_req; 1582 goto out_put_req;
1630 1583
1631 spin_lock_irq(&ctx->ctx_lock); 1584 spin_lock_irq(&ctx->ctx_lock);
1585 /*
1586 * We could have raced with io_destroy() and are currently holding a
1587 * reference to ctx which should be destroyed. We cannot submit IO
1588 * since ctx gets freed as soon as io_submit() puts its reference. The
1589 * check here is reliable: io_destroy() sets ctx->dead before waiting
1590 * for outstanding IO and the barrier between these two is realized by
1591 * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we
1592 * increment ctx->reqs_active before checking for ctx->dead and the
1593 * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
1594 * don't see ctx->dead set here, io_destroy() waits for our IO to
1595 * finish.
1596 */
1597 if (ctx->dead) {
1598 spin_unlock_irq(&ctx->ctx_lock);
1599 ret = -EINVAL;
1600 goto out_put_req;
1601 }
1632 aio_run_iocb(req); 1602 aio_run_iocb(req);
1633 if (!list_empty(&ctx->run_list)) { 1603 if (!list_empty(&ctx->run_list)) {
1634 /* drain the run list */ 1604 /* drain the run list */
@@ -1636,11 +1606,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1636 ; 1606 ;
1637 } 1607 }
1638 spin_unlock_irq(&ctx->ctx_lock); 1608 spin_unlock_irq(&ctx->ctx_lock);
1639 if (req->ki_opcode == IOCB_CMD_PREAD ||
1640 req->ki_opcode == IOCB_CMD_PREADV ||
1641 req->ki_opcode == IOCB_CMD_PWRITE ||
1642 req->ki_opcode == IOCB_CMD_PWRITEV)
1643 aio_batch_add(file->f_mapping, batch_hash);
1644 1609
1645 aio_put_req(req); /* drop extra ref to req */ 1610 aio_put_req(req); /* drop extra ref to req */
1646 return 0; 1611 return 0;
@@ -1657,7 +1622,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1657 struct kioctx *ctx; 1622 struct kioctx *ctx;
1658 long ret = 0; 1623 long ret = 0;
1659 int i; 1624 int i;
1660 struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; 1625 struct blk_plug plug;
1661 1626
1662 if (unlikely(nr < 0)) 1627 if (unlikely(nr < 0))
1663 return -EINVAL; 1628 return -EINVAL;
@@ -1674,6 +1639,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1674 return -EINVAL; 1639 return -EINVAL;
1675 } 1640 }
1676 1641
1642 blk_start_plug(&plug);
1643
1677 /* 1644 /*
1678 * AKPM: should this return a partial result if some of the IOs were 1645 * AKPM: should this return a partial result if some of the IOs were
1679 * successfully submitted? 1646 * successfully submitted?
@@ -1692,11 +1659,11 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1692 break; 1659 break;
1693 } 1660 }
1694 1661
1695 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat); 1662 ret = io_submit_one(ctx, user_iocb, &tmp, compat);
1696 if (ret) 1663 if (ret)
1697 break; 1664 break;
1698 } 1665 }
1699 aio_batch_free(batch_hash); 1666 blk_finish_plug(&plug);
1700 1667
1701 put_ioctx(ctx); 1668 put_ioctx(ctx);
1702 return i ? i : ret; 1669 return i ? i : ret;
diff --git a/fs/attr.c b/fs/attr.c
index 7ca41811afa1..1007ed616314 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -59,7 +59,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
59 59
60 /* Make sure a caller can chmod. */ 60 /* Make sure a caller can chmod. */
61 if (ia_valid & ATTR_MODE) { 61 if (ia_valid & ATTR_MODE) {
62 if (!is_owner_or_cap(inode)) 62 if (!inode_owner_or_capable(inode))
63 return -EPERM; 63 return -EPERM;
64 /* Also check the setgid bit! */ 64 /* Also check the setgid bit! */
65 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 65 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -69,7 +69,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
69 69
70 /* Check for setting the inode time. */ 70 /* Check for setting the inode time. */
71 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { 71 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
72 if (!is_owner_or_cap(inode)) 72 if (!inode_owner_or_capable(inode))
73 return -EPERM; 73 return -EPERM;
74 } 74 }
75 75
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 54f923792728..475f9c597cb7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -61,8 +61,6 @@ do { \
61 current->pid, __func__, ##args); \ 61 current->pid, __func__, ##args); \
62} while (0) 62} while (0)
63 63
64extern spinlock_t autofs4_lock;
65
66/* Unified info structure. This is pointed to by both the dentry and 64/* Unified info structure. This is pointed to by both the dentry and
67 inode structures. Each file in the filesystem has an instance of this 65 inode structures. Each file in the filesystem has an instance of this
68 structure. It holds a reference to the dentry, so dentries are never 66 structure. It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1442da4860e5..509fe1eb66ae 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
372 return -EBUSY; 372 return -EBUSY;
373 } else { 373 } else {
374 struct file *pipe = fget(pipefd); 374 struct file *pipe = fget(pipefd);
375 if (!pipe) {
376 err = -EBADF;
377 goto out;
378 }
375 if (!pipe->f_op || !pipe->f_op->write) { 379 if (!pipe->f_op || !pipe->f_op->write) {
376 err = -EPIPE; 380 err = -EPIPE;
377 fput(pipe); 381 fput(pipe);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index f43100b9662b..450f529a4eae 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -87,18 +87,70 @@ done:
87} 87}
88 88
89/* 89/*
90 * Calculate and dget next entry in the subdirs list under root.
91 */
92static struct dentry *get_next_positive_subdir(struct dentry *prev,
93 struct dentry *root)
94{
95 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
96 struct list_head *next;
97 struct dentry *p, *q;
98
99 spin_lock(&sbi->lookup_lock);
100
101 if (prev == NULL) {
102 spin_lock(&root->d_lock);
103 prev = dget_dlock(root);
104 next = prev->d_subdirs.next;
105 p = prev;
106 goto start;
107 }
108
109 p = prev;
110 spin_lock(&p->d_lock);
111again:
112 next = p->d_u.d_child.next;
113start:
114 if (next == &root->d_subdirs) {
115 spin_unlock(&p->d_lock);
116 spin_unlock(&sbi->lookup_lock);
117 dput(prev);
118 return NULL;
119 }
120
121 q = list_entry(next, struct dentry, d_u.d_child);
122
123 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
124 /* Negative dentry - try next */
125 if (!simple_positive(q)) {
126 spin_unlock(&p->d_lock);
127 p = q;
128 goto again;
129 }
130 dget_dlock(q);
131 spin_unlock(&q->d_lock);
132 spin_unlock(&p->d_lock);
133 spin_unlock(&sbi->lookup_lock);
134
135 dput(prev);
136
137 return q;
138}
139
140/*
90 * Calculate and dget next entry in top down tree traversal. 141 * Calculate and dget next entry in top down tree traversal.
91 */ 142 */
92static struct dentry *get_next_positive_dentry(struct dentry *prev, 143static struct dentry *get_next_positive_dentry(struct dentry *prev,
93 struct dentry *root) 144 struct dentry *root)
94{ 145{
146 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
95 struct list_head *next; 147 struct list_head *next;
96 struct dentry *p, *ret; 148 struct dentry *p, *ret;
97 149
98 if (prev == NULL) 150 if (prev == NULL)
99 return dget(root); 151 return dget(root);
100 152
101 spin_lock(&autofs4_lock); 153 spin_lock(&sbi->lookup_lock);
102relock: 154relock:
103 p = prev; 155 p = prev;
104 spin_lock(&p->d_lock); 156 spin_lock(&p->d_lock);
@@ -110,7 +162,7 @@ again:
110 162
111 if (p == root) { 163 if (p == root) {
112 spin_unlock(&p->d_lock); 164 spin_unlock(&p->d_lock);
113 spin_unlock(&autofs4_lock); 165 spin_unlock(&sbi->lookup_lock);
114 dput(prev); 166 dput(prev);
115 return NULL; 167 return NULL;
116 } 168 }
@@ -140,7 +192,7 @@ again:
140 dget_dlock(ret); 192 dget_dlock(ret);
141 spin_unlock(&ret->d_lock); 193 spin_unlock(&ret->d_lock);
142 spin_unlock(&p->d_lock); 194 spin_unlock(&p->d_lock);
143 spin_unlock(&autofs4_lock); 195 spin_unlock(&sbi->lookup_lock);
144 196
145 dput(prev); 197 dput(prev);
146 198
@@ -290,11 +342,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
290 spin_lock(&sbi->fs_lock); 342 spin_lock(&sbi->fs_lock);
291 ino = autofs4_dentry_ino(root); 343 ino = autofs4_dentry_ino(root);
292 /* No point expiring a pending mount */ 344 /* No point expiring a pending mount */
293 if (ino->flags & AUTOFS_INF_PENDING) { 345 if (ino->flags & AUTOFS_INF_PENDING)
294 spin_unlock(&sbi->fs_lock); 346 goto out;
295 return NULL;
296 }
297 managed_dentry_set_transit(root);
298 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 347 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
299 struct autofs_info *ino = autofs4_dentry_ino(root); 348 struct autofs_info *ino = autofs4_dentry_ino(root);
300 ino->flags |= AUTOFS_INF_EXPIRING; 349 ino->flags |= AUTOFS_INF_EXPIRING;
@@ -302,7 +351,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
302 spin_unlock(&sbi->fs_lock); 351 spin_unlock(&sbi->fs_lock);
303 return root; 352 return root;
304 } 353 }
305 managed_dentry_clear_transit(root); 354out:
306 spin_unlock(&sbi->fs_lock); 355 spin_unlock(&sbi->fs_lock);
307 dput(root); 356 dput(root);
308 357
@@ -336,13 +385,12 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
336 timeout = sbi->exp_timeout; 385 timeout = sbi->exp_timeout;
337 386
338 dentry = NULL; 387 dentry = NULL;
339 while ((dentry = get_next_positive_dentry(dentry, root))) { 388 while ((dentry = get_next_positive_subdir(dentry, root))) {
340 spin_lock(&sbi->fs_lock); 389 spin_lock(&sbi->fs_lock);
341 ino = autofs4_dentry_ino(dentry); 390 ino = autofs4_dentry_ino(dentry);
342 /* No point expiring a pending mount */ 391 /* No point expiring a pending mount */
343 if (ino->flags & AUTOFS_INF_PENDING) 392 if (ino->flags & AUTOFS_INF_PENDING)
344 goto cont; 393 goto next;
345 managed_dentry_set_transit(dentry);
346 394
347 /* 395 /*
348 * Case 1: (i) indirect mount or top level pseudo direct mount 396 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -402,8 +450,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
402 } 450 }
403 } 451 }
404next: 452next:
405 managed_dentry_clear_transit(dentry);
406cont:
407 spin_unlock(&sbi->fs_lock); 453 spin_unlock(&sbi->fs_lock);
408 } 454 }
409 return NULL; 455 return NULL;
@@ -415,13 +461,13 @@ found:
415 ino->flags |= AUTOFS_INF_EXPIRING; 461 ino->flags |= AUTOFS_INF_EXPIRING;
416 init_completion(&ino->expire_complete); 462 init_completion(&ino->expire_complete);
417 spin_unlock(&sbi->fs_lock); 463 spin_unlock(&sbi->fs_lock);
418 spin_lock(&autofs4_lock); 464 spin_lock(&sbi->lookup_lock);
419 spin_lock(&expired->d_parent->d_lock); 465 spin_lock(&expired->d_parent->d_lock);
420 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); 466 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
421 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); 467 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
422 spin_unlock(&expired->d_lock); 468 spin_unlock(&expired->d_lock);
423 spin_unlock(&expired->d_parent->d_lock); 469 spin_unlock(&expired->d_parent->d_lock);
424 spin_unlock(&autofs4_lock); 470 spin_unlock(&sbi->lookup_lock);
425 return expired; 471 return expired;
426} 472}
427 473
@@ -484,8 +530,6 @@ int autofs4_expire_run(struct super_block *sb,
484 spin_lock(&sbi->fs_lock); 530 spin_lock(&sbi->fs_lock);
485 ino = autofs4_dentry_ino(dentry); 531 ino = autofs4_dentry_ino(dentry);
486 ino->flags &= ~AUTOFS_INF_EXPIRING; 532 ino->flags &= ~AUTOFS_INF_EXPIRING;
487 if (!d_unhashed(dentry))
488 managed_dentry_clear_transit(dentry);
489 complete_all(&ino->expire_complete); 533 complete_all(&ino->expire_complete);
490 spin_unlock(&sbi->fs_lock); 534 spin_unlock(&sbi->fs_lock);
491 535
@@ -513,9 +557,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
513 spin_lock(&sbi->fs_lock); 557 spin_lock(&sbi->fs_lock);
514 ino->flags &= ~AUTOFS_INF_EXPIRING; 558 ino->flags &= ~AUTOFS_INF_EXPIRING;
515 spin_lock(&dentry->d_lock); 559 spin_lock(&dentry->d_lock);
516 if (ret) 560 if (!ret) {
517 __managed_dentry_clear_transit(dentry);
518 else {
519 if ((IS_ROOT(dentry) || 561 if ((IS_ROOT(dentry) ||
520 (autofs_type_indirect(sbi->type) && 562 (autofs_type_indirect(sbi->type) &&
521 IS_ROOT(dentry->d_parent))) && 563 IS_ROOT(dentry->d_parent))) &&
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 014e7aba3b08..96804a17bbd0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,8 +23,6 @@
23 23
24#include "autofs_i.h" 24#include "autofs_i.h"
25 25
26DEFINE_SPINLOCK(autofs4_lock);
27
28static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
29static int autofs4_dir_unlink(struct inode *,struct dentry *); 27static int autofs4_dir_unlink(struct inode *,struct dentry *);
30static int autofs4_dir_rmdir(struct inode *,struct dentry *); 28static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -36,7 +34,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
36static int autofs4_dir_open(struct inode *inode, struct file *file); 34static int autofs4_dir_open(struct inode *inode, struct file *file);
37static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 35static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
38static struct vfsmount *autofs4_d_automount(struct path *); 36static struct vfsmount *autofs4_d_automount(struct path *);
39static int autofs4_d_manage(struct dentry *, bool, bool); 37static int autofs4_d_manage(struct dentry *, bool);
40static void autofs4_dentry_release(struct dentry *); 38static void autofs4_dentry_release(struct dentry *);
41 39
42const struct file_operations autofs4_root_operations = { 40const struct file_operations autofs4_root_operations = {
@@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
125 * autofs file system so just let the libfs routines handle 123 * autofs file system so just let the libfs routines handle
126 * it. 124 * it.
127 */ 125 */
128 spin_lock(&autofs4_lock); 126 spin_lock(&sbi->lookup_lock);
129 spin_lock(&dentry->d_lock); 127 spin_lock(&dentry->d_lock);
130 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 128 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
131 spin_unlock(&dentry->d_lock); 129 spin_unlock(&dentry->d_lock);
132 spin_unlock(&autofs4_lock); 130 spin_unlock(&sbi->lookup_lock);
133 return -ENOENT; 131 return -ENOENT;
134 } 132 }
135 spin_unlock(&dentry->d_lock); 133 spin_unlock(&dentry->d_lock);
136 spin_unlock(&autofs4_lock); 134 spin_unlock(&sbi->lookup_lock);
137 135
138out: 136out:
139 return dcache_dir_open(inode, file); 137 return dcache_dir_open(inode, file);
@@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
171 const unsigned char *str = name->name; 169 const unsigned char *str = name->name;
172 struct list_head *p, *head; 170 struct list_head *p, *head;
173 171
174 spin_lock(&autofs4_lock);
175 spin_lock(&sbi->lookup_lock); 172 spin_lock(&sbi->lookup_lock);
176 head = &sbi->active_list; 173 head = &sbi->active_list;
177 list_for_each(p, head) { 174 list_for_each(p, head) {
@@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
204 dget_dlock(active); 201 dget_dlock(active);
205 spin_unlock(&active->d_lock); 202 spin_unlock(&active->d_lock);
206 spin_unlock(&sbi->lookup_lock); 203 spin_unlock(&sbi->lookup_lock);
207 spin_unlock(&autofs4_lock);
208 return active; 204 return active;
209 } 205 }
210next: 206next:
211 spin_unlock(&active->d_lock); 207 spin_unlock(&active->d_lock);
212 } 208 }
213 spin_unlock(&sbi->lookup_lock); 209 spin_unlock(&sbi->lookup_lock);
214 spin_unlock(&autofs4_lock);
215 210
216 return NULL; 211 return NULL;
217} 212}
@@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
226 const unsigned char *str = name->name; 221 const unsigned char *str = name->name;
227 struct list_head *p, *head; 222 struct list_head *p, *head;
228 223
229 spin_lock(&autofs4_lock);
230 spin_lock(&sbi->lookup_lock); 224 spin_lock(&sbi->lookup_lock);
231 head = &sbi->expiring_list; 225 head = &sbi->expiring_list;
232 list_for_each(p, head) { 226 list_for_each(p, head) {
@@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
259 dget_dlock(expiring); 253 dget_dlock(expiring);
260 spin_unlock(&expiring->d_lock); 254 spin_unlock(&expiring->d_lock);
261 spin_unlock(&sbi->lookup_lock); 255 spin_unlock(&sbi->lookup_lock);
262 spin_unlock(&autofs4_lock);
263 return expiring; 256 return expiring;
264 } 257 }
265next: 258next:
266 spin_unlock(&expiring->d_lock); 259 spin_unlock(&expiring->d_lock);
267 } 260 }
268 spin_unlock(&sbi->lookup_lock); 261 spin_unlock(&sbi->lookup_lock);
269 spin_unlock(&autofs4_lock);
270 262
271 return NULL; 263 return NULL;
272} 264}
@@ -275,17 +267,16 @@ static int autofs4_mount_wait(struct dentry *dentry)
275{ 267{
276 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 268 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
277 struct autofs_info *ino = autofs4_dentry_ino(dentry); 269 struct autofs_info *ino = autofs4_dentry_ino(dentry);
278 int status; 270 int status = 0;
279 271
280 if (ino->flags & AUTOFS_INF_PENDING) { 272 if (ino->flags & AUTOFS_INF_PENDING) {
281 DPRINTK("waiting for mount name=%.*s", 273 DPRINTK("waiting for mount name=%.*s",
282 dentry->d_name.len, dentry->d_name.name); 274 dentry->d_name.len, dentry->d_name.name);
283 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 275 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
284 DPRINTK("mount wait done status=%d", status); 276 DPRINTK("mount wait done status=%d", status);
285 ino->last_used = jiffies;
286 return status;
287 } 277 }
288 return 0; 278 ino->last_used = jiffies;
279 return status;
289} 280}
290 281
291static int do_expire_wait(struct dentry *dentry) 282static int do_expire_wait(struct dentry *dentry)
@@ -319,9 +310,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
319 */ 310 */
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { 311 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent; 312 struct dentry *parent = dentry->d_parent;
313 struct autofs_info *ino;
322 struct dentry *new = d_lookup(parent, &dentry->d_name); 314 struct dentry *new = d_lookup(parent, &dentry->d_name);
323 if (!new) 315 if (!new)
324 return NULL; 316 return NULL;
317 ino = autofs4_dentry_ino(new);
318 ino->last_used = jiffies;
325 dput(path->dentry); 319 dput(path->dentry);
326 path->dentry = new; 320 path->dentry = new;
327 } 321 }
@@ -338,18 +332,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
338 DPRINTK("dentry=%p %.*s", 332 DPRINTK("dentry=%p %.*s",
339 dentry, dentry->d_name.len, dentry->d_name.name); 333 dentry, dentry->d_name.len, dentry->d_name.name);
340 334
341 /*
342 * Someone may have manually umounted this or it was a submount
343 * that has gone away.
344 */
345 spin_lock(&dentry->d_lock);
346 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
347 if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
348 (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
349 __managed_dentry_set_transit(path->dentry);
350 }
351 spin_unlock(&dentry->d_lock);
352
353 /* The daemon never triggers a mount. */ 335 /* The daemon never triggers a mount. */
354 if (autofs4_oz_mode(sbi)) 336 if (autofs4_oz_mode(sbi))
355 return NULL; 337 return NULL;
@@ -418,18 +400,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
418done: 400done:
419 if (!(ino->flags & AUTOFS_INF_EXPIRING)) { 401 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
420 /* 402 /*
421 * Any needed mounting has been completed and the path updated 403 * Any needed mounting has been completed and the path
422 * so turn this into a normal dentry so we don't continually 404 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
423 * call ->d_automount() and ->d_manage(). 405 * call ->d_automount() on rootless multi-mounts since
424 */ 406 * it can lead to an incorrect ELOOP error return.
425 spin_lock(&dentry->d_lock); 407 *
426 __managed_dentry_clear_transit(dentry);
427 /*
428 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and 408 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
429 * symlinks as in all other cases the dentry will be covered by 409 * symlinks as in all other cases the dentry will be covered by
430 * an actual mount so ->d_automount() won't be called during 410 * an actual mount so ->d_automount() won't be called during
431 * the follow. 411 * the follow.
432 */ 412 */
413 spin_lock(&dentry->d_lock);
433 if ((!d_mountpoint(dentry) && 414 if ((!d_mountpoint(dentry) &&
434 !list_empty(&dentry->d_subdirs)) || 415 !list_empty(&dentry->d_subdirs)) ||
435 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) 416 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
@@ -446,7 +427,7 @@ done:
446 return NULL; 427 return NULL;
447} 428}
448 429
449int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk) 430int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
450{ 431{
451 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 432 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
452 433
@@ -454,7 +435,9 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
454 dentry, dentry->d_name.len, dentry->d_name.name); 435 dentry, dentry->d_name.len, dentry->d_name.name);
455 436
456 /* The daemon never waits. */ 437 /* The daemon never waits. */
457 if (autofs4_oz_mode(sbi) || mounting_here) { 438 if (autofs4_oz_mode(sbi)) {
439 if (rcu_walk)
440 return 0;
458 if (!d_mountpoint(dentry)) 441 if (!d_mountpoint(dentry))
459 return -EISDIR; 442 return -EISDIR;
460 return 0; 443 return 0;
@@ -612,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
612 595
613 dir->i_mtime = CURRENT_TIME; 596 dir->i_mtime = CURRENT_TIME;
614 597
615 spin_lock(&autofs4_lock); 598 spin_lock(&sbi->lookup_lock);
616 autofs4_add_expiring(dentry); 599 __autofs4_add_expiring(dentry);
617 spin_lock(&dentry->d_lock); 600 spin_lock(&dentry->d_lock);
618 __d_drop(dentry); 601 __d_drop(dentry);
619 spin_unlock(&dentry->d_lock); 602 spin_unlock(&dentry->d_lock);
620 spin_unlock(&autofs4_lock); 603 spin_unlock(&sbi->lookup_lock);
621 604
622 return 0; 605 return 0;
623} 606}
@@ -686,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
686 if (!autofs4_oz_mode(sbi)) 669 if (!autofs4_oz_mode(sbi))
687 return -EACCES; 670 return -EACCES;
688 671
689 spin_lock(&autofs4_lock);
690 spin_lock(&sbi->lookup_lock); 672 spin_lock(&sbi->lookup_lock);
691 spin_lock(&dentry->d_lock); 673 spin_lock(&dentry->d_lock);
692 if (!list_empty(&dentry->d_subdirs)) { 674 if (!list_empty(&dentry->d_subdirs)) {
693 spin_unlock(&dentry->d_lock); 675 spin_unlock(&dentry->d_lock);
694 spin_unlock(&sbi->lookup_lock); 676 spin_unlock(&sbi->lookup_lock);
695 spin_unlock(&autofs4_lock);
696 return -ENOTEMPTY; 677 return -ENOTEMPTY;
697 } 678 }
698 __autofs4_add_expiring(dentry); 679 __autofs4_add_expiring(dentry);
699 spin_unlock(&sbi->lookup_lock);
700 __d_drop(dentry); 680 __d_drop(dentry);
701 spin_unlock(&dentry->d_lock); 681 spin_unlock(&dentry->d_lock);
702 spin_unlock(&autofs4_lock); 682 spin_unlock(&sbi->lookup_lock);
703 683
704 if (sbi->version < 5) 684 if (sbi->version < 5)
705 autofs_clear_leaf_automount_flags(dentry); 685 autofs_clear_leaf_automount_flags(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 56010056b2e6..25435987d6ae 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -197,12 +197,12 @@ rename_retry:
197 197
198 seq = read_seqbegin(&rename_lock); 198 seq = read_seqbegin(&rename_lock);
199 rcu_read_lock(); 199 rcu_read_lock();
200 spin_lock(&autofs4_lock); 200 spin_lock(&sbi->fs_lock);
201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) 201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
202 len += tmp->d_name.len + 1; 202 len += tmp->d_name.len + 1;
203 203
204 if (!len || --len > NAME_MAX) { 204 if (!len || --len > NAME_MAX) {
205 spin_unlock(&autofs4_lock); 205 spin_unlock(&sbi->fs_lock);
206 rcu_read_unlock(); 206 rcu_read_unlock();
207 if (read_seqretry(&rename_lock, seq)) 207 if (read_seqretry(&rename_lock, seq))
208 goto rename_retry; 208 goto rename_retry;
@@ -218,7 +218,7 @@ rename_retry:
218 p -= tmp->d_name.len; 218 p -= tmp->d_name.len;
219 strncpy(p, tmp->d_name.name, tmp->d_name.len); 219 strncpy(p, tmp->d_name.name, tmp->d_name.len);
220 } 220 }
221 spin_unlock(&autofs4_lock); 221 spin_unlock(&sbi->fs_lock);
222 rcu_read_unlock(); 222 rcu_read_unlock();
223 if (read_seqretry(&rename_lock, seq)) 223 if (read_seqretry(&rename_lock, seq))
224 goto rename_retry; 224 goto rename_retry;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b1d0c794747b..06457ed8f3e7 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -75,7 +75,6 @@ static const struct inode_operations befs_dir_inode_operations = {
75 75
76static const struct address_space_operations befs_aops = { 76static const struct address_space_operations befs_aops = {
77 .readpage = befs_readpage, 77 .readpage = befs_readpage,
78 .sync_page = block_sync_page,
79 .bmap = befs_bmap, 78 .bmap = befs_bmap,
80}; 79};
81 80
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 685ecff3ab31..b14cebfd9047 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
97 if (!inode) 97 if (!inode)
98 return -ENOSPC; 98 return -ENOSPC;
99 mutex_lock(&info->bfs_lock); 99 mutex_lock(&info->bfs_lock);
100 ino = find_first_zero_bit(info->si_imap, info->si_lasti); 100 ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1);
101 if (ino > info->si_lasti) { 101 if (ino > info->si_lasti) {
102 mutex_unlock(&info->bfs_lock); 102 mutex_unlock(&info->bfs_lock);
103 iput(inode); 103 iput(inode);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index eb67edd0f8ea..f20e8a71062f 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -186,7 +186,6 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
186const struct address_space_operations bfs_aops = { 186const struct address_space_operations bfs_aops = {
187 .readpage = bfs_readpage, 187 .readpage = bfs_readpage,
188 .writepage = bfs_writepage, 188 .writepage = bfs_writepage,
189 .sync_page = block_sync_page,
190 .write_begin = bfs_write_begin, 189 .write_begin = bfs_write_begin,
191 .write_end = generic_write_end, 190 .write_end = generic_write_end,
192 .bmap = bfs_bmap, 191 .bmap = bfs_bmap,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d5b640ba6cb1..f34078d702d3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -570,7 +570,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
570 unsigned long elf_entry; 570 unsigned long elf_entry;
571 unsigned long interp_load_addr = 0; 571 unsigned long interp_load_addr = 0;
572 unsigned long start_code, end_code, start_data, end_data; 572 unsigned long start_code, end_code, start_data, end_data;
573 unsigned long reloc_func_desc = 0; 573 unsigned long reloc_func_desc __maybe_unused = 0;
574 int executable_stack = EXSTACK_DEFAULT; 574 int executable_stack = EXSTACK_DEFAULT;
575 unsigned long def_flags = 0; 575 unsigned long def_flags = 0;
576 struct { 576 struct {
@@ -1906,7 +1906,7 @@ static int elf_core_dump(struct coredump_params *cprm)
1906 segs = current->mm->map_count; 1906 segs = current->mm->map_count;
1907 segs += elf_core_extra_phdrs(); 1907 segs += elf_core_extra_phdrs();
1908 1908
1909 gate_vma = get_gate_vma(current); 1909 gate_vma = get_gate_vma(current->mm);
1910 if (gate_vma != NULL) 1910 if (gate_vma != NULL)
1911 segs++; 1911 segs++;
1912 1912
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e49cce234c65..9c5e6b2cd11a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -761,6 +761,9 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
761{ 761{
762 unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); 762 unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
763 763
764 if (bs->bio_integrity_pool)
765 return 0;
766
764 bs->bio_integrity_pool = 767 bs->bio_integrity_pool =
765 mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); 768 mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
766 769
diff --git a/fs/bio.c b/fs/bio.c
index 4bd454fa844e..4d6d4b6c2bf1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -43,7 +43,7 @@ static mempool_t *bio_split_pool __read_mostly;
43 * unsigned short 43 * unsigned short
44 */ 44 */
45#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 45#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
46struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 46static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
47 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 47 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
48}; 48};
49#undef BV 49#undef BV
@@ -111,7 +111,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
111 if (!slab) 111 if (!slab)
112 goto out_unlock; 112 goto out_unlock;
113 113
114 printk("bio: create slab <%s> at %d\n", bslab->name, entry); 114 printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
115 bslab->slab = slab; 115 bslab->slab = slab;
116 bslab->slab_ref = 1; 116 bslab->slab_ref = 1;
117 bslab->slab_size = sz; 117 bslab->slab_size = sz;
@@ -1636,9 +1636,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1636 if (!bs->bio_pool) 1636 if (!bs->bio_pool)
1637 goto bad; 1637 goto bad;
1638 1638
1639 if (bioset_integrity_create(bs, pool_size))
1640 goto bad;
1641
1642 if (!biovec_create_pools(bs, pool_size)) 1639 if (!biovec_create_pools(bs, pool_size))
1643 return bs; 1640 return bs;
1644 1641
@@ -1656,12 +1653,10 @@ static void __init biovec_init_slabs(void)
1656 int size; 1653 int size;
1657 struct biovec_slab *bvs = bvec_slabs + i; 1654 struct biovec_slab *bvs = bvec_slabs + i;
1658 1655
1659#ifndef CONFIG_BLK_DEV_INTEGRITY
1660 if (bvs->nr_vecs <= BIO_INLINE_VECS) { 1656 if (bvs->nr_vecs <= BIO_INLINE_VECS) {
1661 bvs->slab = NULL; 1657 bvs->slab = NULL;
1662 continue; 1658 continue;
1663 } 1659 }
1664#endif
1665 1660
1666 size = bvs->nr_vecs * sizeof(struct bio_vec); 1661 size = bvs->nr_vecs * sizeof(struct bio_vec);
1667 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1662 bvs->slab = kmem_cache_create(bvs->name, size, 0,
@@ -1684,6 +1679,9 @@ static int __init init_bio(void)
1684 if (!fs_bio_set) 1679 if (!fs_bio_set)
1685 panic("bio: can't allocate bios\n"); 1680 panic("bio: can't allocate bios\n");
1686 1681
1682 if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
1683 panic("bio: can't create integrity pool\n");
1684
1687 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, 1685 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
1688 sizeof(struct bio_pair)); 1686 sizeof(struct bio_pair));
1689 if (!bio_split_pool) 1687 if (!bio_split_pool)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 333a7bb4cb9c..c1511c674f53 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,11 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
55static void bdev_inode_switch_bdi(struct inode *inode, 55static void bdev_inode_switch_bdi(struct inode *inode,
56 struct backing_dev_info *dst) 56 struct backing_dev_info *dst)
57{ 57{
58 spin_lock(&inode_lock); 58 spin_lock(&inode_wb_list_lock);
59 spin_lock(&inode->i_lock);
59 inode->i_data.backing_dev_info = dst; 60 inode->i_data.backing_dev_info = dst;
60 if (inode->i_state & I_DIRTY) 61 if (inode->i_state & I_DIRTY)
61 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
62 spin_unlock(&inode_lock); 63 spin_unlock(&inode->i_lock);
64 spin_unlock(&inode_wb_list_lock);
63} 65}
64 66
65static sector_t max_block(struct block_device *bdev) 67static sector_t max_block(struct block_device *bdev)
@@ -873,6 +875,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
873 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); 875 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
874 if (ret) 876 if (ret)
875 goto out_del; 877 goto out_del;
878 /*
879 * bdev could be deleted beneath us which would implicitly destroy
880 * the holder directory. Hold on to it.
881 */
882 kobject_get(bdev->bd_part->holder_dir);
876 883
877 list_add(&holder->list, &bdev->bd_holder_disks); 884 list_add(&holder->list, &bdev->bd_holder_disks);
878 goto out_unlock; 885 goto out_unlock;
@@ -909,6 +916,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
909 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 916 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
910 del_symlink(bdev->bd_part->holder_dir, 917 del_symlink(bdev->bd_part->holder_dir,
911 &disk_to_dev(disk)->kobj); 918 &disk_to_dev(disk)->kobj);
919 kobject_put(bdev->bd_part->holder_dir);
912 list_del_init(&holder->list); 920 list_del_init(&holder->list);
913 kfree(holder); 921 kfree(holder);
914 } 922 }
@@ -922,14 +930,15 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
922 * flush_disk - invalidates all buffer-cache entries on a disk 930 * flush_disk - invalidates all buffer-cache entries on a disk
923 * 931 *
924 * @bdev: struct block device to be flushed 932 * @bdev: struct block device to be flushed
933 * @kill_dirty: flag to guide handling of dirty inodes
925 * 934 *
926 * Invalidates all buffer-cache entries on a disk. It should be called 935 * Invalidates all buffer-cache entries on a disk. It should be called
927 * when a disk has been changed -- either by a media change or online 936 * when a disk has been changed -- either by a media change or online
928 * resize. 937 * resize.
929 */ 938 */
930static void flush_disk(struct block_device *bdev) 939static void flush_disk(struct block_device *bdev, bool kill_dirty)
931{ 940{
932 if (__invalidate_device(bdev)) { 941 if (__invalidate_device(bdev, kill_dirty)) {
933 char name[BDEVNAME_SIZE] = ""; 942 char name[BDEVNAME_SIZE] = "";
934 943
935 if (bdev->bd_disk) 944 if (bdev->bd_disk)
@@ -966,7 +975,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
966 "%s: detected capacity change from %lld to %lld\n", 975 "%s: detected capacity change from %lld to %lld\n",
967 name, bdev_size, disk_size); 976 name, bdev_size, disk_size);
968 i_size_write(bdev->bd_inode, disk_size); 977 i_size_write(bdev->bd_inode, disk_size);
969 flush_disk(bdev); 978 flush_disk(bdev, false);
970 } 979 }
971} 980}
972EXPORT_SYMBOL(check_disk_size_change); 981EXPORT_SYMBOL(check_disk_size_change);
@@ -1019,7 +1028,7 @@ int check_disk_change(struct block_device *bdev)
1019 if (!(events & DISK_EVENT_MEDIA_CHANGE)) 1028 if (!(events & DISK_EVENT_MEDIA_CHANGE))
1020 return 0; 1029 return 0;
1021 1030
1022 flush_disk(bdev); 1031 flush_disk(bdev, true);
1023 if (bdops->revalidate_disk) 1032 if (bdops->revalidate_disk)
1024 bdops->revalidate_disk(bdev->bd_disk); 1033 bdops->revalidate_disk(bdev->bd_disk);
1025 return 1; 1034 return 1;
@@ -1080,6 +1089,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1080 if (!disk) 1089 if (!disk)
1081 goto out; 1090 goto out;
1082 1091
1092 disk_block_events(disk);
1083 mutex_lock_nested(&bdev->bd_mutex, for_part); 1093 mutex_lock_nested(&bdev->bd_mutex, for_part);
1084 if (!bdev->bd_openers) { 1094 if (!bdev->bd_openers) {
1085 bdev->bd_disk = disk; 1095 bdev->bd_disk = disk;
@@ -1101,10 +1111,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1101 */ 1111 */
1102 disk_put_part(bdev->bd_part); 1112 disk_put_part(bdev->bd_part);
1103 bdev->bd_part = NULL; 1113 bdev->bd_part = NULL;
1104 module_put(disk->fops->owner);
1105 put_disk(disk);
1106 bdev->bd_disk = NULL; 1114 bdev->bd_disk = NULL;
1107 mutex_unlock(&bdev->bd_mutex); 1115 mutex_unlock(&bdev->bd_mutex);
1116 disk_unblock_events(disk);
1117 module_put(disk->fops->owner);
1118 put_disk(disk);
1108 goto restart; 1119 goto restart;
1109 } 1120 }
1110 if (ret) 1121 if (ret)
@@ -1141,9 +1152,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1141 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1152 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1142 } 1153 }
1143 } else { 1154 } else {
1144 module_put(disk->fops->owner);
1145 put_disk(disk);
1146 disk = NULL;
1147 if (bdev->bd_contains == bdev) { 1155 if (bdev->bd_contains == bdev) {
1148 if (bdev->bd_disk->fops->open) { 1156 if (bdev->bd_disk->fops->open) {
1149 ret = bdev->bd_disk->fops->open(bdev, mode); 1157 ret = bdev->bd_disk->fops->open(bdev, mode);
@@ -1153,11 +1161,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1153 if (bdev->bd_invalidated) 1161 if (bdev->bd_invalidated)
1154 rescan_partitions(bdev->bd_disk, bdev); 1162 rescan_partitions(bdev->bd_disk, bdev);
1155 } 1163 }
1164 /* only one opener holds refs to the module and disk */
1165 module_put(disk->fops->owner);
1166 put_disk(disk);
1156 } 1167 }
1157 bdev->bd_openers++; 1168 bdev->bd_openers++;
1158 if (for_part) 1169 if (for_part)
1159 bdev->bd_part_count++; 1170 bdev->bd_part_count++;
1160 mutex_unlock(&bdev->bd_mutex); 1171 mutex_unlock(&bdev->bd_mutex);
1172 disk_unblock_events(disk);
1161 return 0; 1173 return 0;
1162 1174
1163 out_clear: 1175 out_clear:
@@ -1170,10 +1182,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1170 bdev->bd_contains = NULL; 1182 bdev->bd_contains = NULL;
1171 out_unlock_bdev: 1183 out_unlock_bdev:
1172 mutex_unlock(&bdev->bd_mutex); 1184 mutex_unlock(&bdev->bd_mutex);
1173 out: 1185 disk_unblock_events(disk);
1174 if (disk) 1186 module_put(disk->fops->owner);
1175 module_put(disk->fops->owner);
1176 put_disk(disk); 1187 put_disk(disk);
1188 out:
1177 bdput(bdev); 1189 bdput(bdev);
1178 1190
1179 return ret; 1191 return ret;
@@ -1215,12 +1227,6 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1215 1227
1216 res = __blkdev_get(bdev, mode, 0); 1228 res = __blkdev_get(bdev, mode, 0);
1217 1229
1218 /* __blkdev_get() may alter read only status, check it afterwards */
1219 if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1220 __blkdev_put(bdev, mode, 0);
1221 res = -EACCES;
1222 }
1223
1224 if (whole) { 1230 if (whole) {
1225 /* finish claiming */ 1231 /* finish claiming */
1226 mutex_lock(&bdev->bd_mutex); 1232 mutex_lock(&bdev->bd_mutex);
@@ -1298,6 +1304,11 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1298 if (err) 1304 if (err)
1299 return ERR_PTR(err); 1305 return ERR_PTR(err);
1300 1306
1307 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1308 blkdev_put(bdev, mode);
1309 return ERR_PTR(-EACCES);
1310 }
1311
1301 return bdev; 1312 return bdev;
1302} 1313}
1303EXPORT_SYMBOL(blkdev_get_by_path); 1314EXPORT_SYMBOL(blkdev_get_by_path);
@@ -1440,14 +1451,13 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
1440 if (bdev_free) { 1451 if (bdev_free) {
1441 if (bdev->bd_write_holder) { 1452 if (bdev->bd_write_holder) {
1442 disk_unblock_events(bdev->bd_disk); 1453 disk_unblock_events(bdev->bd_disk);
1443 bdev->bd_write_holder = false;
1444 } else
1445 disk_check_events(bdev->bd_disk); 1454 disk_check_events(bdev->bd_disk);
1455 bdev->bd_write_holder = false;
1456 }
1446 } 1457 }
1447 1458
1448 mutex_unlock(&bdev->bd_mutex); 1459 mutex_unlock(&bdev->bd_mutex);
1449 } else 1460 }
1450 disk_check_events(bdev->bd_disk);
1451 1461
1452 return __blkdev_put(bdev, mode, 0); 1462 return __blkdev_put(bdev, mode, 0);
1453} 1463}
@@ -1521,7 +1531,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
1521static const struct address_space_operations def_blk_aops = { 1531static const struct address_space_operations def_blk_aops = {
1522 .readpage = blkdev_readpage, 1532 .readpage = blkdev_readpage,
1523 .writepage = blkdev_writepage, 1533 .writepage = blkdev_writepage,
1524 .sync_page = block_sync_page,
1525 .write_begin = blkdev_write_begin, 1534 .write_begin = blkdev_write_begin,
1526 .write_end = blkdev_write_end, 1535 .write_end = blkdev_write_end,
1527 .writepages = generic_writepages, 1536 .writepages = generic_writepages,
@@ -1601,7 +1610,7 @@ fail:
1601} 1610}
1602EXPORT_SYMBOL(lookup_bdev); 1611EXPORT_SYMBOL(lookup_bdev);
1603 1612
1604int __invalidate_device(struct block_device *bdev) 1613int __invalidate_device(struct block_device *bdev, bool kill_dirty)
1605{ 1614{
1606 struct super_block *sb = get_super(bdev); 1615 struct super_block *sb = get_super(bdev);
1607 int res = 0; 1616 int res = 0;
@@ -1614,7 +1623,7 @@ int __invalidate_device(struct block_device *bdev)
1614 * hold). 1623 * hold).
1615 */ 1624 */
1616 shrink_dcache_sb(sb); 1625 shrink_dcache_sb(sb);
1617 res = invalidate_inodes(sb); 1626 res = invalidate_inodes(sb, kill_dirty);
1618 drop_super(sb); 1627 drop_super(sb);
1619 } 1628 }
1620 invalidate_bdev(bdev); 1629 invalidate_bdev(bdev);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 15b5ca2a2606..de34bfad9ec3 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
37 char *value = NULL; 37 char *value = NULL;
38 struct posix_acl *acl; 38 struct posix_acl *acl;
39 39
40 if (!IS_POSIXACL(inode))
41 return NULL;
42
40 acl = get_cached_acl(inode, type); 43 acl = get_cached_acl(inode, type);
41 if (acl != ACL_NOT_CACHED) 44 if (acl != ACL_NOT_CACHED)
42 return acl; 45 return acl;
@@ -84,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
84 struct posix_acl *acl; 87 struct posix_acl *acl;
85 int ret = 0; 88 int ret = 0;
86 89
90 if (!IS_POSIXACL(dentry->d_inode))
91 return -EOPNOTSUPP;
92
87 acl = btrfs_get_acl(dentry->d_inode, type); 93 acl = btrfs_get_acl(dentry->d_inode, type);
88 94
89 if (IS_ERR(acl)) 95 if (IS_ERR(acl))
@@ -164,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
164 int ret; 170 int ret;
165 struct posix_acl *acl = NULL; 171 struct posix_acl *acl = NULL;
166 172
167 if (!is_owner_or_cap(dentry->d_inode)) 173 if (!inode_owner_or_capable(dentry->d_inode))
168 return -EPERM; 174 return -EPERM;
169 175
170 if (!IS_POSIXACL(dentry->d_inode)) 176 if (!IS_POSIXACL(dentry->d_inode))
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f745287fbf2e..4d2110eafe29 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -562,7 +562,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
562 u64 em_len; 562 u64 em_len;
563 u64 em_start; 563 u64 em_start;
564 struct extent_map *em; 564 struct extent_map *em;
565 int ret; 565 int ret = -ENOMEM;
566 u32 *sums; 566 u32 *sums;
567 567
568 tree = &BTRFS_I(inode)->io_tree; 568 tree = &BTRFS_I(inode)->io_tree;
@@ -577,6 +577,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
577 577
578 compressed_len = em->block_len; 578 compressed_len = em->block_len;
579 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 579 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
580 if (!cb)
581 goto out;
582
580 atomic_set(&cb->pending_bios, 0); 583 atomic_set(&cb->pending_bios, 0);
581 cb->errors = 0; 584 cb->errors = 0;
582 cb->inode = inode; 585 cb->inode = inode;
@@ -597,13 +600,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
597 600
598 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 601 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
599 PAGE_CACHE_SIZE; 602 PAGE_CACHE_SIZE;
600 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, 603 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
601 GFP_NOFS); 604 GFP_NOFS);
605 if (!cb->compressed_pages)
606 goto fail1;
607
602 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 608 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
603 609
604 for (page_index = 0; page_index < nr_pages; page_index++) { 610 for (page_index = 0; page_index < nr_pages; page_index++) {
605 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | 611 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
606 __GFP_HIGHMEM); 612 __GFP_HIGHMEM);
613 if (!cb->compressed_pages[page_index])
614 goto fail2;
607 } 615 }
608 cb->nr_pages = nr_pages; 616 cb->nr_pages = nr_pages;
609 617
@@ -614,6 +622,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
614 cb->len = uncompressed_len; 622 cb->len = uncompressed_len;
615 623
616 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); 624 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
625 if (!comp_bio)
626 goto fail2;
617 comp_bio->bi_private = cb; 627 comp_bio->bi_private = cb;
618 comp_bio->bi_end_io = end_compressed_bio_read; 628 comp_bio->bi_end_io = end_compressed_bio_read;
619 atomic_inc(&cb->pending_bios); 629 atomic_inc(&cb->pending_bios);
@@ -681,6 +691,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
681 691
682 bio_put(comp_bio); 692 bio_put(comp_bio);
683 return 0; 693 return 0;
694
695fail2:
696 for (page_index = 0; page_index < nr_pages; page_index++)
697 free_page((unsigned long)cb->compressed_pages[page_index]);
698
699 kfree(cb->compressed_pages);
700fail1:
701 kfree(cb);
702out:
703 free_extent_map(em);
704 return ret;
684} 705}
685 706
686static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; 707static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
@@ -900,7 +921,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
900 return ret; 921 return ret;
901} 922}
902 923
903void __exit btrfs_exit_compress(void) 924void btrfs_exit_compress(void)
904{ 925{
905 free_workspaces(); 926 free_workspaces();
906} 927}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c98b3af6052..7f78cc78fdd0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -729,6 +729,15 @@ struct btrfs_space_info {
729 u64 disk_total; /* total bytes on disk, takes mirrors into 729 u64 disk_total; /* total bytes on disk, takes mirrors into
730 account */ 730 account */
731 731
732 /*
733 * we bump reservation progress every time we decrement
734 * bytes_reserved. This way people waiting for reservations
735 * know something good has happened and they can check
736 * for progress. The number here isn't to be trusted, it
737 * just shows reclaim activity
738 */
739 unsigned long reservation_progress;
740
732 int full; /* indicates that we cannot allocate any more 741 int full; /* indicates that we cannot allocate any more
733 chunks for this space */ 742 chunks for this space */
734 int force_alloc; /* set if we need to force a chunk alloc for 743 int force_alloc; /* set if we need to force a chunk alloc for
@@ -1254,6 +1263,7 @@ struct btrfs_root {
1254#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) 1263#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
1255#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) 1264#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
1256#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) 1265#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1266#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1257 1267
1258#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1268#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1259#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1269#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2218,6 +2228,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2218 u64 start, u64 end); 2228 u64 start, u64 end);
2219int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 2229int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2220 u64 num_bytes); 2230 u64 num_bytes);
2231int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
2232 struct btrfs_root *root, u64 type);
2221 2233
2222/* ctree.c */ 2234/* ctree.c */
2223int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2235int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b531c36455d8..830d261d0e6b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -359,10 +359,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
359 359
360 tree = &BTRFS_I(page->mapping->host)->io_tree; 360 tree = &BTRFS_I(page->mapping->host)->io_tree;
361 361
362 if (page->private == EXTENT_PAGE_PRIVATE) 362 if (page->private == EXTENT_PAGE_PRIVATE) {
363 WARN_ON(1);
363 goto out; 364 goto out;
364 if (!page->private) 365 }
366 if (!page->private) {
367 WARN_ON(1);
365 goto out; 368 goto out;
369 }
366 len = page->private >> 2; 370 len = page->private >> 2;
367 WARN_ON(len == 0); 371 WARN_ON(len == 0);
368 372
@@ -843,7 +847,6 @@ static const struct address_space_operations btree_aops = {
843 .writepages = btree_writepages, 847 .writepages = btree_writepages,
844 .releasepage = btree_releasepage, 848 .releasepage = btree_releasepage,
845 .invalidatepage = btree_invalidatepage, 849 .invalidatepage = btree_invalidatepage,
846 .sync_page = block_sync_page,
847#ifdef CONFIG_MIGRATION 850#ifdef CONFIG_MIGRATION
848 .migratepage = btree_migratepage, 851 .migratepage = btree_migratepage,
849#endif 852#endif
@@ -1327,82 +1330,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1327} 1330}
1328 1331
1329/* 1332/*
1330 * this unplugs every device on the box, and it is only used when page
1331 * is null
1332 */
1333static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1334{
1335 struct btrfs_device *device;
1336 struct btrfs_fs_info *info;
1337
1338 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1339 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1340 if (!device->bdev)
1341 continue;
1342
1343 bdi = blk_get_backing_dev_info(device->bdev);
1344 if (bdi->unplug_io_fn)
1345 bdi->unplug_io_fn(bdi, page);
1346 }
1347}
1348
1349static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1350{
1351 struct inode *inode;
1352 struct extent_map_tree *em_tree;
1353 struct extent_map *em;
1354 struct address_space *mapping;
1355 u64 offset;
1356
1357 /* the generic O_DIRECT read code does this */
1358 if (1 || !page) {
1359 __unplug_io_fn(bdi, page);
1360 return;
1361 }
1362
1363 /*
1364 * page->mapping may change at any time. Get a consistent copy
1365 * and use that for everything below
1366 */
1367 smp_mb();
1368 mapping = page->mapping;
1369 if (!mapping)
1370 return;
1371
1372 inode = mapping->host;
1373
1374 /*
1375 * don't do the expensive searching for a small number of
1376 * devices
1377 */
1378 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1379 __unplug_io_fn(bdi, page);
1380 return;
1381 }
1382
1383 offset = page_offset(page);
1384
1385 em_tree = &BTRFS_I(inode)->extent_tree;
1386 read_lock(&em_tree->lock);
1387 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1388 read_unlock(&em_tree->lock);
1389 if (!em) {
1390 __unplug_io_fn(bdi, page);
1391 return;
1392 }
1393
1394 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1395 free_extent_map(em);
1396 __unplug_io_fn(bdi, page);
1397 return;
1398 }
1399 offset = offset - em->start;
1400 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1401 em->block_start + offset, page);
1402 free_extent_map(em);
1403}
1404
1405/*
1406 * If this fails, caller must call bdi_destroy() to get rid of the 1333 * If this fails, caller must call bdi_destroy() to get rid of the
1407 * bdi again. 1334 * bdi again.
1408 */ 1335 */
@@ -1416,8 +1343,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1416 return err; 1343 return err;
1417 1344
1418 bdi->ra_pages = default_backing_dev_info.ra_pages; 1345 bdi->ra_pages = default_backing_dev_info.ra_pages;
1419 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1420 bdi->unplug_io_data = info;
1421 bdi->congested_fn = btrfs_congested_fn; 1346 bdi->congested_fn = btrfs_congested_fn;
1422 bdi->congested_data = info; 1347 bdi->congested_data = info;
1423 return 0; 1348 return 0;
@@ -1550,6 +1475,7 @@ static int transaction_kthread(void *arg)
1550 spin_unlock(&root->fs_info->new_trans_lock); 1475 spin_unlock(&root->fs_info->new_trans_lock);
1551 1476
1552 trans = btrfs_join_transaction(root, 1); 1477 trans = btrfs_join_transaction(root, 1);
1478 BUG_ON(IS_ERR(trans));
1553 if (transid == trans->transid) { 1479 if (transid == trans->transid) {
1554 ret = btrfs_commit_transaction(trans, root); 1480 ret = btrfs_commit_transaction(trans, root);
1555 BUG_ON(ret); 1481 BUG_ON(ret);
@@ -2453,10 +2379,14 @@ int btrfs_commit_super(struct btrfs_root *root)
2453 up_write(&root->fs_info->cleanup_work_sem); 2379 up_write(&root->fs_info->cleanup_work_sem);
2454 2380
2455 trans = btrfs_join_transaction(root, 1); 2381 trans = btrfs_join_transaction(root, 1);
2382 if (IS_ERR(trans))
2383 return PTR_ERR(trans);
2456 ret = btrfs_commit_transaction(trans, root); 2384 ret = btrfs_commit_transaction(trans, root);
2457 BUG_ON(ret); 2385 BUG_ON(ret);
2458 /* run commit again to drop the original snapshot */ 2386 /* run commit again to drop the original snapshot */
2459 trans = btrfs_join_transaction(root, 1); 2387 trans = btrfs_join_transaction(root, 1);
2388 if (IS_ERR(trans))
2389 return PTR_ERR(trans);
2460 btrfs_commit_transaction(trans, root); 2390 btrfs_commit_transaction(trans, root);
2461 ret = btrfs_write_and_wait_transaction(NULL, root); 2391 ret = btrfs_write_and_wait_transaction(NULL, root);
2462 BUG_ON(ret); 2392 BUG_ON(ret);
@@ -2484,7 +2414,7 @@ int close_ctree(struct btrfs_root *root)
2484 * ERROR state on disk. 2414 * ERROR state on disk.
2485 * 2415 *
2486 * 2. when btrfs flips readonly just in btrfs_commit_super, 2416 * 2. when btrfs flips readonly just in btrfs_commit_super,
2487 * and in such case, btrfs cannnot write sb via btrfs_commit_super, 2417 * and in such case, btrfs cannot write sb via btrfs_commit_super,
2488 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, 2418 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2489 * btrfs will cleanup all FS resources first and write sb then. 2419 * btrfs will cleanup all FS resources first and write sb then.
2490 */ 2420 */
@@ -2554,6 +2484,8 @@ int close_ctree(struct btrfs_root *root)
2554 kfree(fs_info->chunk_root); 2484 kfree(fs_info->chunk_root);
2555 kfree(fs_info->dev_root); 2485 kfree(fs_info->dev_root);
2556 kfree(fs_info->csum_root); 2486 kfree(fs_info->csum_root);
2487 kfree(fs_info);
2488
2557 return 0; 2489 return 0;
2558} 2490}
2559 2491
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9786963b07e5..b4ffad859adb 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
21 int len = *max_len; 21 int len = *max_len;
22 int type; 22 int type;
23 23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || 24 if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) 25 *max_len = BTRFS_FID_SIZE_CONNECTABLE;
26 return 255; 26 return 255;
27 } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
28 *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 return 255;
30 }
27 31
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE; 32 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT; 33 type = FILEID_BTRFS_WITHOUT_PARENT;
@@ -171,6 +175,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
171 int ret; 175 int ret;
172 176
173 path = btrfs_alloc_path(); 177 path = btrfs_alloc_path();
178 if (!path)
179 return ERR_PTR(-ENOMEM);
174 180
175 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { 181 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
176 key.objectid = root->root_key.objectid; 182 key.objectid = root->root_key.objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b55269340cec..7b3089b5c2df 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,11 +320,6 @@ static int caching_kthread(void *data)
320 if (!path) 320 if (!path)
321 return -ENOMEM; 321 return -ENOMEM;
322 322
323 exclude_super_stripes(extent_root, block_group);
324 spin_lock(&block_group->space_info->lock);
325 block_group->space_info->bytes_readonly += block_group->bytes_super;
326 spin_unlock(&block_group->space_info->lock);
327
328 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 323 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
329 324
330 /* 325 /*
@@ -467,8 +462,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
467 cache->cached = BTRFS_CACHE_NO; 462 cache->cached = BTRFS_CACHE_NO;
468 } 463 }
469 spin_unlock(&cache->lock); 464 spin_unlock(&cache->lock);
470 if (ret == 1) 465 if (ret == 1) {
466 free_excluded_extents(fs_info->extent_root, cache);
471 return 0; 467 return 0;
468 }
472 } 469 }
473 470
474 if (load_cache_only) 471 if (load_cache_only)
@@ -3344,21 +3341,24 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3344 u64 reserved; 3341 u64 reserved;
3345 u64 max_reclaim; 3342 u64 max_reclaim;
3346 u64 reclaimed = 0; 3343 u64 reclaimed = 0;
3347 int pause = 1; 3344 long time_left;
3348 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3345 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3346 int loops = 0;
3347 unsigned long progress;
3349 3348
3350 block_rsv = &root->fs_info->delalloc_block_rsv; 3349 block_rsv = &root->fs_info->delalloc_block_rsv;
3351 space_info = block_rsv->space_info; 3350 space_info = block_rsv->space_info;
3352 3351
3353 smp_mb(); 3352 smp_mb();
3354 reserved = space_info->bytes_reserved; 3353 reserved = space_info->bytes_reserved;
3354 progress = space_info->reservation_progress;
3355 3355
3356 if (reserved == 0) 3356 if (reserved == 0)
3357 return 0; 3357 return 0;
3358 3358
3359 max_reclaim = min(reserved, to_reclaim); 3359 max_reclaim = min(reserved, to_reclaim);
3360 3360
3361 while (1) { 3361 while (loops < 1024) {
3362 /* have the flusher threads jump in and do some IO */ 3362 /* have the flusher threads jump in and do some IO */
3363 smp_mb(); 3363 smp_mb();
3364 nr_pages = min_t(unsigned long, nr_pages, 3364 nr_pages = min_t(unsigned long, nr_pages,
@@ -3371,17 +3371,31 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3371 reserved = space_info->bytes_reserved; 3371 reserved = space_info->bytes_reserved;
3372 spin_unlock(&space_info->lock); 3372 spin_unlock(&space_info->lock);
3373 3373
3374 loops++;
3375
3374 if (reserved == 0 || reclaimed >= max_reclaim) 3376 if (reserved == 0 || reclaimed >= max_reclaim)
3375 break; 3377 break;
3376 3378
3377 if (trans && trans->transaction->blocked) 3379 if (trans && trans->transaction->blocked)
3378 return -EAGAIN; 3380 return -EAGAIN;
3379 3381
3380 __set_current_state(TASK_INTERRUPTIBLE); 3382 time_left = schedule_timeout_interruptible(1);
3381 schedule_timeout(pause); 3383
3382 pause <<= 1; 3384 /* We were interrupted, exit */
3383 if (pause > HZ / 10) 3385 if (time_left)
3384 pause = HZ / 10; 3386 break;
3387
3388 /* we've kicked the IO a few times, if anything has been freed,
3389 * exit. There is no sense in looping here for a long time
3390 * when we really need to commit the transaction, or there are
3391 * just too many writers without enough free space
3392 */
3393
3394 if (loops > 3) {
3395 smp_mb();
3396 if (progress != space_info->reservation_progress)
3397 break;
3398 }
3385 3399
3386 } 3400 }
3387 return reclaimed >= to_reclaim; 3401 return reclaimed >= to_reclaim;
@@ -3588,10 +3602,23 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3588 3602
3589 if (num_bytes > 0) { 3603 if (num_bytes > 0) {
3590 if (dest) { 3604 if (dest) {
3591 block_rsv_add_bytes(dest, num_bytes, 0); 3605 spin_lock(&dest->lock);
3592 } else { 3606 if (!dest->full) {
3607 u64 bytes_to_add;
3608
3609 bytes_to_add = dest->size - dest->reserved;
3610 bytes_to_add = min(num_bytes, bytes_to_add);
3611 dest->reserved += bytes_to_add;
3612 if (dest->reserved >= dest->size)
3613 dest->full = 1;
3614 num_bytes -= bytes_to_add;
3615 }
3616 spin_unlock(&dest->lock);
3617 }
3618 if (num_bytes) {
3593 spin_lock(&space_info->lock); 3619 spin_lock(&space_info->lock);
3594 space_info->bytes_reserved -= num_bytes; 3620 space_info->bytes_reserved -= num_bytes;
3621 space_info->reservation_progress++;
3595 spin_unlock(&space_info->lock); 3622 spin_unlock(&space_info->lock);
3596 } 3623 }
3597 } 3624 }
@@ -3824,6 +3851,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3824 if (block_rsv->reserved >= block_rsv->size) { 3851 if (block_rsv->reserved >= block_rsv->size) {
3825 num_bytes = block_rsv->reserved - block_rsv->size; 3852 num_bytes = block_rsv->reserved - block_rsv->size;
3826 sinfo->bytes_reserved -= num_bytes; 3853 sinfo->bytes_reserved -= num_bytes;
3854 sinfo->reservation_progress++;
3827 block_rsv->reserved = block_rsv->size; 3855 block_rsv->reserved = block_rsv->size;
3828 block_rsv->full = 1; 3856 block_rsv->full = 1;
3829 } 3857 }
@@ -3985,7 +4013,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3985 to_reserve = 0; 4013 to_reserve = 0;
3986 } 4014 }
3987 spin_unlock(&BTRFS_I(inode)->accounting_lock); 4015 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3988
3989 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4016 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3990 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4017 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3991 if (ret) 4018 if (ret)
@@ -4012,6 +4039,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4012 4039
4013 num_bytes = ALIGN(num_bytes, root->sectorsize); 4040 num_bytes = ALIGN(num_bytes, root->sectorsize);
4014 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4041 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
4042 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4015 4043
4016 spin_lock(&BTRFS_I(inode)->accounting_lock); 4044 spin_lock(&BTRFS_I(inode)->accounting_lock);
4017 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); 4045 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
@@ -4112,6 +4140,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4112 btrfs_set_block_group_used(&cache->item, old_val); 4140 btrfs_set_block_group_used(&cache->item, old_val);
4113 cache->reserved -= num_bytes; 4141 cache->reserved -= num_bytes;
4114 cache->space_info->bytes_reserved -= num_bytes; 4142 cache->space_info->bytes_reserved -= num_bytes;
4143 cache->space_info->reservation_progress++;
4115 cache->space_info->bytes_used += num_bytes; 4144 cache->space_info->bytes_used += num_bytes;
4116 cache->space_info->disk_used += num_bytes * factor; 4145 cache->space_info->disk_used += num_bytes * factor;
4117 spin_unlock(&cache->lock); 4146 spin_unlock(&cache->lock);
@@ -4163,6 +4192,7 @@ static int pin_down_extent(struct btrfs_root *root,
4163 if (reserved) { 4192 if (reserved) {
4164 cache->reserved -= num_bytes; 4193 cache->reserved -= num_bytes;
4165 cache->space_info->bytes_reserved -= num_bytes; 4194 cache->space_info->bytes_reserved -= num_bytes;
4195 cache->space_info->reservation_progress++;
4166 } 4196 }
4167 spin_unlock(&cache->lock); 4197 spin_unlock(&cache->lock);
4168 spin_unlock(&cache->space_info->lock); 4198 spin_unlock(&cache->space_info->lock);
@@ -4213,6 +4243,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
4213 space_info->bytes_readonly += num_bytes; 4243 space_info->bytes_readonly += num_bytes;
4214 cache->reserved -= num_bytes; 4244 cache->reserved -= num_bytes;
4215 space_info->bytes_reserved -= num_bytes; 4245 space_info->bytes_reserved -= num_bytes;
4246 space_info->reservation_progress++;
4216 } 4247 }
4217 spin_unlock(&cache->lock); 4248 spin_unlock(&cache->lock);
4218 spin_unlock(&space_info->lock); 4249 spin_unlock(&space_info->lock);
@@ -4691,6 +4722,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4691 if (ret) { 4722 if (ret) {
4692 spin_lock(&cache->space_info->lock); 4723 spin_lock(&cache->space_info->lock);
4693 cache->space_info->bytes_reserved -= buf->len; 4724 cache->space_info->bytes_reserved -= buf->len;
4725 cache->space_info->reservation_progress++;
4694 spin_unlock(&cache->space_info->lock); 4726 spin_unlock(&cache->space_info->lock);
4695 } 4727 }
4696 goto out; 4728 goto out;
@@ -5355,7 +5387,7 @@ again:
5355 num_bytes, data, 1); 5387 num_bytes, data, 1);
5356 goto again; 5388 goto again;
5357 } 5389 }
5358 if (ret == -ENOSPC) { 5390 if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
5359 struct btrfs_space_info *sinfo; 5391 struct btrfs_space_info *sinfo;
5360 5392
5361 sinfo = __find_space_info(root->fs_info, data); 5393 sinfo = __find_space_info(root->fs_info, data);
@@ -5633,6 +5665,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5633 struct btrfs_root *root, u32 blocksize) 5665 struct btrfs_root *root, u32 blocksize)
5634{ 5666{
5635 struct btrfs_block_rsv *block_rsv; 5667 struct btrfs_block_rsv *block_rsv;
5668 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5636 int ret; 5669 int ret;
5637 5670
5638 block_rsv = get_block_rsv(trans, root); 5671 block_rsv = get_block_rsv(trans, root);
@@ -5640,14 +5673,39 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5640 if (block_rsv->size == 0) { 5673 if (block_rsv->size == 0) {
5641 ret = reserve_metadata_bytes(trans, root, block_rsv, 5674 ret = reserve_metadata_bytes(trans, root, block_rsv,
5642 blocksize, 0); 5675 blocksize, 0);
5643 if (ret) 5676 /*
5677 * If we couldn't reserve metadata bytes try and use some from
5678 * the global reserve.
5679 */
5680 if (ret && block_rsv != global_rsv) {
5681 ret = block_rsv_use_bytes(global_rsv, blocksize);
5682 if (!ret)
5683 return global_rsv;
5684 return ERR_PTR(ret);
5685 } else if (ret) {
5644 return ERR_PTR(ret); 5686 return ERR_PTR(ret);
5687 }
5645 return block_rsv; 5688 return block_rsv;
5646 } 5689 }
5647 5690
5648 ret = block_rsv_use_bytes(block_rsv, blocksize); 5691 ret = block_rsv_use_bytes(block_rsv, blocksize);
5649 if (!ret) 5692 if (!ret)
5650 return block_rsv; 5693 return block_rsv;
5694 if (ret) {
5695 WARN_ON(1);
5696 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
5697 0);
5698 if (!ret) {
5699 spin_lock(&block_rsv->lock);
5700 block_rsv->size += blocksize;
5701 spin_unlock(&block_rsv->lock);
5702 return block_rsv;
5703 } else if (ret && block_rsv != global_rsv) {
5704 ret = block_rsv_use_bytes(global_rsv, blocksize);
5705 if (!ret)
5706 return global_rsv;
5707 }
5708 }
5651 5709
5652 return ERR_PTR(-ENOSPC); 5710 return ERR_PTR(-ENOSPC);
5653} 5711}
@@ -6221,6 +6279,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6221 BUG_ON(!wc); 6279 BUG_ON(!wc);
6222 6280
6223 trans = btrfs_start_transaction(tree_root, 0); 6281 trans = btrfs_start_transaction(tree_root, 0);
6282 BUG_ON(IS_ERR(trans));
6283
6224 if (block_rsv) 6284 if (block_rsv)
6225 trans->block_rsv = block_rsv; 6285 trans->block_rsv = block_rsv;
6226 6286
@@ -6318,6 +6378,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6318 6378
6319 btrfs_end_transaction_throttle(trans, tree_root); 6379 btrfs_end_transaction_throttle(trans, tree_root);
6320 trans = btrfs_start_transaction(tree_root, 0); 6380 trans = btrfs_start_transaction(tree_root, 0);
6381 BUG_ON(IS_ERR(trans));
6321 if (block_rsv) 6382 if (block_rsv)
6322 trans->block_rsv = block_rsv; 6383 trans->block_rsv = block_rsv;
6323 } 6384 }
@@ -6446,6 +6507,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start,
6446 int ret = 0; 6507 int ret = 0;
6447 6508
6448 ra = kzalloc(sizeof(*ra), GFP_NOFS); 6509 ra = kzalloc(sizeof(*ra), GFP_NOFS);
6510 if (!ra)
6511 return -ENOMEM;
6449 6512
6450 mutex_lock(&inode->i_mutex); 6513 mutex_lock(&inode->i_mutex);
6451 first_index = start >> PAGE_CACHE_SHIFT; 6514 first_index = start >> PAGE_CACHE_SHIFT;
@@ -6531,7 +6594,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
6531 u64 end = start + extent_key->offset - 1; 6594 u64 end = start + extent_key->offset - 1;
6532 6595
6533 em = alloc_extent_map(GFP_NOFS); 6596 em = alloc_extent_map(GFP_NOFS);
6534 BUG_ON(!em || IS_ERR(em)); 6597 BUG_ON(!em);
6535 6598
6536 em->start = start; 6599 em->start = start;
6537 em->len = extent_key->offset; 6600 em->len = extent_key->offset;
@@ -7477,7 +7540,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
7477 BUG_ON(reloc_root->commit_root != NULL); 7540 BUG_ON(reloc_root->commit_root != NULL);
7478 while (1) { 7541 while (1) {
7479 trans = btrfs_join_transaction(root, 1); 7542 trans = btrfs_join_transaction(root, 1);
7480 BUG_ON(!trans); 7543 BUG_ON(IS_ERR(trans));
7481 7544
7482 mutex_lock(&root->fs_info->drop_mutex); 7545 mutex_lock(&root->fs_info->drop_mutex);
7483 ret = btrfs_drop_snapshot(trans, reloc_root); 7546 ret = btrfs_drop_snapshot(trans, reloc_root);
@@ -7535,7 +7598,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7535 7598
7536 if (found) { 7599 if (found) {
7537 trans = btrfs_start_transaction(root, 1); 7600 trans = btrfs_start_transaction(root, 1);
7538 BUG_ON(!trans); 7601 BUG_ON(IS_ERR(trans));
7539 ret = btrfs_commit_transaction(trans, root); 7602 ret = btrfs_commit_transaction(trans, root);
7540 BUG_ON(ret); 7603 BUG_ON(ret);
7541 } 7604 }
@@ -7779,7 +7842,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7779 7842
7780 7843
7781 trans = btrfs_start_transaction(extent_root, 1); 7844 trans = btrfs_start_transaction(extent_root, 1);
7782 BUG_ON(!trans); 7845 BUG_ON(IS_ERR(trans));
7783 7846
7784 if (extent_key->objectid == 0) { 7847 if (extent_key->objectid == 0) {
7785 ret = del_extent_zero(trans, extent_root, path, extent_key); 7848 ret = del_extent_zero(trans, extent_root, path, extent_key);
@@ -8013,6 +8076,13 @@ out:
8013 return ret; 8076 return ret;
8014} 8077}
8015 8078
8079int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8080 struct btrfs_root *root, u64 type)
8081{
8082 u64 alloc_flags = get_alloc_profile(root, type);
8083 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
8084}
8085
8016/* 8086/*
8017 * helper to account the unused space of all the readonly block group in the 8087 * helper to account the unused space of all the readonly block group in the
8018 * list. takes mirrors into account. 8088 * list. takes mirrors into account.
@@ -8270,6 +8340,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8270 if (block_group->cached == BTRFS_CACHE_STARTED) 8340 if (block_group->cached == BTRFS_CACHE_STARTED)
8271 wait_block_group_cache_done(block_group); 8341 wait_block_group_cache_done(block_group);
8272 8342
8343 /*
8344 * We haven't cached this block group, which means we could
8345 * possibly have excluded extents on this block group.
8346 */
8347 if (block_group->cached == BTRFS_CACHE_NO)
8348 free_excluded_extents(info->extent_root, block_group);
8349
8273 btrfs_remove_free_space_cache(block_group); 8350 btrfs_remove_free_space_cache(block_group);
8274 btrfs_put_block_group(block_group); 8351 btrfs_put_block_group(block_group);
8275 8352
@@ -8385,6 +8462,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8385 cache->sectorsize = root->sectorsize; 8462 cache->sectorsize = root->sectorsize;
8386 8463
8387 /* 8464 /*
8465 * We need to exclude the super stripes now so that the space
8466 * info has super bytes accounted for, otherwise we'll think
8467 * we have more space than we actually do.
8468 */
8469 exclude_super_stripes(root, cache);
8470
8471 /*
8388 * check for two cases, either we are full, and therefore 8472 * check for two cases, either we are full, and therefore
8389 * don't need to bother with the caching work since we won't 8473 * don't need to bother with the caching work since we won't
8390 * find any space, or we are empty, and we can just add all 8474 * find any space, or we are empty, and we can just add all
@@ -8392,12 +8476,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8392 * time, particularly in the full case. 8476 * time, particularly in the full case.
8393 */ 8477 */
8394 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8478 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8395 exclude_super_stripes(root, cache);
8396 cache->last_byte_to_unpin = (u64)-1; 8479 cache->last_byte_to_unpin = (u64)-1;
8397 cache->cached = BTRFS_CACHE_FINISHED; 8480 cache->cached = BTRFS_CACHE_FINISHED;
8398 free_excluded_extents(root, cache); 8481 free_excluded_extents(root, cache);
8399 } else if (btrfs_block_group_used(&cache->item) == 0) { 8482 } else if (btrfs_block_group_used(&cache->item) == 0) {
8400 exclude_super_stripes(root, cache);
8401 cache->last_byte_to_unpin = (u64)-1; 8483 cache->last_byte_to_unpin = (u64)-1;
8402 cache->cached = BTRFS_CACHE_FINISHED; 8484 cache->cached = BTRFS_CACHE_FINISHED;
8403 add_new_free_space(cache, root->fs_info, 8485 add_new_free_space(cache, root->fs_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2e993cf1766e..b5b92824a271 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1433 */ 1433 */
1434u64 count_range_bits(struct extent_io_tree *tree, 1434u64 count_range_bits(struct extent_io_tree *tree,
1435 u64 *start, u64 search_end, u64 max_bytes, 1435 u64 *start, u64 search_end, u64 max_bytes,
1436 unsigned long bits) 1436 unsigned long bits, int contig)
1437{ 1437{
1438 struct rb_node *node; 1438 struct rb_node *node;
1439 struct extent_state *state; 1439 struct extent_state *state;
1440 u64 cur_start = *start; 1440 u64 cur_start = *start;
1441 u64 total_bytes = 0; 1441 u64 total_bytes = 0;
1442 u64 last = 0;
1442 int found = 0; 1443 int found = 0;
1443 1444
1444 if (search_end <= cur_start) { 1445 if (search_end <= cur_start) {
@@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
1463 state = rb_entry(node, struct extent_state, rb_node); 1464 state = rb_entry(node, struct extent_state, rb_node);
1464 if (state->start > search_end) 1465 if (state->start > search_end)
1465 break; 1466 break;
1466 if (state->end >= cur_start && (state->state & bits)) { 1467 if (contig && found && state->start > last + 1)
1468 break;
1469 if (state->end >= cur_start && (state->state & bits) == bits) {
1467 total_bytes += min(search_end, state->end) + 1 - 1470 total_bytes += min(search_end, state->end) + 1 -
1468 max(cur_start, state->start); 1471 max(cur_start, state->start);
1469 if (total_bytes >= max_bytes) 1472 if (total_bytes >= max_bytes)
@@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
1472 *start = state->start; 1475 *start = state->start;
1473 found = 1; 1476 found = 1;
1474 } 1477 }
1478 last = state->end;
1479 } else if (contig && found) {
1480 break;
1475 } 1481 }
1476 node = rb_next(node); 1482 node = rb_next(node);
1477 if (!node) 1483 if (!node)
@@ -1865,7 +1871,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1865 bio_get(bio); 1871 bio_get(bio);
1866 1872
1867 if (tree->ops && tree->ops->submit_bio_hook) 1873 if (tree->ops && tree->ops->submit_bio_hook)
1868 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1874 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1869 mirror_num, bio_flags, start); 1875 mirror_num, bio_flags, start);
1870 else 1876 else
1871 submit_bio(rw, bio); 1877 submit_bio(rw, bio);
@@ -1920,6 +1926,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1920 nr = bio_get_nr_vecs(bdev); 1926 nr = bio_get_nr_vecs(bdev);
1921 1927
1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1928 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1929 if (!bio)
1930 return -ENOMEM;
1923 1931
1924 bio_add_page(bio, page, page_size, offset); 1932 bio_add_page(bio, page, page_size, offset);
1925 bio->bi_end_io = end_io_func; 1933 bio->bi_end_io = end_io_func;
@@ -1944,6 +1952,7 @@ void set_page_extent_mapped(struct page *page)
1944 1952
1945static void set_page_extent_head(struct page *page, unsigned long len) 1953static void set_page_extent_head(struct page *page, unsigned long len)
1946{ 1954{
1955 WARN_ON(!PagePrivate(page));
1947 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1956 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1948} 1957}
1949 1958
@@ -2126,7 +2135,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2126 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2135 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2127 &bio_flags); 2136 &bio_flags);
2128 if (bio) 2137 if (bio)
2129 submit_one_bio(READ, bio, 0, bio_flags); 2138 ret = submit_one_bio(READ, bio, 0, bio_flags);
2130 return ret; 2139 return ret;
2131} 2140}
2132 2141
@@ -2179,7 +2188,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2179 unsigned long nr_written = 0; 2188 unsigned long nr_written = 0;
2180 2189
2181 if (wbc->sync_mode == WB_SYNC_ALL) 2190 if (wbc->sync_mode == WB_SYNC_ALL)
2182 write_flags = WRITE_SYNC_PLUG; 2191 write_flags = WRITE_SYNC;
2183 else 2192 else
2184 write_flags = WRITE; 2193 write_flags = WRITE;
2185 2194
@@ -2819,9 +2828,17 @@ int try_release_extent_state(struct extent_map_tree *map,
2819 * at this point we can safely clear everything except the 2828 * at this point we can safely clear everything except the
2820 * locked bit and the nodatasum bit 2829 * locked bit and the nodatasum bit
2821 */ 2830 */
2822 clear_extent_bit(tree, start, end, 2831 ret = clear_extent_bit(tree, start, end,
2823 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2832 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2824 0, 0, NULL, mask); 2833 0, 0, NULL, mask);
2834
2835 /* if clear_extent_bit failed for enomem reasons,
2836 * we can't allow the release to continue.
2837 */
2838 if (ret < 0)
2839 ret = 0;
2840 else
2841 ret = 1;
2825 } 2842 }
2826 return ret; 2843 return ret;
2827} 2844}
@@ -2901,6 +2918,46 @@ out:
2901 return sector; 2918 return sector;
2902} 2919}
2903 2920
2921/*
2922 * helper function for fiemap, which doesn't want to see any holes.
2923 * This maps until we find something past 'last'
2924 */
2925static struct extent_map *get_extent_skip_holes(struct inode *inode,
2926 u64 offset,
2927 u64 last,
2928 get_extent_t *get_extent)
2929{
2930 u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2931 struct extent_map *em;
2932 u64 len;
2933
2934 if (offset >= last)
2935 return NULL;
2936
2937 while(1) {
2938 len = last - offset;
2939 if (len == 0)
2940 break;
2941 len = (len + sectorsize - 1) & ~(sectorsize - 1);
2942 em = get_extent(inode, NULL, 0, offset, len, 0);
2943 if (!em || IS_ERR(em))
2944 return em;
2945
2946 /* if this isn't a hole return it */
2947 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
2948 em->block_start != EXTENT_MAP_HOLE) {
2949 return em;
2950 }
2951
2952 /* this is a hole, advance to the next extent */
2953 offset = extent_map_end(em);
2954 free_extent_map(em);
2955 if (offset >= last)
2956 break;
2957 }
2958 return NULL;
2959}
2960
2904int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2961int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2905 __u64 start, __u64 len, get_extent_t *get_extent) 2962 __u64 start, __u64 len, get_extent_t *get_extent)
2906{ 2963{
@@ -2910,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2910 u32 flags = 0; 2967 u32 flags = 0;
2911 u32 found_type; 2968 u32 found_type;
2912 u64 last; 2969 u64 last;
2970 u64 last_for_get_extent = 0;
2913 u64 disko = 0; 2971 u64 disko = 0;
2972 u64 isize = i_size_read(inode);
2914 struct btrfs_key found_key; 2973 struct btrfs_key found_key;
2915 struct extent_map *em = NULL; 2974 struct extent_map *em = NULL;
2916 struct extent_state *cached_state = NULL; 2975 struct extent_state *cached_state = NULL;
2917 struct btrfs_path *path; 2976 struct btrfs_path *path;
2918 struct btrfs_file_extent_item *item; 2977 struct btrfs_file_extent_item *item;
2919 int end = 0; 2978 int end = 0;
2920 u64 em_start = 0, em_len = 0; 2979 u64 em_start = 0;
2980 u64 em_len = 0;
2981 u64 em_end = 0;
2921 unsigned long emflags; 2982 unsigned long emflags;
2922 int hole = 0;
2923 2983
2924 if (len == 0) 2984 if (len == 0)
2925 return -EINVAL; 2985 return -EINVAL;
@@ -2929,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2929 return -ENOMEM; 2989 return -ENOMEM;
2930 path->leave_spinning = 1; 2990 path->leave_spinning = 1;
2931 2991
2992 /*
2993 * lookup the last file extent. We're not using i_size here
2994 * because there might be preallocation past i_size
2995 */
2932 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 2996 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2933 path, inode->i_ino, -1, 0); 2997 path, inode->i_ino, -1, 0);
2934 if (ret < 0) { 2998 if (ret < 0) {
@@ -2942,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2942 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 3006 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2943 found_type = btrfs_key_type(&found_key); 3007 found_type = btrfs_key_type(&found_key);
2944 3008
2945 /* No extents, just return */ 3009 /* No extents, but there might be delalloc bits */
2946 if (found_key.objectid != inode->i_ino || 3010 if (found_key.objectid != inode->i_ino ||
2947 found_type != BTRFS_EXTENT_DATA_KEY) { 3011 found_type != BTRFS_EXTENT_DATA_KEY) {
2948 btrfs_free_path(path); 3012 /* have to trust i_size as the end */
2949 return 0; 3013 last = (u64)-1;
3014 last_for_get_extent = isize;
3015 } else {
3016 /*
3017 * remember the start of the last extent. There are a
3018 * bunch of different factors that go into the length of the
3019 * extent, so its much less complex to remember where it started
3020 */
3021 last = found_key.offset;
3022 last_for_get_extent = last + 1;
2950 } 3023 }
2951 last = found_key.offset;
2952 btrfs_free_path(path); 3024 btrfs_free_path(path);
2953 3025
3026 /*
3027 * we might have some extents allocated but more delalloc past those
3028 * extents. so, we trust isize unless the start of the last extent is
3029 * beyond isize
3030 */
3031 if (last < isize) {
3032 last = (u64)-1;
3033 last_for_get_extent = isize;
3034 }
3035
2954 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3036 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2955 &cached_state, GFP_NOFS); 3037 &cached_state, GFP_NOFS);
2956 em = get_extent(inode, NULL, 0, off, max - off, 0); 3038
3039 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3040 get_extent);
2957 if (!em) 3041 if (!em)
2958 goto out; 3042 goto out;
2959 if (IS_ERR(em)) { 3043 if (IS_ERR(em)) {
@@ -2962,22 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2962 } 3046 }
2963 3047
2964 while (!end) { 3048 while (!end) {
2965 hole = 0; 3049 u64 offset_in_extent;
2966 off = em->start + em->len;
2967 if (off >= max)
2968 end = 1;
2969 3050
2970 if (em->block_start == EXTENT_MAP_HOLE) { 3051 /* break if the extent we found is outside the range */
2971 hole = 1; 3052 if (em->start >= max || extent_map_end(em) < off)
2972 goto next; 3053 break;
2973 }
2974 3054
2975 em_start = em->start; 3055 /*
2976 em_len = em->len; 3056 * get_extent may return an extent that starts before our
3057 * requested range. We have to make sure the ranges
3058 * we return to fiemap always move forward and don't
3059 * overlap, so adjust the offsets here
3060 */
3061 em_start = max(em->start, off);
2977 3062
3063 /*
3064 * record the offset from the start of the extent
3065 * for adjusting the disk offset below
3066 */
3067 offset_in_extent = em_start - em->start;
3068 em_end = extent_map_end(em);
3069 em_len = em_end - em_start;
3070 emflags = em->flags;
2978 disko = 0; 3071 disko = 0;
2979 flags = 0; 3072 flags = 0;
2980 3073
3074 /*
3075 * bump off for our next call to get_extent
3076 */
3077 off = extent_map_end(em);
3078 if (off >= max)
3079 end = 1;
3080
2981 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 3081 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2982 end = 1; 3082 end = 1;
2983 flags |= FIEMAP_EXTENT_LAST; 3083 flags |= FIEMAP_EXTENT_LAST;
@@ -2988,42 +3088,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2988 flags |= (FIEMAP_EXTENT_DELALLOC | 3088 flags |= (FIEMAP_EXTENT_DELALLOC |
2989 FIEMAP_EXTENT_UNKNOWN); 3089 FIEMAP_EXTENT_UNKNOWN);
2990 } else { 3090 } else {
2991 disko = em->block_start; 3091 disko = em->block_start + offset_in_extent;
2992 } 3092 }
2993 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3093 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2994 flags |= FIEMAP_EXTENT_ENCODED; 3094 flags |= FIEMAP_EXTENT_ENCODED;
2995 3095
2996next:
2997 emflags = em->flags;
2998 free_extent_map(em); 3096 free_extent_map(em);
2999 em = NULL; 3097 em = NULL;
3000 if (!end) { 3098 if ((em_start >= last) || em_len == (u64)-1 ||
3001 em = get_extent(inode, NULL, 0, off, max - off, 0); 3099 (last == (u64)-1 && isize <= em_end)) {
3002 if (!em)
3003 goto out;
3004 if (IS_ERR(em)) {
3005 ret = PTR_ERR(em);
3006 goto out;
3007 }
3008 emflags = em->flags;
3009 }
3010
3011 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
3012 flags |= FIEMAP_EXTENT_LAST; 3100 flags |= FIEMAP_EXTENT_LAST;
3013 end = 1; 3101 end = 1;
3014 } 3102 }
3015 3103
3016 if (em_start == last) { 3104 /* now scan forward to see if this is really the last extent. */
3105 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3106 get_extent);
3107 if (IS_ERR(em)) {
3108 ret = PTR_ERR(em);
3109 goto out;
3110 }
3111 if (!em) {
3017 flags |= FIEMAP_EXTENT_LAST; 3112 flags |= FIEMAP_EXTENT_LAST;
3018 end = 1; 3113 end = 1;
3019 } 3114 }
3020 3115 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3021 if (!hole) { 3116 em_len, flags);
3022 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3117 if (ret)
3023 em_len, flags); 3118 goto out_free;
3024 if (ret)
3025 goto out_free;
3026 }
3027 } 3119 }
3028out_free: 3120out_free:
3029 free_extent_map(em); 3121 free_extent_map(em);
@@ -3192,7 +3284,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3192 } 3284 }
3193 if (!PageUptodate(p)) 3285 if (!PageUptodate(p))
3194 uptodate = 0; 3286 uptodate = 0;
3195 unlock_page(p); 3287
3288 /*
3289 * see below about how we avoid a nasty race with release page
3290 * and why we unlock later
3291 */
3292 if (i != 0)
3293 unlock_page(p);
3196 } 3294 }
3197 if (uptodate) 3295 if (uptodate)
3198 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3296 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3216,9 +3314,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3216 atomic_inc(&eb->refs); 3314 atomic_inc(&eb->refs);
3217 spin_unlock(&tree->buffer_lock); 3315 spin_unlock(&tree->buffer_lock);
3218 radix_tree_preload_end(); 3316 radix_tree_preload_end();
3317
3318 /*
3319 * there is a race where release page may have
3320 * tried to find this extent buffer in the radix
3321 * but failed. It will tell the VM it is safe to
3322 * reclaim the, and it will clear the page private bit.
3323 * We must make sure to set the page private bit properly
3324 * after the extent buffer is in the radix tree so
3325 * it doesn't get lost
3326 */
3327 set_page_extent_mapped(eb->first_page);
3328 set_page_extent_head(eb->first_page, eb->len);
3329 if (!page0)
3330 unlock_page(eb->first_page);
3219 return eb; 3331 return eb;
3220 3332
3221free_eb: 3333free_eb:
3334 if (eb->first_page && !page0)
3335 unlock_page(eb->first_page);
3336
3222 if (!atomic_dec_and_test(&eb->refs)) 3337 if (!atomic_dec_and_test(&eb->refs))
3223 return exists; 3338 return exists;
3224 btrfs_release_extent_buffer(eb); 3339 btrfs_release_extent_buffer(eb);
@@ -3269,10 +3384,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3269 continue; 3384 continue;
3270 3385
3271 lock_page(page); 3386 lock_page(page);
3387 WARN_ON(!PagePrivate(page));
3388
3389 set_page_extent_mapped(page);
3272 if (i == 0) 3390 if (i == 0)
3273 set_page_extent_head(page, eb->len); 3391 set_page_extent_head(page, eb->len);
3274 else
3275 set_page_private(page, EXTENT_PAGE_PRIVATE);
3276 3392
3277 clear_page_dirty_for_io(page); 3393 clear_page_dirty_for_io(page);
3278 spin_lock_irq(&page->mapping->tree_lock); 3394 spin_lock_irq(&page->mapping->tree_lock);
@@ -3462,6 +3578,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3462 3578
3463 for (i = start_i; i < num_pages; i++) { 3579 for (i = start_i; i < num_pages; i++) {
3464 page = extent_buffer_page(eb, i); 3580 page = extent_buffer_page(eb, i);
3581
3582 WARN_ON(!PagePrivate(page));
3583
3584 set_page_extent_mapped(page);
3585 if (i == 0)
3586 set_page_extent_head(page, eb->len);
3587
3465 if (inc_all_pages) 3588 if (inc_all_pages)
3466 page_cache_get(page); 3589 page_cache_get(page);
3467 if (!PageUptodate(page)) { 3590 if (!PageUptodate(page)) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7083cfafd061..9318dfefd59c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -191,7 +191,7 @@ void extent_io_exit(void);
191 191
192u64 count_range_bits(struct extent_io_tree *tree, 192u64 count_range_bits(struct extent_io_tree *tree,
193 u64 *start, u64 search_end, 193 u64 *start, u64 search_end,
194 u64 max_bytes, unsigned long bits); 194 u64 max_bytes, unsigned long bits, int contig);
195 195
196void free_extent_state(struct extent_state *state); 196void free_extent_state(struct extent_state *state);
197int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 197int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b0e1fce12530..2b6c12e983b3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,8 +51,8 @@ struct extent_map *alloc_extent_map(gfp_t mask)
51{ 51{
52 struct extent_map *em; 52 struct extent_map *em;
53 em = kmem_cache_alloc(extent_map_cache, mask); 53 em = kmem_cache_alloc(extent_map_cache, mask);
54 if (!em || IS_ERR(em)) 54 if (!em)
55 return em; 55 return NULL;
56 em->in_tree = 0; 56 em->in_tree = 0;
57 em->flags = 0; 57 em->flags = 0;
58 em->compress_type = BTRFS_COMPRESS_NONE; 58 em->compress_type = BTRFS_COMPRESS_NONE;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae77..4f19a3e1bf32 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -536,6 +536,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
536 root = root->fs_info->csum_root; 536 root = root->fs_info->csum_root;
537 537
538 path = btrfs_alloc_path(); 538 path = btrfs_alloc_path();
539 if (!path)
540 return -ENOMEM;
539 541
540 while (1) { 542 while (1) {
541 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 543 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -548,7 +550,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
548 if (path->slots[0] == 0) 550 if (path->slots[0] == 0)
549 goto out; 551 goto out;
550 path->slots[0]--; 552 path->slots[0]--;
553 } else if (ret < 0) {
554 goto out;
551 } 555 }
556
552 leaf = path->nodes[0]; 557 leaf = path->nodes[0];
553 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 558 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
554 559
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c800d58f3013..f447b783bb84 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -70,6 +70,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
70 70
71 /* Flush processor's dcache for this page */ 71 /* Flush processor's dcache for this page */
72 flush_dcache_page(page); 72 flush_dcache_page(page);
73
74 /*
75 * if we get a partial write, we can end up with
76 * partially up to date pages. These add
77 * a lot of complexity, so make sure they don't
78 * happen by forcing this copy to be retried.
79 *
80 * The rest of the btrfs_file_write code will fall
81 * back to page at a time copies after we return 0.
82 */
83 if (!PageUptodate(page) && copied < count)
84 copied = 0;
85
73 iov_iter_advance(i, copied); 86 iov_iter_advance(i, copied);
74 write_bytes -= copied; 87 write_bytes -= copied;
75 total_copied += copied; 88 total_copied += copied;
@@ -186,6 +199,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
186 split = alloc_extent_map(GFP_NOFS); 199 split = alloc_extent_map(GFP_NOFS);
187 if (!split2) 200 if (!split2)
188 split2 = alloc_extent_map(GFP_NOFS); 201 split2 = alloc_extent_map(GFP_NOFS);
202 BUG_ON(!split || !split2);
189 203
190 write_lock(&em_tree->lock); 204 write_lock(&em_tree->lock);
191 em = lookup_extent_mapping(em_tree, start, len); 205 em = lookup_extent_mapping(em_tree, start, len);
@@ -762,6 +776,27 @@ out:
762} 776}
763 777
764/* 778/*
779 * on error we return an unlocked page and the error value
780 * on success we return a locked page and 0
781 */
782static int prepare_uptodate_page(struct page *page, u64 pos)
783{
784 int ret = 0;
785
786 if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
787 ret = btrfs_readpage(NULL, page);
788 if (ret)
789 return ret;
790 lock_page(page);
791 if (!PageUptodate(page)) {
792 unlock_page(page);
793 return -EIO;
794 }
795 }
796 return 0;
797}
798
799/*
765 * this gets pages into the page cache and locks them down, it also properly 800 * this gets pages into the page cache and locks them down, it also properly
766 * waits for data=ordered extents to finish before allowing the pages to be 801 * waits for data=ordered extents to finish before allowing the pages to be
767 * modified. 802 * modified.
@@ -776,6 +811,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
776 unsigned long index = pos >> PAGE_CACHE_SHIFT; 811 unsigned long index = pos >> PAGE_CACHE_SHIFT;
777 struct inode *inode = fdentry(file)->d_inode; 812 struct inode *inode = fdentry(file)->d_inode;
778 int err = 0; 813 int err = 0;
814 int faili = 0;
779 u64 start_pos; 815 u64 start_pos;
780 u64 last_pos; 816 u64 last_pos;
781 817
@@ -793,11 +829,24 @@ again:
793 for (i = 0; i < num_pages; i++) { 829 for (i = 0; i < num_pages; i++) {
794 pages[i] = grab_cache_page(inode->i_mapping, index + i); 830 pages[i] = grab_cache_page(inode->i_mapping, index + i);
795 if (!pages[i]) { 831 if (!pages[i]) {
832 faili = i - 1;
796 err = -ENOMEM; 833 err = -ENOMEM;
797 BUG_ON(1); 834 goto fail;
835 }
836
837 if (i == 0)
838 err = prepare_uptodate_page(pages[i], pos);
839 if (i == num_pages - 1)
840 err = prepare_uptodate_page(pages[i],
841 pos + write_bytes);
842 if (err) {
843 page_cache_release(pages[i]);
844 faili = i - 1;
845 goto fail;
798 } 846 }
799 wait_on_page_writeback(pages[i]); 847 wait_on_page_writeback(pages[i]);
800 } 848 }
849 err = 0;
801 if (start_pos < inode->i_size) { 850 if (start_pos < inode->i_size) {
802 struct btrfs_ordered_extent *ordered; 851 struct btrfs_ordered_extent *ordered;
803 lock_extent_bits(&BTRFS_I(inode)->io_tree, 852 lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -837,6 +886,14 @@ again:
837 WARN_ON(!PageLocked(pages[i])); 886 WARN_ON(!PageLocked(pages[i]));
838 } 887 }
839 return 0; 888 return 0;
889fail:
890 while (faili >= 0) {
891 unlock_page(pages[faili]);
892 page_cache_release(pages[faili]);
893 faili--;
894 }
895 return err;
896
840} 897}
841 898
842static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 899static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -846,7 +903,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
846 struct file *file = iocb->ki_filp; 903 struct file *file = iocb->ki_filp;
847 struct inode *inode = fdentry(file)->d_inode; 904 struct inode *inode = fdentry(file)->d_inode;
848 struct btrfs_root *root = BTRFS_I(inode)->root; 905 struct btrfs_root *root = BTRFS_I(inode)->root;
849 struct page *pinned[2];
850 struct page **pages = NULL; 906 struct page **pages = NULL;
851 struct iov_iter i; 907 struct iov_iter i;
852 loff_t *ppos = &iocb->ki_pos; 908 loff_t *ppos = &iocb->ki_pos;
@@ -867,9 +923,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
867 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 923 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
868 (file->f_flags & O_DIRECT)); 924 (file->f_flags & O_DIRECT));
869 925
870 pinned[0] = NULL;
871 pinned[1] = NULL;
872
873 start_pos = pos; 926 start_pos = pos;
874 927
875 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 928 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -946,6 +999,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
946 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 999 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
947 (sizeof(struct page *))); 1000 (sizeof(struct page *)));
948 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1001 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1002 if (!pages) {
1003 ret = -ENOMEM;
1004 goto out;
1005 }
949 1006
950 /* generic_write_checks can change our pos */ 1007 /* generic_write_checks can change our pos */
951 start_pos = pos; 1008 start_pos = pos;
@@ -953,39 +1010,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
953 first_index = pos >> PAGE_CACHE_SHIFT; 1010 first_index = pos >> PAGE_CACHE_SHIFT;
954 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; 1011 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
955 1012
956 /*
957 * there are lots of better ways to do this, but this code
958 * makes sure the first and last page in the file range are
959 * up to date and ready for cow
960 */
961 if ((pos & (PAGE_CACHE_SIZE - 1))) {
962 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
963 if (!PageUptodate(pinned[0])) {
964 ret = btrfs_readpage(NULL, pinned[0]);
965 BUG_ON(ret);
966 wait_on_page_locked(pinned[0]);
967 } else {
968 unlock_page(pinned[0]);
969 }
970 }
971 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
972 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
973 if (!PageUptodate(pinned[1])) {
974 ret = btrfs_readpage(NULL, pinned[1]);
975 BUG_ON(ret);
976 wait_on_page_locked(pinned[1]);
977 } else {
978 unlock_page(pinned[1]);
979 }
980 }
981
982 while (iov_iter_count(&i) > 0) { 1013 while (iov_iter_count(&i) > 0) {
983 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1014 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
984 size_t write_bytes = min(iov_iter_count(&i), 1015 size_t write_bytes = min(iov_iter_count(&i),
985 nrptrs * (size_t)PAGE_CACHE_SIZE - 1016 nrptrs * (size_t)PAGE_CACHE_SIZE -
986 offset); 1017 offset);
987 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 1018 size_t num_pages = (write_bytes + offset +
988 PAGE_CACHE_SHIFT; 1019 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
989 1020
990 WARN_ON(num_pages > nrptrs); 1021 WARN_ON(num_pages > nrptrs);
991 memset(pages, 0, sizeof(struct page *) * nrptrs); 1022 memset(pages, 0, sizeof(struct page *) * nrptrs);
@@ -1015,8 +1046,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1015 1046
1016 copied = btrfs_copy_from_user(pos, num_pages, 1047 copied = btrfs_copy_from_user(pos, num_pages,
1017 write_bytes, pages, &i); 1048 write_bytes, pages, &i);
1018 dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> 1049
1019 PAGE_CACHE_SHIFT; 1050 /*
1051 * if we have trouble faulting in the pages, fall
1052 * back to one page at a time
1053 */
1054 if (copied < write_bytes)
1055 nrptrs = 1;
1056
1057 if (copied == 0)
1058 dirty_pages = 0;
1059 else
1060 dirty_pages = (copied + offset +
1061 PAGE_CACHE_SIZE - 1) >>
1062 PAGE_CACHE_SHIFT;
1020 1063
1021 if (num_pages > dirty_pages) { 1064 if (num_pages > dirty_pages) {
1022 if (copied > 0) 1065 if (copied > 0)
@@ -1060,10 +1103,6 @@ out:
1060 err = ret; 1103 err = ret;
1061 1104
1062 kfree(pages); 1105 kfree(pages);
1063 if (pinned[0])
1064 page_cache_release(pinned[0]);
1065 if (pinned[1])
1066 page_cache_release(pinned[1]);
1067 *ppos = pos; 1106 *ppos = pos;
1068 1107
1069 /* 1108 /*
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 60d684266959..a0390657451b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -987,11 +987,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
987 return entry; 987 return entry;
988} 988}
989 989
990static void unlink_free_space(struct btrfs_block_group_cache *block_group, 990static inline void
991 struct btrfs_free_space *info) 991__unlink_free_space(struct btrfs_block_group_cache *block_group,
992 struct btrfs_free_space *info)
992{ 993{
993 rb_erase(&info->offset_index, &block_group->free_space_offset); 994 rb_erase(&info->offset_index, &block_group->free_space_offset);
994 block_group->free_extents--; 995 block_group->free_extents--;
996}
997
998static void unlink_free_space(struct btrfs_block_group_cache *block_group,
999 struct btrfs_free_space *info)
1000{
1001 __unlink_free_space(block_group, info);
995 block_group->free_space -= info->bytes; 1002 block_group->free_space -= info->bytes;
996} 1003}
997 1004
@@ -1016,14 +1023,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
1016 u64 max_bytes; 1023 u64 max_bytes;
1017 u64 bitmap_bytes; 1024 u64 bitmap_bytes;
1018 u64 extent_bytes; 1025 u64 extent_bytes;
1026 u64 size = block_group->key.offset;
1019 1027
1020 /* 1028 /*
1021 * The goal is to keep the total amount of memory used per 1gb of space 1029 * The goal is to keep the total amount of memory used per 1gb of space
1022 * at or below 32k, so we need to adjust how much memory we allow to be 1030 * at or below 32k, so we need to adjust how much memory we allow to be
1023 * used by extent based free space tracking 1031 * used by extent based free space tracking
1024 */ 1032 */
1025 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1033 if (size < 1024 * 1024 * 1024)
1026 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); 1034 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1035 else
1036 max_bytes = MAX_CACHE_BYTES_PER_GIG *
1037 div64_u64(size, 1024 * 1024 * 1024);
1027 1038
1028 /* 1039 /*
1029 * we want to account for 1 more bitmap than what we have so we can make 1040 * we want to account for 1 more bitmap than what we have so we can make
@@ -1171,6 +1182,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
1171 recalculate_thresholds(block_group); 1182 recalculate_thresholds(block_group);
1172} 1183}
1173 1184
1185static void free_bitmap(struct btrfs_block_group_cache *block_group,
1186 struct btrfs_free_space *bitmap_info)
1187{
1188 unlink_free_space(block_group, bitmap_info);
1189 kfree(bitmap_info->bitmap);
1190 kfree(bitmap_info);
1191 block_group->total_bitmaps--;
1192 recalculate_thresholds(block_group);
1193}
1194
1174static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, 1195static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
1175 struct btrfs_free_space *bitmap_info, 1196 struct btrfs_free_space *bitmap_info,
1176 u64 *offset, u64 *bytes) 1197 u64 *offset, u64 *bytes)
@@ -1195,6 +1216,7 @@ again:
1195 */ 1216 */
1196 search_start = *offset; 1217 search_start = *offset;
1197 search_bytes = *bytes; 1218 search_bytes = *bytes;
1219 search_bytes = min(search_bytes, end - search_start + 1);
1198 ret = search_bitmap(block_group, bitmap_info, &search_start, 1220 ret = search_bitmap(block_group, bitmap_info, &search_start,
1199 &search_bytes); 1221 &search_bytes);
1200 BUG_ON(ret < 0 || search_start != *offset); 1222 BUG_ON(ret < 0 || search_start != *offset);
@@ -1211,13 +1233,8 @@ again:
1211 1233
1212 if (*bytes) { 1234 if (*bytes) {
1213 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1235 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1214 if (!bitmap_info->bytes) { 1236 if (!bitmap_info->bytes)
1215 unlink_free_space(block_group, bitmap_info); 1237 free_bitmap(block_group, bitmap_info);
1216 kfree(bitmap_info->bitmap);
1217 kfree(bitmap_info);
1218 block_group->total_bitmaps--;
1219 recalculate_thresholds(block_group);
1220 }
1221 1238
1222 /* 1239 /*
1223 * no entry after this bitmap, but we still have bytes to 1240 * no entry after this bitmap, but we still have bytes to
@@ -1250,13 +1267,8 @@ again:
1250 return -EAGAIN; 1267 return -EAGAIN;
1251 1268
1252 goto again; 1269 goto again;
1253 } else if (!bitmap_info->bytes) { 1270 } else if (!bitmap_info->bytes)
1254 unlink_free_space(block_group, bitmap_info); 1271 free_bitmap(block_group, bitmap_info);
1255 kfree(bitmap_info->bitmap);
1256 kfree(bitmap_info);
1257 block_group->total_bitmaps--;
1258 recalculate_thresholds(block_group);
1259 }
1260 1272
1261 return 0; 1273 return 0;
1262} 1274}
@@ -1359,22 +1371,14 @@ out:
1359 return ret; 1371 return ret;
1360} 1372}
1361 1373
1362int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 1374bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
1363 u64 offset, u64 bytes) 1375 struct btrfs_free_space *info, bool update_stat)
1364{ 1376{
1365 struct btrfs_free_space *right_info = NULL; 1377 struct btrfs_free_space *left_info;
1366 struct btrfs_free_space *left_info = NULL; 1378 struct btrfs_free_space *right_info;
1367 struct btrfs_free_space *info = NULL; 1379 bool merged = false;
1368 int ret = 0; 1380 u64 offset = info->offset;
1369 1381 u64 bytes = info->bytes;
1370 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
1371 if (!info)
1372 return -ENOMEM;
1373
1374 info->offset = offset;
1375 info->bytes = bytes;
1376
1377 spin_lock(&block_group->tree_lock);
1378 1382
1379 /* 1383 /*
1380 * first we want to see if there is free space adjacent to the range we 1384 * first we want to see if there is free space adjacent to the range we
@@ -1388,37 +1392,62 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1388 else 1392 else
1389 left_info = tree_search_offset(block_group, offset - 1, 0, 0); 1393 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
1390 1394
1391 /*
1392 * If there was no extent directly to the left or right of this new
1393 * extent then we know we're going to have to allocate a new extent, so
1394 * before we do that see if we need to drop this into a bitmap
1395 */
1396 if ((!left_info || left_info->bitmap) &&
1397 (!right_info || right_info->bitmap)) {
1398 ret = insert_into_bitmap(block_group, info);
1399
1400 if (ret < 0) {
1401 goto out;
1402 } else if (ret) {
1403 ret = 0;
1404 goto out;
1405 }
1406 }
1407
1408 if (right_info && !right_info->bitmap) { 1395 if (right_info && !right_info->bitmap) {
1409 unlink_free_space(block_group, right_info); 1396 if (update_stat)
1397 unlink_free_space(block_group, right_info);
1398 else
1399 __unlink_free_space(block_group, right_info);
1410 info->bytes += right_info->bytes; 1400 info->bytes += right_info->bytes;
1411 kfree(right_info); 1401 kfree(right_info);
1402 merged = true;
1412 } 1403 }
1413 1404
1414 if (left_info && !left_info->bitmap && 1405 if (left_info && !left_info->bitmap &&
1415 left_info->offset + left_info->bytes == offset) { 1406 left_info->offset + left_info->bytes == offset) {
1416 unlink_free_space(block_group, left_info); 1407 if (update_stat)
1408 unlink_free_space(block_group, left_info);
1409 else
1410 __unlink_free_space(block_group, left_info);
1417 info->offset = left_info->offset; 1411 info->offset = left_info->offset;
1418 info->bytes += left_info->bytes; 1412 info->bytes += left_info->bytes;
1419 kfree(left_info); 1413 kfree(left_info);
1414 merged = true;
1420 } 1415 }
1421 1416
1417 return merged;
1418}
1419
1420int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1421 u64 offset, u64 bytes)
1422{
1423 struct btrfs_free_space *info;
1424 int ret = 0;
1425
1426 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
1427 if (!info)
1428 return -ENOMEM;
1429
1430 info->offset = offset;
1431 info->bytes = bytes;
1432
1433 spin_lock(&block_group->tree_lock);
1434
1435 if (try_merge_free_space(block_group, info, true))
1436 goto link;
1437
1438 /*
1439 * There was no extent directly to the left or right of this new
1440 * extent then we know we're going to have to allocate a new extent, so
1441 * before we do that see if we need to drop this into a bitmap
1442 */
1443 ret = insert_into_bitmap(block_group, info);
1444 if (ret < 0) {
1445 goto out;
1446 } else if (ret) {
1447 ret = 0;
1448 goto out;
1449 }
1450link:
1422 ret = link_free_space(block_group, info); 1451 ret = link_free_space(block_group, info);
1423 if (ret) 1452 if (ret)
1424 kfree(info); 1453 kfree(info);
@@ -1621,6 +1650,7 @@ __btrfs_return_cluster_to_free_space(
1621 node = rb_next(&entry->offset_index); 1650 node = rb_next(&entry->offset_index);
1622 rb_erase(&entry->offset_index, &cluster->root); 1651 rb_erase(&entry->offset_index, &cluster->root);
1623 BUG_ON(entry->bitmap); 1652 BUG_ON(entry->bitmap);
1653 try_merge_free_space(block_group, entry, false);
1624 tree_insert_offset(&block_group->free_space_offset, 1654 tree_insert_offset(&block_group->free_space_offset,
1625 entry->offset, &entry->offset_index, 0); 1655 entry->offset, &entry->offset_index, 0);
1626 } 1656 }
@@ -1685,13 +1715,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
1685 ret = offset; 1715 ret = offset;
1686 if (entry->bitmap) { 1716 if (entry->bitmap) {
1687 bitmap_clear_bits(block_group, entry, offset, bytes); 1717 bitmap_clear_bits(block_group, entry, offset, bytes);
1688 if (!entry->bytes) { 1718 if (!entry->bytes)
1689 unlink_free_space(block_group, entry); 1719 free_bitmap(block_group, entry);
1690 kfree(entry->bitmap);
1691 kfree(entry);
1692 block_group->total_bitmaps--;
1693 recalculate_thresholds(block_group);
1694 }
1695 } else { 1720 } else {
1696 unlink_free_space(block_group, entry); 1721 unlink_free_space(block_group, entry);
1697 entry->offset += bytes; 1722 entry->offset += bytes;
@@ -1789,6 +1814,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
1789 1814
1790 ret = search_start; 1815 ret = search_start;
1791 bitmap_clear_bits(block_group, entry, ret, bytes); 1816 bitmap_clear_bits(block_group, entry, ret, bytes);
1817 if (entry->bytes == 0)
1818 free_bitmap(block_group, entry);
1792out: 1819out:
1793 spin_unlock(&cluster->lock); 1820 spin_unlock(&cluster->lock);
1794 spin_unlock(&block_group->tree_lock); 1821 spin_unlock(&block_group->tree_lock);
@@ -1842,15 +1869,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1842 entry->offset += bytes; 1869 entry->offset += bytes;
1843 entry->bytes -= bytes; 1870 entry->bytes -= bytes;
1844 1871
1845 if (entry->bytes == 0) { 1872 if (entry->bytes == 0)
1846 rb_erase(&entry->offset_index, &cluster->root); 1873 rb_erase(&entry->offset_index, &cluster->root);
1847 kfree(entry);
1848 }
1849 break; 1874 break;
1850 } 1875 }
1851out: 1876out:
1852 spin_unlock(&cluster->lock); 1877 spin_unlock(&cluster->lock);
1853 1878
1879 if (!ret)
1880 return 0;
1881
1882 spin_lock(&block_group->tree_lock);
1883
1884 block_group->free_space -= bytes;
1885 if (entry->bytes == 0) {
1886 block_group->free_extents--;
1887 kfree(entry);
1888 }
1889
1890 spin_unlock(&block_group->tree_lock);
1891
1854 return ret; 1892 return ret;
1855} 1893}
1856 1894
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 160b55b3e132..119520bdb9a5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -90,13 +90,14 @@ static noinline int cow_file_range(struct inode *inode,
90 unsigned long *nr_written, int unlock); 90 unsigned long *nr_written, int unlock);
91 91
92static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 92static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
93 struct inode *inode, struct inode *dir) 93 struct inode *inode, struct inode *dir,
94 const struct qstr *qstr)
94{ 95{
95 int err; 96 int err;
96 97
97 err = btrfs_init_acl(trans, inode, dir); 98 err = btrfs_init_acl(trans, inode, dir);
98 if (!err) 99 if (!err)
99 err = btrfs_xattr_security_init(trans, inode, dir); 100 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
100 return err; 101 return err;
101} 102}
102 103
@@ -416,7 +417,7 @@ again:
416 } 417 }
417 if (start == 0) { 418 if (start == 0) {
418 trans = btrfs_join_transaction(root, 1); 419 trans = btrfs_join_transaction(root, 1);
419 BUG_ON(!trans); 420 BUG_ON(IS_ERR(trans));
420 btrfs_set_trans_block_group(trans, inode); 421 btrfs_set_trans_block_group(trans, inode);
421 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 422 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
422 423
@@ -612,6 +613,7 @@ retry:
612 GFP_NOFS); 613 GFP_NOFS);
613 614
614 trans = btrfs_join_transaction(root, 1); 615 trans = btrfs_join_transaction(root, 1);
616 BUG_ON(IS_ERR(trans));
615 ret = btrfs_reserve_extent(trans, root, 617 ret = btrfs_reserve_extent(trans, root,
616 async_extent->compressed_size, 618 async_extent->compressed_size,
617 async_extent->compressed_size, 619 async_extent->compressed_size,
@@ -643,6 +645,7 @@ retry:
643 async_extent->ram_size - 1, 0); 645 async_extent->ram_size - 1, 0);
644 646
645 em = alloc_extent_map(GFP_NOFS); 647 em = alloc_extent_map(GFP_NOFS);
648 BUG_ON(!em);
646 em->start = async_extent->start; 649 em->start = async_extent->start;
647 em->len = async_extent->ram_size; 650 em->len = async_extent->ram_size;
648 em->orig_start = em->start; 651 em->orig_start = em->start;
@@ -771,7 +774,7 @@ static noinline int cow_file_range(struct inode *inode,
771 774
772 BUG_ON(root == root->fs_info->tree_root); 775 BUG_ON(root == root->fs_info->tree_root);
773 trans = btrfs_join_transaction(root, 1); 776 trans = btrfs_join_transaction(root, 1);
774 BUG_ON(!trans); 777 BUG_ON(IS_ERR(trans));
775 btrfs_set_trans_block_group(trans, inode); 778 btrfs_set_trans_block_group(trans, inode);
776 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 779 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
777 780
@@ -819,6 +822,7 @@ static noinline int cow_file_range(struct inode *inode,
819 BUG_ON(ret); 822 BUG_ON(ret);
820 823
821 em = alloc_extent_map(GFP_NOFS); 824 em = alloc_extent_map(GFP_NOFS);
825 BUG_ON(!em);
822 em->start = start; 826 em->start = start;
823 em->orig_start = em->start; 827 em->orig_start = em->start;
824 ram_size = ins.offset; 828 ram_size = ins.offset;
@@ -1049,7 +1053,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1049 } else { 1053 } else {
1050 trans = btrfs_join_transaction(root, 1); 1054 trans = btrfs_join_transaction(root, 1);
1051 } 1055 }
1052 BUG_ON(!trans); 1056 BUG_ON(IS_ERR(trans));
1053 1057
1054 cow_start = (u64)-1; 1058 cow_start = (u64)-1;
1055 cur_offset = start; 1059 cur_offset = start;
@@ -1168,6 +1172,7 @@ out_check:
1168 struct extent_map_tree *em_tree; 1172 struct extent_map_tree *em_tree;
1169 em_tree = &BTRFS_I(inode)->extent_tree; 1173 em_tree = &BTRFS_I(inode)->extent_tree;
1170 em = alloc_extent_map(GFP_NOFS); 1174 em = alloc_extent_map(GFP_NOFS);
1175 BUG_ON(!em);
1171 em->start = cur_offset; 1176 em->start = cur_offset;
1172 em->orig_start = em->start; 1177 em->orig_start = em->start;
1173 em->len = num_bytes; 1178 em->len = num_bytes;
@@ -1557,6 +1562,7 @@ out:
1557out_page: 1562out_page:
1558 unlock_page(page); 1563 unlock_page(page);
1559 page_cache_release(page); 1564 page_cache_release(page);
1565 kfree(fixup);
1560} 1566}
1561 1567
1562/* 1568/*
@@ -1703,7 +1709,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1703 trans = btrfs_join_transaction_nolock(root, 1); 1709 trans = btrfs_join_transaction_nolock(root, 1);
1704 else 1710 else
1705 trans = btrfs_join_transaction(root, 1); 1711 trans = btrfs_join_transaction(root, 1);
1706 BUG_ON(!trans); 1712 BUG_ON(IS_ERR(trans));
1707 btrfs_set_trans_block_group(trans, inode); 1713 btrfs_set_trans_block_group(trans, inode);
1708 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1714 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1709 ret = btrfs_update_inode(trans, root, inode); 1715 ret = btrfs_update_inode(trans, root, inode);
@@ -1720,6 +1726,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1720 trans = btrfs_join_transaction_nolock(root, 1); 1726 trans = btrfs_join_transaction_nolock(root, 1);
1721 else 1727 else
1722 trans = btrfs_join_transaction(root, 1); 1728 trans = btrfs_join_transaction(root, 1);
1729 BUG_ON(IS_ERR(trans));
1723 btrfs_set_trans_block_group(trans, inode); 1730 btrfs_set_trans_block_group(trans, inode);
1724 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1731 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1725 1732
@@ -1907,7 +1914,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1907 1914
1908 private = 0; 1915 private = 0;
1909 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 1916 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1910 (u64)-1, 1, EXTENT_DIRTY)) { 1917 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1911 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, 1918 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1912 start, &private_failure); 1919 start, &private_failure);
1913 if (ret == 0) { 1920 if (ret == 0) {
@@ -2354,6 +2361,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2354 */ 2361 */
2355 if (is_bad_inode(inode)) { 2362 if (is_bad_inode(inode)) {
2356 trans = btrfs_start_transaction(root, 0); 2363 trans = btrfs_start_transaction(root, 0);
2364 BUG_ON(IS_ERR(trans));
2357 btrfs_orphan_del(trans, inode); 2365 btrfs_orphan_del(trans, inode);
2358 btrfs_end_transaction(trans, root); 2366 btrfs_end_transaction(trans, root);
2359 iput(inode); 2367 iput(inode);
@@ -2381,6 +2389,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2381 2389
2382 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2390 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2383 trans = btrfs_join_transaction(root, 1); 2391 trans = btrfs_join_transaction(root, 1);
2392 BUG_ON(IS_ERR(trans));
2384 btrfs_end_transaction(trans, root); 2393 btrfs_end_transaction(trans, root);
2385 } 2394 }
2386 2395
@@ -2641,7 +2650,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2641 path = btrfs_alloc_path(); 2650 path = btrfs_alloc_path();
2642 if (!path) { 2651 if (!path) {
2643 ret = -ENOMEM; 2652 ret = -ENOMEM;
2644 goto err; 2653 goto out;
2645 } 2654 }
2646 2655
2647 path->leave_spinning = 1; 2656 path->leave_spinning = 1;
@@ -2714,9 +2723,10 @@ static int check_path_shared(struct btrfs_root *root,
2714 struct extent_buffer *eb; 2723 struct extent_buffer *eb;
2715 int level; 2724 int level;
2716 u64 refs = 1; 2725 u64 refs = 1;
2717 int uninitialized_var(ret);
2718 2726
2719 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2727 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2728 int ret;
2729
2720 if (!path->nodes[level]) 2730 if (!path->nodes[level])
2721 break; 2731 break;
2722 eb = path->nodes[level]; 2732 eb = path->nodes[level];
@@ -2727,7 +2737,7 @@ static int check_path_shared(struct btrfs_root *root,
2727 if (refs > 1) 2737 if (refs > 1)
2728 return 1; 2738 return 1;
2729 } 2739 }
2730 return ret; /* XXX callers? */ 2740 return 0;
2731} 2741}
2732 2742
2733/* 2743/*
@@ -4134,7 +4144,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4134 } 4144 }
4135 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4145 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
4136 4146
4137 if (root != sub_root) { 4147 if (!IS_ERR(inode) && root != sub_root) {
4138 down_read(&root->fs_info->cleanup_work_sem); 4148 down_read(&root->fs_info->cleanup_work_sem);
4139 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4149 if (!(inode->i_sb->s_flags & MS_RDONLY))
4140 btrfs_orphan_cleanup(sub_root); 4150 btrfs_orphan_cleanup(sub_root);
@@ -4347,6 +4357,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4347 trans = btrfs_join_transaction_nolock(root, 1); 4357 trans = btrfs_join_transaction_nolock(root, 1);
4348 else 4358 else
4349 trans = btrfs_join_transaction(root, 1); 4359 trans = btrfs_join_transaction(root, 1);
4360 if (IS_ERR(trans))
4361 return PTR_ERR(trans);
4350 btrfs_set_trans_block_group(trans, inode); 4362 btrfs_set_trans_block_group(trans, inode);
4351 if (nolock) 4363 if (nolock)
4352 ret = btrfs_end_transaction_nolock(trans, root); 4364 ret = btrfs_end_transaction_nolock(trans, root);
@@ -4372,6 +4384,7 @@ void btrfs_dirty_inode(struct inode *inode)
4372 return; 4384 return;
4373 4385
4374 trans = btrfs_join_transaction(root, 1); 4386 trans = btrfs_join_transaction(root, 1);
4387 BUG_ON(IS_ERR(trans));
4375 btrfs_set_trans_block_group(trans, inode); 4388 btrfs_set_trans_block_group(trans, inode);
4376 4389
4377 ret = btrfs_update_inode(trans, root, inode); 4390 ret = btrfs_update_inode(trans, root, inode);
@@ -4692,7 +4705,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4692 if (IS_ERR(inode)) 4705 if (IS_ERR(inode))
4693 goto out_unlock; 4706 goto out_unlock;
4694 4707
4695 err = btrfs_init_inode_security(trans, inode, dir); 4708 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4696 if (err) { 4709 if (err) {
4697 drop_inode = 1; 4710 drop_inode = 1;
4698 goto out_unlock; 4711 goto out_unlock;
@@ -4753,7 +4766,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4753 if (IS_ERR(inode)) 4766 if (IS_ERR(inode))
4754 goto out_unlock; 4767 goto out_unlock;
4755 4768
4756 err = btrfs_init_inode_security(trans, inode, dir); 4769 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4757 if (err) { 4770 if (err) {
4758 drop_inode = 1; 4771 drop_inode = 1;
4759 goto out_unlock; 4772 goto out_unlock;
@@ -4794,9 +4807,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4794 int err; 4807 int err;
4795 int drop_inode = 0; 4808 int drop_inode = 0;
4796 4809
4797 if (inode->i_nlink == 0)
4798 return -ENOENT;
4799
4800 /* do not allow sys_link's with other subvols of the same device */ 4810 /* do not allow sys_link's with other subvols of the same device */
4801 if (root->objectid != BTRFS_I(inode)->root->objectid) 4811 if (root->objectid != BTRFS_I(inode)->root->objectid)
4802 return -EPERM; 4812 return -EPERM;
@@ -4809,10 +4819,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4809 goto fail; 4819 goto fail;
4810 4820
4811 /* 4821 /*
4812 * 1 item for inode ref 4822 * 2 items for inode and inode ref
4813 * 2 items for dir items 4823 * 2 items for dir items
4824 * 1 item for parent inode
4814 */ 4825 */
4815 trans = btrfs_start_transaction(root, 3); 4826 trans = btrfs_start_transaction(root, 5);
4816 if (IS_ERR(trans)) { 4827 if (IS_ERR(trans)) {
4817 err = PTR_ERR(trans); 4828 err = PTR_ERR(trans);
4818 goto fail; 4829 goto fail;
@@ -4881,7 +4892,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4881 4892
4882 drop_on_err = 1; 4893 drop_on_err = 1;
4883 4894
4884 err = btrfs_init_inode_security(trans, inode, dir); 4895 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4885 if (err) 4896 if (err)
4886 goto out_fail; 4897 goto out_fail;
4887 4898
@@ -5176,6 +5187,8 @@ again:
5176 em = NULL; 5187 em = NULL;
5177 btrfs_release_path(root, path); 5188 btrfs_release_path(root, path);
5178 trans = btrfs_join_transaction(root, 1); 5189 trans = btrfs_join_transaction(root, 1);
5190 if (IS_ERR(trans))
5191 return ERR_CAST(trans);
5179 goto again; 5192 goto again;
5180 } 5193 }
5181 map = kmap(page); 5194 map = kmap(page);
@@ -5266,6 +5279,128 @@ out:
5266 return em; 5279 return em;
5267} 5280}
5268 5281
5282struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
5283 size_t pg_offset, u64 start, u64 len,
5284 int create)
5285{
5286 struct extent_map *em;
5287 struct extent_map *hole_em = NULL;
5288 u64 range_start = start;
5289 u64 end;
5290 u64 found;
5291 u64 found_end;
5292 int err = 0;
5293
5294 em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
5295 if (IS_ERR(em))
5296 return em;
5297 if (em) {
5298 /*
5299 * if our em maps to a hole, there might
5300 * actually be delalloc bytes behind it
5301 */
5302 if (em->block_start != EXTENT_MAP_HOLE)
5303 return em;
5304 else
5305 hole_em = em;
5306 }
5307
5308 /* check to see if we've wrapped (len == -1 or similar) */
5309 end = start + len;
5310 if (end < start)
5311 end = (u64)-1;
5312 else
5313 end -= 1;
5314
5315 em = NULL;
5316
5317 /* ok, we didn't find anything, lets look for delalloc */
5318 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
5319 end, len, EXTENT_DELALLOC, 1);
5320 found_end = range_start + found;
5321 if (found_end < range_start)
5322 found_end = (u64)-1;
5323
5324 /*
5325 * we didn't find anything useful, return
5326 * the original results from get_extent()
5327 */
5328 if (range_start > end || found_end <= start) {
5329 em = hole_em;
5330 hole_em = NULL;
5331 goto out;
5332 }
5333
5334 /* adjust the range_start to make sure it doesn't
5335 * go backwards from the start they passed in
5336 */
5337 range_start = max(start,range_start);
5338 found = found_end - range_start;
5339
5340 if (found > 0) {
5341 u64 hole_start = start;
5342 u64 hole_len = len;
5343
5344 em = alloc_extent_map(GFP_NOFS);
5345 if (!em) {
5346 err = -ENOMEM;
5347 goto out;
5348 }
5349 /*
5350 * when btrfs_get_extent can't find anything it
5351 * returns one huge hole
5352 *
5353 * make sure what it found really fits our range, and
5354 * adjust to make sure it is based on the start from
5355 * the caller
5356 */
5357 if (hole_em) {
5358 u64 calc_end = extent_map_end(hole_em);
5359
5360 if (calc_end <= start || (hole_em->start > end)) {
5361 free_extent_map(hole_em);
5362 hole_em = NULL;
5363 } else {
5364 hole_start = max(hole_em->start, start);
5365 hole_len = calc_end - hole_start;
5366 }
5367 }
5368 em->bdev = NULL;
5369 if (hole_em && range_start > hole_start) {
5370 /* our hole starts before our delalloc, so we
5371 * have to return just the parts of the hole
5372 * that go until the delalloc starts
5373 */
5374 em->len = min(hole_len,
5375 range_start - hole_start);
5376 em->start = hole_start;
5377 em->orig_start = hole_start;
5378 /*
5379 * don't adjust block start at all,
5380 * it is fixed at EXTENT_MAP_HOLE
5381 */
5382 em->block_start = hole_em->block_start;
5383 em->block_len = hole_len;
5384 } else {
5385 em->start = range_start;
5386 em->len = found;
5387 em->orig_start = range_start;
5388 em->block_start = EXTENT_MAP_DELALLOC;
5389 em->block_len = found;
5390 }
5391 } else if (hole_em) {
5392 return hole_em;
5393 }
5394out:
5395
5396 free_extent_map(hole_em);
5397 if (err) {
5398 free_extent_map(em);
5399 return ERR_PTR(err);
5400 }
5401 return em;
5402}
5403
5269static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5404static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5270 u64 start, u64 len) 5405 u64 start, u64 len)
5271{ 5406{
@@ -5280,8 +5415,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5280 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5415 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5281 5416
5282 trans = btrfs_join_transaction(root, 0); 5417 trans = btrfs_join_transaction(root, 0);
5283 if (!trans) 5418 if (IS_ERR(trans))
5284 return ERR_PTR(-ENOMEM); 5419 return ERR_CAST(trans);
5285 5420
5286 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5421 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5287 5422
@@ -5505,7 +5640,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5505 * while we look for nocow cross refs 5640 * while we look for nocow cross refs
5506 */ 5641 */
5507 trans = btrfs_join_transaction(root, 0); 5642 trans = btrfs_join_transaction(root, 0);
5508 if (!trans) 5643 if (IS_ERR(trans))
5509 goto must_cow; 5644 goto must_cow;
5510 5645
5511 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5646 if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5640,7 +5775,7 @@ again:
5640 BUG_ON(!ordered); 5775 BUG_ON(!ordered);
5641 5776
5642 trans = btrfs_join_transaction(root, 1); 5777 trans = btrfs_join_transaction(root, 1);
5643 if (!trans) { 5778 if (IS_ERR(trans)) {
5644 err = -ENOMEM; 5779 err = -ENOMEM;
5645 goto out; 5780 goto out;
5646 } 5781 }
@@ -5920,6 +6055,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5920 if (!skip_sum) { 6055 if (!skip_sum) {
5921 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 6056 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5922 if (!dip->csums) { 6057 if (!dip->csums) {
6058 kfree(dip);
5923 ret = -ENOMEM; 6059 ret = -ENOMEM;
5924 goto free_ordered; 6060 goto free_ordered;
5925 } 6061 }
@@ -6088,7 +6224,7 @@ out:
6088static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6224static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6089 __u64 start, __u64 len) 6225 __u64 start, __u64 len)
6090{ 6226{
6091 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); 6227 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6092} 6228}
6093 6229
6094int btrfs_readpage(struct file *file, struct page *page) 6230int btrfs_readpage(struct file *file, struct page *page)
@@ -6968,7 +7104,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6968 if (IS_ERR(inode)) 7104 if (IS_ERR(inode))
6969 goto out_unlock; 7105 goto out_unlock;
6970 7106
6971 err = btrfs_init_inode_security(trans, inode, dir); 7107 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6972 if (err) { 7108 if (err) {
6973 drop_inode = 1; 7109 drop_inode = 1;
6974 goto out_unlock; 7110 goto out_unlock;
@@ -7204,7 +7340,6 @@ static const struct address_space_operations btrfs_aops = {
7204 .writepage = btrfs_writepage, 7340 .writepage = btrfs_writepage,
7205 .writepages = btrfs_writepages, 7341 .writepages = btrfs_writepages,
7206 .readpages = btrfs_readpages, 7342 .readpages = btrfs_readpages,
7207 .sync_page = block_sync_page,
7208 .direct_IO = btrfs_direct_IO, 7343 .direct_IO = btrfs_direct_IO,
7209 .invalidatepage = btrfs_invalidatepage, 7344 .invalidatepage = btrfs_invalidatepage,
7210 .releasepage = btrfs_releasepage, 7345 .releasepage = btrfs_releasepage,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a506a22b522a..d1bace3df9b6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -158,7 +158,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
158 FS_SYNC_FL | FS_DIRSYNC_FL)) 158 FS_SYNC_FL | FS_DIRSYNC_FL))
159 return -EOPNOTSUPP; 159 return -EOPNOTSUPP;
160 160
161 if (!is_owner_or_cap(inode)) 161 if (!inode_owner_or_capable(inode))
162 return -EACCES; 162 return -EACCES;
163 163
164 mutex_lock(&inode->i_mutex); 164 mutex_lock(&inode->i_mutex);
@@ -203,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
203 203
204 204
205 trans = btrfs_join_transaction(root, 1); 205 trans = btrfs_join_transaction(root, 1);
206 BUG_ON(!trans); 206 BUG_ON(IS_ERR(trans));
207 207
208 ret = btrfs_update_inode(trans, root, inode); 208 ret = btrfs_update_inode(trans, root, inode);
209 BUG_ON(ret); 209 BUG_ON(ret);
@@ -907,6 +907,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
907 907
908 if (new_size > old_size) { 908 if (new_size > old_size) {
909 trans = btrfs_start_transaction(root, 0); 909 trans = btrfs_start_transaction(root, 0);
910 if (IS_ERR(trans)) {
911 ret = PTR_ERR(trans);
912 goto out_unlock;
913 }
910 ret = btrfs_grow_device(trans, device, new_size); 914 ret = btrfs_grow_device(trans, device, new_size);
911 btrfs_commit_transaction(trans, root); 915 btrfs_commit_transaction(trans, root);
912 } else { 916 } else {
@@ -1067,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1067 if (copy_from_user(&flags, arg, sizeof(flags))) 1071 if (copy_from_user(&flags, arg, sizeof(flags)))
1068 return -EFAULT; 1072 return -EFAULT;
1069 1073
1070 if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC) 1074 if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
1071 return -EINVAL; 1075 return -EINVAL;
1072 1076
1073 if (flags & ~BTRFS_SUBVOL_RDONLY) 1077 if (flags & ~BTRFS_SUBVOL_RDONLY)
1074 return -EOPNOTSUPP; 1078 return -EOPNOTSUPP;
1075 1079
1080 if (!inode_owner_or_capable(inode))
1081 return -EACCES;
1082
1076 down_write(&root->fs_info->subvol_sem); 1083 down_write(&root->fs_info->subvol_sem);
1077 1084
1078 /* nothing to do */ 1085 /* nothing to do */
@@ -1093,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1093 goto out_reset; 1100 goto out_reset;
1094 } 1101 }
1095 1102
1096 ret = btrfs_update_root(trans, root, 1103 ret = btrfs_update_root(trans, root->fs_info->tree_root,
1097 &root->root_key, &root->root_item); 1104 &root->root_key, &root->root_item);
1098 1105
1099 btrfs_commit_transaction(trans, root); 1106 btrfs_commit_transaction(trans, root);
@@ -1898,7 +1905,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1898 1905
1899 memcpy(&new_key, &key, sizeof(new_key)); 1906 memcpy(&new_key, &key, sizeof(new_key));
1900 new_key.objectid = inode->i_ino; 1907 new_key.objectid = inode->i_ino;
1901 new_key.offset = key.offset + destoff - off; 1908 if (off <= key.offset)
1909 new_key.offset = key.offset + destoff - off;
1910 else
1911 new_key.offset = destoff;
1902 1912
1903 trans = btrfs_start_transaction(root, 1); 1913 trans = btrfs_start_transaction(root, 1);
1904 if (IS_ERR(trans)) { 1914 if (IS_ERR(trans)) {
@@ -2082,7 +2092,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
2082 2092
2083 ret = -ENOMEM; 2093 ret = -ENOMEM;
2084 trans = btrfs_start_ioctl_transaction(root, 0); 2094 trans = btrfs_start_ioctl_transaction(root, 0);
2085 if (!trans) 2095 if (IS_ERR(trans))
2086 goto out_drop; 2096 goto out_drop;
2087 2097
2088 file->private_data = trans; 2098 file->private_data = trans;
@@ -2138,9 +2148,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2138 path->leave_spinning = 1; 2148 path->leave_spinning = 1;
2139 2149
2140 trans = btrfs_start_transaction(root, 1); 2150 trans = btrfs_start_transaction(root, 1);
2141 if (!trans) { 2151 if (IS_ERR(trans)) {
2142 btrfs_free_path(path); 2152 btrfs_free_path(path);
2143 return -ENOMEM; 2153 return PTR_ERR(trans);
2144 } 2154 }
2145 2155
2146 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2156 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -2201,7 +2211,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2201 int num_types = 4; 2211 int num_types = 4;
2202 int alloc_size; 2212 int alloc_size;
2203 int ret = 0; 2213 int ret = 0;
2204 int slot_count = 0; 2214 u64 slot_count = 0;
2205 int i, c; 2215 int i, c;
2206 2216
2207 if (copy_from_user(&space_args, 2217 if (copy_from_user(&space_args,
@@ -2240,7 +2250,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2240 goto out; 2250 goto out;
2241 } 2251 }
2242 2252
2243 slot_count = min_t(int, space_args.space_slots, slot_count); 2253 slot_count = min_t(u64, space_args.space_slots, slot_count);
2244 2254
2245 alloc_size = sizeof(*dest) * slot_count; 2255 alloc_size = sizeof(*dest) * slot_count;
2246 2256
@@ -2260,6 +2270,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2260 for (i = 0; i < num_types; i++) { 2270 for (i = 0; i < num_types; i++) {
2261 struct btrfs_space_info *tmp; 2271 struct btrfs_space_info *tmp;
2262 2272
2273 if (!slot_count)
2274 break;
2275
2263 info = NULL; 2276 info = NULL;
2264 rcu_read_lock(); 2277 rcu_read_lock();
2265 list_for_each_entry_rcu(tmp, &root->fs_info->space_info, 2278 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
@@ -2281,7 +2294,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2281 memcpy(dest, &space, sizeof(space)); 2294 memcpy(dest, &space, sizeof(space));
2282 dest++; 2295 dest++;
2283 space_args.total_spaces++; 2296 space_args.total_spaces++;
2297 slot_count--;
2284 } 2298 }
2299 if (!slot_count)
2300 break;
2285 } 2301 }
2286 up_read(&info->groups_sem); 2302 up_read(&info->groups_sem);
2287 } 2303 }
@@ -2334,6 +2350,8 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
2334 u64 transid; 2350 u64 transid;
2335 2351
2336 trans = btrfs_start_transaction(root, 0); 2352 trans = btrfs_start_transaction(root, 0);
2353 if (IS_ERR(trans))
2354 return PTR_ERR(trans);
2337 transid = trans->transid; 2355 transid = trans->transid;
2338 btrfs_commit_transaction_async(trans, root, 0); 2356 btrfs_commit_transaction_async(trans, root, 0);
2339 2357
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cc9b450399df..a178f5ebea78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
280 unsigned long tot_out; 280 unsigned long tot_out;
281 unsigned long tot_len; 281 unsigned long tot_len;
282 char *buf; 282 char *buf;
283 bool may_late_unmap, need_unmap;
283 284
284 data_in = kmap(pages_in[0]); 285 data_in = kmap(pages_in[0]);
285 tot_len = read_compress_length(data_in); 286 tot_len = read_compress_length(data_in);
@@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws,
300 301
301 tot_in += in_len; 302 tot_in += in_len;
302 working_bytes = in_len; 303 working_bytes = in_len;
304 may_late_unmap = need_unmap = false;
303 305
304 /* fast path: avoid using the working buffer */ 306 /* fast path: avoid using the working buffer */
305 if (in_page_bytes_left >= in_len) { 307 if (in_page_bytes_left >= in_len) {
306 buf = data_in + in_offset; 308 buf = data_in + in_offset;
307 bytes = in_len; 309 bytes = in_len;
310 may_late_unmap = true;
308 goto cont; 311 goto cont;
309 } 312 }
310 313
@@ -329,14 +332,17 @@ cont:
329 if (working_bytes == 0 && tot_in >= tot_len) 332 if (working_bytes == 0 && tot_in >= tot_len)
330 break; 333 break;
331 334
332 kunmap(pages_in[page_in_index]); 335 if (page_in_index + 1 >= total_pages_in) {
333 page_in_index++;
334 if (page_in_index >= total_pages_in) {
335 ret = -1; 336 ret = -1;
336 data_in = NULL;
337 goto done; 337 goto done;
338 } 338 }
339 data_in = kmap(pages_in[page_in_index]); 339
340 if (may_late_unmap)
341 need_unmap = true;
342 else
343 kunmap(pages_in[page_in_index]);
344
345 data_in = kmap(pages_in[++page_in_index]);
340 346
341 in_page_bytes_left = PAGE_CACHE_SIZE; 347 in_page_bytes_left = PAGE_CACHE_SIZE;
342 in_offset = 0; 348 in_offset = 0;
@@ -346,6 +352,8 @@ cont:
346 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); 352 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
347 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, 353 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
348 &out_len); 354 &out_len);
355 if (need_unmap)
356 kunmap(pages_in[page_in_index - 1]);
349 if (ret != LZO_E_OK) { 357 if (ret != LZO_E_OK) {
350 printk(KERN_WARNING "btrfs decompress failed\n"); 358 printk(KERN_WARNING "btrfs decompress failed\n");
351 ret = -1; 359 ret = -1;
@@ -363,8 +371,7 @@ cont:
363 break; 371 break;
364 } 372 }
365done: 373done:
366 if (data_in) 374 kunmap(pages_in[page_in_index]);
367 kunmap(pages_in[page_in_index]);
368 return ret; 375 return ret;
369} 376}
370 377
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2b61e1ddcd99..083a55477375 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
141 u64 file_offset) 141 u64 file_offset)
142{ 142{
143 struct rb_root *root = &tree->tree; 143 struct rb_root *root = &tree->tree;
144 struct rb_node *prev; 144 struct rb_node *prev = NULL;
145 struct rb_node *ret; 145 struct rb_node *ret;
146 struct btrfs_ordered_extent *entry; 146 struct btrfs_ordered_extent *entry;
147 147
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b63..fb2605d998e9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
260#else 260#else
261 BUG(); 261 BUG();
262#endif 262#endif
263 break;
263 case BTRFS_BLOCK_GROUP_ITEM_KEY: 264 case BTRFS_BLOCK_GROUP_ITEM_KEY:
264 bi = btrfs_item_ptr(l, i, 265 bi = btrfs_item_ptr(l, i,
265 struct btrfs_block_group_item); 266 struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 045c9c2b2d7e..31ade5802ae8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1157 new_node->bytenr = dest->node->start; 1157 new_node->bytenr = dest->node->start;
1158 new_node->level = node->level; 1158 new_node->level = node->level;
1159 new_node->lowest = node->lowest; 1159 new_node->lowest = node->lowest;
1160 new_node->checked = 1;
1160 new_node->root = dest; 1161 new_node->root = dest;
1161 1162
1162 if (!node->lowest) { 1163 if (!node->lowest) {
@@ -2028,6 +2029,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2028 2029
2029 while (1) { 2030 while (1) {
2030 trans = btrfs_start_transaction(root, 0); 2031 trans = btrfs_start_transaction(root, 0);
2032 BUG_ON(IS_ERR(trans));
2031 trans->block_rsv = rc->block_rsv; 2033 trans->block_rsv = rc->block_rsv;
2032 2034
2033 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2035 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -2147,6 +2149,12 @@ again:
2147 } 2149 }
2148 2150
2149 trans = btrfs_join_transaction(rc->extent_root, 1); 2151 trans = btrfs_join_transaction(rc->extent_root, 1);
2152 if (IS_ERR(trans)) {
2153 if (!err)
2154 btrfs_block_rsv_release(rc->extent_root,
2155 rc->block_rsv, num_bytes);
2156 return PTR_ERR(trans);
2157 }
2150 2158
2151 if (!err) { 2159 if (!err) {
2152 if (num_bytes != rc->merging_rsv_size) { 2160 if (num_bytes != rc->merging_rsv_size) {
@@ -3222,6 +3230,7 @@ truncate:
3222 trans = btrfs_join_transaction(root, 0); 3230 trans = btrfs_join_transaction(root, 0);
3223 if (IS_ERR(trans)) { 3231 if (IS_ERR(trans)) {
3224 btrfs_free_path(path); 3232 btrfs_free_path(path);
3233 ret = PTR_ERR(trans);
3225 goto out; 3234 goto out;
3226 } 3235 }
3227 3236
@@ -3628,6 +3637,7 @@ int prepare_to_relocate(struct reloc_control *rc)
3628 set_reloc_control(rc); 3637 set_reloc_control(rc);
3629 3638
3630 trans = btrfs_join_transaction(rc->extent_root, 1); 3639 trans = btrfs_join_transaction(rc->extent_root, 1);
3640 BUG_ON(IS_ERR(trans));
3631 btrfs_commit_transaction(trans, rc->extent_root); 3641 btrfs_commit_transaction(trans, rc->extent_root);
3632 return 0; 3642 return 0;
3633} 3643}
@@ -3644,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3644 u32 item_size; 3654 u32 item_size;
3645 int ret; 3655 int ret;
3646 int err = 0; 3656 int err = 0;
3657 int progress = 0;
3647 3658
3648 path = btrfs_alloc_path(); 3659 path = btrfs_alloc_path();
3649 if (!path) 3660 if (!path)
@@ -3656,8 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3656 } 3667 }
3657 3668
3658 while (1) { 3669 while (1) {
3670 progress++;
3659 trans = btrfs_start_transaction(rc->extent_root, 0); 3671 trans = btrfs_start_transaction(rc->extent_root, 0);
3660 3672 BUG_ON(IS_ERR(trans));
3673restart:
3661 if (update_backref_cache(trans, &rc->backref_cache)) { 3674 if (update_backref_cache(trans, &rc->backref_cache)) {
3662 btrfs_end_transaction(trans, rc->extent_root); 3675 btrfs_end_transaction(trans, rc->extent_root);
3663 continue; 3676 continue;
@@ -3770,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3770 } 3783 }
3771 } 3784 }
3772 } 3785 }
3786 if (trans && progress && err == -ENOSPC) {
3787 ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
3788 rc->block_group->flags);
3789 if (ret == 0) {
3790 err = 0;
3791 progress = 0;
3792 goto restart;
3793 }
3794 }
3773 3795
3774 btrfs_release_path(rc->extent_root, path); 3796 btrfs_release_path(rc->extent_root, path);
3775 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, 3797 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
@@ -3804,7 +3826,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3804 3826
3805 /* get rid of pinned extents */ 3827 /* get rid of pinned extents */
3806 trans = btrfs_join_transaction(rc->extent_root, 1); 3828 trans = btrfs_join_transaction(rc->extent_root, 1);
3807 btrfs_commit_transaction(trans, rc->extent_root); 3829 if (IS_ERR(trans))
3830 err = PTR_ERR(trans);
3831 else
3832 btrfs_commit_transaction(trans, rc->extent_root);
3808out_free: 3833out_free:
3809 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); 3834 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3810 btrfs_free_path(path); 3835 btrfs_free_path(path);
@@ -4022,6 +4047,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4022 int ret; 4047 int ret;
4023 4048
4024 trans = btrfs_start_transaction(root->fs_info->tree_root, 0); 4049 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4050 BUG_ON(IS_ERR(trans));
4025 4051
4026 memset(&root->root_item.drop_progress, 0, 4052 memset(&root->root_item.drop_progress, 0,
4027 sizeof(root->root_item.drop_progress)); 4053 sizeof(root->root_item.drop_progress));
@@ -4125,6 +4151,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4125 set_reloc_control(rc); 4151 set_reloc_control(rc);
4126 4152
4127 trans = btrfs_join_transaction(rc->extent_root, 1); 4153 trans = btrfs_join_transaction(rc->extent_root, 1);
4154 if (IS_ERR(trans)) {
4155 unset_reloc_control(rc);
4156 err = PTR_ERR(trans);
4157 goto out_free;
4158 }
4128 4159
4129 rc->merge_reloc_tree = 1; 4160 rc->merge_reloc_tree = 1;
4130 4161
@@ -4154,9 +4185,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4154 unset_reloc_control(rc); 4185 unset_reloc_control(rc);
4155 4186
4156 trans = btrfs_join_transaction(rc->extent_root, 1); 4187 trans = btrfs_join_transaction(rc->extent_root, 1);
4157 btrfs_commit_transaction(trans, rc->extent_root); 4188 if (IS_ERR(trans))
4158out: 4189 err = PTR_ERR(trans);
4190 else
4191 btrfs_commit_transaction(trans, rc->extent_root);
4192out_free:
4159 kfree(rc); 4193 kfree(rc);
4194out:
4160 while (!list_empty(&reloc_roots)) { 4195 while (!list_empty(&reloc_roots)) {
4161 reloc_root = list_entry(reloc_roots.next, 4196 reloc_root = list_entry(reloc_roots.next,
4162 struct btrfs_root, root_list); 4197 struct btrfs_root, root_list);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b2130c46fdb5..d39a9895d932 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -155,7 +155,8 @@ enum {
155 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 155 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err, 158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
159 Opt_enospc_debug, Opt_err,
159}; 160};
160 161
161static match_table_t tokens = { 162static match_table_t tokens = {
@@ -184,6 +185,7 @@ static match_table_t tokens = {
184 {Opt_space_cache, "space_cache"}, 185 {Opt_space_cache, "space_cache"},
185 {Opt_clear_cache, "clear_cache"}, 186 {Opt_clear_cache, "clear_cache"},
186 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 187 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
188 {Opt_enospc_debug, "enospc_debug"},
187 {Opt_err, NULL}, 189 {Opt_err, NULL},
188}; 190};
189 191
@@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
358 case Opt_user_subvol_rm_allowed: 360 case Opt_user_subvol_rm_allowed:
359 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); 361 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
360 break; 362 break;
363 case Opt_enospc_debug:
364 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
365 break;
361 case Opt_err: 366 case Opt_err:
362 printk(KERN_INFO "btrfs: unrecognized mount option " 367 printk(KERN_INFO "btrfs: unrecognized mount option "
363 "'%s'\n", p); 368 "'%s'\n", p);
@@ -383,7 +388,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
383 struct btrfs_fs_devices **fs_devices) 388 struct btrfs_fs_devices **fs_devices)
384{ 389{
385 substring_t args[MAX_OPT_ARGS]; 390 substring_t args[MAX_OPT_ARGS];
386 char *opts, *p; 391 char *opts, *orig, *p;
387 int error = 0; 392 int error = 0;
388 int intarg; 393 int intarg;
389 394
@@ -397,6 +402,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
397 opts = kstrdup(options, GFP_KERNEL); 402 opts = kstrdup(options, GFP_KERNEL);
398 if (!opts) 403 if (!opts)
399 return -ENOMEM; 404 return -ENOMEM;
405 orig = opts;
400 406
401 while ((p = strsep(&opts, ",")) != NULL) { 407 while ((p = strsep(&opts, ",")) != NULL) {
402 int token; 408 int token;
@@ -432,7 +438,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
432 } 438 }
433 439
434 out_free_opts: 440 out_free_opts:
435 kfree(opts); 441 kfree(orig);
436 out: 442 out:
437 /* 443 /*
438 * If no subvolume name is specified we use the default one. Allocate 444 * If no subvolume name is specified we use the default one. Allocate
@@ -623,6 +629,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
623 btrfs_wait_ordered_extents(root, 0, 0); 629 btrfs_wait_ordered_extents(root, 0, 0);
624 630
625 trans = btrfs_start_transaction(root, 0); 631 trans = btrfs_start_transaction(root, 0);
632 if (IS_ERR(trans))
633 return PTR_ERR(trans);
626 ret = btrfs_commit_transaction(trans, root); 634 ret = btrfs_commit_transaction(trans, root);
627 return ret; 635 return ret;
628} 636}
@@ -761,6 +769,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
761 } 769 }
762 770
763 btrfs_close_devices(fs_devices); 771 btrfs_close_devices(fs_devices);
772 kfree(fs_info);
773 kfree(tree_root);
764 } else { 774 } else {
765 char b[BDEVNAME_SIZE]; 775 char b[BDEVNAME_SIZE];
766 776
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bae5c7b8bbe2..3d73c8d93bbb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1161,6 +1161,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1161 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1161 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1162 ac->root = root; 1162 ac->root = root;
1163 ac->newtrans = btrfs_join_transaction(root, 0); 1163 ac->newtrans = btrfs_join_transaction(root, 0);
1164 if (IS_ERR(ac->newtrans)) {
1165 int err = PTR_ERR(ac->newtrans);
1166 kfree(ac);
1167 return err;
1168 }
1164 1169
1165 /* take transaction reference */ 1170 /* take transaction reference */
1166 mutex_lock(&root->fs_info->trans_mutex); 1171 mutex_lock(&root->fs_info->trans_mutex);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 054744ac5719..a4bbb854dfd2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
338 } 338 }
339 dst_copy = kmalloc(item_size, GFP_NOFS); 339 dst_copy = kmalloc(item_size, GFP_NOFS);
340 src_copy = kmalloc(item_size, GFP_NOFS); 340 src_copy = kmalloc(item_size, GFP_NOFS);
341 if (!dst_copy || !src_copy) {
342 btrfs_release_path(root, path);
343 kfree(dst_copy);
344 kfree(src_copy);
345 return -ENOMEM;
346 }
341 347
342 read_extent_buffer(eb, src_copy, src_ptr, item_size); 348 read_extent_buffer(eb, src_copy, src_ptr, item_size);
343 349
@@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
665 btrfs_dir_item_key_to_cpu(leaf, di, &location); 671 btrfs_dir_item_key_to_cpu(leaf, di, &location);
666 name_len = btrfs_dir_name_len(leaf, di); 672 name_len = btrfs_dir_name_len(leaf, di);
667 name = kmalloc(name_len, GFP_NOFS); 673 name = kmalloc(name_len, GFP_NOFS);
674 if (!name)
675 return -ENOMEM;
676
668 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 677 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
669 btrfs_release_path(root, path); 678 btrfs_release_path(root, path);
670 679
@@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
744 int match = 0; 753 int match = 0;
745 754
746 path = btrfs_alloc_path(); 755 path = btrfs_alloc_path();
756 if (!path)
757 return -ENOMEM;
758
747 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 759 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
748 if (ret != 0) 760 if (ret != 0)
749 goto out; 761 goto out;
@@ -967,6 +979,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
967 key.offset = (u64)-1; 979 key.offset = (u64)-1;
968 980
969 path = btrfs_alloc_path(); 981 path = btrfs_alloc_path();
982 if (!path)
983 return -ENOMEM;
970 984
971 while (1) { 985 while (1) {
972 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 986 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1178,6 +1192,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1178 1192
1179 name_len = btrfs_dir_name_len(eb, di); 1193 name_len = btrfs_dir_name_len(eb, di);
1180 name = kmalloc(name_len, GFP_NOFS); 1194 name = kmalloc(name_len, GFP_NOFS);
1195 if (!name)
1196 return -ENOMEM;
1197
1181 log_type = btrfs_dir_type(eb, di); 1198 log_type = btrfs_dir_type(eb, di);
1182 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1199 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1183 name_len); 1200 name_len);
@@ -1692,6 +1709,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1692 root_owner = btrfs_header_owner(parent); 1709 root_owner = btrfs_header_owner(parent);
1693 1710
1694 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1711 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1712 if (!next)
1713 return -ENOMEM;
1695 1714
1696 if (*level == 1) { 1715 if (*level == 1) {
1697 wc->process_func(root, next, wc, ptr_gen); 1716 wc->process_func(root, next, wc, ptr_gen);
@@ -2032,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2032 wait_log_commit(trans, log_root_tree, 2051 wait_log_commit(trans, log_root_tree,
2033 log_root_tree->log_transid); 2052 log_root_tree->log_transid);
2034 mutex_unlock(&log_root_tree->log_mutex); 2053 mutex_unlock(&log_root_tree->log_mutex);
2054 ret = 0;
2035 goto out; 2055 goto out;
2036 } 2056 }
2037 atomic_set(&log_root_tree->log_commit[index2], 1); 2057 atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2096,7 +2116,7 @@ out:
2096 smp_mb(); 2116 smp_mb();
2097 if (waitqueue_active(&root->log_commit_wait[index1])) 2117 if (waitqueue_active(&root->log_commit_wait[index1]))
2098 wake_up(&root->log_commit_wait[index1]); 2118 wake_up(&root->log_commit_wait[index1]);
2099 return 0; 2119 return ret;
2100} 2120}
2101 2121
2102static void free_log_tree(struct btrfs_trans_handle *trans, 2122static void free_log_tree(struct btrfs_trans_handle *trans,
@@ -2194,6 +2214,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2194 2214
2195 log = root->log_root; 2215 log = root->log_root;
2196 path = btrfs_alloc_path(); 2216 path = btrfs_alloc_path();
2217 if (!path)
2218 return -ENOMEM;
2219
2197 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2220 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2198 name, name_len, -1); 2221 name, name_len, -1);
2199 if (IS_ERR(di)) { 2222 if (IS_ERR(di)) {
@@ -2594,6 +2617,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2594 2617
2595 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2618 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2596 nr * sizeof(u32), GFP_NOFS); 2619 nr * sizeof(u32), GFP_NOFS);
2620 if (!ins_data)
2621 return -ENOMEM;
2622
2597 ins_sizes = (u32 *)ins_data; 2623 ins_sizes = (u32 *)ins_data;
2598 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2624 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2599 2625
@@ -2725,7 +2751,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2725 log = root->log_root; 2751 log = root->log_root;
2726 2752
2727 path = btrfs_alloc_path(); 2753 path = btrfs_alloc_path();
2754 if (!path)
2755 return -ENOMEM;
2728 dst_path = btrfs_alloc_path(); 2756 dst_path = btrfs_alloc_path();
2757 if (!dst_path) {
2758 btrfs_free_path(path);
2759 return -ENOMEM;
2760 }
2729 2761
2730 min_key.objectid = inode->i_ino; 2762 min_key.objectid = inode->i_ino;
2731 min_key.type = BTRFS_INODE_ITEM_KEY; 2763 min_key.type = BTRFS_INODE_ITEM_KEY;
@@ -3080,6 +3112,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3080 BUG_ON(!path); 3112 BUG_ON(!path);
3081 3113
3082 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3114 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3115 BUG_ON(IS_ERR(trans));
3083 3116
3084 wc.trans = trans; 3117 wc.trans = trans;
3085 wc.pin = 1; 3118 wc.pin = 1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d158530233b7..9d554e8e6583 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -162,7 +162,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
162 struct bio *cur; 162 struct bio *cur;
163 int again = 0; 163 int again = 0;
164 unsigned long num_run; 164 unsigned long num_run;
165 unsigned long num_sync_run;
166 unsigned long batch_run = 0; 165 unsigned long batch_run = 0;
167 unsigned long limit; 166 unsigned long limit;
168 unsigned long last_waited = 0; 167 unsigned long last_waited = 0;
@@ -173,11 +172,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
173 limit = btrfs_async_submit_limit(fs_info); 172 limit = btrfs_async_submit_limit(fs_info);
174 limit = limit * 2 / 3; 173 limit = limit * 2 / 3;
175 174
176 /* we want to make sure that every time we switch from the sync
177 * list to the normal list, we unplug
178 */
179 num_sync_run = 0;
180
181loop: 175loop:
182 spin_lock(&device->io_lock); 176 spin_lock(&device->io_lock);
183 177
@@ -223,15 +217,6 @@ loop_lock:
223 217
224 spin_unlock(&device->io_lock); 218 spin_unlock(&device->io_lock);
225 219
226 /*
227 * if we're doing the regular priority list, make sure we unplug
228 * for any high prio bios we've sent down
229 */
230 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
231 num_sync_run = 0;
232 blk_run_backing_dev(bdi, NULL);
233 }
234
235 while (pending) { 220 while (pending) {
236 221
237 rmb(); 222 rmb();
@@ -259,19 +244,11 @@ loop_lock:
259 244
260 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 245 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
261 246
262 if (cur->bi_rw & REQ_SYNC)
263 num_sync_run++;
264
265 submit_bio(cur->bi_rw, cur); 247 submit_bio(cur->bi_rw, cur);
266 num_run++; 248 num_run++;
267 batch_run++; 249 batch_run++;
268 if (need_resched()) { 250 if (need_resched())
269 if (num_sync_run) {
270 blk_run_backing_dev(bdi, NULL);
271 num_sync_run = 0;
272 }
273 cond_resched(); 251 cond_resched();
274 }
275 252
276 /* 253 /*
277 * we made progress, there is more work to do and the bdi 254 * we made progress, there is more work to do and the bdi
@@ -304,13 +281,8 @@ loop_lock:
304 * against it before looping 281 * against it before looping
305 */ 282 */
306 last_waited = ioc->last_waited; 283 last_waited = ioc->last_waited;
307 if (need_resched()) { 284 if (need_resched())
308 if (num_sync_run) {
309 blk_run_backing_dev(bdi, NULL);
310 num_sync_run = 0;
311 }
312 cond_resched(); 285 cond_resched();
313 }
314 continue; 286 continue;
315 } 287 }
316 spin_lock(&device->io_lock); 288 spin_lock(&device->io_lock);
@@ -323,22 +295,6 @@ loop_lock:
323 } 295 }
324 } 296 }
325 297
326 if (num_sync_run) {
327 num_sync_run = 0;
328 blk_run_backing_dev(bdi, NULL);
329 }
330 /*
331 * IO has already been through a long path to get here. Checksumming,
332 * async helper threads, perhaps compression. We've done a pretty
333 * good job of collecting a batch of IO and should just unplug
334 * the device right away.
335 *
336 * This will help anyone who is waiting on the IO, they might have
337 * already unplugged, but managed to do so before the bio they
338 * cared about found its way down here.
339 */
340 blk_run_backing_dev(bdi, NULL);
341
342 cond_resched(); 298 cond_resched();
343 if (again) 299 if (again)
344 goto loop; 300 goto loop;
@@ -1213,6 +1169,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1213 return -ENOMEM; 1169 return -ENOMEM;
1214 1170
1215 trans = btrfs_start_transaction(root, 0); 1171 trans = btrfs_start_transaction(root, 0);
1172 if (IS_ERR(trans)) {
1173 btrfs_free_path(path);
1174 return PTR_ERR(trans);
1175 }
1216 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1176 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1217 key.type = BTRFS_DEV_ITEM_KEY; 1177 key.type = BTRFS_DEV_ITEM_KEY;
1218 key.offset = device->devid; 1178 key.offset = device->devid;
@@ -1334,11 +1294,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1334 1294
1335 ret = btrfs_shrink_device(device, 0); 1295 ret = btrfs_shrink_device(device, 0);
1336 if (ret) 1296 if (ret)
1337 goto error_brelse; 1297 goto error_undo;
1338 1298
1339 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1299 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1340 if (ret) 1300 if (ret)
1341 goto error_brelse; 1301 goto error_undo;
1342 1302
1343 device->in_fs_metadata = 0; 1303 device->in_fs_metadata = 0;
1344 1304
@@ -1412,6 +1372,13 @@ out:
1412 mutex_unlock(&root->fs_info->volume_mutex); 1372 mutex_unlock(&root->fs_info->volume_mutex);
1413 mutex_unlock(&uuid_mutex); 1373 mutex_unlock(&uuid_mutex);
1414 return ret; 1374 return ret;
1375error_undo:
1376 if (device->writeable) {
1377 list_add(&device->dev_alloc_list,
1378 &root->fs_info->fs_devices->alloc_list);
1379 root->fs_info->fs_devices->rw_devices++;
1380 }
1381 goto error_brelse;
1415} 1382}
1416 1383
1417/* 1384/*
@@ -1601,11 +1568,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1601 1568
1602 ret = find_next_devid(root, &device->devid); 1569 ret = find_next_devid(root, &device->devid);
1603 if (ret) { 1570 if (ret) {
1571 kfree(device->name);
1604 kfree(device); 1572 kfree(device);
1605 goto error; 1573 goto error;
1606 } 1574 }
1607 1575
1608 trans = btrfs_start_transaction(root, 0); 1576 trans = btrfs_start_transaction(root, 0);
1577 if (IS_ERR(trans)) {
1578 kfree(device->name);
1579 kfree(device);
1580 ret = PTR_ERR(trans);
1581 goto error;
1582 }
1583
1609 lock_chunks(root); 1584 lock_chunks(root);
1610 1585
1611 device->writeable = 1; 1586 device->writeable = 1;
@@ -1621,7 +1596,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1621 device->dev_root = root->fs_info->dev_root; 1596 device->dev_root = root->fs_info->dev_root;
1622 device->bdev = bdev; 1597 device->bdev = bdev;
1623 device->in_fs_metadata = 1; 1598 device->in_fs_metadata = 1;
1624 device->mode = 0; 1599 device->mode = FMODE_EXCL;
1625 set_blocksize(device->bdev, 4096); 1600 set_blocksize(device->bdev, 4096);
1626 1601
1627 if (seeding_dev) { 1602 if (seeding_dev) {
@@ -1873,7 +1848,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1873 return ret; 1848 return ret;
1874 1849
1875 trans = btrfs_start_transaction(root, 0); 1850 trans = btrfs_start_transaction(root, 0);
1876 BUG_ON(!trans); 1851 BUG_ON(IS_ERR(trans));
1877 1852
1878 lock_chunks(root); 1853 lock_chunks(root);
1879 1854
@@ -2047,7 +2022,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
2047 BUG_ON(ret); 2022 BUG_ON(ret);
2048 2023
2049 trans = btrfs_start_transaction(dev_root, 0); 2024 trans = btrfs_start_transaction(dev_root, 0);
2050 BUG_ON(!trans); 2025 BUG_ON(IS_ERR(trans));
2051 2026
2052 ret = btrfs_grow_device(trans, device, old_size); 2027 ret = btrfs_grow_device(trans, device, old_size);
2053 BUG_ON(ret); 2028 BUG_ON(ret);
@@ -2213,6 +2188,11 @@ again:
2213 2188
2214 /* Shrinking succeeded, else we would be at "done". */ 2189 /* Shrinking succeeded, else we would be at "done". */
2215 trans = btrfs_start_transaction(root, 0); 2190 trans = btrfs_start_transaction(root, 0);
2191 if (IS_ERR(trans)) {
2192 ret = PTR_ERR(trans);
2193 goto done;
2194 }
2195
2216 lock_chunks(root); 2196 lock_chunks(root);
2217 2197
2218 device->disk_total_bytes = new_size; 2198 device->disk_total_bytes = new_size;
@@ -2931,7 +2911,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2931static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2911static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2932 u64 logical, u64 *length, 2912 u64 logical, u64 *length,
2933 struct btrfs_multi_bio **multi_ret, 2913 struct btrfs_multi_bio **multi_ret,
2934 int mirror_num, struct page *unplug_page) 2914 int mirror_num)
2935{ 2915{
2936 struct extent_map *em; 2916 struct extent_map *em;
2937 struct map_lookup *map; 2917 struct map_lookup *map;
@@ -2963,11 +2943,6 @@ again:
2963 em = lookup_extent_mapping(em_tree, logical, *length); 2943 em = lookup_extent_mapping(em_tree, logical, *length);
2964 read_unlock(&em_tree->lock); 2944 read_unlock(&em_tree->lock);
2965 2945
2966 if (!em && unplug_page) {
2967 kfree(multi);
2968 return 0;
2969 }
2970
2971 if (!em) { 2946 if (!em) {
2972 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2947 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2973 (unsigned long long)logical, 2948 (unsigned long long)logical,
@@ -3023,13 +2998,13 @@ again:
3023 *length = em->len - offset; 2998 *length = em->len - offset;
3024 } 2999 }
3025 3000
3026 if (!multi_ret && !unplug_page) 3001 if (!multi_ret)
3027 goto out; 3002 goto out;
3028 3003
3029 num_stripes = 1; 3004 num_stripes = 1;
3030 stripe_index = 0; 3005 stripe_index = 0;
3031 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3006 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3032 if (unplug_page || (rw & REQ_WRITE)) 3007 if (rw & REQ_WRITE)
3033 num_stripes = map->num_stripes; 3008 num_stripes = map->num_stripes;
3034 else if (mirror_num) 3009 else if (mirror_num)
3035 stripe_index = mirror_num - 1; 3010 stripe_index = mirror_num - 1;
@@ -3051,7 +3026,7 @@ again:
3051 stripe_index = do_div(stripe_nr, factor); 3026 stripe_index = do_div(stripe_nr, factor);
3052 stripe_index *= map->sub_stripes; 3027 stripe_index *= map->sub_stripes;
3053 3028
3054 if (unplug_page || (rw & REQ_WRITE)) 3029 if (rw & REQ_WRITE)
3055 num_stripes = map->sub_stripes; 3030 num_stripes = map->sub_stripes;
3056 else if (mirror_num) 3031 else if (mirror_num)
3057 stripe_index += mirror_num - 1; 3032 stripe_index += mirror_num - 1;
@@ -3071,22 +3046,10 @@ again:
3071 BUG_ON(stripe_index >= map->num_stripes); 3046 BUG_ON(stripe_index >= map->num_stripes);
3072 3047
3073 for (i = 0; i < num_stripes; i++) { 3048 for (i = 0; i < num_stripes; i++) {
3074 if (unplug_page) { 3049 multi->stripes[i].physical =
3075 struct btrfs_device *device; 3050 map->stripes[stripe_index].physical +
3076 struct backing_dev_info *bdi; 3051 stripe_offset + stripe_nr * map->stripe_len;
3077 3052 multi->stripes[i].dev = map->stripes[stripe_index].dev;
3078 device = map->stripes[stripe_index].dev;
3079 if (device->bdev) {
3080 bdi = blk_get_backing_dev_info(device->bdev);
3081 if (bdi->unplug_io_fn)
3082 bdi->unplug_io_fn(bdi, unplug_page);
3083 }
3084 } else {
3085 multi->stripes[i].physical =
3086 map->stripes[stripe_index].physical +
3087 stripe_offset + stripe_nr * map->stripe_len;
3088 multi->stripes[i].dev = map->stripes[stripe_index].dev;
3089 }
3090 stripe_index++; 3053 stripe_index++;
3091 } 3054 }
3092 if (multi_ret) { 3055 if (multi_ret) {
@@ -3104,7 +3067,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3104 struct btrfs_multi_bio **multi_ret, int mirror_num) 3067 struct btrfs_multi_bio **multi_ret, int mirror_num)
3105{ 3068{
3106 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3069 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3107 mirror_num, NULL); 3070 mirror_num);
3108} 3071}
3109 3072
3110int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3073int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3172,14 +3135,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3172 return 0; 3135 return 0;
3173} 3136}
3174 3137
3175int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
3176 u64 logical, struct page *page)
3177{
3178 u64 length = PAGE_CACHE_SIZE;
3179 return __btrfs_map_block(map_tree, READ, logical, &length,
3180 NULL, 0, page);
3181}
3182
3183static void end_bio_multi_stripe(struct bio *bio, int err) 3138static void end_bio_multi_stripe(struct bio *bio, int err)
3184{ 3139{
3185 struct btrfs_multi_bio *multi = bio->bi_private; 3140 struct btrfs_multi_bio *multi = bio->bi_private;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a5776531dc2b..d779cefcfd7d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -370,7 +370,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
370} 370}
371 371
372int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, 372int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
373 struct inode *inode, struct inode *dir) 373 struct inode *inode, struct inode *dir,
374 const struct qstr *qstr)
374{ 375{
375 int err; 376 int err;
376 size_t len; 377 size_t len;
@@ -378,7 +379,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
378 char *suffix; 379 char *suffix;
379 char *name; 380 char *name;
380 381
381 err = security_inode_init_security(inode, dir, &suffix, &value, &len); 382 err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
383 &len);
382 if (err) { 384 if (err) {
383 if (err == -EOPNOTSUPP) 385 if (err == -EOPNOTSUPP)
384 return 0; 386 return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bbb..b3cc8039134b 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, 39extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
40 struct inode *inode, struct inode *dir); 40 struct inode *inode, struct inode *dir,
41 const struct qstr *qstr);
41 42
42#endif /* __XATTR__ */ 43#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index f5ec2d44150d..faccd47c6c46 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -57,7 +57,8 @@ static struct list_head *zlib_alloc_workspace(void)
57 if (!workspace) 57 if (!workspace)
58 return ERR_PTR(-ENOMEM); 58 return ERR_PTR(-ENOMEM);
59 59
60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
61 MAX_WBITS, MAX_MEM_LEVEL));
61 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 62 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 63 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
63 if (!workspace->def_strm.workspace || 64 if (!workspace->def_strm.workspace ||
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76e2caf..a08bb8e61c6f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -54,23 +54,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
54} 54}
55EXPORT_SYMBOL(init_buffer); 55EXPORT_SYMBOL(init_buffer);
56 56
57static int sync_buffer(void *word) 57static int sleep_on_buffer(void *word)
58{ 58{
59 struct block_device *bd;
60 struct buffer_head *bh
61 = container_of(word, struct buffer_head, b_state);
62
63 smp_mb();
64 bd = bh->b_bdev;
65 if (bd)
66 blk_run_address_space(bd->bd_inode->i_mapping);
67 io_schedule(); 59 io_schedule();
68 return 0; 60 return 0;
69} 61}
70 62
71void __lock_buffer(struct buffer_head *bh) 63void __lock_buffer(struct buffer_head *bh)
72{ 64{
73 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, 65 wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
74 TASK_UNINTERRUPTIBLE); 66 TASK_UNINTERRUPTIBLE);
75} 67}
76EXPORT_SYMBOL(__lock_buffer); 68EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(unlock_buffer);
90 */ 82 */
91void __wait_on_buffer(struct buffer_head * bh) 83void __wait_on_buffer(struct buffer_head * bh)
92{ 84{
93 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); 85 wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
94} 86}
95EXPORT_SYMBOL(__wait_on_buffer); 87EXPORT_SYMBOL(__wait_on_buffer);
96 88
@@ -749,10 +741,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
749{ 741{
750 struct buffer_head *bh; 742 struct buffer_head *bh;
751 struct list_head tmp; 743 struct list_head tmp;
752 struct address_space *mapping, *prev_mapping = NULL; 744 struct address_space *mapping;
753 int err = 0, err2; 745 int err = 0, err2;
746 struct blk_plug plug;
754 747
755 INIT_LIST_HEAD(&tmp); 748 INIT_LIST_HEAD(&tmp);
749 blk_start_plug(&plug);
756 750
757 spin_lock(lock); 751 spin_lock(lock);
758 while (!list_empty(list)) { 752 while (!list_empty(list)) {
@@ -775,7 +769,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
775 * still in flight on potentially older 769 * still in flight on potentially older
776 * contents. 770 * contents.
777 */ 771 */
778 write_dirty_buffer(bh, WRITE_SYNC_PLUG); 772 write_dirty_buffer(bh, WRITE_SYNC);
779 773
780 /* 774 /*
781 * Kick off IO for the previous mapping. Note 775 * Kick off IO for the previous mapping. Note
@@ -783,16 +777,16 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
783 * wait_on_buffer() will do that for us 777 * wait_on_buffer() will do that for us
784 * through sync_buffer(). 778 * through sync_buffer().
785 */ 779 */
786 if (prev_mapping && prev_mapping != mapping)
787 blk_run_address_space(prev_mapping);
788 prev_mapping = mapping;
789
790 brelse(bh); 780 brelse(bh);
791 spin_lock(lock); 781 spin_lock(lock);
792 } 782 }
793 } 783 }
794 } 784 }
795 785
786 spin_unlock(lock);
787 blk_finish_plug(&plug);
788 spin_lock(lock);
789
796 while (!list_empty(&tmp)) { 790 while (!list_empty(&tmp)) {
797 bh = BH_ENTRY(tmp.prev); 791 bh = BH_ENTRY(tmp.prev);
798 get_bh(bh); 792 get_bh(bh);
@@ -1144,7 +1138,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1144 * inode list. 1138 * inode list.
1145 * 1139 *
1146 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1140 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1147 * mapping->tree_lock and the global inode_lock. 1141 * mapping->tree_lock and mapping->host->i_lock.
1148 */ 1142 */
1149void mark_buffer_dirty(struct buffer_head *bh) 1143void mark_buffer_dirty(struct buffer_head *bh)
1150{ 1144{
@@ -1614,14 +1608,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
1614 * prevents this contention from occurring. 1608 * prevents this contention from occurring.
1615 * 1609 *
1616 * If block_write_full_page() is called with wbc->sync_mode == 1610 * If block_write_full_page() is called with wbc->sync_mode ==
1617 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this 1611 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1618 * causes the writes to be flagged as synchronous writes, but the 1612 * causes the writes to be flagged as synchronous writes.
1619 * block device queue will NOT be unplugged, since usually many pages
1620 * will be pushed to the out before the higher-level caller actually
1621 * waits for the writes to be completed. The various wait functions,
1622 * such as wait_on_writeback_range() will ultimately call sync_page()
1623 * which will ultimately call blk_run_backing_dev(), which will end up
1624 * unplugging the device queue.
1625 */ 1613 */
1626static int __block_write_full_page(struct inode *inode, struct page *page, 1614static int __block_write_full_page(struct inode *inode, struct page *page,
1627 get_block_t *get_block, struct writeback_control *wbc, 1615 get_block_t *get_block, struct writeback_control *wbc,
@@ -1634,7 +1622,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1634 const unsigned blocksize = 1 << inode->i_blkbits; 1622 const unsigned blocksize = 1 << inode->i_blkbits;
1635 int nr_underway = 0; 1623 int nr_underway = 0;
1636 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? 1624 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1637 WRITE_SYNC_PLUG : WRITE); 1625 WRITE_SYNC : WRITE);
1638 1626
1639 BUG_ON(!PageLocked(page)); 1627 BUG_ON(!PageLocked(page));
1640 1628
@@ -3138,17 +3126,6 @@ out:
3138} 3126}
3139EXPORT_SYMBOL(try_to_free_buffers); 3127EXPORT_SYMBOL(try_to_free_buffers);
3140 3128
3141void block_sync_page(struct page *page)
3142{
3143 struct address_space *mapping;
3144
3145 smp_mb();
3146 mapping = page_mapping(page);
3147 if (mapping)
3148 blk_run_backing_dev(mapping->backing_dev_info, page);
3149}
3150EXPORT_SYMBOL(block_sync_page);
3151
3152/* 3129/*
3153 * There are no bdflush tunables left. But distributions are 3130 * There are no bdflush tunables left. But distributions are
3154 * still running obsolete flush daemons, so we terminate them here. 3131 * still running obsolete flush daemons, so we terminate them here.
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 42c7fafc8bfe..a0358c2189cb 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -275,6 +275,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
275 bool preemptive) 275 bool preemptive)
276{ 276{
277 struct dentry *grave, *trap; 277 struct dentry *grave, *trap;
278 struct path path, path_to_graveyard;
278 char nbuffer[8 + 8 + 1]; 279 char nbuffer[8 + 8 + 1];
279 int ret; 280 int ret;
280 281
@@ -287,10 +288,18 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
287 /* non-directories can just be unlinked */ 288 /* non-directories can just be unlinked */
288 if (!S_ISDIR(rep->d_inode->i_mode)) { 289 if (!S_ISDIR(rep->d_inode->i_mode)) {
289 _debug("unlink stale object"); 290 _debug("unlink stale object");
290 ret = vfs_unlink(dir->d_inode, rep);
291 291
292 if (preemptive) 292 path.mnt = cache->mnt;
293 cachefiles_mark_object_buried(cache, rep); 293 path.dentry = dir;
294 ret = security_path_unlink(&path, rep);
295 if (ret < 0) {
296 cachefiles_io_error(cache, "Unlink security error");
297 } else {
298 ret = vfs_unlink(dir->d_inode, rep);
299
300 if (preemptive)
301 cachefiles_mark_object_buried(cache, rep);
302 }
294 303
295 mutex_unlock(&dir->d_inode->i_mutex); 304 mutex_unlock(&dir->d_inode->i_mutex);
296 305
@@ -379,12 +388,23 @@ try_again:
379 } 388 }
380 389
381 /* attempt the rename */ 390 /* attempt the rename */
382 ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave); 391 path.mnt = cache->mnt;
383 if (ret != 0 && ret != -ENOMEM) 392 path.dentry = dir;
384 cachefiles_io_error(cache, "Rename failed with error %d", ret); 393 path_to_graveyard.mnt = cache->mnt;
394 path_to_graveyard.dentry = cache->graveyard;
395 ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
396 if (ret < 0) {
397 cachefiles_io_error(cache, "Rename security error %d", ret);
398 } else {
399 ret = vfs_rename(dir->d_inode, rep,
400 cache->graveyard->d_inode, grave);
401 if (ret != 0 && ret != -ENOMEM)
402 cachefiles_io_error(cache,
403 "Rename failed with error %d", ret);
385 404
386 if (preemptive) 405 if (preemptive)
387 cachefiles_mark_object_buried(cache, rep); 406 cachefiles_mark_object_buried(cache, rep);
407 }
388 408
389 unlock_rename(cache->graveyard, dir); 409 unlock_rename(cache->graveyard, dir);
390 dput(grave); 410 dput(grave);
@@ -448,6 +468,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
448{ 468{
449 struct cachefiles_cache *cache; 469 struct cachefiles_cache *cache;
450 struct dentry *dir, *next = NULL; 470 struct dentry *dir, *next = NULL;
471 struct path path;
451 unsigned long start; 472 unsigned long start;
452 const char *name; 473 const char *name;
453 int ret, nlen; 474 int ret, nlen;
@@ -458,6 +479,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
458 479
459 cache = container_of(parent->fscache.cache, 480 cache = container_of(parent->fscache.cache,
460 struct cachefiles_cache, cache); 481 struct cachefiles_cache, cache);
482 path.mnt = cache->mnt;
461 483
462 ASSERT(parent->dentry); 484 ASSERT(parent->dentry);
463 ASSERT(parent->dentry->d_inode); 485 ASSERT(parent->dentry->d_inode);
@@ -511,6 +533,10 @@ lookup_again:
511 if (ret < 0) 533 if (ret < 0)
512 goto create_error; 534 goto create_error;
513 535
536 path.dentry = dir;
537 ret = security_path_mkdir(&path, next, 0);
538 if (ret < 0)
539 goto create_error;
514 start = jiffies; 540 start = jiffies;
515 ret = vfs_mkdir(dir->d_inode, next, 0); 541 ret = vfs_mkdir(dir->d_inode, next, 0);
516 cachefiles_hist(cachefiles_mkdir_histogram, start); 542 cachefiles_hist(cachefiles_mkdir_histogram, start);
@@ -536,6 +562,10 @@ lookup_again:
536 if (ret < 0) 562 if (ret < 0)
537 goto create_error; 563 goto create_error;
538 564
565 path.dentry = dir;
566 ret = security_path_mknod(&path, next, S_IFREG, 0);
567 if (ret < 0)
568 goto create_error;
539 start = jiffies; 569 start = jiffies;
540 ret = vfs_create(dir->d_inode, next, S_IFREG, NULL); 570 ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
541 cachefiles_hist(cachefiles_create_histogram, start); 571 cachefiles_hist(cachefiles_create_histogram, start);
@@ -692,6 +722,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
692{ 722{
693 struct dentry *subdir; 723 struct dentry *subdir;
694 unsigned long start; 724 unsigned long start;
725 struct path path;
695 int ret; 726 int ret;
696 727
697 _enter(",,%s", dirname); 728 _enter(",,%s", dirname);
@@ -719,6 +750,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
719 750
720 _debug("attempt mkdir"); 751 _debug("attempt mkdir");
721 752
753 path.mnt = cache->mnt;
754 path.dentry = dir;
755 ret = security_path_mkdir(&path, subdir, 0700);
756 if (ret < 0)
757 goto mkdir_error;
722 ret = vfs_mkdir(dir->d_inode, subdir, 0700); 758 ret = vfs_mkdir(dir->d_inode, subdir, 0700);
723 if (ret < 0) 759 if (ret < 0)
724 goto mkdir_error; 760 goto mkdir_error;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 60d27bc9eb83..6b61ded701e1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1560,9 +1560,10 @@ retry_locked:
1560 /* NOTE: no side-effects allowed, until we take s_mutex */ 1560 /* NOTE: no side-effects allowed, until we take s_mutex */
1561 1561
1562 revoking = cap->implemented & ~cap->issued; 1562 revoking = cap->implemented & ~cap->issued;
1563 if (revoking) 1563 dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
1564 dout(" mds%d revoking %s\n", cap->mds, 1564 cap->mds, cap, ceph_cap_string(cap->issued),
1565 ceph_cap_string(revoking)); 1565 ceph_cap_string(cap->implemented),
1566 ceph_cap_string(revoking));
1566 1567
1567 if (cap == ci->i_auth_cap && 1568 if (cap == ci->i_auth_cap &&
1568 (cap->issued & CEPH_CAP_FILE_WR)) { 1569 (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1658,6 +1659,8 @@ ack:
1658 1659
1659 if (cap == ci->i_auth_cap && ci->i_dirty_caps) 1660 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1660 flushing = __mark_caps_flushing(inode, session); 1661 flushing = __mark_caps_flushing(inode, session);
1662 else
1663 flushing = 0;
1661 1664
1662 mds = cap->mds; /* remember mds, so we don't repeat */ 1665 mds = cap->mds; /* remember mds, so we don't repeat */
1663 sent++; 1666 sent++;
@@ -1940,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1940 } 1943 }
1941} 1944}
1942 1945
1946static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1947 struct ceph_mds_session *session,
1948 struct inode *inode)
1949{
1950 struct ceph_inode_info *ci = ceph_inode(inode);
1951 struct ceph_cap *cap;
1952 int delayed = 0;
1953
1954 spin_lock(&inode->i_lock);
1955 cap = ci->i_auth_cap;
1956 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1957 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
1958 __ceph_flush_snaps(ci, &session, 1);
1959 if (ci->i_flushing_caps) {
1960 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1961 __ceph_caps_used(ci),
1962 __ceph_caps_wanted(ci),
1963 cap->issued | cap->implemented,
1964 ci->i_flushing_caps, NULL);
1965 if (delayed) {
1966 spin_lock(&inode->i_lock);
1967 __cap_delay_requeue(mdsc, ci);
1968 spin_unlock(&inode->i_lock);
1969 }
1970 } else {
1971 spin_unlock(&inode->i_lock);
1972 }
1973}
1974
1943 1975
1944/* 1976/*
1945 * Take references to capabilities we hold, so that we don't release 1977 * Take references to capabilities we hold, so that we don't release
@@ -2687,7 +2719,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2687 ceph_add_cap(inode, session, cap_id, -1, 2719 ceph_add_cap(inode, session, cap_id, -1,
2688 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, 2720 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2689 NULL /* no caps context */); 2721 NULL /* no caps context */);
2690 try_flush_caps(inode, session, NULL); 2722 kick_flushing_inode_caps(mdsc, session, inode);
2691 up_read(&mdsc->snap_rwsem); 2723 up_read(&mdsc->snap_rwsem);
2692 2724
2693 /* make sure we re-request max_size, if necessary */ 2725 /* make sure we re-request max_size, if necessary */
@@ -2785,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2785 case CEPH_CAP_OP_IMPORT: 2817 case CEPH_CAP_OP_IMPORT:
2786 handle_cap_import(mdsc, inode, h, session, 2818 handle_cap_import(mdsc, inode, h, session,
2787 snaptrace, snaptrace_len); 2819 snaptrace, snaptrace_len);
2788 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, 2820 ceph_check_caps(ceph_inode(inode), 0, session);
2789 session);
2790 goto done_unlocked; 2821 goto done_unlocked;
2791 } 2822 }
2792 2823
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 08f65faac112..0dba6915712b 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -210,8 +210,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
210 if (!fsc->debugfs_congestion_kb) 210 if (!fsc->debugfs_congestion_kb)
211 goto out; 211 goto out;
212 212
213 dout("a\n");
214
215 snprintf(name, sizeof(name), "../../bdi/%s", 213 snprintf(name, sizeof(name), "../../bdi/%s",
216 dev_name(fsc->backing_dev_info.dev)); 214 dev_name(fsc->backing_dev_info.dev));
217 fsc->debugfs_bdi = 215 fsc->debugfs_bdi =
@@ -221,7 +219,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
221 if (!fsc->debugfs_bdi) 219 if (!fsc->debugfs_bdi)
222 goto out; 220 goto out;
223 221
224 dout("b\n");
225 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 222 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
226 0600, 223 0600,
227 fsc->client->debugfs_dir, 224 fsc->client->debugfs_dir,
@@ -230,7 +227,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
230 if (!fsc->debugfs_mdsmap) 227 if (!fsc->debugfs_mdsmap)
231 goto out; 228 goto out;
232 229
233 dout("ca\n");
234 fsc->debugfs_mdsc = debugfs_create_file("mdsc", 230 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
235 0600, 231 0600,
236 fsc->client->debugfs_dir, 232 fsc->client->debugfs_dir,
@@ -239,7 +235,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
239 if (!fsc->debugfs_mdsc) 235 if (!fsc->debugfs_mdsc)
240 goto out; 236 goto out;
241 237
242 dout("da\n");
243 fsc->debugfs_caps = debugfs_create_file("caps", 238 fsc->debugfs_caps = debugfs_create_file("caps",
244 0400, 239 0400,
245 fsc->client->debugfs_dir, 240 fsc->client->debugfs_dir,
@@ -248,7 +243,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
248 if (!fsc->debugfs_caps) 243 if (!fsc->debugfs_caps)
249 goto out; 244 goto out;
250 245
251 dout("ea\n");
252 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 246 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
253 0600, 247 0600,
254 fsc->client->debugfs_dir, 248 fsc->client->debugfs_dir,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bc68de8edd7..1a867a3601ae 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -161,7 +161,7 @@ more:
161 filp->f_pos = di->offset; 161 filp->f_pos = di->offset;
162 err = filldir(dirent, dentry->d_name.name, 162 err = filldir(dirent, dentry->d_name.name,
163 dentry->d_name.len, di->offset, 163 dentry->d_name.len, di->offset,
164 dentry->d_inode->i_ino, 164 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
165 dentry->d_inode->i_mode >> 12); 165 dentry->d_inode->i_mode >> 12);
166 166
167 if (last) { 167 if (last) {
@@ -245,15 +245,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
245 245
246 dout("readdir off 0 -> '.'\n"); 246 dout("readdir off 0 -> '.'\n");
247 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), 247 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
248 inode->i_ino, inode->i_mode >> 12) < 0) 248 ceph_translate_ino(inode->i_sb, inode->i_ino),
249 inode->i_mode >> 12) < 0)
249 return 0; 250 return 0;
250 filp->f_pos = 1; 251 filp->f_pos = 1;
251 off = 1; 252 off = 1;
252 } 253 }
253 if (filp->f_pos == 1) { 254 if (filp->f_pos == 1) {
255 ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino;
254 dout("readdir off 1 -> '..'\n"); 256 dout("readdir off 1 -> '..'\n");
255 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), 257 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
256 filp->f_dentry->d_parent->d_inode->i_ino, 258 ceph_translate_ino(inode->i_sb, ino),
257 inode->i_mode >> 12) < 0) 259 inode->i_mode >> 12) < 0)
258 return 0; 260 return 0;
259 filp->f_pos = 2; 261 filp->f_pos = 2;
@@ -377,7 +379,8 @@ more:
377 if (filldir(dirent, 379 if (filldir(dirent,
378 rinfo->dir_dname[off - fi->offset], 380 rinfo->dir_dname[off - fi->offset],
379 rinfo->dir_dname_len[off - fi->offset], 381 rinfo->dir_dname_len[off - fi->offset],
380 pos, ino, ftype) < 0) { 382 pos,
383 ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
381 dout("filldir stopping us...\n"); 384 dout("filldir stopping us...\n");
382 return 0; 385 return 0;
383 } 386 }
@@ -409,7 +412,7 @@ more:
409 spin_lock(&inode->i_lock); 412 spin_lock(&inode->i_lock);
410 if (ci->i_release_count == fi->dir_release_count) { 413 if (ci->i_release_count == fi->dir_release_count) {
411 dout(" marking %p complete\n", inode); 414 dout(" marking %p complete\n", inode);
412 ci->i_ceph_flags |= CEPH_I_COMPLETE; 415 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
413 ci->i_max_offset = filp->f_pos; 416 ci->i_max_offset = filp->f_pos;
414 } 417 }
415 spin_unlock(&inode->i_lock); 418 spin_unlock(&inode->i_lock);
@@ -496,6 +499,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
496 499
497 /* .snap dir? */ 500 /* .snap dir? */
498 if (err == -ENOENT && 501 if (err == -ENOENT &&
502 ceph_snap(parent) == CEPH_NOSNAP &&
499 strcmp(dentry->d_name.name, 503 strcmp(dentry->d_name.name,
500 fsc->mount_options->snapdir_name) == 0) { 504 fsc->mount_options->snapdir_name) == 0) {
501 struct inode *inode = ceph_get_snapdir(parent); 505 struct inode *inode = ceph_get_snapdir(parent);
@@ -992,7 +996,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
992{ 996{
993 struct inode *dir; 997 struct inode *dir;
994 998
995 if (nd->flags & LOOKUP_RCU) 999 if (nd && nd->flags & LOOKUP_RCU)
996 return -ECHILD; 1000 return -ECHILD;
997 1001
998 dir = dentry->d_parent->d_inode; 1002 dir = dentry->d_parent->d_inode;
@@ -1023,34 +1027,13 @@ out_touch:
1023} 1027}
1024 1028
1025/* 1029/*
1026 * When a dentry is released, clear the dir I_COMPLETE if it was part 1030 * Release our ceph_dentry_info.
1027 * of the current dir gen or if this is in the snapshot namespace.
1028 */ 1031 */
1029static void ceph_dentry_release(struct dentry *dentry) 1032static void ceph_d_release(struct dentry *dentry)
1030{ 1033{
1031 struct ceph_dentry_info *di = ceph_dentry(dentry); 1034 struct ceph_dentry_info *di = ceph_dentry(dentry);
1032 struct inode *parent_inode = NULL;
1033 u64 snapid = CEPH_NOSNAP;
1034 1035
1035 if (!IS_ROOT(dentry)) { 1036 dout("d_release %p\n", dentry);
1036 parent_inode = dentry->d_parent->d_inode;
1037 if (parent_inode)
1038 snapid = ceph_snap(parent_inode);
1039 }
1040 dout("dentry_release %p parent %p\n", dentry, parent_inode);
1041 if (parent_inode && snapid != CEPH_SNAPDIR) {
1042 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1043
1044 spin_lock(&parent_inode->i_lock);
1045 if (ci->i_shared_gen == di->lease_shared_gen ||
1046 snapid <= CEPH_MAXSNAP) {
1047 dout(" clearing %p complete (d_release)\n",
1048 parent_inode);
1049 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1050 ci->i_release_count++;
1051 }
1052 spin_unlock(&parent_inode->i_lock);
1053 }
1054 if (di) { 1037 if (di) {
1055 ceph_dentry_lru_del(dentry); 1038 ceph_dentry_lru_del(dentry);
1056 if (di->lease_session) 1039 if (di->lease_session)
@@ -1275,14 +1258,14 @@ const struct inode_operations ceph_dir_iops = {
1275 1258
1276const struct dentry_operations ceph_dentry_ops = { 1259const struct dentry_operations ceph_dentry_ops = {
1277 .d_revalidate = ceph_d_revalidate, 1260 .d_revalidate = ceph_d_revalidate,
1278 .d_release = ceph_dentry_release, 1261 .d_release = ceph_d_release,
1279}; 1262};
1280 1263
1281const struct dentry_operations ceph_snapdir_dentry_ops = { 1264const struct dentry_operations ceph_snapdir_dentry_ops = {
1282 .d_revalidate = ceph_snapdir_d_revalidate, 1265 .d_revalidate = ceph_snapdir_d_revalidate,
1283 .d_release = ceph_dentry_release, 1266 .d_release = ceph_d_release,
1284}; 1267};
1285 1268
1286const struct dentry_operations ceph_snap_dentry_ops = { 1269const struct dentry_operations ceph_snap_dentry_ops = {
1287 .d_release = ceph_dentry_release, 1270 .d_release = ceph_d_release,
1288}; 1271};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7d0e4a82d898..159b512d5a27 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -564,11 +564,19 @@ more:
564 * start_request so that a tid has been assigned. 564 * start_request so that a tid has been assigned.
565 */ 565 */
566 spin_lock(&ci->i_unsafe_lock); 566 spin_lock(&ci->i_unsafe_lock);
567 list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); 567 list_add_tail(&req->r_unsafe_item,
568 &ci->i_unsafe_writes);
568 spin_unlock(&ci->i_unsafe_lock); 569 spin_unlock(&ci->i_unsafe_lock);
569 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 570 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
570 } 571 }
572
571 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 573 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
574 if (ret < 0 && req->r_safe_callback) {
575 spin_lock(&ci->i_unsafe_lock);
576 list_del_init(&req->r_unsafe_item);
577 spin_unlock(&ci->i_unsafe_lock);
578 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
579 }
572 } 580 }
573 581
574 if (file->f_flags & O_DIRECT) 582 if (file->f_flags & O_DIRECT)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e835eff551e3..b54c97da1c43 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -36,6 +36,13 @@ static void ceph_vmtruncate_work(struct work_struct *work);
36/* 36/*
37 * find or create an inode, given the ceph ino number 37 * find or create an inode, given the ceph ino number
38 */ 38 */
39static int ceph_set_ino_cb(struct inode *inode, void *data)
40{
41 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
42 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
43 return 0;
44}
45
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) 46struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{ 47{
41 struct inode *inode; 48 struct inode *inode;
@@ -707,13 +714,9 @@ static int fill_inode(struct inode *inode,
707 (issued & CEPH_CAP_FILE_EXCL) == 0 && 714 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
708 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 715 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
709 dout(" marking %p complete (empty)\n", inode); 716 dout(" marking %p complete (empty)\n", inode);
710 ci->i_ceph_flags |= CEPH_I_COMPLETE; 717 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
711 ci->i_max_offset = 2; 718 ci->i_max_offset = 2;
712 } 719 }
713
714 /* it may be better to set st_size in getattr instead? */
715 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
716 inode->i_size = ci->i_rbytes;
717 break; 720 break;
718 default: 721 default:
719 pr_err("fill_inode %llx.%llx BAD mode 0%o\n", 722 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -1034,9 +1037,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1034 dout("fill_trace doing d_move %p -> %p\n", 1037 dout("fill_trace doing d_move %p -> %p\n",
1035 req->r_old_dentry, dn); 1038 req->r_old_dentry, dn);
1036 1039
1037 /* d_move screws up d_subdirs order */
1038 ceph_i_clear(dir, CEPH_I_COMPLETE);
1039
1040 d_move(req->r_old_dentry, dn); 1040 d_move(req->r_old_dentry, dn);
1041 dout(" src %p '%.*s' dst %p '%.*s'\n", 1041 dout(" src %p '%.*s' dst %p '%.*s'\n",
1042 req->r_old_dentry, 1042 req->r_old_dentry,
@@ -1048,12 +1048,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1048 rehashing bug in vfs_rename_dir */ 1048 rehashing bug in vfs_rename_dir */
1049 ceph_invalidate_dentry_lease(dn); 1049 ceph_invalidate_dentry_lease(dn);
1050 1050
1051 /* take overwritten dentry's readdir offset */ 1051 /*
1052 dout("dn %p gets %p offset %lld (old offset %lld)\n", 1052 * d_move() puts the renamed dentry at the end of
1053 req->r_old_dentry, dn, ceph_dentry(dn)->offset, 1053 * d_subdirs. We need to assign it an appropriate
1054 * directory offset so we can behave when holding
1055 * I_COMPLETE.
1056 */
1057 ceph_set_dentry_offset(req->r_old_dentry);
1058 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
1054 ceph_dentry(req->r_old_dentry)->offset); 1059 ceph_dentry(req->r_old_dentry)->offset);
1055 ceph_dentry(req->r_old_dentry)->offset =
1056 ceph_dentry(dn)->offset;
1057 1060
1058 dn = req->r_old_dentry; /* use old_dentry */ 1061 dn = req->r_old_dentry; /* use old_dentry */
1059 in = dn->d_inode; 1062 in = dn->d_inode;
@@ -1813,13 +1816,17 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1813 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); 1816 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1814 if (!err) { 1817 if (!err) {
1815 generic_fillattr(inode, stat); 1818 generic_fillattr(inode, stat);
1816 stat->ino = inode->i_ino; 1819 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
1817 if (ceph_snap(inode) != CEPH_NOSNAP) 1820 if (ceph_snap(inode) != CEPH_NOSNAP)
1818 stat->dev = ceph_snap(inode); 1821 stat->dev = ceph_snap(inode);
1819 else 1822 else
1820 stat->dev = 0; 1823 stat->dev = 0;
1821 if (S_ISDIR(inode->i_mode)) { 1824 if (S_ISDIR(inode->i_mode)) {
1822 stat->size = ci->i_rbytes; 1825 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
1826 RBYTES))
1827 stat->size = ci->i_rbytes;
1828 else
1829 stat->size = ci->i_files + ci->i_subdirs;
1823 stat->blocks = 0; 1830 stat->blocks = 0;
1824 stat->blksize = 65536; 1831 stat->blksize = 65536;
1825 } 1832 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1e30d194a8e3..a1ee8fa3a8e7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -693,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
693 dout("choose_mds %p %llx.%llx " 693 dout("choose_mds %p %llx.%llx "
694 "frag %u mds%d (%d/%d)\n", 694 "frag %u mds%d (%d/%d)\n",
695 inode, ceph_vinop(inode), 695 inode, ceph_vinop(inode),
696 frag.frag, frag.mds, 696 frag.frag, mds,
697 (int)r, frag.ndist); 697 (int)r, frag.ndist);
698 return mds; 698 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
699 CEPH_MDS_STATE_ACTIVE)
700 return mds;
699 } 701 }
700 702
701 /* since this file/dir wasn't known to be 703 /* since this file/dir wasn't known to be
@@ -708,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
708 dout("choose_mds %p %llx.%llx " 710 dout("choose_mds %p %llx.%llx "
709 "frag %u mds%d (auth)\n", 711 "frag %u mds%d (auth)\n",
710 inode, ceph_vinop(inode), frag.frag, mds); 712 inode, ceph_vinop(inode), frag.frag, mds);
711 return mds; 713 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
714 CEPH_MDS_STATE_ACTIVE)
715 return mds;
712 } 716 }
713 } 717 }
714 } 718 }
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 39c243acd062..f40b9139e437 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -584,10 +584,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
584 if (lastinode) 584 if (lastinode)
585 iput(lastinode); 585 iput(lastinode);
586 586
587 dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino); 587 list_for_each_entry(child, &realm->children, child_item) {
588 list_for_each_entry(child, &realm->children, child_item) 588 dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
589 queue_realm_cap_snaps(child); 589 realm, realm->ino, child, child->ino);
590 list_del_init(&child->dirty_item);
591 list_add(&child->dirty_item, &realm->dirty_item);
592 }
590 593
594 list_del_init(&realm->dirty_item);
591 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); 595 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
592} 596}
593 597
@@ -683,7 +687,9 @@ more:
683 * queue cap snaps _after_ we've built the new snap contexts, 687 * queue cap snaps _after_ we've built the new snap contexts,
684 * so that i_head_snapc can be set appropriately. 688 * so that i_head_snapc can be set appropriately.
685 */ 689 */
686 list_for_each_entry(realm, &dirty_realms, dirty_item) { 690 while (!list_empty(&dirty_realms)) {
691 realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
692 dirty_item);
687 queue_realm_cap_snaps(realm); 693 queue_realm_cap_snaps(realm);
688 } 694 }
689 695
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bf6f0f34082a..a9e78b4a258c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,7 @@ enum {
131 Opt_rbytes, 131 Opt_rbytes,
132 Opt_norbytes, 132 Opt_norbytes,
133 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
134 Opt_ino32,
134}; 135};
135 136
136static match_table_t fsopt_tokens = { 137static match_table_t fsopt_tokens = {
@@ -150,6 +151,7 @@ static match_table_t fsopt_tokens = {
150 {Opt_rbytes, "rbytes"}, 151 {Opt_rbytes, "rbytes"},
151 {Opt_norbytes, "norbytes"}, 152 {Opt_norbytes, "norbytes"},
152 {Opt_noasyncreaddir, "noasyncreaddir"}, 153 {Opt_noasyncreaddir, "noasyncreaddir"},
154 {Opt_ino32, "ino32"},
153 {-1, NULL} 155 {-1, NULL}
154}; 156};
155 157
@@ -225,6 +227,9 @@ static int parse_fsopt_token(char *c, void *private)
225 case Opt_noasyncreaddir: 227 case Opt_noasyncreaddir:
226 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; 228 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
227 break; 229 break;
230 case Opt_ino32:
231 fsopt->flags |= CEPH_MOUNT_OPT_INO32;
232 break;
228 default: 233 default:
229 BUG_ON(token); 234 BUG_ON(token);
230 } 235 }
@@ -288,8 +293,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
288 fsopt->sb_flags = flags; 293 fsopt->sb_flags = flags;
289 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; 294 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
290 295
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 296 fsopt->rsize = CEPH_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 297 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
298 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
299 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
293 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; 300 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
294 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 301 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
295 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 302 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
@@ -368,7 +375,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
368 375
369 if (fsopt->wsize) 376 if (fsopt->wsize)
370 seq_printf(m, ",wsize=%d", fsopt->wsize); 377 seq_printf(m, ",wsize=%d", fsopt->wsize);
371 if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) 378 if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
372 seq_printf(m, ",rsize=%d", fsopt->rsize); 379 seq_printf(m, ",rsize=%d", fsopt->rsize);
373 if (fsopt->congestion_kb != default_congestion_kb()) 380 if (fsopt->congestion_kb != default_congestion_kb())
374 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); 381 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 20b907d76ae2..619fe719968f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -27,6 +27,7 @@
27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
30#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
30 31
31#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) 32#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
32 33
@@ -35,6 +36,7 @@
35#define ceph_test_mount_opt(fsc, opt) \ 36#define ceph_test_mount_opt(fsc, opt) \
36 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) 37 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
37 38
39#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */
38#define CEPH_MAX_READDIR_DEFAULT 1024 40#define CEPH_MAX_READDIR_DEFAULT 1024
39#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) 41#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
40#define CEPH_SNAPDIRNAME_DEFAULT ".snap" 42#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
@@ -319,6 +321,16 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
319 return container_of(inode, struct ceph_inode_info, vfs_inode); 321 return container_of(inode, struct ceph_inode_info, vfs_inode);
320} 322}
321 323
324static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
325{
326 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
327}
328
329static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
330{
331 return (struct ceph_fs_client *)sb->s_fs_info;
332}
333
322static inline struct ceph_vino ceph_vino(struct inode *inode) 334static inline struct ceph_vino ceph_vino(struct inode *inode)
323{ 335{
324 return ceph_inode(inode)->i_vino; 336 return ceph_inode(inode)->i_vino;
@@ -327,19 +339,49 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
327/* 339/*
328 * ino_t is <64 bits on many architectures, blech. 340 * ino_t is <64 bits on many architectures, blech.
329 * 341 *
330 * don't include snap in ino hash, at least for now. 342 * i_ino (kernel inode) st_ino (userspace)
343 * i386 32 32
344 * x86_64+ino32 64 32
345 * x86_64 64 64
346 */
347static inline u32 ceph_ino_to_ino32(ino_t ino)
348{
349 ino ^= ino >> (sizeof(ino) * 8 - 32);
350 if (!ino)
351 ino = 1;
352 return ino;
353}
354
355/*
356 * kernel i_ino value
331 */ 357 */
332static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) 358static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
333{ 359{
334 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ 360 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
335#if BITS_PER_LONG == 32 361#if BITS_PER_LONG == 32
336 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; 362 ino = ceph_ino_to_ino32(ino);
337 if (!ino)
338 ino = 1;
339#endif 363#endif
340 return ino; 364 return ino;
341} 365}
342 366
367/*
368 * user-visible ino (stat, filldir)
369 */
370#if BITS_PER_LONG == 32
371static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
372{
373 return ino;
374}
375#else
376static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
377{
378 if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
379 ino = ceph_ino_to_ino32(ino);
380 return ino;
381}
382#endif
383
384
343/* for printf-style formatting */ 385/* for printf-style formatting */
344#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap 386#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
345 387
@@ -428,13 +470,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
428 return ((loff_t)frag << 32) | (loff_t)off; 470 return ((loff_t)frag << 32) | (loff_t)off;
429} 471}
430 472
431static inline int ceph_set_ino_cb(struct inode *inode, void *data)
432{
433 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
434 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
435 return 0;
436}
437
438/* 473/*
439 * caps helpers 474 * caps helpers
440 */ 475 */
@@ -503,15 +538,6 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
503 int *total, int *avail, int *used, 538 int *total, int *avail, int *used,
504 int *reserved, int *min); 539 int *reserved, int *min);
505 540
506static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
507{
508 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
509}
510
511static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
512{
513 return (struct ceph_fs_client *)sb->s_fs_info;
514}
515 541
516 542
517/* 543/*
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 6e12a6ba5f79..8c9eba6ef9df 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
219 struct rb_node **p; 219 struct rb_node **p;
220 struct rb_node *parent = NULL; 220 struct rb_node *parent = NULL;
221 struct ceph_inode_xattr *xattr = NULL; 221 struct ceph_inode_xattr *xattr = NULL;
222 int name_len = strlen(name);
222 int c; 223 int c;
223 224
224 p = &ci->i_xattrs.index.rb_node; 225 p = &ci->i_xattrs.index.rb_node;
@@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
226 parent = *p; 227 parent = *p;
227 xattr = rb_entry(parent, struct ceph_inode_xattr, node); 228 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
228 c = strncmp(name, xattr->name, xattr->name_len); 229 c = strncmp(name, xattr->name, xattr->name_len);
230 if (c == 0 && name_len > xattr->name_len)
231 c = 1;
229 if (c < 0) 232 if (c < 0)
230 p = &(*p)->rb_left; 233 p = &(*p)->rb_left;
231 else if (c > 0) 234 else if (c > 0)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index ee45648b0d1a..7cb0f7f847e4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -3,6 +3,7 @@ config CIFS
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select CRYPTO 5 select CRYPTO
6 select CRYPTO_MD4
6 select CRYPTO_MD5 7 select CRYPTO_MD5
7 select CRYPTO_HMAC 8 select CRYPTO_HMAC
8 select CRYPTO_ARC4 9 select CRYPTO_ARC4
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 43b19dd39191..d87558448e3d 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
5 5
6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ 6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ 7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ 8 cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
9 readdir.o ioctl.o sess.o export.o 9 readdir.o ioctl.o sess.o export.o
10 10
11cifs-$(CONFIG_CIFS_ACL) += cifsacl.o 11cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
diff --git a/fs/cifs/README b/fs/cifs/README
index 46af99ab3614..fe1683590828 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -452,6 +452,11 @@ A partial list of the supported mount options follows:
452 if oplock (caching token) is granted and held. Note that 452 if oplock (caching token) is granted and held. Note that
453 direct allows write operations larger than page size 453 direct allows write operations larger than page size
454 to be sent to the server. 454 to be sent to the server.
455 strictcache Use for switching on strict cache mode. In this mode the
456 client read from the cache all the time it has Oplock Level II,
457 otherwise - read from the server. All written data are stored
458 in the cache, but if the client doesn't have Exclusive Oplock,
459 it writes the data to the server.
455 acl Allow setfacl and getfacl to manage posix ACLs if server 460 acl Allow setfacl and getfacl to manage posix ACLs if server
456 supports them. (default) 461 supports them. (default)
457 noacl Do not allow setfacl and getfacl calls on this mount 462 noacl Do not allow setfacl and getfacl calls on this mount
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 7ed36536e754..0a265ad9e426 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -282,8 +282,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
282 cFYI(1, "in %s", __func__); 282 cFYI(1, "in %s", __func__);
283 BUG_ON(IS_ROOT(mntpt)); 283 BUG_ON(IS_ROOT(mntpt));
284 284
285 xid = GetXid();
286
287 /* 285 /*
288 * The MSDFS spec states that paths in DFS referral requests and 286 * The MSDFS spec states that paths in DFS referral requests and
289 * responses must be prefixed by a single '\' character instead of 287 * responses must be prefixed by a single '\' character instead of
@@ -293,20 +291,21 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
293 mnt = ERR_PTR(-ENOMEM); 291 mnt = ERR_PTR(-ENOMEM);
294 full_path = build_path_from_dentry(mntpt); 292 full_path = build_path_from_dentry(mntpt);
295 if (full_path == NULL) 293 if (full_path == NULL)
296 goto free_xid; 294 goto cdda_exit;
297 295
298 cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); 296 cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
299 tlink = cifs_sb_tlink(cifs_sb); 297 tlink = cifs_sb_tlink(cifs_sb);
300 mnt = ERR_PTR(-EINVAL);
301 if (IS_ERR(tlink)) { 298 if (IS_ERR(tlink)) {
302 mnt = ERR_CAST(tlink); 299 mnt = ERR_CAST(tlink);
303 goto free_full_path; 300 goto free_full_path;
304 } 301 }
305 ses = tlink_tcon(tlink)->ses; 302 ses = tlink_tcon(tlink)->ses;
306 303
304 xid = GetXid();
307 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, 305 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
308 &num_referrals, &referrals, 306 &num_referrals, &referrals,
309 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 307 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
308 FreeXid(xid);
310 309
311 cifs_put_tlink(tlink); 310 cifs_put_tlink(tlink);
312 311
@@ -339,8 +338,7 @@ success:
339 free_dfs_info_array(referrals, num_referrals); 338 free_dfs_info_array(referrals, num_referrals);
340free_full_path: 339free_full_path:
341 kfree(full_path); 340 kfree(full_path);
342free_xid: 341cdda_exit:
343 FreeXid(xid);
344 cFYI(1, "leaving %s" , __func__); 342 cFYI(1, "leaving %s" , __func__);
345 return mnt; 343 return mnt;
346} 344}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1e7636b145a8..beeebf194234 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -372,6 +372,10 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
372 372
373 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), 373 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
374 GFP_KERNEL); 374 GFP_KERNEL);
375 if (!ppace) {
376 cERROR(1, "DACL memory allocation error");
377 return;
378 }
375 379
376 for (i = 0; i < num_aces; ++i) { 380 for (i = 0; i < num_aces; ++i) {
377 ppace[i] = (struct cifs_ace *) (acl_base + acl_size); 381 ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 66f3d50d0676..a51585f9852b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,7 +24,6 @@
24#include "cifspdu.h" 24#include "cifspdu.h"
25#include "cifsglob.h" 25#include "cifsglob.h"
26#include "cifs_debug.h" 26#include "cifs_debug.h"
27#include "md5.h"
28#include "cifs_unicode.h" 27#include "cifs_unicode.h"
29#include "cifsproto.h" 28#include "cifsproto.h"
30#include "ntlmssp.h" 29#include "ntlmssp.h"
@@ -37,11 +36,6 @@
37/* Note that the smb header signature field on input contains the 36/* Note that the smb header signature field on input contains the
38 sequence number before this function is called */ 37 sequence number before this function is called */
39 38
40extern void mdfour(unsigned char *out, unsigned char *in, int n);
41extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
42extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
43 unsigned char *p24);
44
45static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 39static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
46 struct TCP_Server_Info *server, char *signature) 40 struct TCP_Server_Info *server, char *signature)
47{ 41{
@@ -234,6 +228,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
234/* first calculate 24 bytes ntlm response and then 16 byte session key */ 228/* first calculate 24 bytes ntlm response and then 16 byte session key */
235int setup_ntlm_response(struct cifsSesInfo *ses) 229int setup_ntlm_response(struct cifsSesInfo *ses)
236{ 230{
231 int rc = 0;
237 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; 232 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
238 char temp_key[CIFS_SESS_KEY_SIZE]; 233 char temp_key[CIFS_SESS_KEY_SIZE];
239 234
@@ -247,13 +242,26 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
247 } 242 }
248 ses->auth_key.len = temp_len; 243 ses->auth_key.len = temp_len;
249 244
250 SMBNTencrypt(ses->password, ses->server->cryptkey, 245 rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
251 ses->auth_key.response + CIFS_SESS_KEY_SIZE); 246 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
247 if (rc) {
248 cFYI(1, "%s Can't generate NTLM response, error: %d",
249 __func__, rc);
250 return rc;
251 }
252 252
253 E_md4hash(ses->password, temp_key); 253 rc = E_md4hash(ses->password, temp_key);
254 mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); 254 if (rc) {
255 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
256 return rc;
257 }
255 258
256 return 0; 259 rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
260 if (rc)
261 cFYI(1, "%s Can't generate NTLM session key, error: %d",
262 __func__, rc);
263
264 return rc;
257} 265}
258 266
259#ifdef CONFIG_CIFS_WEAK_PW_HASH 267#ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -649,9 +657,10 @@ calc_seckey(struct cifsSesInfo *ses)
649 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); 657 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
650 658
651 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); 659 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
652 if (!tfm_arc4 || IS_ERR(tfm_arc4)) { 660 if (IS_ERR(tfm_arc4)) {
661 rc = PTR_ERR(tfm_arc4);
653 cERROR(1, "could not allocate crypto API arc4\n"); 662 cERROR(1, "could not allocate crypto API arc4\n");
654 return PTR_ERR(tfm_arc4); 663 return rc;
655 } 664 }
656 665
657 desc.tfm = tfm_arc4; 666 desc.tfm = tfm_arc4;
@@ -700,14 +709,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
700 unsigned int size; 709 unsigned int size;
701 710
702 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); 711 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
703 if (!server->secmech.hmacmd5 || 712 if (IS_ERR(server->secmech.hmacmd5)) {
704 IS_ERR(server->secmech.hmacmd5)) {
705 cERROR(1, "could not allocate crypto hmacmd5\n"); 713 cERROR(1, "could not allocate crypto hmacmd5\n");
706 return PTR_ERR(server->secmech.hmacmd5); 714 return PTR_ERR(server->secmech.hmacmd5);
707 } 715 }
708 716
709 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); 717 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
710 if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) { 718 if (IS_ERR(server->secmech.md5)) {
711 cERROR(1, "could not allocate crypto md5\n"); 719 cERROR(1, "could not allocate crypto md5\n");
712 rc = PTR_ERR(server->secmech.md5); 720 rc = PTR_ERR(server->secmech.md5);
713 goto crypto_allocate_md5_fail; 721 goto crypto_allocate_md5_fail;
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
deleted file mode 100644
index 15d2ec006474..000000000000
--- a/fs/cifs/cifsencrypt.h
+++ /dev/null
@@ -1,33 +0,0 @@
1/*
2 * fs/cifs/cifsencrypt.h
3 *
4 * Copyright (c) International Business Machines Corp., 2005
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * Externs for misc. small encryption routines
8 * so we do not have to put them in cifsproto.h
9 *
10 * This library is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser General Public License as published
12 * by the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
18 * the GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this library; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25/* md4.c */
26extern void mdfour(unsigned char *out, unsigned char *in, int n);
27/* smbdes.c */
28extern void E_P16(unsigned char *p14, unsigned char *p16);
29extern void E_P24(unsigned char *p21, const unsigned char *c8,
30 unsigned char *p24);
31
32
33
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a8323f1dc1c4..f2970136d17d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -600,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
600{ 600{
601 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 601 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
602 ssize_t written; 602 ssize_t written;
603 int rc;
603 604
604 written = generic_file_aio_write(iocb, iov, nr_segs, pos); 605 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
605 if (!CIFS_I(inode)->clientCanCacheAll) 606
606 filemap_fdatawrite(inode->i_mapping); 607 if (CIFS_I(inode)->clientCanCacheAll)
608 return written;
609
610 rc = filemap_fdatawrite(inode->i_mapping);
611 if (rc)
612 cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
613
607 return written; 614 return written;
608} 615}
609 616
@@ -737,7 +744,7 @@ const struct file_operations cifs_file_strict_ops = {
737 .read = do_sync_read, 744 .read = do_sync_read,
738 .write = do_sync_write, 745 .write = do_sync_write,
739 .aio_read = cifs_strict_readv, 746 .aio_read = cifs_strict_readv,
740 .aio_write = cifs_file_aio_write, 747 .aio_write = cifs_strict_writev,
741 .open = cifs_open, 748 .open = cifs_open,
742 .release = cifs_close, 749 .release = cifs_close,
743 .lock = cifs_lock, 750 .lock = cifs_lock,
@@ -793,7 +800,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
793 .read = do_sync_read, 800 .read = do_sync_read,
794 .write = do_sync_write, 801 .write = do_sync_write,
795 .aio_read = cifs_strict_readv, 802 .aio_read = cifs_strict_readv,
796 .aio_write = cifs_file_aio_write, 803 .aio_write = cifs_strict_writev,
797 .open = cifs_open, 804 .open = cifs_open,
798 .release = cifs_close, 805 .release = cifs_close,
799 .fsync = cifs_strict_fsync, 806 .fsync = cifs_strict_fsync,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f23206d46531..a9371b6578c0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -85,7 +85,9 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
85extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, 85extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
86 unsigned long nr_segs, loff_t pos); 86 unsigned long nr_segs, loff_t pos);
87extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 87extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
88 size_t write_size, loff_t *poffset); 88 size_t write_size, loff_t *poffset);
89extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
90 unsigned long nr_segs, loff_t pos);
89extern int cifs_lock(struct file *, int, struct file_lock *); 91extern int cifs_lock(struct file *, int, struct file_lock *);
90extern int cifs_fsync(struct file *, int); 92extern int cifs_fsync(struct file *, int);
91extern int cifs_strict_fsync(struct file *, int); 93extern int cifs_strict_fsync(struct file *, int);
@@ -125,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
125extern const struct export_operations cifs_export_ops; 127extern const struct export_operations cifs_export_ops;
126#endif /* EXPERIMENTAL */ 128#endif /* EXPERIMENTAL */
127 129
128#define CIFS_VERSION "1.69" 130#define CIFS_VERSION "1.71"
129#endif /* _CIFSFS_H */ 131#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5bfb75346cb0..17afb0fbcaed 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -166,6 +166,9 @@ struct TCP_Server_Info {
166 struct socket *ssocket; 166 struct socket *ssocket;
167 struct sockaddr_storage dstaddr; 167 struct sockaddr_storage dstaddr;
168 struct sockaddr_storage srcaddr; /* locally bind to this IP */ 168 struct sockaddr_storage srcaddr; /* locally bind to this IP */
169#ifdef CONFIG_NET_NS
170 struct net *net;
171#endif
169 wait_queue_head_t response_q; 172 wait_queue_head_t response_q;
170 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ 173 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
171 struct list_head pending_mid_q; 174 struct list_head pending_mid_q;
@@ -185,6 +188,8 @@ struct TCP_Server_Info {
185 /* multiplexed reads or writes */ 188 /* multiplexed reads or writes */
186 unsigned int maxBuf; /* maxBuf specifies the maximum */ 189 unsigned int maxBuf; /* maxBuf specifies the maximum */
187 /* message size the server can send or receive for non-raw SMBs */ 190 /* message size the server can send or receive for non-raw SMBs */
191 /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
192 /* when socket is setup (and during reconnect) before NegProt sent */
188 unsigned int max_rw; /* maxRw specifies the maximum */ 193 unsigned int max_rw; /* maxRw specifies the maximum */
189 /* message size the server can send or receive for */ 194 /* message size the server can send or receive for */
190 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ 195 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
@@ -217,6 +222,36 @@ struct TCP_Server_Info {
217}; 222};
218 223
219/* 224/*
225 * Macros to allow the TCP_Server_Info->net field and related code to drop out
226 * when CONFIG_NET_NS isn't set.
227 */
228
229#ifdef CONFIG_NET_NS
230
231static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
232{
233 return srv->net;
234}
235
236static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
237{
238 srv->net = net;
239}
240
241#else
242
243static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
244{
245 return &init_net;
246}
247
248static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
249{
250}
251
252#endif
253
254/*
220 * Session structure. One of these for each uid session with a particular host 255 * Session structure. One of these for each uid session with a particular host
221 */ 256 */
222struct cifsSesInfo { 257struct cifsSesInfo {
@@ -619,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
619#define MID_REQUEST_SUBMITTED 2 654#define MID_REQUEST_SUBMITTED 2
620#define MID_RESPONSE_RECEIVED 4 655#define MID_RESPONSE_RECEIVED 4
621#define MID_RETRY_NEEDED 8 /* session closed while this request out */ 656#define MID_RETRY_NEEDED 8 /* session closed while this request out */
622#define MID_NO_RESP_NEEDED 0x10 657#define MID_RESPONSE_MALFORMED 0x10
623 658
624/* Types of response buffer returned from SendReceive2 */ 659/* Types of response buffer returned from SendReceive2 */
625#define CIFS_NO_BUFFER 0 /* Response buffer not returned */ 660#define CIFS_NO_BUFFER 0 /* Response buffer not returned */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 982895fa7615..8096f27ad9a8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -85,6 +85,8 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
85extern bool is_valid_oplock_break(struct smb_hdr *smb, 85extern bool is_valid_oplock_break(struct smb_hdr *smb,
86 struct TCP_Server_Info *); 86 struct TCP_Server_Info *);
87extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 87extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
88extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
89 unsigned int bytes_written);
88extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); 90extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
89extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); 91extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
90extern unsigned int smbCalcSize(struct smb_hdr *ptr); 92extern unsigned int smbCalcSize(struct smb_hdr *ptr);
@@ -373,7 +375,7 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
373extern int cifs_verify_signature(struct smb_hdr *, 375extern int cifs_verify_signature(struct smb_hdr *,
374 struct TCP_Server_Info *server, 376 struct TCP_Server_Info *server,
375 __u32 expected_sequence_number); 377 __u32 expected_sequence_number);
376extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); 378extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
377extern int setup_ntlm_response(struct cifsSesInfo *); 379extern int setup_ntlm_response(struct cifsSesInfo *);
378extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *); 380extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
379extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); 381extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
@@ -423,4 +425,11 @@ extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
423extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, 425extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
424 const unsigned char *path, 426 const unsigned char *path,
425 struct cifs_sb_info *cifs_sb, int xid); 427 struct cifs_sb_info *cifs_sb, int xid);
428extern int mdfour(unsigned char *, unsigned char *, int);
429extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
430extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
431 unsigned char *p24);
432extern void E_P16(unsigned char *p14, unsigned char *p16);
433extern void E_P24(unsigned char *p21, const unsigned char *c8,
434 unsigned char *p24);
426#endif /* _CIFSPROTO_H */ 435#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3106f5e5c633..904aa47e3515 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -136,9 +136,6 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
136 } 136 }
137 } 137 }
138 138
139 if (ses->status == CifsExiting)
140 return -EIO;
141
142 /* 139 /*
143 * Give demultiplex thread up to 10 seconds to reconnect, should be 140 * Give demultiplex thread up to 10 seconds to reconnect, should be
144 * greater than cifs socket timeout which is 7 seconds 141 * greater than cifs socket timeout which is 7 seconds
@@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
156 * retrying until process is killed or server comes 153 * retrying until process is killed or server comes
157 * back on-line 154 * back on-line
158 */ 155 */
159 if (!tcon->retry || ses->status == CifsExiting) { 156 if (!tcon->retry) {
160 cFYI(1, "gave up waiting on reconnect in smb_init"); 157 cFYI(1, "gave up waiting on reconnect in smb_init");
161 return -EHOSTDOWN; 158 return -EHOSTDOWN;
162 } 159 }
@@ -4914,7 +4911,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4914 __u16 fid, __u32 pid_of_opener, bool SetAllocation) 4911 __u16 fid, __u32 pid_of_opener, bool SetAllocation)
4915{ 4912{
4916 struct smb_com_transaction2_sfi_req *pSMB = NULL; 4913 struct smb_com_transaction2_sfi_req *pSMB = NULL;
4917 char *data_offset;
4918 struct file_end_of_file_info *parm_data; 4914 struct file_end_of_file_info *parm_data;
4919 int rc = 0; 4915 int rc = 0;
4920 __u16 params, param_offset, offset, byte_count, count; 4916 __u16 params, param_offset, offset, byte_count, count;
@@ -4938,8 +4934,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4938 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; 4934 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
4939 offset = param_offset + params; 4935 offset = param_offset + params;
4940 4936
4941 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
4942
4943 count = sizeof(struct file_end_of_file_info); 4937 count = sizeof(struct file_end_of_file_info);
4944 pSMB->MaxParameterCount = cpu_to_le16(2); 4938 pSMB->MaxParameterCount = cpu_to_le16(2);
4945 /* BB find exact max SMB PDU from sess structure BB */ 4939 /* BB find exact max SMB PDU from sess structure BB */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 18d3c7724d6e..8d6c17ab593d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -55,9 +55,6 @@
55/* SMB echo "timeout" -- FIXME: tunable? */ 55/* SMB echo "timeout" -- FIXME: tunable? */
56#define SMB_ECHO_INTERVAL (60 * HZ) 56#define SMB_ECHO_INTERVAL (60 * HZ)
57 57
58extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
59 unsigned char *p24);
60
61extern mempool_t *cifs_req_poolp; 58extern mempool_t *cifs_req_poolp;
62 59
63struct smb_vol { 60struct smb_vol {
@@ -87,6 +84,7 @@ struct smb_vol {
87 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ 84 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/
88 bool server_ino:1; /* use inode numbers from server ie UniqueId */ 85 bool server_ino:1; /* use inode numbers from server ie UniqueId */
89 bool direct_io:1; 86 bool direct_io:1;
87 bool strict_io:1; /* strict cache behavior */
90 bool remap:1; /* set to remap seven reserved chars in filenames */ 88 bool remap:1; /* set to remap seven reserved chars in filenames */
91 bool posix_paths:1; /* unset to not ask for posix pathnames. */ 89 bool posix_paths:1; /* unset to not ask for posix pathnames. */
92 bool no_linux_ext:1; 90 bool no_linux_ext:1;
@@ -339,8 +337,13 @@ cifs_echo_request(struct work_struct *work)
339 struct TCP_Server_Info *server = container_of(work, 337 struct TCP_Server_Info *server = container_of(work,
340 struct TCP_Server_Info, echo.work); 338 struct TCP_Server_Info, echo.work);
341 339
342 /* no need to ping if we got a response recently */ 340 /*
343 if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) 341 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
342 * done, which is indicated by maxBuf != 0. Also, no need to ping if
343 * we got a response recently
344 */
345 if (server->maxBuf == 0 ||
346 time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
344 goto requeue_echo; 347 goto requeue_echo;
345 348
346 rc = CIFSSMBEcho(server); 349 rc = CIFSSMBEcho(server);
@@ -580,14 +583,23 @@ incomplete_rcv:
580 else if (reconnect == 1) 583 else if (reconnect == 1)
581 continue; 584 continue;
582 585
583 length += 4; /* account for rfc1002 hdr */ 586 total_read += 4; /* account for rfc1002 hdr */
584 587
588 dump_smb(smb_buffer, total_read);
585 589
586 dump_smb(smb_buffer, length); 590 /*
587 if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) { 591 * We know that we received enough to get to the MID as we
588 cifs_dump_mem("Bad SMB: ", smb_buffer, 48); 592 * checked the pdu_length earlier. Now check to see
589 continue; 593 * if the rest of the header is OK. We borrow the length
590 } 594 * var for the rest of the loop to avoid a new stack var.
595 *
596 * 48 bytes is enough to display the header and a little bit
597 * into the payload for debugging purposes.
598 */
599 length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
600 if (length != 0)
601 cifs_dump_mem("Bad SMB: ", smb_buffer,
602 min_t(unsigned int, total_read, 48));
591 603
592 mid_entry = NULL; 604 mid_entry = NULL;
593 server->lstrp = jiffies; 605 server->lstrp = jiffies;
@@ -599,7 +611,8 @@ incomplete_rcv:
599 if ((mid_entry->mid == smb_buffer->Mid) && 611 if ((mid_entry->mid == smb_buffer->Mid) &&
600 (mid_entry->midState == MID_REQUEST_SUBMITTED) && 612 (mid_entry->midState == MID_REQUEST_SUBMITTED) &&
601 (mid_entry->command == smb_buffer->Command)) { 613 (mid_entry->command == smb_buffer->Command)) {
602 if (check2ndT2(smb_buffer,server->maxBuf) > 0) { 614 if (length == 0 &&
615 check2ndT2(smb_buffer, server->maxBuf) > 0) {
603 /* We have a multipart transact2 resp */ 616 /* We have a multipart transact2 resp */
604 isMultiRsp = true; 617 isMultiRsp = true;
605 if (mid_entry->resp_buf) { 618 if (mid_entry->resp_buf) {
@@ -634,12 +647,17 @@ incomplete_rcv:
634 mid_entry->resp_buf = smb_buffer; 647 mid_entry->resp_buf = smb_buffer;
635 mid_entry->largeBuf = isLargeBuf; 648 mid_entry->largeBuf = isLargeBuf;
636multi_t2_fnd: 649multi_t2_fnd:
637 mid_entry->midState = MID_RESPONSE_RECEIVED; 650 if (length == 0)
638 list_del_init(&mid_entry->qhead); 651 mid_entry->midState =
639 mid_entry->callback(mid_entry); 652 MID_RESPONSE_RECEIVED;
653 else
654 mid_entry->midState =
655 MID_RESPONSE_MALFORMED;
640#ifdef CONFIG_CIFS_STATS2 656#ifdef CONFIG_CIFS_STATS2
641 mid_entry->when_received = jiffies; 657 mid_entry->when_received = jiffies;
642#endif 658#endif
659 list_del_init(&mid_entry->qhead);
660 mid_entry->callback(mid_entry);
643 break; 661 break;
644 } 662 }
645 mid_entry = NULL; 663 mid_entry = NULL;
@@ -655,6 +673,9 @@ multi_t2_fnd:
655 else 673 else
656 smallbuf = NULL; 674 smallbuf = NULL;
657 } 675 }
676 } else if (length != 0) {
677 /* response sanity checks failed */
678 continue;
658 } else if (!is_valid_oplock_break(smb_buffer, server) && 679 } else if (!is_valid_oplock_break(smb_buffer, server) &&
659 !isMultiRsp) { 680 !isMultiRsp) {
660 cERROR(1, "No task to wake, unknown frame received! " 681 cERROR(1, "No task to wake, unknown frame received! "
@@ -1344,6 +1365,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1344 vol->direct_io = 1; 1365 vol->direct_io = 1;
1345 } else if (strnicmp(data, "forcedirectio", 13) == 0) { 1366 } else if (strnicmp(data, "forcedirectio", 13) == 0) {
1346 vol->direct_io = 1; 1367 vol->direct_io = 1;
1368 } else if (strnicmp(data, "strictcache", 11) == 0) {
1369 vol->strict_io = 1;
1347 } else if (strnicmp(data, "noac", 4) == 0) { 1370 } else if (strnicmp(data, "noac", 4) == 0) {
1348 printk(KERN_WARNING "CIFS: Mount option noac not " 1371 printk(KERN_WARNING "CIFS: Mount option noac not "
1349 "supported. Instead set " 1372 "supported. Instead set "
@@ -1568,6 +1591,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
1568 1591
1569 spin_lock(&cifs_tcp_ses_lock); 1592 spin_lock(&cifs_tcp_ses_lock);
1570 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { 1593 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
1594 if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
1595 continue;
1596
1571 if (!match_address(server, addr, 1597 if (!match_address(server, addr,
1572 (struct sockaddr *)&vol->srcaddr)) 1598 (struct sockaddr *)&vol->srcaddr))
1573 continue; 1599 continue;
@@ -1598,6 +1624,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
1598 return; 1624 return;
1599 } 1625 }
1600 1626
1627 put_net(cifs_net_ns(server));
1628
1601 list_del_init(&server->tcp_ses_list); 1629 list_del_init(&server->tcp_ses_list);
1602 spin_unlock(&cifs_tcp_ses_lock); 1630 spin_unlock(&cifs_tcp_ses_lock);
1603 1631
@@ -1672,6 +1700,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1672 goto out_err; 1700 goto out_err;
1673 } 1701 }
1674 1702
1703 cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
1675 tcp_ses->hostname = extract_hostname(volume_info->UNC); 1704 tcp_ses->hostname = extract_hostname(volume_info->UNC);
1676 if (IS_ERR(tcp_ses->hostname)) { 1705 if (IS_ERR(tcp_ses->hostname)) {
1677 rc = PTR_ERR(tcp_ses->hostname); 1706 rc = PTR_ERR(tcp_ses->hostname);
@@ -1752,6 +1781,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1752out_err_crypto_release: 1781out_err_crypto_release:
1753 cifs_crypto_shash_release(tcp_ses); 1782 cifs_crypto_shash_release(tcp_ses);
1754 1783
1784 put_net(cifs_net_ns(tcp_ses));
1785
1755out_err: 1786out_err:
1756 if (tcp_ses) { 1787 if (tcp_ses) {
1757 if (!IS_ERR(tcp_ses->hostname)) 1788 if (!IS_ERR(tcp_ses->hostname))
@@ -2263,8 +2294,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
2263 } 2294 }
2264 2295
2265 if (socket == NULL) { 2296 if (socket == NULL) {
2266 rc = sock_create_kern(sfamily, SOCK_STREAM, 2297 rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
2267 IPPROTO_TCP, &socket); 2298 IPPROTO_TCP, &socket, 1);
2268 if (rc < 0) { 2299 if (rc < 0) {
2269 cERROR(1, "Error %d creating socket", rc); 2300 cERROR(1, "Error %d creating socket", rc);
2270 server->ssocket = NULL; 2301 server->ssocket = NULL;
@@ -2576,6 +2607,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2576 if (pvolume_info->multiuser) 2607 if (pvolume_info->multiuser)
2577 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | 2608 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
2578 CIFS_MOUNT_NO_PERM); 2609 CIFS_MOUNT_NO_PERM);
2610 if (pvolume_info->strict_io)
2611 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
2579 if (pvolume_info->direct_io) { 2612 if (pvolume_info->direct_io) {
2580 cFYI(1, "mounting share using direct i/o"); 2613 cFYI(1, "mounting share using direct i/o");
2581 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2614 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
@@ -2977,7 +3010,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2977 bcc_ptr); 3010 bcc_ptr);
2978 else 3011 else
2979#endif /* CIFS_WEAK_PW_HASH */ 3012#endif /* CIFS_WEAK_PW_HASH */
2980 SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr); 3013 rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
3014 bcc_ptr);
2981 3015
2982 bcc_ptr += CIFS_AUTH_RESP_SIZE; 3016 bcc_ptr += CIFS_AUTH_RESP_SIZE;
2983 if (ses->capabilities & CAP_UNICODE) { 3017 if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d7d65a70678e..c27d236738fc 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -346,7 +346,6 @@ int cifs_open(struct inode *inode, struct file *file)
346 struct cifsTconInfo *tcon; 346 struct cifsTconInfo *tcon;
347 struct tcon_link *tlink; 347 struct tcon_link *tlink;
348 struct cifsFileInfo *pCifsFile = NULL; 348 struct cifsFileInfo *pCifsFile = NULL;
349 struct cifsInodeInfo *pCifsInode;
350 char *full_path = NULL; 349 char *full_path = NULL;
351 bool posix_open_ok = false; 350 bool posix_open_ok = false;
352 __u16 netfid; 351 __u16 netfid;
@@ -361,8 +360,6 @@ int cifs_open(struct inode *inode, struct file *file)
361 } 360 }
362 tcon = tlink_tcon(tlink); 361 tcon = tlink_tcon(tlink);
363 362
364 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
365
366 full_path = build_path_from_dentry(file->f_path.dentry); 363 full_path = build_path_from_dentry(file->f_path.dentry);
367 if (full_path == NULL) { 364 if (full_path == NULL) {
368 rc = -ENOMEM; 365 rc = -ENOMEM;
@@ -848,7 +845,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
848} 845}
849 846
850/* update the file size (if needed) after a write */ 847/* update the file size (if needed) after a write */
851static void 848void
852cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, 849cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
853 unsigned int bytes_written) 850 unsigned int bytes_written)
854{ 851{
@@ -1146,7 +1143,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1146 char *write_data; 1143 char *write_data;
1147 int rc = -EFAULT; 1144 int rc = -EFAULT;
1148 int bytes_written = 0; 1145 int bytes_written = 0;
1149 struct cifs_sb_info *cifs_sb;
1150 struct inode *inode; 1146 struct inode *inode;
1151 struct cifsFileInfo *open_file; 1147 struct cifsFileInfo *open_file;
1152 1148
@@ -1154,7 +1150,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1154 return -EFAULT; 1150 return -EFAULT;
1155 1151
1156 inode = page->mapping->host; 1152 inode = page->mapping->host;
1157 cifs_sb = CIFS_SB(inode->i_sb);
1158 1153
1159 offset += (loff_t)from; 1154 offset += (loff_t)from;
1160 write_data = kmap(page); 1155 write_data = kmap(page);
@@ -1574,34 +1569,6 @@ int cifs_fsync(struct file *file, int datasync)
1574 return rc; 1569 return rc;
1575} 1570}
1576 1571
1577/* static void cifs_sync_page(struct page *page)
1578{
1579 struct address_space *mapping;
1580 struct inode *inode;
1581 unsigned long index = page->index;
1582 unsigned int rpages = 0;
1583 int rc = 0;
1584
1585 cFYI(1, "sync page %p", page);
1586 mapping = page->mapping;
1587 if (!mapping)
1588 return 0;
1589 inode = mapping->host;
1590 if (!inode)
1591 return; */
1592
1593/* fill in rpages then
1594 result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
1595
1596/* cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
1597
1598#if 0
1599 if (rc < 0)
1600 return rc;
1601 return 0;
1602#endif
1603} */
1604
1605/* 1572/*
1606 * As file closes, flush all cached write data for this inode checking 1573 * As file closes, flush all cached write data for this inode checking
1607 * for write behind errors. 1574 * for write behind errors.
@@ -1619,13 +1586,215 @@ int cifs_flush(struct file *file, fl_owner_t id)
1619 return rc; 1586 return rc;
1620} 1587}
1621 1588
1589static int
1590cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
1591{
1592 int rc = 0;
1593 unsigned long i;
1594
1595 for (i = 0; i < num_pages; i++) {
1596 pages[i] = alloc_page(__GFP_HIGHMEM);
1597 if (!pages[i]) {
1598 /*
1599 * save number of pages we have already allocated and
1600 * return with ENOMEM error
1601 */
1602 num_pages = i;
1603 rc = -ENOMEM;
1604 goto error;
1605 }
1606 }
1607
1608 return rc;
1609
1610error:
1611 for (i = 0; i < num_pages; i++)
1612 put_page(pages[i]);
1613 return rc;
1614}
1615
1616static inline
1617size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
1618{
1619 size_t num_pages;
1620 size_t clen;
1621
1622 clen = min_t(const size_t, len, wsize);
1623 num_pages = clen / PAGE_CACHE_SIZE;
1624 if (clen % PAGE_CACHE_SIZE)
1625 num_pages++;
1626
1627 if (cur_len)
1628 *cur_len = clen;
1629
1630 return num_pages;
1631}
1632
1633static ssize_t
1634cifs_iovec_write(struct file *file, const struct iovec *iov,
1635 unsigned long nr_segs, loff_t *poffset)
1636{
1637 unsigned int written;
1638 unsigned long num_pages, npages, i;
1639 size_t copied, len, cur_len;
1640 ssize_t total_written = 0;
1641 struct kvec *to_send;
1642 struct page **pages;
1643 struct iov_iter it;
1644 struct inode *inode;
1645 struct cifsFileInfo *open_file;
1646 struct cifsTconInfo *pTcon;
1647 struct cifs_sb_info *cifs_sb;
1648 int xid, rc;
1649
1650 len = iov_length(iov, nr_segs);
1651 if (!len)
1652 return 0;
1653
1654 rc = generic_write_checks(file, poffset, &len, 0);
1655 if (rc)
1656 return rc;
1657
1658 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1659 num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
1660
1661 pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
1662 if (!pages)
1663 return -ENOMEM;
1664
1665 to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
1666 if (!to_send) {
1667 kfree(pages);
1668 return -ENOMEM;
1669 }
1670
1671 rc = cifs_write_allocate_pages(pages, num_pages);
1672 if (rc) {
1673 kfree(pages);
1674 kfree(to_send);
1675 return rc;
1676 }
1677
1678 xid = GetXid();
1679 open_file = file->private_data;
1680 pTcon = tlink_tcon(open_file->tlink);
1681 inode = file->f_path.dentry->d_inode;
1682
1683 iov_iter_init(&it, iov, nr_segs, len, 0);
1684 npages = num_pages;
1685
1686 do {
1687 size_t save_len = cur_len;
1688 for (i = 0; i < npages; i++) {
1689 copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
1690 copied = iov_iter_copy_from_user(pages[i], &it, 0,
1691 copied);
1692 cur_len -= copied;
1693 iov_iter_advance(&it, copied);
1694 to_send[i+1].iov_base = kmap(pages[i]);
1695 to_send[i+1].iov_len = copied;
1696 }
1697
1698 cur_len = save_len - cur_len;
1699
1700 do {
1701 if (open_file->invalidHandle) {
1702 rc = cifs_reopen_file(open_file, false);
1703 if (rc != 0)
1704 break;
1705 }
1706 rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
1707 cur_len, *poffset, &written,
1708 to_send, npages, 0);
1709 } while (rc == -EAGAIN);
1710
1711 for (i = 0; i < npages; i++)
1712 kunmap(pages[i]);
1713
1714 if (written) {
1715 len -= written;
1716 total_written += written;
1717 cifs_update_eof(CIFS_I(inode), *poffset, written);
1718 *poffset += written;
1719 } else if (rc < 0) {
1720 if (!total_written)
1721 total_written = rc;
1722 break;
1723 }
1724
1725 /* get length and number of kvecs of the next write */
1726 npages = get_numpages(cifs_sb->wsize, len, &cur_len);
1727 } while (len > 0);
1728
1729 if (total_written > 0) {
1730 spin_lock(&inode->i_lock);
1731 if (*poffset > inode->i_size)
1732 i_size_write(inode, *poffset);
1733 spin_unlock(&inode->i_lock);
1734 }
1735
1736 cifs_stats_bytes_written(pTcon, total_written);
1737 mark_inode_dirty_sync(inode);
1738
1739 for (i = 0; i < num_pages; i++)
1740 put_page(pages[i]);
1741 kfree(to_send);
1742 kfree(pages);
1743 FreeXid(xid);
1744 return total_written;
1745}
1746
1747static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
1748 unsigned long nr_segs, loff_t pos)
1749{
1750 ssize_t written;
1751 struct inode *inode;
1752
1753 inode = iocb->ki_filp->f_path.dentry->d_inode;
1754
1755 /*
1756 * BB - optimize the way when signing is disabled. We can drop this
1757 * extra memory-to-memory copying and use iovec buffers for constructing
1758 * write request.
1759 */
1760
1761 written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
1762 if (written > 0) {
1763 CIFS_I(inode)->invalid_mapping = true;
1764 iocb->ki_pos = pos;
1765 }
1766
1767 return written;
1768}
1769
1770ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
1771 unsigned long nr_segs, loff_t pos)
1772{
1773 struct inode *inode;
1774
1775 inode = iocb->ki_filp->f_path.dentry->d_inode;
1776
1777 if (CIFS_I(inode)->clientCanCacheAll)
1778 return generic_file_aio_write(iocb, iov, nr_segs, pos);
1779
1780 /*
1781 * In strict cache mode we need to write the data to the server exactly
1782 * from the pos to pos+len-1 rather than flush all affected pages
1783 * because it may cause a error with mandatory locks on these pages but
1784 * not on the region from pos to ppos+len-1.
1785 */
1786
1787 return cifs_user_writev(iocb, iov, nr_segs, pos);
1788}
1789
1622static ssize_t 1790static ssize_t
1623cifs_iovec_read(struct file *file, const struct iovec *iov, 1791cifs_iovec_read(struct file *file, const struct iovec *iov,
1624 unsigned long nr_segs, loff_t *poffset) 1792 unsigned long nr_segs, loff_t *poffset)
1625{ 1793{
1626 int rc; 1794 int rc;
1627 int xid; 1795 int xid;
1628 unsigned int total_read, bytes_read = 0; 1796 ssize_t total_read;
1797 unsigned int bytes_read = 0;
1629 size_t len, cur_len; 1798 size_t len, cur_len;
1630 int iov_offset = 0; 1799 int iov_offset = 0;
1631 struct cifs_sb_info *cifs_sb; 1800 struct cifs_sb_info *cifs_sb;
@@ -2313,7 +2482,6 @@ const struct address_space_operations cifs_addr_ops = {
2313 .set_page_dirty = __set_page_dirty_nobuffers, 2482 .set_page_dirty = __set_page_dirty_nobuffers,
2314 .releasepage = cifs_release_page, 2483 .releasepage = cifs_release_page,
2315 .invalidatepage = cifs_invalidate_page, 2484 .invalidatepage = cifs_invalidate_page,
2316 /* .sync_page = cifs_sync_page, */
2317 /* .direct_IO = */ 2485 /* .direct_IO = */
2318}; 2486};
2319 2487
@@ -2331,6 +2499,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
2331 .set_page_dirty = __set_page_dirty_nobuffers, 2499 .set_page_dirty = __set_page_dirty_nobuffers,
2332 .releasepage = cifs_release_page, 2500 .releasepage = cifs_release_page,
2333 .invalidatepage = cifs_invalidate_page, 2501 .invalidatepage = cifs_invalidate_page,
2334 /* .sync_page = cifs_sync_page, */
2335 /* .direct_IO = */ 2502 /* .direct_IO = */
2336}; 2503};
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 306769de2fb5..e8804d373404 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,7 +28,6 @@
28#include "cifsproto.h" 28#include "cifsproto.h"
29#include "cifs_debug.h" 29#include "cifs_debug.h"
30#include "cifs_fs_sb.h" 30#include "cifs_fs_sb.h"
31#include "md5.h"
32 31
33#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1) 32#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
34#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1)) 33#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
@@ -47,6 +46,45 @@
47 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15] 46 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
48 47
49static int 48static int
49symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
50{
51 int rc;
52 unsigned int size;
53 struct crypto_shash *md5;
54 struct sdesc *sdescmd5;
55
56 md5 = crypto_alloc_shash("md5", 0, 0);
57 if (IS_ERR(md5)) {
58 rc = PTR_ERR(md5);
59 cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
60 return rc;
61 }
62 size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
63 sdescmd5 = kmalloc(size, GFP_KERNEL);
64 if (!sdescmd5) {
65 rc = -ENOMEM;
66 cERROR(1, "%s: Memory allocation failure\n", __func__);
67 goto symlink_hash_err;
68 }
69 sdescmd5->shash.tfm = md5;
70 sdescmd5->shash.flags = 0x0;
71
72 rc = crypto_shash_init(&sdescmd5->shash);
73 if (rc) {
74 cERROR(1, "%s: Could not init md5 shash\n", __func__);
75 goto symlink_hash_err;
76 }
77 crypto_shash_update(&sdescmd5->shash, link_str, link_len);
78 rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
79
80symlink_hash_err:
81 crypto_free_shash(md5);
82 kfree(sdescmd5);
83
84 return rc;
85}
86
87static int
50CIFSParseMFSymlink(const u8 *buf, 88CIFSParseMFSymlink(const u8 *buf,
51 unsigned int buf_len, 89 unsigned int buf_len,
52 unsigned int *_link_len, 90 unsigned int *_link_len,
@@ -56,7 +94,6 @@ CIFSParseMFSymlink(const u8 *buf,
56 unsigned int link_len; 94 unsigned int link_len;
57 const char *md5_str1; 95 const char *md5_str1;
58 const char *link_str; 96 const char *link_str;
59 struct MD5Context md5_ctx;
60 u8 md5_hash[16]; 97 u8 md5_hash[16];
61 char md5_str2[34]; 98 char md5_str2[34];
62 99
@@ -70,9 +107,11 @@ CIFSParseMFSymlink(const u8 *buf,
70 if (rc != 1) 107 if (rc != 1)
71 return -EINVAL; 108 return -EINVAL;
72 109
73 cifs_MD5_init(&md5_ctx); 110 rc = symlink_hash(link_len, link_str, md5_hash);
74 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len); 111 if (rc) {
75 cifs_MD5_final(md5_hash, &md5_ctx); 112 cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
113 return rc;
114 }
76 115
77 snprintf(md5_str2, sizeof(md5_str2), 116 snprintf(md5_str2, sizeof(md5_str2),
78 CIFS_MF_SYMLINK_MD5_FORMAT, 117 CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -94,9 +133,9 @@ CIFSParseMFSymlink(const u8 *buf,
94static int 133static int
95CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) 134CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
96{ 135{
136 int rc;
97 unsigned int link_len; 137 unsigned int link_len;
98 unsigned int ofs; 138 unsigned int ofs;
99 struct MD5Context md5_ctx;
100 u8 md5_hash[16]; 139 u8 md5_hash[16];
101 140
102 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE) 141 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
@@ -107,9 +146,11 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
107 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) 146 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
108 return -ENAMETOOLONG; 147 return -ENAMETOOLONG;
109 148
110 cifs_MD5_init(&md5_ctx); 149 rc = symlink_hash(link_len, link_str, md5_hash);
111 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len); 150 if (rc) {
112 cifs_MD5_final(md5_hash, &md5_ctx); 151 cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
152 return rc;
153 }
113 154
114 snprintf(buf, buf_len, 155 snprintf(buf, buf_len,
115 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, 156 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
deleted file mode 100644
index a725c2609d67..000000000000
--- a/fs/cifs/md4.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/*
2 Unix SMB/Netbios implementation.
3 Version 1.9.
4 a implementation of MD4 designed for use in the SMB authentication protocol
5 Copyright (C) Andrew Tridgell 1997-1998.
6 Modified by Steve French (sfrench@us.ibm.com) 2002-2003
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*/
22#include <linux/module.h>
23#include <linux/fs.h>
24#include "cifsencrypt.h"
25
26/* NOTE: This code makes no attempt to be fast! */
27
28static __u32
29F(__u32 X, __u32 Y, __u32 Z)
30{
31 return (X & Y) | ((~X) & Z);
32}
33
34static __u32
35G(__u32 X, __u32 Y, __u32 Z)
36{
37 return (X & Y) | (X & Z) | (Y & Z);
38}
39
40static __u32
41H(__u32 X, __u32 Y, __u32 Z)
42{
43 return X ^ Y ^ Z;
44}
45
46static __u32
47lshift(__u32 x, int s)
48{
49 x &= 0xFFFFFFFF;
50 return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
51}
52
53#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s)
54#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s)
55#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s)
56
57/* this applies md4 to 64 byte chunks */
58static void
59mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
60{
61 int j;
62 __u32 AA, BB, CC, DD;
63 __u32 X[16];
64
65
66 for (j = 0; j < 16; j++)
67 X[j] = M[j];
68
69 AA = *A;
70 BB = *B;
71 CC = *C;
72 DD = *D;
73
74 ROUND1(A, B, C, D, 0, 3);
75 ROUND1(D, A, B, C, 1, 7);
76 ROUND1(C, D, A, B, 2, 11);
77 ROUND1(B, C, D, A, 3, 19);
78 ROUND1(A, B, C, D, 4, 3);
79 ROUND1(D, A, B, C, 5, 7);
80 ROUND1(C, D, A, B, 6, 11);
81 ROUND1(B, C, D, A, 7, 19);
82 ROUND1(A, B, C, D, 8, 3);
83 ROUND1(D, A, B, C, 9, 7);
84 ROUND1(C, D, A, B, 10, 11);
85 ROUND1(B, C, D, A, 11, 19);
86 ROUND1(A, B, C, D, 12, 3);
87 ROUND1(D, A, B, C, 13, 7);
88 ROUND1(C, D, A, B, 14, 11);
89 ROUND1(B, C, D, A, 15, 19);
90
91 ROUND2(A, B, C, D, 0, 3);
92 ROUND2(D, A, B, C, 4, 5);
93 ROUND2(C, D, A, B, 8, 9);
94 ROUND2(B, C, D, A, 12, 13);
95 ROUND2(A, B, C, D, 1, 3);
96 ROUND2(D, A, B, C, 5, 5);
97 ROUND2(C, D, A, B, 9, 9);
98 ROUND2(B, C, D, A, 13, 13);
99 ROUND2(A, B, C, D, 2, 3);
100 ROUND2(D, A, B, C, 6, 5);
101 ROUND2(C, D, A, B, 10, 9);
102 ROUND2(B, C, D, A, 14, 13);
103 ROUND2(A, B, C, D, 3, 3);
104 ROUND2(D, A, B, C, 7, 5);
105 ROUND2(C, D, A, B, 11, 9);
106 ROUND2(B, C, D, A, 15, 13);
107
108 ROUND3(A, B, C, D, 0, 3);
109 ROUND3(D, A, B, C, 8, 9);
110 ROUND3(C, D, A, B, 4, 11);
111 ROUND3(B, C, D, A, 12, 15);
112 ROUND3(A, B, C, D, 2, 3);
113 ROUND3(D, A, B, C, 10, 9);
114 ROUND3(C, D, A, B, 6, 11);
115 ROUND3(B, C, D, A, 14, 15);
116 ROUND3(A, B, C, D, 1, 3);
117 ROUND3(D, A, B, C, 9, 9);
118 ROUND3(C, D, A, B, 5, 11);
119 ROUND3(B, C, D, A, 13, 15);
120 ROUND3(A, B, C, D, 3, 3);
121 ROUND3(D, A, B, C, 11, 9);
122 ROUND3(C, D, A, B, 7, 11);
123 ROUND3(B, C, D, A, 15, 15);
124
125 *A += AA;
126 *B += BB;
127 *C += CC;
128 *D += DD;
129
130 *A &= 0xFFFFFFFF;
131 *B &= 0xFFFFFFFF;
132 *C &= 0xFFFFFFFF;
133 *D &= 0xFFFFFFFF;
134
135 for (j = 0; j < 16; j++)
136 X[j] = 0;
137}
138
139static void
140copy64(__u32 *M, unsigned char *in)
141{
142 int i;
143
144 for (i = 0; i < 16; i++)
145 M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) |
146 (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0);
147}
148
149static void
150copy4(unsigned char *out, __u32 x)
151{
152 out[0] = x & 0xFF;
153 out[1] = (x >> 8) & 0xFF;
154 out[2] = (x >> 16) & 0xFF;
155 out[3] = (x >> 24) & 0xFF;
156}
157
158/* produce a md4 message digest from data of length n bytes */
159void
160mdfour(unsigned char *out, unsigned char *in, int n)
161{
162 unsigned char buf[128];
163 __u32 M[16];
164 __u32 b = n * 8;
165 int i;
166 __u32 A = 0x67452301;
167 __u32 B = 0xefcdab89;
168 __u32 C = 0x98badcfe;
169 __u32 D = 0x10325476;
170
171 while (n > 64) {
172 copy64(M, in);
173 mdfour64(M, &A, &B, &C, &D);
174 in += 64;
175 n -= 64;
176 }
177
178 for (i = 0; i < 128; i++)
179 buf[i] = 0;
180 memcpy(buf, in, n);
181 buf[n] = 0x80;
182
183 if (n <= 55) {
184 copy4(buf + 56, b);
185 copy64(M, buf);
186 mdfour64(M, &A, &B, &C, &D);
187 } else {
188 copy4(buf + 120, b);
189 copy64(M, buf);
190 mdfour64(M, &A, &B, &C, &D);
191 copy64(M, buf + 64);
192 mdfour64(M, &A, &B, &C, &D);
193 }
194
195 for (i = 0; i < 128; i++)
196 buf[i] = 0;
197 copy64(M, buf);
198
199 copy4(out, A);
200 copy4(out + 4, B);
201 copy4(out + 8, C);
202 copy4(out + 12, D);
203
204 A = B = C = D = 0;
205}
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
deleted file mode 100644
index 98b66a54c319..000000000000
--- a/fs/cifs/md5.c
+++ /dev/null
@@ -1,366 +0,0 @@
1/*
2 * This code implements the MD5 message-digest algorithm.
3 * The algorithm is due to Ron Rivest. This code was
4 * written by Colin Plumb in 1993, no copyright is claimed.
5 * This code is in the public domain; do with it what you wish.
6 *
7 * Equivalent code is available from RSA Data Security, Inc.
8 * This code has been tested against that, and is equivalent,
9 * except that you don't need to include two pages of legalese
10 * with every copy.
11 *
12 * To compute the message digest of a chunk of bytes, declare an
13 * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
14 * needed on buffers full of bytes, and then call cifs_MD5_final, which
15 * will fill a supplied 16-byte array with the digest.
16 */
17
18/* This code slightly modified to fit into Samba by
19 abartlet@samba.org Jun 2001
20 and to fit the cifs vfs by
21 Steve French sfrench@us.ibm.com */
22
23#include <linux/string.h>
24#include "md5.h"
25
26static void MD5Transform(__u32 buf[4], __u32 const in[16]);
27
28/*
29 * Note: this code is harmless on little-endian machines.
30 */
31static void
32byteReverse(unsigned char *buf, unsigned longs)
33{
34 __u32 t;
35 do {
36 t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
37 ((unsigned) buf[1] << 8 | buf[0]);
38 *(__u32 *) buf = t;
39 buf += 4;
40 } while (--longs);
41}
42
43/*
44 * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
45 * initialization constants.
46 */
47void
48cifs_MD5_init(struct MD5Context *ctx)
49{
50 ctx->buf[0] = 0x67452301;
51 ctx->buf[1] = 0xefcdab89;
52 ctx->buf[2] = 0x98badcfe;
53 ctx->buf[3] = 0x10325476;
54
55 ctx->bits[0] = 0;
56 ctx->bits[1] = 0;
57}
58
59/*
60 * Update context to reflect the concatenation of another buffer full
61 * of bytes.
62 */
63void
64cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
65{
66 register __u32 t;
67
68 /* Update bitcount */
69
70 t = ctx->bits[0];
71 if ((ctx->bits[0] = t + ((__u32) len << 3)) < t)
72 ctx->bits[1]++; /* Carry from low to high */
73 ctx->bits[1] += len >> 29;
74
75 t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
76
77 /* Handle any leading odd-sized chunks */
78
79 if (t) {
80 unsigned char *p = (unsigned char *) ctx->in + t;
81
82 t = 64 - t;
83 if (len < t) {
84 memmove(p, buf, len);
85 return;
86 }
87 memmove(p, buf, t);
88 byteReverse(ctx->in, 16);
89 MD5Transform(ctx->buf, (__u32 *) ctx->in);
90 buf += t;
91 len -= t;
92 }
93 /* Process data in 64-byte chunks */
94
95 while (len >= 64) {
96 memmove(ctx->in, buf, 64);
97 byteReverse(ctx->in, 16);
98 MD5Transform(ctx->buf, (__u32 *) ctx->in);
99 buf += 64;
100 len -= 64;
101 }
102
103 /* Handle any remaining bytes of data. */
104
105 memmove(ctx->in, buf, len);
106}
107
108/*
109 * Final wrapup - pad to 64-byte boundary with the bit pattern
110 * 1 0* (64-bit count of bits processed, MSB-first)
111 */
112void
113cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
114{
115 unsigned int count;
116 unsigned char *p;
117
118 /* Compute number of bytes mod 64 */
119 count = (ctx->bits[0] >> 3) & 0x3F;
120
121 /* Set the first char of padding to 0x80. This is safe since there is
122 always at least one byte free */
123 p = ctx->in + count;
124 *p++ = 0x80;
125
126 /* Bytes of padding needed to make 64 bytes */
127 count = 64 - 1 - count;
128
129 /* Pad out to 56 mod 64 */
130 if (count < 8) {
131 /* Two lots of padding: Pad the first block to 64 bytes */
132 memset(p, 0, count);
133 byteReverse(ctx->in, 16);
134 MD5Transform(ctx->buf, (__u32 *) ctx->in);
135
136 /* Now fill the next block with 56 bytes */
137 memset(ctx->in, 0, 56);
138 } else {
139 /* Pad block to 56 bytes */
140 memset(p, 0, count - 8);
141 }
142 byteReverse(ctx->in, 14);
143
144 /* Append length in bits and transform */
145 ((__u32 *) ctx->in)[14] = ctx->bits[0];
146 ((__u32 *) ctx->in)[15] = ctx->bits[1];
147
148 MD5Transform(ctx->buf, (__u32 *) ctx->in);
149 byteReverse((unsigned char *) ctx->buf, 4);
150 memmove(digest, ctx->buf, 16);
151 memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
152}
153
154/* The four core functions - F1 is optimized somewhat */
155
156/* #define F1(x, y, z) (x & y | ~x & z) */
157#define F1(x, y, z) (z ^ (x & (y ^ z)))
158#define F2(x, y, z) F1(z, x, y)
159#define F3(x, y, z) (x ^ y ^ z)
160#define F4(x, y, z) (y ^ (x | ~z))
161
162/* This is the central step in the MD5 algorithm. */
163#define MD5STEP(f, w, x, y, z, data, s) \
164 (w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x)
165
166/*
167 * The core of the MD5 algorithm, this alters an existing MD5 hash to
168 * reflect the addition of 16 longwords of new data. cifs_MD5_update blocks
169 * the data and converts bytes into longwords for this routine.
170 */
171static void
172MD5Transform(__u32 buf[4], __u32 const in[16])
173{
174 register __u32 a, b, c, d;
175
176 a = buf[0];
177 b = buf[1];
178 c = buf[2];
179 d = buf[3];
180
181 MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
182 MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
183 MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
184 MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
185 MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
186 MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
187 MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
188 MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
189 MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
190 MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
191 MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
192 MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
193 MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
194 MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
195 MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
196 MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
197
198 MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
199 MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
200 MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
201 MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
202 MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
203 MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
204 MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
205 MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
206 MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
207 MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
208 MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
209 MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
210 MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
211 MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
212 MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
213 MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
214
215 MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
216 MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
217 MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
218 MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
219 MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
220 MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
221 MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
222 MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
223 MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
224 MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
225 MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
226 MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
227 MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
228 MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
229 MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
230 MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
231
232 MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
233 MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
234 MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
235 MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
236 MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
237 MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
238 MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
239 MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
240 MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
241 MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
242 MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
243 MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
244 MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
245 MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
246 MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
247 MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
248
249 buf[0] += a;
250 buf[1] += b;
251 buf[2] += c;
252 buf[3] += d;
253}
254
255#if 0 /* currently unused */
256/***********************************************************************
257 the rfc 2104 version of hmac_md5 initialisation.
258***********************************************************************/
259static void
260hmac_md5_init_rfc2104(unsigned char *key, int key_len,
261 struct HMACMD5Context *ctx)
262{
263 int i;
264
265 /* if key is longer than 64 bytes reset it to key=MD5(key) */
266 if (key_len > 64) {
267 unsigned char tk[16];
268 struct MD5Context tctx;
269
270 cifs_MD5_init(&tctx);
271 cifs_MD5_update(&tctx, key, key_len);
272 cifs_MD5_final(tk, &tctx);
273
274 key = tk;
275 key_len = 16;
276 }
277
278 /* start out by storing key in pads */
279 memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
280 memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
281 memcpy(ctx->k_ipad, key, key_len);
282 memcpy(ctx->k_opad, key, key_len);
283
284 /* XOR key with ipad and opad values */
285 for (i = 0; i < 64; i++) {
286 ctx->k_ipad[i] ^= 0x36;
287 ctx->k_opad[i] ^= 0x5c;
288 }
289
290 cifs_MD5_init(&ctx->ctx);
291 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
292}
293#endif
294
295/***********************************************************************
296 the microsoft version of hmac_md5 initialisation.
297***********************************************************************/
298void
299hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
300 struct HMACMD5Context *ctx)
301{
302 int i;
303
304 /* if key is longer than 64 bytes truncate it */
305 if (key_len > 64)
306 key_len = 64;
307
308 /* start out by storing key in pads */
309 memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
310 memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
311 memcpy(ctx->k_ipad, key, key_len);
312 memcpy(ctx->k_opad, key, key_len);
313
314 /* XOR key with ipad and opad values */
315 for (i = 0; i < 64; i++) {
316 ctx->k_ipad[i] ^= 0x36;
317 ctx->k_opad[i] ^= 0x5c;
318 }
319
320 cifs_MD5_init(&ctx->ctx);
321 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
322}
323
324/***********************************************************************
325 update hmac_md5 "inner" buffer
326***********************************************************************/
327void
328hmac_md5_update(const unsigned char *text, int text_len,
329 struct HMACMD5Context *ctx)
330{
331 cifs_MD5_update(&ctx->ctx, text, text_len); /* then text of datagram */
332}
333
334/***********************************************************************
335 finish off hmac_md5 "inner" buffer and generate outer one.
336***********************************************************************/
337void
338hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
339{
340 struct MD5Context ctx_o;
341
342 cifs_MD5_final(digest, &ctx->ctx);
343
344 cifs_MD5_init(&ctx_o);
345 cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
346 cifs_MD5_update(&ctx_o, digest, 16);
347 cifs_MD5_final(digest, &ctx_o);
348}
349
350/***********************************************************
351 single function to calculate an HMAC MD5 digest from data.
352 use the microsoft hmacmd5 init method because the key is 16 bytes.
353************************************************************/
354#if 0 /* currently unused */
355static void
356hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
357 unsigned char *digest)
358{
359 struct HMACMD5Context ctx;
360 hmac_md5_init_limK_to_64(key, 16, &ctx);
361 if (data_len != 0)
362 hmac_md5_update(data, data_len, &ctx);
363
364 hmac_md5_final(digest, &ctx);
365}
366#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
deleted file mode 100644
index 6fba8cb402fd..000000000000
--- a/fs/cifs/md5.h
+++ /dev/null
@@ -1,38 +0,0 @@
1#ifndef MD5_H
2#define MD5_H
3#ifndef HEADER_MD5_H
4/* Try to avoid clashes with OpenSSL */
5#define HEADER_MD5_H
6#endif
7
8struct MD5Context {
9 __u32 buf[4];
10 __u32 bits[2];
11 unsigned char in[64];
12};
13#endif /* !MD5_H */
14
15#ifndef _HMAC_MD5_H
16struct HMACMD5Context {
17 struct MD5Context ctx;
18 unsigned char k_ipad[65];
19 unsigned char k_opad[65];
20};
21#endif /* _HMAC_MD5_H */
22
23void cifs_MD5_init(struct MD5Context *context);
24void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
25 unsigned len);
26void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
27
28/* The following definitions come from lib/hmacmd5.c */
29
30/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
31 struct HMACMD5Context *ctx);*/
32void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
33 struct HMACMD5Context *ctx);
34void hmac_md5_update(const unsigned char *text, int text_len,
35 struct HMACMD5Context *ctx);
36void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
37/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
38 unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index a09e077ba925..2a930a752a78 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -236,10 +236,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
236{ 236{
237 __u16 mid = 0; 237 __u16 mid = 0;
238 __u16 last_mid; 238 __u16 last_mid;
239 int collision; 239 bool collision;
240
241 if (server == NULL)
242 return mid;
243 240
244 spin_lock(&GlobalMid_Lock); 241 spin_lock(&GlobalMid_Lock);
245 last_mid = server->CurrentMid; /* we do not want to loop forever */ 242 last_mid = server->CurrentMid; /* we do not want to loop forever */
@@ -252,24 +249,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
252 (and it would also have to have been a request that 249 (and it would also have to have been a request that
253 did not time out) */ 250 did not time out) */
254 while (server->CurrentMid != last_mid) { 251 while (server->CurrentMid != last_mid) {
255 struct list_head *tmp;
256 struct mid_q_entry *mid_entry; 252 struct mid_q_entry *mid_entry;
253 unsigned int num_mids;
257 254
258 collision = 0; 255 collision = false;
259 if (server->CurrentMid == 0) 256 if (server->CurrentMid == 0)
260 server->CurrentMid++; 257 server->CurrentMid++;
261 258
262 list_for_each(tmp, &server->pending_mid_q) { 259 num_mids = 0;
263 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 260 list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
264 261 ++num_mids;
265 if ((mid_entry->mid == server->CurrentMid) && 262 if (mid_entry->mid == server->CurrentMid &&
266 (mid_entry->midState == MID_REQUEST_SUBMITTED)) { 263 mid_entry->midState == MID_REQUEST_SUBMITTED) {
267 /* This mid is in use, try a different one */ 264 /* This mid is in use, try a different one */
268 collision = 1; 265 collision = true;
269 break; 266 break;
270 } 267 }
271 } 268 }
272 if (collision == 0) { 269
270 /*
271 * if we have more than 32k mids in the list, then something
272 * is very wrong. Possibly a local user is trying to DoS the
273 * box by issuing long-running calls and SIGKILL'ing them. If
274 * we get to 2^16 mids then we're in big trouble as this
275 * function could loop forever.
276 *
277 * Go ahead and assign out the mid in this situation, but force
278 * an eventual reconnect to clean out the pending_mid_q.
279 */
280 if (num_mids > 32768)
281 server->tcpStatus = CifsNeedReconnect;
282
283 if (!collision) {
273 mid = server->CurrentMid; 284 mid = server->CurrentMid;
274 break; 285 break;
275 } 286 }
@@ -381,29 +392,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
381} 392}
382 393
383static int 394static int
384checkSMBhdr(struct smb_hdr *smb, __u16 mid) 395check_smb_hdr(struct smb_hdr *smb, __u16 mid)
385{ 396{
386 /* Make sure that this really is an SMB, that it is a response, 397 /* does it have the right SMB "signature" ? */
387 and that the message ids match */ 398 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
388 if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) && 399 cERROR(1, "Bad protocol string signature header 0x%x",
389 (mid == smb->Mid)) { 400 *(unsigned int *)smb->Protocol);
390 if (smb->Flags & SMBFLG_RESPONSE) 401 return 1;
391 return 0; 402 }
392 else { 403
393 /* only one valid case where server sends us request */ 404 /* Make sure that message ids match */
394 if (smb->Command == SMB_COM_LOCKING_ANDX) 405 if (mid != smb->Mid) {
395 return 0; 406 cERROR(1, "Mids do not match. received=%u expected=%u",
396 else 407 smb->Mid, mid);
397 cERROR(1, "Received Request not response"); 408 return 1;
398 }
399 } else { /* bad signature or mid */
400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
401 cERROR(1, "Bad protocol string signature header %x",
402 *(unsigned int *) smb->Protocol);
403 if (mid != smb->Mid)
404 cERROR(1, "Mids do not match");
405 } 409 }
406 cERROR(1, "bad smb detected. The Mid=%d", smb->Mid); 410
411 /* if it's a response then accept */
412 if (smb->Flags & SMBFLG_RESPONSE)
413 return 0;
414
415 /* only one valid case where server sends us request */
416 if (smb->Command == SMB_COM_LOCKING_ANDX)
417 return 0;
418
419 cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
407 return 1; 420 return 1;
408} 421}
409 422
@@ -448,7 +461,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
448 return 1; 461 return 1;
449 } 462 }
450 463
451 if (checkSMBhdr(smb, mid)) 464 if (check_smb_hdr(smb, mid))
452 return 1; 465 return 1;
453 clc_len = smbCalcSize_LE(smb); 466 clc_len = smbCalcSize_LE(smb);
454 467
@@ -465,25 +478,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
465 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) 478 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
466 return 0; /* bcc wrapped */ 479 return 0; /* bcc wrapped */
467 } 480 }
468 cFYI(1, "Calculated size %d vs length %d mismatch for mid %d", 481 cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
469 clc_len, 4 + len, smb->Mid); 482 clc_len, 4 + len, smb->Mid);
470 /* Windows XP can return a few bytes too much, presumably 483
471 an illegal pad, at the end of byte range lock responses 484 if (4 + len < clc_len) {
472 so we allow for that three byte pad, as long as actual 485 cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
473 received length is as long or longer than calculated length */
474 /* We have now had to extend this more, since there is a
475 case in which it needs to be bigger still to handle a
476 malformed response to transact2 findfirst from WinXP when
477 access denied is returned and thus bcc and wct are zero
478 but server says length is 0x21 bytes too long as if the server
479 forget to reset the smb rfc1001 length when it reset the
480 wct and bcc to minimum size and drop the t2 parms and data */
481 if ((4+len > clc_len) && (len <= clc_len + 512))
482 return 0;
483 else {
484 cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
485 len, smb->Mid); 486 len, smb->Mid);
486 return 1; 487 return 1;
488 } else if (len > clc_len + 512) {
489 /*
490 * Some servers (Windows XP in particular) send more
491 * data than the lengths in the SMB packet would
492 * indicate on certain calls (byte range locks and
493 * trans2 find first calls in particular). While the
494 * client can handle such a frame by ignoring the
495 * trailing data, we choose limit the amount of extra
496 * data to 512 bytes.
497 */
498 cERROR(1, "RFC1001 size %u more than 512 bytes larger "
499 "than SMB for mid=%u", len, smb->Mid);
500 return 1;
487 } 501 }
488 } 502 }
489 return 0; 503 return 0;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8d9189f64477..79f641eeda30 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -170,7 +170,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
170{ 170{
171 int rc, alen, slen; 171 int rc, alen, slen;
172 const char *pct; 172 const char *pct;
173 char *endp, scope_id[13]; 173 char scope_id[13];
174 struct sockaddr_in *s4 = (struct sockaddr_in *) dst; 174 struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
175 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; 175 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
176 176
@@ -197,9 +197,9 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
197 memcpy(scope_id, pct + 1, slen); 197 memcpy(scope_id, pct + 1, slen);
198 scope_id[slen] = '\0'; 198 scope_id[slen] = '\0';
199 199
200 s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0); 200 rc = strict_strtoul(scope_id, 0,
201 if (endp != scope_id + slen) 201 (unsigned long *)&s6->sin6_scope_id);
202 return 0; 202 rc = (rc == 0) ? 1 : 0;
203 } 203 }
204 204
205 return rc; 205 return rc;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 7f25cc3d2256..f8e4cd2a7912 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -764,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
764{ 764{
765 int rc = 0; 765 int rc = 0;
766 int xid, i; 766 int xid, i;
767 struct cifs_sb_info *cifs_sb;
768 struct cifsTconInfo *pTcon; 767 struct cifsTconInfo *pTcon;
769 struct cifsFileInfo *cifsFile = NULL; 768 struct cifsFileInfo *cifsFile = NULL;
770 char *current_entry; 769 char *current_entry;
@@ -775,8 +774,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
775 774
776 xid = GetXid(); 775 xid = GetXid();
777 776
778 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
779
780 /* 777 /*
781 * Ensure FindFirst doesn't fail before doing filldir() for '.' and 778 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
782 * '..'. Otherwise we won't be able to notify VFS in case of failure. 779 * '..'. Otherwise we won't be able to notify VFS in case of failure.
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 1adc9625a344..16765703131b 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -656,13 +656,13 @@ ssetup_ntlmssp_authenticate:
656 656
657 if (type == LANMAN) { 657 if (type == LANMAN) {
658#ifdef CONFIG_CIFS_WEAK_PW_HASH 658#ifdef CONFIG_CIFS_WEAK_PW_HASH
659 char lnm_session_key[CIFS_SESS_KEY_SIZE]; 659 char lnm_session_key[CIFS_AUTH_RESP_SIZE];
660 660
661 pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; 661 pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
662 662
663 /* no capabilities flags in old lanman negotiation */ 663 /* no capabilities flags in old lanman negotiation */
664 664
665 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 665 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
666 666
667 /* Calculate hash with password and copy into bcc_ptr. 667 /* Calculate hash with password and copy into bcc_ptr.
668 * Encryption Key (stored as in cryptkey) gets used if the 668 * Encryption Key (stored as in cryptkey) gets used if the
@@ -675,8 +675,8 @@ ssetup_ntlmssp_authenticate:
675 true : false, lnm_session_key); 675 true : false, lnm_session_key);
676 676
677 ses->flags |= CIFS_SES_LANMAN; 677 ses->flags |= CIFS_SES_LANMAN;
678 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE); 678 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
679 bcc_ptr += CIFS_SESS_KEY_SIZE; 679 bcc_ptr += CIFS_AUTH_RESP_SIZE;
680 680
681 /* can not sign if LANMAN negotiated so no need 681 /* can not sign if LANMAN negotiated so no need
682 to calculate signing key? but what if server 682 to calculate signing key? but what if server
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index b6b6dcb500bf..04721485925d 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -45,7 +45,6 @@
45 up with a different answer to the one above) 45 up with a different answer to the one above)
46*/ 46*/
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include "cifsencrypt.h"
49#define uchar unsigned char 48#define uchar unsigned char
50 49
51static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9, 50static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 192ea51af20f..b5041c849981 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -32,9 +32,8 @@
32#include "cifs_unicode.h" 32#include "cifs_unicode.h"
33#include "cifspdu.h" 33#include "cifspdu.h"
34#include "cifsglob.h" 34#include "cifsglob.h"
35#include "md5.h"
36#include "cifs_debug.h" 35#include "cifs_debug.h"
37#include "cifsencrypt.h" 36#include "cifsproto.h"
38 37
39#ifndef false 38#ifndef false
40#define false 0 39#define false 0
@@ -48,14 +47,58 @@
48#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) 47#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
49#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) 48#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
50 49
51/*The following definitions come from libsmb/smbencrypt.c */ 50/* produce a md4 message digest from data of length n bytes */
51int
52mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
53{
54 int rc;
55 unsigned int size;
56 struct crypto_shash *md4;
57 struct sdesc *sdescmd4;
58
59 md4 = crypto_alloc_shash("md4", 0, 0);
60 if (IS_ERR(md4)) {
61 rc = PTR_ERR(md4);
62 cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
63 return rc;
64 }
65 size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
66 sdescmd4 = kmalloc(size, GFP_KERNEL);
67 if (!sdescmd4) {
68 rc = -ENOMEM;
69 cERROR(1, "%s: Memory allocation failure\n", __func__);
70 goto mdfour_err;
71 }
72 sdescmd4->shash.tfm = md4;
73 sdescmd4->shash.flags = 0x0;
74
75 rc = crypto_shash_init(&sdescmd4->shash);
76 if (rc) {
77 cERROR(1, "%s: Could not init md4 shash\n", __func__);
78 goto mdfour_err;
79 }
80 crypto_shash_update(&sdescmd4->shash, link_str, link_len);
81 rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
52 82
53void SMBencrypt(unsigned char *passwd, const unsigned char *c8, 83mdfour_err:
54 unsigned char *p24); 84 crypto_free_shash(md4);
55void E_md4hash(const unsigned char *passwd, unsigned char *p16); 85 kfree(sdescmd4);
56static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8, 86
57 unsigned char p24[24]); 87 return rc;
58void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); 88}
89
90/* Does the des encryption from the NT or LM MD4 hash. */
91static void
92SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
93 unsigned char p24[24])
94{
95 unsigned char p21[21];
96
97 memset(p21, '\0', 21);
98
99 memcpy(p21, passwd, 16);
100 E_P24(p21, c8, p24);
101}
59 102
60/* 103/*
61 This implements the X/Open SMB password encryption 104 This implements the X/Open SMB password encryption
@@ -118,9 +161,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
118 * Creates the MD4 Hash of the users password in NT UNICODE. 161 * Creates the MD4 Hash of the users password in NT UNICODE.
119 */ 162 */
120 163
121void 164int
122E_md4hash(const unsigned char *passwd, unsigned char *p16) 165E_md4hash(const unsigned char *passwd, unsigned char *p16)
123{ 166{
167 int rc;
124 int len; 168 int len;
125 __u16 wpwd[129]; 169 __u16 wpwd[129];
126 170
@@ -139,8 +183,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
139 /* Calculate length in bytes */ 183 /* Calculate length in bytes */
140 len = _my_wcslen(wpwd) * sizeof(__u16); 184 len = _my_wcslen(wpwd) * sizeof(__u16);
141 185
142 mdfour(p16, (unsigned char *) wpwd, len); 186 rc = mdfour(p16, (unsigned char *) wpwd, len);
143 memset(wpwd, 0, 129 * 2); 187 memset(wpwd, 0, 129 * 2);
188
189 return rc;
144} 190}
145 191
146#if 0 /* currently unused */ 192#if 0 /* currently unused */
@@ -212,19 +258,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
212} 258}
213#endif 259#endif
214 260
215/* Does the des encryption from the NT or LM MD4 hash. */
216static void
217SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
218 unsigned char p24[24])
219{
220 unsigned char p21[21];
221
222 memset(p21, '\0', 21);
223
224 memcpy(p21, passwd, 16);
225 E_P24(p21, c8, p24);
226}
227
228/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ 261/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
229#if 0 /* currently unused */ 262#if 0 /* currently unused */
230static void 263static void
@@ -242,16 +275,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
242#endif 275#endif
243 276
244/* Does the NT MD4 hash then des encryption. */ 277/* Does the NT MD4 hash then des encryption. */
245 278int
246void
247SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) 279SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
248{ 280{
281 int rc;
249 unsigned char p21[21]; 282 unsigned char p21[21];
250 283
251 memset(p21, '\0', 21); 284 memset(p21, '\0', 21);
252 285
253 E_md4hash(passwd, p21); 286 rc = E_md4hash(passwd, p21);
287 if (rc) {
288 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
289 return rc;
290 }
254 SMBOWFencrypt(p21, c8, p24); 291 SMBOWFencrypt(p21, c8, p24);
292 return rc;
255} 293}
256 294
257 295
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c1ccca1a933f..46d8756f2b24 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -236,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
236 server->tcpStatus = CifsNeedReconnect; 236 server->tcpStatus = CifsNeedReconnect;
237 } 237 }
238 238
239 if (rc < 0) { 239 if (rc < 0 && rc != -EINTR)
240 cERROR(1, "Error %d sending data on socket to server", rc); 240 cERROR(1, "Error %d sending data on socket to server", rc);
241 } else 241 else
242 rc = 0; 242 rc = 0;
243 243
244 /* Don't want to modify the buffer as a 244 /* Don't want to modify the buffer as a
@@ -359,6 +359,10 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
359 if (rc) 359 if (rc)
360 return rc; 360 return rc;
361 361
362 /* enable signing if server requires it */
363 if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
364 in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
365
362 mutex_lock(&server->srv_mutex); 366 mutex_lock(&server->srv_mutex);
363 mid = AllocMidQEntry(in_buf, server); 367 mid = AllocMidQEntry(in_buf, server);
364 if (mid == NULL) { 368 if (mid == NULL) {
@@ -453,6 +457,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
453 case MID_RETRY_NEEDED: 457 case MID_RETRY_NEEDED:
454 rc = -EAGAIN; 458 rc = -EAGAIN;
455 break; 459 break;
460 case MID_RESPONSE_MALFORMED:
461 rc = -EIO;
462 break;
456 default: 463 default:
457 cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__, 464 cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
458 mid->mid, mid->midState); 465 mid->mid, mid->midState);
@@ -570,17 +577,33 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
570#endif 577#endif
571 578
572 mutex_unlock(&ses->server->srv_mutex); 579 mutex_unlock(&ses->server->srv_mutex);
573 cifs_small_buf_release(in_buf);
574 580
575 if (rc < 0) 581 if (rc < 0) {
582 cifs_small_buf_release(in_buf);
576 goto out; 583 goto out;
584 }
577 585
578 if (long_op == CIFS_ASYNC_OP) 586 if (long_op == CIFS_ASYNC_OP) {
587 cifs_small_buf_release(in_buf);
579 goto out; 588 goto out;
589 }
580 590
581 rc = wait_for_response(ses->server, midQ); 591 rc = wait_for_response(ses->server, midQ);
582 if (rc != 0) 592 if (rc != 0) {
583 goto out; 593 send_nt_cancel(ses->server, in_buf, midQ);
594 spin_lock(&GlobalMid_Lock);
595 if (midQ->midState == MID_REQUEST_SUBMITTED) {
596 midQ->callback = DeleteMidQEntry;
597 spin_unlock(&GlobalMid_Lock);
598 cifs_small_buf_release(in_buf);
599 atomic_dec(&ses->server->inFlight);
600 wake_up(&ses->server->request_q);
601 return rc;
602 }
603 spin_unlock(&GlobalMid_Lock);
604 }
605
606 cifs_small_buf_release(in_buf);
584 607
585 rc = sync_mid_result(midQ, ses->server); 608 rc = sync_mid_result(midQ, ses->server);
586 if (rc != 0) { 609 if (rc != 0) {
@@ -724,8 +747,19 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
724 goto out; 747 goto out;
725 748
726 rc = wait_for_response(ses->server, midQ); 749 rc = wait_for_response(ses->server, midQ);
727 if (rc != 0) 750 if (rc != 0) {
728 goto out; 751 send_nt_cancel(ses->server, in_buf, midQ);
752 spin_lock(&GlobalMid_Lock);
753 if (midQ->midState == MID_REQUEST_SUBMITTED) {
754 /* no longer considered to be "in-flight" */
755 midQ->callback = DeleteMidQEntry;
756 spin_unlock(&GlobalMid_Lock);
757 atomic_dec(&ses->server->inFlight);
758 wake_up(&ses->server->request_q);
759 return rc;
760 }
761 spin_unlock(&GlobalMid_Lock);
762 }
729 763
730 rc = sync_mid_result(midQ, ses->server); 764 rc = sync_mid_result(midQ, ses->server);
731 if (rc != 0) { 765 if (rc != 0) {
@@ -922,10 +956,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
922 } 956 }
923 } 957 }
924 958
925 if (wait_for_response(ses->server, midQ) == 0) { 959 rc = wait_for_response(ses->server, midQ);
926 /* We got the response - restart system call. */ 960 if (rc) {
927 rstart = 1; 961 send_nt_cancel(ses->server, in_buf, midQ);
962 spin_lock(&GlobalMid_Lock);
963 if (midQ->midState == MID_REQUEST_SUBMITTED) {
964 /* no longer considered to be "in-flight" */
965 midQ->callback = DeleteMidQEntry;
966 spin_unlock(&GlobalMid_Lock);
967 return rc;
968 }
969 spin_unlock(&GlobalMid_Lock);
928 } 970 }
971
972 /* We got the response - restart system call. */
973 rstart = 1;
929 } 974 }
930 975
931 rc = sync_mid_result(midQ, ses->server); 976 rc = sync_mid_result(midQ, ses->server);
diff --git a/fs/coda/Makefile b/fs/coda/Makefile
index 6c22e61da397..1bab69a0d347 100644
--- a/fs/coda/Makefile
+++ b/fs/coda/Makefile
@@ -9,4 +9,4 @@ coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \
9 9
10# If you want debugging output, please uncomment the following line. 10# If you want debugging output, please uncomment the following line.
11 11
12# EXTRA_CFLAGS += -DDEBUG -DDEBUG_SMB_MALLOC=1 12# ccflags-y := -DDEBUG -DDEBUG_SMB_MALLOC=1
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index c6405ce3c50e..06d27a41807f 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -13,7 +13,6 @@
13 13
14#ifdef CONFIG_SYSCTL 14#ifdef CONFIG_SYSCTL
15static struct ctl_table_header *fs_table_header; 15static struct ctl_table_header *fs_table_header;
16#endif
17 16
18static ctl_table coda_table[] = { 17static ctl_table coda_table[] = {
19 { 18 {
@@ -40,7 +39,6 @@ static ctl_table coda_table[] = {
40 {} 39 {}
41}; 40};
42 41
43#ifdef CONFIG_SYSCTL
44static ctl_table fs_table[] = { 42static ctl_table fs_table[] = {
45 { 43 {
46 .procname = "coda", 44 .procname = "coda",
@@ -49,22 +47,18 @@ static ctl_table fs_table[] = {
49 }, 47 },
50 {} 48 {}
51}; 49};
52#endif
53 50
54void coda_sysctl_init(void) 51void coda_sysctl_init(void)
55{ 52{
56#ifdef CONFIG_SYSCTL
57 if ( !fs_table_header ) 53 if ( !fs_table_header )
58 fs_table_header = register_sysctl_table(fs_table); 54 fs_table_header = register_sysctl_table(fs_table);
59#endif
60} 55}
61 56
62void coda_sysctl_clean(void) 57void coda_sysctl_clean(void)
63{ 58{
64#ifdef CONFIG_SYSCTL
65 if ( fs_table_header ) { 59 if ( fs_table_header ) {
66 unregister_sysctl_table(fs_table_header); 60 unregister_sysctl_table(fs_table_header);
67 fs_table_header = NULL; 61 fs_table_header = NULL;
68 } 62 }
69#endif
70} 63}
64#endif
diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6cc..72fe6cda9108 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
262 */ 262 */
263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
264{ 264{
265 struct path path; 265 struct kstatfs tmp;
266 int error; 266 int error = user_statfs(pathname, &tmp);
267 267 if (!error)
268 error = user_path(pathname, &path); 268 error = put_compat_statfs(buf, &tmp);
269 if (!error) {
270 struct kstatfs tmp;
271 error = vfs_statfs(&path, &tmp);
272 if (!error)
273 error = put_compat_statfs(buf, &tmp);
274 path_put(&path);
275 }
276 return error; 269 return error;
277} 270}
278 271
279asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) 272asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
280{ 273{
281 struct file * file;
282 struct kstatfs tmp; 274 struct kstatfs tmp;
283 int error; 275 int error = fd_statfs(fd, &tmp);
284
285 error = -EBADF;
286 file = fget(fd);
287 if (!file)
288 goto out;
289 error = vfs_statfs(&file->f_path, &tmp);
290 if (!error) 276 if (!error)
291 error = put_compat_statfs(buf, &tmp); 277 error = put_compat_statfs(buf, &tmp);
292 fput(file);
293out:
294 return error; 278 return error;
295} 279}
296 280
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
329 313
330asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) 314asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
331{ 315{
332 struct path path; 316 struct kstatfs tmp;
333 int error; 317 int error;
334 318
335 if (sz != sizeof(*buf)) 319 if (sz != sizeof(*buf))
336 return -EINVAL; 320 return -EINVAL;
337 321
338 error = user_path(pathname, &path); 322 error = user_statfs(pathname, &tmp);
339 if (!error) { 323 if (!error)
340 struct kstatfs tmp; 324 error = put_compat_statfs64(buf, &tmp);
341 error = vfs_statfs(&path, &tmp);
342 if (!error)
343 error = put_compat_statfs64(buf, &tmp);
344 path_put(&path);
345 }
346 return error; 325 return error;
347} 326}
348 327
349asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) 328asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
350{ 329{
351 struct file * file;
352 struct kstatfs tmp; 330 struct kstatfs tmp;
353 int error; 331 int error;
354 332
355 if (sz != sizeof(*buf)) 333 if (sz != sizeof(*buf))
356 return -EINVAL; 334 return -EINVAL;
357 335
358 error = -EBADF; 336 error = fd_statfs(fd, &tmp);
359 file = fget(fd);
360 if (!file)
361 goto out;
362 error = vfs_statfs(&file->f_path, &tmp);
363 if (!error) 337 if (!error)
364 error = put_compat_statfs64(buf, &tmp); 338 error = put_compat_statfs64(buf, &tmp);
365 fput(file);
366out:
367 return error; 339 return error;
368} 340}
369 341
@@ -1228,7 +1200,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
1228 file = fget_light(fd, &fput_needed); 1200 file = fget_light(fd, &fput_needed);
1229 if (!file) 1201 if (!file)
1230 return -EBADF; 1202 return -EBADF;
1231 ret = compat_readv(file, vec, vlen, &pos); 1203 ret = -ESPIPE;
1204 if (file->f_mode & FMODE_PREAD)
1205 ret = compat_readv(file, vec, vlen, &pos);
1232 fput_light(file, fput_needed); 1206 fput_light(file, fput_needed);
1233 return ret; 1207 return ret;
1234} 1208}
@@ -1285,7 +1259,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
1285 file = fget_light(fd, &fput_needed); 1259 file = fget_light(fd, &fput_needed);
1286 if (!file) 1260 if (!file)
1287 return -EBADF; 1261 return -EBADF;
1288 ret = compat_writev(file, vec, vlen, &pos); 1262 ret = -ESPIPE;
1263 if (file->f_mode & FMODE_PWRITE)
1264 ret = compat_writev(file, vec, vlen, &pos);
1289 fput_light(file, fput_needed); 1265 fput_light(file, fput_needed);
1290 return ret; 1266 return ret;
1291} 1267}
@@ -1695,9 +1671,6 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
1695 * Update: ERESTARTSYS breaks at least the xview clock binary, so 1671 * Update: ERESTARTSYS breaks at least the xview clock binary, so
1696 * I'm trying ERESTARTNOHAND which restart only when you want to. 1672 * I'm trying ERESTARTNOHAND which restart only when you want to.
1697 */ 1673 */
1698#define MAX_SELECT_SECONDS \
1699 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
1700
1701int compat_core_sys_select(int n, compat_ulong_t __user *inp, 1674int compat_core_sys_select(int n, compat_ulong_t __user *inp,
1702 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1675 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
1703 struct timespec *end_time) 1676 struct timespec *end_time)
@@ -2308,3 +2281,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
2308} 2281}
2309 2282
2310#endif /* CONFIG_TIMERFD */ 2283#endif /* CONFIG_TIMERFD */
2284
2285#ifdef CONFIG_FHANDLE
2286/*
2287 * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
2288 * doesn't set the O_LARGEFILE flag.
2289 */
2290asmlinkage long
2291compat_sys_open_by_handle_at(int mountdirfd,
2292 struct file_handle __user *handle, int flags)
2293{
2294 return do_handle_open(mountdirfd, handle, flags);
2295}
2296#endif
diff --git a/fs/dcache.c b/fs/dcache.c
index 9f493ee4dcba..ad25c4cec7d5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -176,6 +176,7 @@ static void d_free(struct dentry *dentry)
176 176
177/** 177/**
178 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups 178 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
179 * @dentry: the target dentry
179 * After this call, in-progress rcu-walk path lookup will fail. This 180 * After this call, in-progress rcu-walk path lookup will fail. This
180 * should be called after unhashing, and after changing d_inode (if 181 * should be called after unhashing, and after changing d_inode (if
181 * the dentry has not already been unhashed). 182 * the dentry has not already been unhashed).
@@ -281,6 +282,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
281/** 282/**
282 * d_kill - kill dentry and return parent 283 * d_kill - kill dentry and return parent
283 * @dentry: dentry to kill 284 * @dentry: dentry to kill
285 * @parent: parent dentry
284 * 286 *
285 * The dentry must already be unhashed and removed from the LRU. 287 * The dentry must already be unhashed and removed from the LRU.
286 * 288 *
@@ -294,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
294 __releases(parent->d_lock) 296 __releases(parent->d_lock)
295 __releases(dentry->d_inode->i_lock) 297 __releases(dentry->d_inode->i_lock)
296{ 298{
297 dentry->d_parent = NULL;
298 list_del(&dentry->d_u.d_child); 299 list_del(&dentry->d_u.d_child);
300 /*
301 * Inform try_to_ascend() that we are no longer attached to the
302 * dentry tree
303 */
304 dentry->d_flags |= DCACHE_DISCONNECTED;
299 if (parent) 305 if (parent)
300 spin_unlock(&parent->d_lock); 306 spin_unlock(&parent->d_lock);
301 dentry_iput(dentry); 307 dentry_iput(dentry);
@@ -1010,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb)
1010} 1016}
1011 1017
1012/* 1018/*
1019 * This tries to ascend one level of parenthood, but
1020 * we can race with renaming, so we need to re-check
1021 * the parenthood after dropping the lock and check
1022 * that the sequence number still matches.
1023 */
1024static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
1025{
1026 struct dentry *new = old->d_parent;
1027
1028 rcu_read_lock();
1029 spin_unlock(&old->d_lock);
1030 spin_lock(&new->d_lock);
1031
1032 /*
1033 * might go back up the wrong parent if we have had a rename
1034 * or deletion
1035 */
1036 if (new != old->d_parent ||
1037 (old->d_flags & DCACHE_DISCONNECTED) ||
1038 (!locked && read_seqretry(&rename_lock, seq))) {
1039 spin_unlock(&new->d_lock);
1040 new = NULL;
1041 }
1042 rcu_read_unlock();
1043 return new;
1044}
1045
1046
1047/*
1013 * Search for at least 1 mount point in the dentry's subdirs. 1048 * Search for at least 1 mount point in the dentry's subdirs.
1014 * We descend to the next level whenever the d_subdirs 1049 * We descend to the next level whenever the d_subdirs
1015 * list is non-empty and continue searching. 1050 * list is non-empty and continue searching.
@@ -1064,24 +1099,10 @@ resume:
1064 * All done at this level ... ascend and resume the search. 1099 * All done at this level ... ascend and resume the search.
1065 */ 1100 */
1066 if (this_parent != parent) { 1101 if (this_parent != parent) {
1067 struct dentry *tmp; 1102 struct dentry *child = this_parent;
1068 struct dentry *child; 1103 this_parent = try_to_ascend(this_parent, locked, seq);
1069 1104 if (!this_parent)
1070 tmp = this_parent->d_parent;
1071 rcu_read_lock();
1072 spin_unlock(&this_parent->d_lock);
1073 child = this_parent;
1074 this_parent = tmp;
1075 spin_lock(&this_parent->d_lock);
1076 /* might go back up the wrong parent if we have had a rename
1077 * or deletion */
1078 if (this_parent != child->d_parent ||
1079 (!locked && read_seqretry(&rename_lock, seq))) {
1080 spin_unlock(&this_parent->d_lock);
1081 rcu_read_unlock();
1082 goto rename_retry; 1105 goto rename_retry;
1083 }
1084 rcu_read_unlock();
1085 next = child->d_u.d_child.next; 1106 next = child->d_u.d_child.next;
1086 goto resume; 1107 goto resume;
1087 } 1108 }
@@ -1179,24 +1200,10 @@ resume:
1179 * All done at this level ... ascend and resume the search. 1200 * All done at this level ... ascend and resume the search.
1180 */ 1201 */
1181 if (this_parent != parent) { 1202 if (this_parent != parent) {
1182 struct dentry *tmp; 1203 struct dentry *child = this_parent;
1183 struct dentry *child; 1204 this_parent = try_to_ascend(this_parent, locked, seq);
1184 1205 if (!this_parent)
1185 tmp = this_parent->d_parent;
1186 rcu_read_lock();
1187 spin_unlock(&this_parent->d_lock);
1188 child = this_parent;
1189 this_parent = tmp;
1190 spin_lock(&this_parent->d_lock);
1191 /* might go back up the wrong parent if we have had a rename
1192 * or deletion */
1193 if (this_parent != child->d_parent ||
1194 (!locked && read_seqretry(&rename_lock, seq))) {
1195 spin_unlock(&this_parent->d_lock);
1196 rcu_read_unlock();
1197 goto rename_retry; 1206 goto rename_retry;
1198 }
1199 rcu_read_unlock();
1200 next = child->d_u.d_child.next; 1207 next = child->d_u.d_child.next;
1201 goto resume; 1208 goto resume;
1202 } 1209 }
@@ -1521,6 +1528,28 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1521} 1528}
1522EXPORT_SYMBOL(d_alloc_root); 1529EXPORT_SYMBOL(d_alloc_root);
1523 1530
1531static struct dentry * __d_find_any_alias(struct inode *inode)
1532{
1533 struct dentry *alias;
1534
1535 if (list_empty(&inode->i_dentry))
1536 return NULL;
1537 alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
1538 __dget(alias);
1539 return alias;
1540}
1541
1542static struct dentry * d_find_any_alias(struct inode *inode)
1543{
1544 struct dentry *de;
1545
1546 spin_lock(&inode->i_lock);
1547 de = __d_find_any_alias(inode);
1548 spin_unlock(&inode->i_lock);
1549 return de;
1550}
1551
1552
1524/** 1553/**
1525 * d_obtain_alias - find or allocate a dentry for a given inode 1554 * d_obtain_alias - find or allocate a dentry for a given inode
1526 * @inode: inode to allocate the dentry for 1555 * @inode: inode to allocate the dentry for
@@ -1550,7 +1579,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
1550 if (IS_ERR(inode)) 1579 if (IS_ERR(inode))
1551 return ERR_CAST(inode); 1580 return ERR_CAST(inode);
1552 1581
1553 res = d_find_alias(inode); 1582 res = d_find_any_alias(inode);
1554 if (res) 1583 if (res)
1555 goto out_iput; 1584 goto out_iput;
1556 1585
@@ -1563,7 +1592,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
1563 1592
1564 1593
1565 spin_lock(&inode->i_lock); 1594 spin_lock(&inode->i_lock);
1566 res = __d_find_alias(inode, 0); 1595 res = __d_find_any_alias(inode);
1567 if (res) { 1596 if (res) {
1568 spin_unlock(&inode->i_lock); 1597 spin_unlock(&inode->i_lock);
1569 dput(tmp); 1598 dput(tmp);
@@ -1583,10 +1612,13 @@ struct dentry *d_obtain_alias(struct inode *inode)
1583 __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first); 1612 __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
1584 spin_unlock(&tmp->d_lock); 1613 spin_unlock(&tmp->d_lock);
1585 spin_unlock(&inode->i_lock); 1614 spin_unlock(&inode->i_lock);
1615 security_d_instantiate(tmp, inode);
1586 1616
1587 return tmp; 1617 return tmp;
1588 1618
1589 out_iput: 1619 out_iput:
1620 if (res && !IS_ERR(res))
1621 security_d_instantiate(res, inode);
1590 iput(inode); 1622 iput(inode);
1591 return res; 1623 return res;
1592} 1624}
@@ -1779,7 +1811,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
1779 * false-negative result. d_lookup() protects against concurrent 1811 * false-negative result. d_lookup() protects against concurrent
1780 * renames using rename_lock seqlock. 1812 * renames using rename_lock seqlock.
1781 * 1813 *
1782 * See Documentation/vfs/dcache-locking.txt for more details. 1814 * See Documentation/filesystems/path-lookup.txt for more details.
1783 */ 1815 */
1784 hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) { 1816 hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
1785 struct inode *i; 1817 struct inode *i;
@@ -1899,7 +1931,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
1899 * false-negative result. d_lookup() protects against concurrent 1931 * false-negative result. d_lookup() protects against concurrent
1900 * renames using rename_lock seqlock. 1932 * renames using rename_lock seqlock.
1901 * 1933 *
1902 * See Documentation/vfs/dcache-locking.txt for more details. 1934 * See Documentation/filesystems/path-lookup.txt for more details.
1903 */ 1935 */
1904 rcu_read_lock(); 1936 rcu_read_lock();
1905 1937
@@ -1973,7 +2005,7 @@ out:
1973/** 2005/**
1974 * d_validate - verify dentry provided from insecure source (deprecated) 2006 * d_validate - verify dentry provided from insecure source (deprecated)
1975 * @dentry: The dentry alleged to be valid child of @dparent 2007 * @dentry: The dentry alleged to be valid child of @dparent
1976 * @parent: The parent dentry (known to be valid) 2008 * @dparent: The parent dentry (known to be valid)
1977 * 2009 *
1978 * An insecure source has sent us a dentry, here we verify it and dget() it. 2010 * An insecure source has sent us a dentry, here we verify it and dget() it.
1979 * This is used by ncpfs in its readdir implementation. 2011 * This is used by ncpfs in its readdir implementation.
@@ -2918,28 +2950,14 @@ resume:
2918 spin_unlock(&dentry->d_lock); 2950 spin_unlock(&dentry->d_lock);
2919 } 2951 }
2920 if (this_parent != root) { 2952 if (this_parent != root) {
2921 struct dentry *tmp; 2953 struct dentry *child = this_parent;
2922 struct dentry *child;
2923
2924 tmp = this_parent->d_parent;
2925 if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { 2954 if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
2926 this_parent->d_flags |= DCACHE_GENOCIDE; 2955 this_parent->d_flags |= DCACHE_GENOCIDE;
2927 this_parent->d_count--; 2956 this_parent->d_count--;
2928 } 2957 }
2929 rcu_read_lock(); 2958 this_parent = try_to_ascend(this_parent, locked, seq);
2930 spin_unlock(&this_parent->d_lock); 2959 if (!this_parent)
2931 child = this_parent;
2932 this_parent = tmp;
2933 spin_lock(&this_parent->d_lock);
2934 /* might go back up the wrong parent if we have had a rename
2935 * or deletion */
2936 if (this_parent != child->d_parent ||
2937 (!locked && read_seqretry(&rename_lock, seq))) {
2938 spin_unlock(&this_parent->d_lock);
2939 rcu_read_unlock();
2940 goto rename_retry; 2960 goto rename_retry;
2941 }
2942 rcu_read_unlock();
2943 next = child->d_u.d_child.next; 2961 next = child->d_u.d_child.next;
2944 goto resume; 2962 goto resume;
2945 } 2963 }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 37a8ca7c1222..e7a7a2f07324 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -13,9 +13,6 @@
13 * 13 *
14 */ 14 */
15 15
16/* uncomment to get debug messages from the debug filesystem, ah the irony. */
17/* #define DEBUG */
18
19#include <linux/module.h> 16#include <linux/module.h>
20#include <linux/fs.h> 17#include <linux/fs.h>
21#include <linux/mount.h> 18#include <linux/mount.h>
@@ -310,7 +307,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
310} 307}
311EXPORT_SYMBOL_GPL(debugfs_create_symlink); 308EXPORT_SYMBOL_GPL(debugfs_create_symlink);
312 309
313static void __debugfs_remove(struct dentry *dentry, struct dentry *parent) 310static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
314{ 311{
315 int ret = 0; 312 int ret = 0;
316 313
@@ -333,6 +330,7 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
333 dput(dentry); 330 dput(dentry);
334 } 331 }
335 } 332 }
333 return ret;
336} 334}
337 335
338/** 336/**
@@ -351,7 +349,8 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
351void debugfs_remove(struct dentry *dentry) 349void debugfs_remove(struct dentry *dentry)
352{ 350{
353 struct dentry *parent; 351 struct dentry *parent;
354 352 int ret;
353
355 if (!dentry) 354 if (!dentry)
356 return; 355 return;
357 356
@@ -360,9 +359,10 @@ void debugfs_remove(struct dentry *dentry)
360 return; 359 return;
361 360
362 mutex_lock(&parent->d_inode->i_mutex); 361 mutex_lock(&parent->d_inode->i_mutex);
363 __debugfs_remove(dentry, parent); 362 ret = __debugfs_remove(dentry, parent);
364 mutex_unlock(&parent->d_inode->i_mutex); 363 mutex_unlock(&parent->d_inode->i_mutex);
365 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 364 if (!ret)
365 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
366} 366}
367EXPORT_SYMBOL_GPL(debugfs_remove); 367EXPORT_SYMBOL_GPL(debugfs_remove);
368 368
@@ -540,17 +540,5 @@ static int __init debugfs_init(void)
540 540
541 return retval; 541 return retval;
542} 542}
543
544static void __exit debugfs_exit(void)
545{
546 debugfs_registered = false;
547
548 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
549 unregister_filesystem(&debug_fs_type);
550 kobject_put(debug_kobj);
551}
552
553core_initcall(debugfs_init); 543core_initcall(debugfs_init);
554module_exit(debugfs_exit);
555MODULE_LICENSE("GPL");
556 544
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1bb547c9cad6..2f27e578d466 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -479,6 +479,7 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
479 struct dentry *root = sb->s_root; 479 struct dentry *root = sb->s_root;
480 struct pts_fs_info *fsi = DEVPTS_SB(sb); 480 struct pts_fs_info *fsi = DEVPTS_SB(sb);
481 struct pts_mount_opts *opts = &fsi->mount_opts; 481 struct pts_mount_opts *opts = &fsi->mount_opts;
482 int ret = 0;
482 char s[12]; 483 char s[12];
483 484
484 /* We're supposed to be given the slave end of a pty */ 485 /* We're supposed to be given the slave end of a pty */
@@ -501,14 +502,17 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
501 mutex_lock(&root->d_inode->i_mutex); 502 mutex_lock(&root->d_inode->i_mutex);
502 503
503 dentry = d_alloc_name(root, s); 504 dentry = d_alloc_name(root, s);
504 if (!IS_ERR(dentry)) { 505 if (dentry) {
505 d_add(dentry, inode); 506 d_add(dentry, inode);
506 fsnotify_create(root->d_inode, dentry); 507 fsnotify_create(root->d_inode, dentry);
508 } else {
509 iput(inode);
510 ret = -ENOMEM;
507 } 511 }
508 512
509 mutex_unlock(&root->d_inode->i_mutex); 513 mutex_unlock(&root->d_inode->i_mutex);
510 514
511 return 0; 515 return ret;
512} 516}
513 517
514struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) 518struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
@@ -544,17 +548,12 @@ void devpts_pty_kill(struct tty_struct *tty)
544 mutex_lock(&root->d_inode->i_mutex); 548 mutex_lock(&root->d_inode->i_mutex);
545 549
546 dentry = d_find_alias(inode); 550 dentry = d_find_alias(inode);
547 if (IS_ERR(dentry))
548 goto out;
549
550 if (dentry) {
551 inode->i_nlink--;
552 d_delete(dentry);
553 dput(dentry); /* d_alloc_name() in devpts_pty_new() */
554 }
555 551
552 inode->i_nlink--;
553 d_delete(dentry);
554 dput(dentry); /* d_alloc_name() in devpts_pty_new() */
556 dput(dentry); /* d_find_alias above */ 555 dput(dentry); /* d_find_alias above */
557out: 556
558 mutex_unlock(&root->d_inode->i_mutex); 557 mutex_unlock(&root->d_inode->i_mutex);
559} 558}
560 559
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b044705eedd4..ac5f164170e3 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -645,11 +645,11 @@ static int dio_send_cur_page(struct dio *dio)
645 /* 645 /*
646 * See whether this new request is contiguous with the old. 646 * See whether this new request is contiguous with the old.
647 * 647 *
648 * Btrfs cannot handl having logically non-contiguous requests 648 * Btrfs cannot handle having logically non-contiguous requests
649 * submitted. For exmple if you have 649 * submitted. For example if you have
650 * 650 *
651 * Logical: [0-4095][HOLE][8192-12287] 651 * Logical: [0-4095][HOLE][8192-12287]
652 * Phyiscal: [0-4095] [4096-8181] 652 * Physical: [0-4095] [4096-8191]
653 * 653 *
654 * We cannot submit those pages together as one BIO. So if our 654 * We cannot submit those pages together as one BIO. So if our
655 * current logical offset in the file does not equal what would 655 * current logical offset in the file does not equal what would
@@ -1110,11 +1110,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1110 ((rw & READ) || (dio->result == dio->size))) 1110 ((rw & READ) || (dio->result == dio->size)))
1111 ret = -EIOCBQUEUED; 1111 ret = -EIOCBQUEUED;
1112 1112
1113 if (ret != -EIOCBQUEUED) { 1113 if (ret != -EIOCBQUEUED)
1114 /* All IO is now issued, send it on its way */
1115 blk_run_address_space(inode->i_mapping);
1116 dio_await_completion(dio); 1114 dio_await_completion(dio);
1117 }
1118 1115
1119 /* 1116 /*
1120 * Sync will always be dropping the final ref and completing the 1117 * Sync will always be dropping the final ref and completing the
@@ -1176,7 +1173,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1176 struct dio *dio; 1173 struct dio *dio;
1177 1174
1178 if (rw & WRITE) 1175 if (rw & WRITE)
1179 rw = WRITE_ODIRECT_PLUG; 1176 rw = WRITE_ODIRECT;
1180 1177
1181 if (bdev) 1178 if (bdev)
1182 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1179 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 4314f0d48d85..abc49f292454 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -18,6 +18,7 @@
18 18
19#define WAKE_ASTS 0 19#define WAKE_ASTS 0
20 20
21static uint64_t ast_seq_count;
21static struct list_head ast_queue; 22static struct list_head ast_queue;
22static spinlock_t ast_queue_lock; 23static spinlock_t ast_queue_lock;
23static struct task_struct * astd_task; 24static struct task_struct * astd_task;
@@ -25,40 +26,186 @@ static unsigned long astd_wakeflags;
25static struct mutex astd_running; 26static struct mutex astd_running;
26 27
27 28
29static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
30{
31 int i;
32
33 log_print("last_bast %x %llu flags %x mode %d sb %d %x",
34 lkb->lkb_id,
35 (unsigned long long)lkb->lkb_last_bast.seq,
36 lkb->lkb_last_bast.flags,
37 lkb->lkb_last_bast.mode,
38 lkb->lkb_last_bast.sb_status,
39 lkb->lkb_last_bast.sb_flags);
40
41 log_print("last_cast %x %llu flags %x mode %d sb %d %x",
42 lkb->lkb_id,
43 (unsigned long long)lkb->lkb_last_cast.seq,
44 lkb->lkb_last_cast.flags,
45 lkb->lkb_last_cast.mode,
46 lkb->lkb_last_cast.sb_status,
47 lkb->lkb_last_cast.sb_flags);
48
49 for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
50 log_print("cb %x %llu flags %x mode %d sb %d %x",
51 lkb->lkb_id,
52 (unsigned long long)lkb->lkb_callbacks[i].seq,
53 lkb->lkb_callbacks[i].flags,
54 lkb->lkb_callbacks[i].mode,
55 lkb->lkb_callbacks[i].sb_status,
56 lkb->lkb_callbacks[i].sb_flags);
57 }
58}
59
28void dlm_del_ast(struct dlm_lkb *lkb) 60void dlm_del_ast(struct dlm_lkb *lkb)
29{ 61{
30 spin_lock(&ast_queue_lock); 62 spin_lock(&ast_queue_lock);
31 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST)) 63 if (!list_empty(&lkb->lkb_astqueue))
32 list_del(&lkb->lkb_astqueue); 64 list_del_init(&lkb->lkb_astqueue);
33 spin_unlock(&ast_queue_lock); 65 spin_unlock(&ast_queue_lock);
34} 66}
35 67
36void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode) 68int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
69 int status, uint32_t sbflags, uint64_t seq)
37{ 70{
71 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
72 uint64_t prev_seq;
73 int prev_mode;
74 int i;
75
76 for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
77 if (lkb->lkb_callbacks[i].seq)
78 continue;
79
80 /*
81 * Suppress some redundant basts here, do more on removal.
82 * Don't even add a bast if the callback just before it
83 * is a bast for the same mode or a more restrictive mode.
84 * (the addional > PR check is needed for PR/CW inversion)
85 */
86
87 if ((i > 0) && (flags & DLM_CB_BAST) &&
88 (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
89
90 prev_seq = lkb->lkb_callbacks[i-1].seq;
91 prev_mode = lkb->lkb_callbacks[i-1].mode;
92
93 if ((prev_mode == mode) ||
94 (prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
95
96 log_debug(ls, "skip %x add bast %llu mode %d "
97 "for bast %llu mode %d",
98 lkb->lkb_id,
99 (unsigned long long)seq,
100 mode,
101 (unsigned long long)prev_seq,
102 prev_mode);
103 return 0;
104 }
105 }
106
107 lkb->lkb_callbacks[i].seq = seq;
108 lkb->lkb_callbacks[i].flags = flags;
109 lkb->lkb_callbacks[i].mode = mode;
110 lkb->lkb_callbacks[i].sb_status = status;
111 lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
112 break;
113 }
114
115 if (i == DLM_CALLBACKS_SIZE) {
116 log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
117 lkb->lkb_id, (unsigned long long)seq,
118 flags, mode, status, sbflags);
119 dlm_dump_lkb_callbacks(lkb);
120 return -1;
121 }
122
123 return 0;
124}
125
126int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
127 struct dlm_callback *cb, int *resid)
128{
129 int i;
130
131 *resid = 0;
132
133 if (!lkb->lkb_callbacks[0].seq)
134 return -ENOENT;
135
136 /* oldest undelivered cb is callbacks[0] */
137
138 memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
139 memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
140
141 /* shift others down */
142
143 for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
144 if (!lkb->lkb_callbacks[i].seq)
145 break;
146 memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
147 sizeof(struct dlm_callback));
148 memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
149 (*resid)++;
150 }
151
152 /* if cb is a bast, it should be skipped if the blocking mode is
153 compatible with the last granted mode */
154
155 if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
156 if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
157 cb->flags |= DLM_CB_SKIP;
158
159 log_debug(ls, "skip %x bast %llu mode %d "
160 "for cast %llu mode %d",
161 lkb->lkb_id,
162 (unsigned long long)cb->seq,
163 cb->mode,
164 (unsigned long long)lkb->lkb_last_cast.seq,
165 lkb->lkb_last_cast.mode);
166 return 0;
167 }
168 }
169
170 if (cb->flags & DLM_CB_CAST) {
171 memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
172 lkb->lkb_last_cast_time = ktime_get();
173 }
174
175 if (cb->flags & DLM_CB_BAST) {
176 memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
177 lkb->lkb_last_bast_time = ktime_get();
178 }
179
180 return 0;
181}
182
183void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
184 uint32_t sbflags)
185{
186 uint64_t seq;
187 int rv;
188
189 spin_lock(&ast_queue_lock);
190
191 seq = ++ast_seq_count;
192
38 if (lkb->lkb_flags & DLM_IFL_USER) { 193 if (lkb->lkb_flags & DLM_IFL_USER) {
39 dlm_user_add_ast(lkb, type, mode); 194 spin_unlock(&ast_queue_lock);
195 dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq);
40 return; 196 return;
41 } 197 }
42 198
43 spin_lock(&ast_queue_lock); 199 rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { 200 if (rv < 0) {
201 spin_unlock(&ast_queue_lock);
202 return;
203 }
204
205 if (list_empty(&lkb->lkb_astqueue)) {
45 kref_get(&lkb->lkb_ref); 206 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue); 207 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 lkb->lkb_ast_first = type;
48 } 208 }
49
50 /* sanity check, this should not happen */
51
52 if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
53 log_print("repeat cast %d castmode %d lock %x %s",
54 mode, lkb->lkb_castmode,
55 lkb->lkb_id, lkb->lkb_resource->res_name);
56
57 lkb->lkb_ast_type |= type;
58 if (type == AST_BAST)
59 lkb->lkb_bastmode = mode;
60 else
61 lkb->lkb_castmode = mode;
62 spin_unlock(&ast_queue_lock); 209 spin_unlock(&ast_queue_lock);
63 210
64 set_bit(WAKE_ASTS, &astd_wakeflags); 211 set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -72,7 +219,8 @@ static void process_asts(void)
72 struct dlm_lkb *lkb; 219 struct dlm_lkb *lkb;
73 void (*castfn) (void *astparam); 220 void (*castfn) (void *astparam);
74 void (*bastfn) (void *astparam, int mode); 221 void (*bastfn) (void *astparam, int mode);
75 int type, first, bastmode, castmode, do_bast, do_cast, last_castmode; 222 struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
223 int i, rv, resid;
76 224
77repeat: 225repeat:
78 spin_lock(&ast_queue_lock); 226 spin_lock(&ast_queue_lock);
@@ -83,54 +231,45 @@ repeat:
83 if (dlm_locking_stopped(ls)) 231 if (dlm_locking_stopped(ls))
84 continue; 232 continue;
85 233
86 list_del(&lkb->lkb_astqueue); 234 /* we remove from astqueue list and remove everything in
87 type = lkb->lkb_ast_type; 235 lkb_callbacks before releasing the spinlock so empty
88 lkb->lkb_ast_type = 0; 236 lkb_astqueue is always consistent with empty lkb_callbacks */
89 first = lkb->lkb_ast_first; 237
90 lkb->lkb_ast_first = 0; 238 list_del_init(&lkb->lkb_astqueue);
91 bastmode = lkb->lkb_bastmode; 239
92 castmode = lkb->lkb_castmode;
93 castfn = lkb->lkb_astfn; 240 castfn = lkb->lkb_astfn;
94 bastfn = lkb->lkb_bastfn; 241 bastfn = lkb->lkb_bastfn;
95 spin_unlock(&ast_queue_lock);
96 242
97 do_cast = (type & AST_COMP) && castfn; 243 memset(&callbacks, 0, sizeof(callbacks));
98 do_bast = (type & AST_BAST) && bastfn;
99 244
100 /* Skip a bast if its blocking mode is compatible with the 245 for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
101 granted mode of the preceding cast. */ 246 rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
247 if (rv < 0)
248 break;
249 }
250 spin_unlock(&ast_queue_lock);
102 251
103 if (do_bast) { 252 if (resid) {
104 if (first == AST_COMP) 253 /* shouldn't happen, for loop should have removed all */
105 last_castmode = castmode; 254 log_error(ls, "callback resid %d lkb %x",
106 else 255 resid, lkb->lkb_id);
107 last_castmode = lkb->lkb_castmode_done;
108 if (dlm_modes_compat(bastmode, last_castmode))
109 do_bast = 0;
110 } 256 }
111 257
112 if (first == AST_COMP) { 258 for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
113 if (do_cast) 259 if (!callbacks[i].seq)
114 castfn(lkb->lkb_astparam); 260 break;
115 if (do_bast) 261 if (callbacks[i].flags & DLM_CB_SKIP) {
116 bastfn(lkb->lkb_astparam, bastmode); 262 continue;
117 } else if (first == AST_BAST) { 263 } else if (callbacks[i].flags & DLM_CB_BAST) {
118 if (do_bast) 264 bastfn(lkb->lkb_astparam, callbacks[i].mode);
119 bastfn(lkb->lkb_astparam, bastmode); 265 } else if (callbacks[i].flags & DLM_CB_CAST) {
120 if (do_cast) 266 lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
267 lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
121 castfn(lkb->lkb_astparam); 268 castfn(lkb->lkb_astparam);
122 } else { 269 }
123 log_error(ls, "bad ast_first %d ast_type %d",
124 first, type);
125 } 270 }
126 271
127 if (do_cast) 272 /* removes ref for ast_queue, may cause lkb to be freed */
128 lkb->lkb_castmode_done = castmode;
129 if (do_bast)
130 lkb->lkb_bastmode_done = bastmode;
131
132 /* this removes the reference added by dlm_add_ast
133 and may result in the lkb being freed */
134 dlm_put_lkb(lkb); 273 dlm_put_lkb(lkb);
135 274
136 cond_resched(); 275 cond_resched();
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index bcb1aaba519d..8aa89c9b5611 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -13,8 +13,13 @@
13#ifndef __ASTD_DOT_H__ 13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__ 14#define __ASTD_DOT_H__
15 15
16void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
17void dlm_del_ast(struct dlm_lkb *lkb); 16void dlm_del_ast(struct dlm_lkb *lkb);
17int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
18 int status, uint32_t sbflags, uint64_t seq);
19int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
20 struct dlm_callback *cb, int *resid);
21void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
22 uint32_t sbflags);
18 23
19void dlm_astd_wake(void); 24void dlm_astd_wake(void);
20int dlm_astd_start(void); 25int dlm_astd_start(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index b54bca03d92f..0d329ff8ed4c 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -977,9 +977,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
977/* Config file defaults */ 977/* Config file defaults */
978#define DEFAULT_TCP_PORT 21064 978#define DEFAULT_TCP_PORT 21064
979#define DEFAULT_BUFFER_SIZE 4096 979#define DEFAULT_BUFFER_SIZE 4096
980#define DEFAULT_RSBTBL_SIZE 256 980#define DEFAULT_RSBTBL_SIZE 1024
981#define DEFAULT_LKBTBL_SIZE 1024 981#define DEFAULT_LKBTBL_SIZE 1024
982#define DEFAULT_DIRTBL_SIZE 512 982#define DEFAULT_DIRTBL_SIZE 1024
983#define DEFAULT_RECOVER_TIMER 5 983#define DEFAULT_RECOVER_TIMER 5
984#define DEFAULT_TOSS_SECS 10 984#define DEFAULT_TOSS_SECS 10
985#define DEFAULT_SCAN_SECS 5 985#define DEFAULT_SCAN_SECS 5
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 6b42ba807dfd..59779237e2b4 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -257,12 +257,12 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
257 lkb->lkb_status, 257 lkb->lkb_status,
258 lkb->lkb_grmode, 258 lkb->lkb_grmode,
259 lkb->lkb_rqmode, 259 lkb->lkb_rqmode,
260 lkb->lkb_bastmode, 260 lkb->lkb_last_bast.mode,
261 rsb_lookup, 261 rsb_lookup,
262 lkb->lkb_wait_type, 262 lkb->lkb_wait_type,
263 lkb->lkb_lvbseq, 263 lkb->lkb_lvbseq,
264 (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), 264 (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
265 (unsigned long long)ktime_to_ns(lkb->lkb_time_bast)); 265 (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
266 return rv; 266 return rv;
267} 267}
268 268
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index f632b58cd222..b94204913011 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -192,11 +192,6 @@ struct dlm_args {
192 * lkb is a process copy, the nodeid specifies the lock master. 192 * lkb is a process copy, the nodeid specifies the lock master.
193 */ 193 */
194 194
195/* lkb_ast_type */
196
197#define AST_COMP 1
198#define AST_BAST 2
199
200/* lkb_status */ 195/* lkb_status */
201 196
202#define DLM_LKSTS_WAITING 1 197#define DLM_LKSTS_WAITING 1
@@ -217,6 +212,20 @@ struct dlm_args {
217#define DLM_IFL_USER 0x00000001 212#define DLM_IFL_USER 0x00000001
218#define DLM_IFL_ORPHAN 0x00000002 213#define DLM_IFL_ORPHAN 0x00000002
219 214
215#define DLM_CALLBACKS_SIZE 6
216
217#define DLM_CB_CAST 0x00000001
218#define DLM_CB_BAST 0x00000002
219#define DLM_CB_SKIP 0x00000004
220
221struct dlm_callback {
222 uint64_t seq;
223 uint32_t flags; /* DLM_CBF_ */
224 int sb_status; /* copy to lksb status */
225 uint8_t sb_flags; /* copy to lksb flags */
226 int8_t mode; /* rq mode of bast, gr mode of cast */
227};
228
220struct dlm_lkb { 229struct dlm_lkb {
221 struct dlm_rsb *lkb_resource; /* the rsb */ 230 struct dlm_rsb *lkb_resource; /* the rsb */
222 struct kref lkb_ref; 231 struct kref lkb_ref;
@@ -236,13 +245,6 @@ struct dlm_lkb {
236 245
237 int8_t lkb_wait_type; /* type of reply waiting for */ 246 int8_t lkb_wait_type; /* type of reply waiting for */
238 int8_t lkb_wait_count; 247 int8_t lkb_wait_count;
239 int8_t lkb_ast_type; /* type of ast queued for */
240 int8_t lkb_ast_first; /* type of first ast queued */
241
242 int8_t lkb_bastmode; /* req mode of queued bast */
243 int8_t lkb_castmode; /* gr mode of queued cast */
244 int8_t lkb_bastmode_done; /* last delivered bastmode */
245 int8_t lkb_castmode_done; /* last delivered castmode */
246 248
247 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 249 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
248 struct list_head lkb_statequeue; /* rsb g/c/w list */ 250 struct list_head lkb_statequeue; /* rsb g/c/w list */
@@ -251,10 +253,15 @@ struct dlm_lkb {
251 struct list_head lkb_astqueue; /* need ast to be sent */ 253 struct list_head lkb_astqueue; /* need ast to be sent */
252 struct list_head lkb_ownqueue; /* list of locks for a process */ 254 struct list_head lkb_ownqueue; /* list of locks for a process */
253 struct list_head lkb_time_list; 255 struct list_head lkb_time_list;
254 ktime_t lkb_time_bast; /* for debugging */
255 ktime_t lkb_timestamp; 256 ktime_t lkb_timestamp;
256 unsigned long lkb_timeout_cs; 257 unsigned long lkb_timeout_cs;
257 258
259 struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE];
260 struct dlm_callback lkb_last_cast;
261 struct dlm_callback lkb_last_bast;
262 ktime_t lkb_last_cast_time; /* for debugging */
263 ktime_t lkb_last_bast_time; /* for debugging */
264
258 char *lkb_lvbptr; 265 char *lkb_lvbptr;
259 struct dlm_lksb *lkb_lksb; /* caller's status block */ 266 struct dlm_lksb *lkb_lksb; /* caller's status block */
260 void (*lkb_astfn) (void *astparam); 267 void (*lkb_astfn) (void *astparam);
@@ -544,8 +551,6 @@ struct dlm_user_args {
544 (dlm_user_proc) on the struct file, 551 (dlm_user_proc) on the struct file,
545 the process's locks point back to it*/ 552 the process's locks point back to it*/
546 struct dlm_lksb lksb; 553 struct dlm_lksb lksb;
547 int old_mode;
548 int update_user_lvb;
549 struct dlm_lksb __user *user_lksb; 554 struct dlm_lksb __user *user_lksb;
550 void __user *castparam; 555 void __user *castparam;
551 void __user *castaddr; 556 void __user *castaddr;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 64e5f3efdd81..04b8c449303f 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -160,10 +160,10 @@ static const int __quecvt_compat_matrix[8][8] = {
160void dlm_print_lkb(struct dlm_lkb *lkb) 160void dlm_print_lkb(struct dlm_lkb *lkb)
161{ 161{
162 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n" 162 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
163 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n", 163 " status %d rqmode %d grmode %d wait_type %d\n",
164 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, 164 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
165 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, 165 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
166 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type); 166 lkb->lkb_grmode, lkb->lkb_wait_type);
167} 167}
168 168
169static void dlm_print_rsb(struct dlm_rsb *r) 169static void dlm_print_rsb(struct dlm_rsb *r)
@@ -305,10 +305,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
305 rv = -EDEADLK; 305 rv = -EDEADLK;
306 } 306 }
307 307
308 lkb->lkb_lksb->sb_status = rv; 308 dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags);
309 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
310
311 dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
312} 309}
313 310
314static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) 311static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -319,13 +316,10 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
319 316
320static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) 317static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
321{ 318{
322 lkb->lkb_time_bast = ktime_get();
323
324 if (is_master_copy(lkb)) { 319 if (is_master_copy(lkb)) {
325 lkb->lkb_bastmode = rqmode; /* printed by debugfs */
326 send_bast(r, lkb, rqmode); 320 send_bast(r, lkb, rqmode);
327 } else { 321 } else {
328 dlm_add_ast(lkb, AST_BAST, rqmode); 322 dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0);
329 } 323 }
330} 324}
331 325
@@ -600,6 +594,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
600 INIT_LIST_HEAD(&lkb->lkb_ownqueue); 594 INIT_LIST_HEAD(&lkb->lkb_ownqueue);
601 INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); 595 INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
602 INIT_LIST_HEAD(&lkb->lkb_time_list); 596 INIT_LIST_HEAD(&lkb->lkb_time_list);
597 INIT_LIST_HEAD(&lkb->lkb_astqueue);
603 598
604 get_random_bytes(&bucket, sizeof(bucket)); 599 get_random_bytes(&bucket, sizeof(bucket));
605 bucket &= (ls->ls_lkbtbl_size - 1); 600 bucket &= (ls->ls_lkbtbl_size - 1);
@@ -2819,9 +2814,9 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2819 not from lkb fields */ 2814 not from lkb fields */
2820 2815
2821 if (lkb->lkb_bastfn) 2816 if (lkb->lkb_bastfn)
2822 ms->m_asts |= AST_BAST; 2817 ms->m_asts |= DLM_CB_BAST;
2823 if (lkb->lkb_astfn) 2818 if (lkb->lkb_astfn)
2824 ms->m_asts |= AST_COMP; 2819 ms->m_asts |= DLM_CB_CAST;
2825 2820
2826 /* compare with switch in create_message; send_remove() doesn't 2821 /* compare with switch in create_message; send_remove() doesn't
2827 use send_args() */ 2822 use send_args() */
@@ -3122,8 +3117,8 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3122 lkb->lkb_grmode = DLM_LOCK_IV; 3117 lkb->lkb_grmode = DLM_LOCK_IV;
3123 lkb->lkb_rqmode = ms->m_rqmode; 3118 lkb->lkb_rqmode = ms->m_rqmode;
3124 3119
3125 lkb->lkb_bastfn = (ms->m_asts & AST_BAST) ? &fake_bastfn : NULL; 3120 lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
3126 lkb->lkb_astfn = (ms->m_asts & AST_COMP) ? &fake_astfn : NULL; 3121 lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
3127 3122
3128 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 3123 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3129 /* lkb was just created so there won't be an lvb yet */ 3124 /* lkb was just created so there won't be an lvb yet */
@@ -4412,8 +4407,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
4412 lkb->lkb_grmode = rl->rl_grmode; 4407 lkb->lkb_grmode = rl->rl_grmode;
4413 /* don't set lkb_status because add_lkb wants to itself */ 4408 /* don't set lkb_status because add_lkb wants to itself */
4414 4409
4415 lkb->lkb_bastfn = (rl->rl_asts & AST_BAST) ? &fake_bastfn : NULL; 4410 lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
4416 lkb->lkb_astfn = (rl->rl_asts & AST_COMP) ? &fake_astfn : NULL; 4411 lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
4417 4412
4418 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 4413 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
4419 int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - 4414 int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4589,7 +4584,6 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4589 error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, 4584 error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
4590 fake_astfn, ua, fake_bastfn, &args); 4585 fake_astfn, ua, fake_bastfn, &args);
4591 lkb->lkb_flags |= DLM_IFL_USER; 4586 lkb->lkb_flags |= DLM_IFL_USER;
4592 ua->old_mode = DLM_LOCK_IV;
4593 4587
4594 if (error) { 4588 if (error) {
4595 __put_lkb(ls, lkb); 4589 __put_lkb(ls, lkb);
@@ -4658,7 +4652,6 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4658 ua->bastparam = ua_tmp->bastparam; 4652 ua->bastparam = ua_tmp->bastparam;
4659 ua->bastaddr = ua_tmp->bastaddr; 4653 ua->bastaddr = ua_tmp->bastaddr;
4660 ua->user_lksb = ua_tmp->user_lksb; 4654 ua->user_lksb = ua_tmp->user_lksb;
4661 ua->old_mode = lkb->lkb_grmode;
4662 4655
4663 error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, 4656 error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
4664 fake_astfn, ua, fake_bastfn, &args); 4657 fake_astfn, ua, fake_bastfn, &args);
@@ -4917,8 +4910,9 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4917 } 4910 }
4918 4911
4919 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { 4912 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4920 lkb->lkb_ast_type = 0; 4913 memset(&lkb->lkb_callbacks, 0,
4921 list_del(&lkb->lkb_astqueue); 4914 sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
4915 list_del_init(&lkb->lkb_astqueue);
4922 dlm_put_lkb(lkb); 4916 dlm_put_lkb(lkb);
4923 } 4917 }
4924 4918
@@ -4958,7 +4952,9 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4958 4952
4959 spin_lock(&proc->asts_spin); 4953 spin_lock(&proc->asts_spin);
4960 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { 4954 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4961 list_del(&lkb->lkb_astqueue); 4955 memset(&lkb->lkb_callbacks, 0,
4956 sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
4957 list_del_init(&lkb->lkb_astqueue);
4962 dlm_put_lkb(lkb); 4958 dlm_put_lkb(lkb);
4963 } 4959 }
4964 spin_unlock(&proc->asts_spin); 4960 spin_unlock(&proc->asts_spin);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9c64ae9e4c1a..bffa1e73b9a9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1468,15 +1468,15 @@ static void work_stop(void)
1468 1468
1469static int work_start(void) 1469static int work_start(void)
1470{ 1470{
1471 recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM | 1471 recv_workqueue = alloc_workqueue("dlm_recv",
1472 WQ_HIGHPRI | WQ_FREEZEABLE, 0); 1472 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
1473 if (!recv_workqueue) { 1473 if (!recv_workqueue) {
1474 log_print("can't start dlm_recv"); 1474 log_print("can't start dlm_recv");
1475 return -ENOMEM; 1475 return -ENOMEM;
1476 } 1476 }
1477 1477
1478 send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM | 1478 send_workqueue = alloc_workqueue("dlm_send",
1479 WQ_HIGHPRI | WQ_FREEZEABLE, 0); 1479 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
1480 if (!send_workqueue) { 1480 if (!send_workqueue) {
1481 log_print("can't start dlm_send"); 1481 log_print("can't start dlm_send");
1482 destroy_workqueue(recv_workqueue); 1482 destroy_workqueue(recv_workqueue);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 3c83a49a48a3..f10a50f24e8f 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -321,9 +321,9 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
321 rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type); 321 rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type);
322 322
323 if (lkb->lkb_bastfn) 323 if (lkb->lkb_bastfn)
324 rl->rl_asts |= AST_BAST; 324 rl->rl_asts |= DLM_CB_BAST;
325 if (lkb->lkb_astfn) 325 if (lkb->lkb_astfn)
326 rl->rl_asts |= AST_COMP; 326 rl->rl_asts |= DLM_CB_CAST;
327 327
328 rl->rl_namelen = cpu_to_le16(r->res_length); 328 rl->rl_namelen = cpu_to_le16(r->res_length);
329 memcpy(rl->rl_name, r->res_name, r->res_length); 329 memcpy(rl->rl_name, r->res_name, r->res_length);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 66d6c16bf440..d5ab3fe7c198 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,6 +24,7 @@
24#include "lock.h" 24#include "lock.h"
25#include "lvb_table.h" 25#include "lvb_table.h"
26#include "user.h" 26#include "user.h"
27#include "ast.h"
27 28
28static const char name_prefix[] = "dlm"; 29static const char name_prefix[] = "dlm";
29static const struct file_operations device_fops; 30static const struct file_operations device_fops;
@@ -152,19 +153,16 @@ static void compat_output(struct dlm_lock_result *res,
152 not related to the lifetime of the lkb struct which is managed 153 not related to the lifetime of the lkb struct which is managed
153 entirely by refcount. */ 154 entirely by refcount. */
154 155
155static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type) 156static int lkb_is_endoflife(int mode, int status)
156{ 157{
157 switch (sb_status) { 158 switch (status) {
158 case -DLM_EUNLOCK: 159 case -DLM_EUNLOCK:
159 return 1; 160 return 1;
160 case -DLM_ECANCEL: 161 case -DLM_ECANCEL:
161 case -ETIMEDOUT: 162 case -ETIMEDOUT:
162 case -EDEADLK: 163 case -EDEADLK:
163 if (lkb->lkb_grmode == DLM_LOCK_IV)
164 return 1;
165 break;
166 case -EAGAIN: 164 case -EAGAIN:
167 if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV) 165 if (mode == DLM_LOCK_IV)
168 return 1; 166 return 1;
169 break; 167 break;
170 } 168 }
@@ -174,12 +172,13 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
174/* we could possibly check if the cancel of an orphan has resulted in the lkb 172/* we could possibly check if the cancel of an orphan has resulted in the lkb
175 being removed and then remove that lkb from the orphans list and free it */ 173 being removed and then remove that lkb from the orphans list and free it */
176 174
177void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) 175void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
176 int status, uint32_t sbflags, uint64_t seq)
178{ 177{
179 struct dlm_ls *ls; 178 struct dlm_ls *ls;
180 struct dlm_user_args *ua; 179 struct dlm_user_args *ua;
181 struct dlm_user_proc *proc; 180 struct dlm_user_proc *proc;
182 int eol = 0, ast_type; 181 int rv;
183 182
184 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) 183 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
185 return; 184 return;
@@ -200,49 +199,29 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
200 ua = lkb->lkb_ua; 199 ua = lkb->lkb_ua;
201 proc = ua->proc; 200 proc = ua->proc;
202 201
203 if (type == AST_BAST && ua->bastaddr == NULL) 202 if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL)
204 goto out; 203 goto out;
205 204
205 if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status))
206 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
207
206 spin_lock(&proc->asts_spin); 208 spin_lock(&proc->asts_spin);
207 209
208 ast_type = lkb->lkb_ast_type; 210 rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
209 lkb->lkb_ast_type |= type; 211 if (rv < 0) {
210 if (type == AST_BAST) 212 spin_unlock(&proc->asts_spin);
211 lkb->lkb_bastmode = mode; 213 goto out;
212 else 214 }
213 lkb->lkb_castmode = mode;
214 215
215 if (!ast_type) { 216 if (list_empty(&lkb->lkb_astqueue)) {
216 kref_get(&lkb->lkb_ref); 217 kref_get(&lkb->lkb_ref);
217 list_add_tail(&lkb->lkb_astqueue, &proc->asts); 218 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
218 lkb->lkb_ast_first = type;
219 wake_up_interruptible(&proc->wait); 219 wake_up_interruptible(&proc->wait);
220 } 220 }
221 if (type == AST_COMP && (ast_type & AST_COMP))
222 log_debug(ls, "ast overlap %x status %x %x",
223 lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
224
225 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
226 if (eol) {
227 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
228 }
229
230 /* We want to copy the lvb to userspace when the completion
231 ast is read if the status is 0, the lock has an lvb and
232 lvb_ops says we should. We could probably have set_lvb_lock()
233 set update_user_lvb instead and not need old_mode */
234
235 if ((lkb->lkb_ast_type & AST_COMP) &&
236 (lkb->lkb_lksb->sb_status == 0) &&
237 lkb->lkb_lksb->sb_lvbptr &&
238 dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
239 ua->update_user_lvb = 1;
240 else
241 ua->update_user_lvb = 0;
242
243 spin_unlock(&proc->asts_spin); 221 spin_unlock(&proc->asts_spin);
244 222
245 if (eol) { 223 if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
224 /* N.B. spin_lock locks_spin, not asts_spin */
246 spin_lock(&proc->locks_spin); 225 spin_lock(&proc->locks_spin);
247 if (!list_empty(&lkb->lkb_ownqueue)) { 226 if (!list_empty(&lkb->lkb_ownqueue)) {
248 list_del_init(&lkb->lkb_ownqueue); 227 list_del_init(&lkb->lkb_ownqueue);
@@ -705,8 +684,9 @@ static int device_close(struct inode *inode, struct file *file)
705 return 0; 684 return 0;
706} 685}
707 686
708static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, 687static int copy_result_to_user(struct dlm_user_args *ua, int compat,
709 int mode, char __user *buf, size_t count) 688 uint32_t flags, int mode, int copy_lvb,
689 char __user *buf, size_t count)
710{ 690{
711#ifdef CONFIG_COMPAT 691#ifdef CONFIG_COMPAT
712 struct dlm_lock_result32 result32; 692 struct dlm_lock_result32 result32;
@@ -730,7 +710,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
730 notes that a new blocking AST address and parameter are set even if 710 notes that a new blocking AST address and parameter are set even if
731 the conversion fails, so maybe we should just do that. */ 711 the conversion fails, so maybe we should just do that. */
732 712
733 if (type == AST_BAST) { 713 if (flags & DLM_CB_BAST) {
734 result.user_astaddr = ua->bastaddr; 714 result.user_astaddr = ua->bastaddr;
735 result.user_astparam = ua->bastparam; 715 result.user_astparam = ua->bastparam;
736 result.bast_mode = mode; 716 result.bast_mode = mode;
@@ -750,8 +730,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
750 /* copy lvb to userspace if there is one, it's been updated, and 730 /* copy lvb to userspace if there is one, it's been updated, and
751 the user buffer has space for it */ 731 the user buffer has space for it */
752 732
753 if (ua->update_user_lvb && ua->lksb.sb_lvbptr && 733 if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) {
754 count >= len + DLM_USER_LVB_LEN) {
755 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr, 734 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
756 DLM_USER_LVB_LEN)) { 735 DLM_USER_LVB_LEN)) {
757 error = -EFAULT; 736 error = -EFAULT;
@@ -801,13 +780,12 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
801 struct dlm_user_proc *proc = file->private_data; 780 struct dlm_user_proc *proc = file->private_data;
802 struct dlm_lkb *lkb; 781 struct dlm_lkb *lkb;
803 DECLARE_WAITQUEUE(wait, current); 782 DECLARE_WAITQUEUE(wait, current);
804 int error = 0, removed; 783 struct dlm_callback cb;
805 int ret_type, ret_mode; 784 int rv, resid, copy_lvb = 0;
806 int bastmode, castmode, do_bast, do_cast;
807 785
808 if (count == sizeof(struct dlm_device_version)) { 786 if (count == sizeof(struct dlm_device_version)) {
809 error = copy_version_to_user(buf, count); 787 rv = copy_version_to_user(buf, count);
810 return error; 788 return rv;
811 } 789 }
812 790
813 if (!proc) { 791 if (!proc) {
@@ -854,92 +832,57 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
854 } 832 }
855 } 833 }
856 834
857 /* there may be both completion and blocking asts to return for 835 /* if we empty lkb_callbacks, we don't want to unlock the spinlock
858 the lkb, don't remove lkb from asts list unless no asts remain */ 836 without removing lkb_astqueue; so empty lkb_astqueue is always
837 consistent with empty lkb_callbacks */
859 838
860 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); 839 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
861 840
862 removed = 0; 841 rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
863 ret_type = 0; 842 if (rv < 0) {
864 ret_mode = 0; 843 /* this shouldn't happen; lkb should have been removed from
865 do_bast = lkb->lkb_ast_type & AST_BAST; 844 list when resid was zero */
866 do_cast = lkb->lkb_ast_type & AST_COMP; 845 log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
867 bastmode = lkb->lkb_bastmode; 846 list_del_init(&lkb->lkb_astqueue);
868 castmode = lkb->lkb_castmode; 847 spin_unlock(&proc->asts_spin);
869 848 /* removes ref for proc->asts, may cause lkb to be freed */
870 /* when both are queued figure out which to do first and 849 dlm_put_lkb(lkb);
871 switch first so the other goes in the next read */ 850 goto try_another;
872
873 if (do_cast && do_bast) {
874 if (lkb->lkb_ast_first == AST_COMP) {
875 ret_type = AST_COMP;
876 ret_mode = castmode;
877 lkb->lkb_ast_type &= ~AST_COMP;
878 lkb->lkb_ast_first = AST_BAST;
879 } else {
880 ret_type = AST_BAST;
881 ret_mode = bastmode;
882 lkb->lkb_ast_type &= ~AST_BAST;
883 lkb->lkb_ast_first = AST_COMP;
884 }
885 } else {
886 ret_type = lkb->lkb_ast_first;
887 ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
888 lkb->lkb_ast_type &= ~ret_type;
889 lkb->lkb_ast_first = 0;
890 } 851 }
852 if (!resid)
853 list_del_init(&lkb->lkb_astqueue);
854 spin_unlock(&proc->asts_spin);
891 855
892 /* if we're doing a bast but the bast is unnecessary, then 856 if (cb.flags & DLM_CB_SKIP) {
893 switch to do nothing or do a cast if that was needed next */ 857 /* removes ref for proc->asts, may cause lkb to be freed */
894 858 if (!resid)
895 if ((ret_type == AST_BAST) && 859 dlm_put_lkb(lkb);
896 dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) { 860 goto try_another;
897 ret_type = 0;
898 ret_mode = 0;
899
900 if (do_cast) {
901 ret_type = AST_COMP;
902 ret_mode = castmode;
903 lkb->lkb_ast_type &= ~AST_COMP;
904 lkb->lkb_ast_first = 0;
905 }
906 } 861 }
907 862
908 if (lkb->lkb_ast_first != lkb->lkb_ast_type) { 863 if (cb.flags & DLM_CB_CAST) {
909 log_print("device_read %x ast_first %x ast_type %x", 864 int old_mode, new_mode;
910 lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
911 }
912 865
913 if (!lkb->lkb_ast_type) { 866 old_mode = lkb->lkb_last_cast.mode;
914 list_del(&lkb->lkb_astqueue); 867 new_mode = cb.mode;
915 removed = 1;
916 }
917 spin_unlock(&proc->asts_spin);
918 868
919 if (ret_type) { 869 if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
920 error = copy_result_to_user(lkb->lkb_ua, 870 dlm_lvb_operations[old_mode + 1][new_mode + 1])
921 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), 871 copy_lvb = 1;
922 ret_type, ret_mode, buf, count);
923 872
924 if (ret_type == AST_COMP) 873 lkb->lkb_lksb->sb_status = cb.sb_status;
925 lkb->lkb_castmode_done = castmode; 874 lkb->lkb_lksb->sb_flags = cb.sb_flags;
926 if (ret_type == AST_BAST)
927 lkb->lkb_bastmode_done = bastmode;
928 } 875 }
929 876
930 /* removes reference for the proc->asts lists added by 877 rv = copy_result_to_user(lkb->lkb_ua,
931 dlm_user_add_ast() and may result in the lkb being freed */ 878 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
879 cb.flags, cb.mode, copy_lvb, buf, count);
932 880
933 if (removed) 881 /* removes ref for proc->asts, may cause lkb to be freed */
882 if (!resid)
934 dlm_put_lkb(lkb); 883 dlm_put_lkb(lkb);
935 884
936 /* the bast that was queued was eliminated (see unnecessary above), 885 return rv;
937 leaving nothing to return */
938
939 if (!ret_type)
940 goto try_another;
941
942 return error;
943} 886}
944 887
945static unsigned int device_poll(struct file *file, poll_table *wait) 888static unsigned int device_poll(struct file *file, poll_table *wait)
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index f196091dd7ff..00499ab8835f 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,8 @@
9#ifndef __USER_DOT_H__ 9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__ 10#define __USER_DOT_H__
11 11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode); 12void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
13 int status, uint32_t sbflags, uint64_t seq);
13int dlm_user_init(void); 14int dlm_user_init(void);
14void dlm_user_exit(void); 15void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls); 16int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2195c213ab2f..98b77c89494c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,6 +8,7 @@
8#include <linux/writeback.h> 8#include <linux/writeback.h>
9#include <linux/sysctl.h> 9#include <linux/sysctl.h>
10#include <linux/gfp.h> 10#include <linux/gfp.h>
11#include "internal.h"
11 12
12/* A global variable is a bit ugly, but it keeps the code simple */ 13/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches; 14int sysctl_drop_caches;
@@ -16,20 +17,23 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
16{ 17{
17 struct inode *inode, *toput_inode = NULL; 18 struct inode *inode, *toput_inode = NULL;
18 19
19 spin_lock(&inode_lock); 20 spin_lock(&inode_sb_list_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 21 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 22 spin_lock(&inode->i_lock);
22 continue; 23 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
23 if (inode->i_mapping->nrpages == 0) 24 (inode->i_mapping->nrpages == 0)) {
25 spin_unlock(&inode->i_lock);
24 continue; 26 continue;
27 }
25 __iget(inode); 28 __iget(inode);
26 spin_unlock(&inode_lock); 29 spin_unlock(&inode->i_lock);
30 spin_unlock(&inode_sb_list_lock);
27 invalidate_mapping_pages(inode->i_mapping, 0, -1); 31 invalidate_mapping_pages(inode->i_mapping, 0, -1);
28 iput(toput_inode); 32 iput(toput_inode);
29 toput_inode = inode; 33 toput_inode = inode;
30 spin_lock(&inode_lock); 34 spin_lock(&inode_sb_list_lock);
31 } 35 }
32 spin_unlock(&inode_lock); 36 spin_unlock(&inode_sb_list_lock);
33 iput(toput_inode); 37 iput(toput_inode);
34} 38}
35 39
@@ -45,7 +49,11 @@ static void drop_slab(void)
45int drop_caches_sysctl_handler(ctl_table *table, int write, 49int drop_caches_sysctl_handler(ctl_table *table, int write,
46 void __user *buffer, size_t *length, loff_t *ppos) 50 void __user *buffer, size_t *length, loff_t *ppos)
47{ 51{
48 proc_dointvec_minmax(table, write, buffer, length, ppos); 52 int ret;
53
54 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
55 if (ret)
56 return ret;
49 if (write) { 57 if (write) {
50 if (sysctl_drop_caches & 1) 58 if (sysctl_drop_caches & 1)
51 iterate_supers(drop_pagecache_sb, NULL); 59 iterate_supers(drop_pagecache_sb, NULL);
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 6fc4f319b550..534c1d46e69e 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -46,24 +46,28 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
46{ 46{
47 struct dentry *lower_dentry; 47 struct dentry *lower_dentry;
48 struct vfsmount *lower_mnt; 48 struct vfsmount *lower_mnt;
49 struct dentry *dentry_save; 49 struct dentry *dentry_save = NULL;
50 struct vfsmount *vfsmount_save; 50 struct vfsmount *vfsmount_save = NULL;
51 int rc = 1; 51 int rc = 1;
52 52
53 if (nd->flags & LOOKUP_RCU) 53 if (nd && nd->flags & LOOKUP_RCU)
54 return -ECHILD; 54 return -ECHILD;
55 55
56 lower_dentry = ecryptfs_dentry_to_lower(dentry); 56 lower_dentry = ecryptfs_dentry_to_lower(dentry);
57 lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); 57 lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
58 if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) 58 if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
59 goto out; 59 goto out;
60 dentry_save = nd->path.dentry; 60 if (nd) {
61 vfsmount_save = nd->path.mnt; 61 dentry_save = nd->path.dentry;
62 nd->path.dentry = lower_dentry; 62 vfsmount_save = nd->path.mnt;
63 nd->path.mnt = lower_mnt; 63 nd->path.dentry = lower_dentry;
64 nd->path.mnt = lower_mnt;
65 }
64 rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); 66 rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
65 nd->path.dentry = dentry_save; 67 if (nd) {
66 nd->path.mnt = vfsmount_save; 68 nd->path.dentry = dentry_save;
69 nd->path.mnt = vfsmount_save;
70 }
67 if (dentry->d_inode) { 71 if (dentry->d_inode) {
68 struct inode *lower_inode = 72 struct inode *lower_inode =
69 ecryptfs_inode_to_lower(dentry->d_inode); 73 ecryptfs_inode_to_lower(dentry->d_inode);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index dbc84ed96336..e00753496e3e 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -632,8 +632,7 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
632 u32 flags); 632 u32 flags);
633int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 633int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
634 struct dentry *lower_dentry, 634 struct dentry *lower_dentry,
635 struct inode *ecryptfs_dir_inode, 635 struct inode *ecryptfs_dir_inode);
636 struct nameidata *ecryptfs_nd);
637int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 636int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
638 size_t *decrypted_name_size, 637 size_t *decrypted_name_size,
639 struct dentry *ecryptfs_dentry, 638 struct dentry *ecryptfs_dentry,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 81e10e6a9443..7d1050e254f9 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -317,6 +317,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
317 317
318const struct file_operations ecryptfs_dir_fops = { 318const struct file_operations ecryptfs_dir_fops = {
319 .readdir = ecryptfs_readdir, 319 .readdir = ecryptfs_readdir,
320 .read = generic_read_dir,
320 .unlocked_ioctl = ecryptfs_unlocked_ioctl, 321 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
321#ifdef CONFIG_COMPAT 322#ifdef CONFIG_COMPAT
322 .compat_ioctl = ecryptfs_compat_ioctl, 323 .compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bd33f87a1907..b592938a84bc 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -74,16 +74,20 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
74 unsigned int flags_save; 74 unsigned int flags_save;
75 int rc; 75 int rc;
76 76
77 dentry_save = nd->path.dentry; 77 if (nd) {
78 vfsmount_save = nd->path.mnt; 78 dentry_save = nd->path.dentry;
79 flags_save = nd->flags; 79 vfsmount_save = nd->path.mnt;
80 nd->path.dentry = lower_dentry; 80 flags_save = nd->flags;
81 nd->path.mnt = lower_mnt; 81 nd->path.dentry = lower_dentry;
82 nd->flags &= ~LOOKUP_OPEN; 82 nd->path.mnt = lower_mnt;
83 nd->flags &= ~LOOKUP_OPEN;
84 }
83 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); 85 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
84 nd->path.dentry = dentry_save; 86 if (nd) {
85 nd->path.mnt = vfsmount_save; 87 nd->path.dentry = dentry_save;
86 nd->flags = flags_save; 88 nd->path.mnt = vfsmount_save;
89 nd->flags = flags_save;
90 }
87 return rc; 91 return rc;
88} 92}
89 93
@@ -241,8 +245,7 @@ out:
241 */ 245 */
242int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 246int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
243 struct dentry *lower_dentry, 247 struct dentry *lower_dentry,
244 struct inode *ecryptfs_dir_inode, 248 struct inode *ecryptfs_dir_inode)
245 struct nameidata *ecryptfs_nd)
246{ 249{
247 struct dentry *lower_dir_dentry; 250 struct dentry *lower_dir_dentry;
248 struct vfsmount *lower_mnt; 251 struct vfsmount *lower_mnt;
@@ -290,8 +293,6 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
290 goto out; 293 goto out;
291 if (special_file(lower_inode->i_mode)) 294 if (special_file(lower_inode->i_mode))
292 goto out; 295 goto out;
293 if (!ecryptfs_nd)
294 goto out;
295 /* Released in this function */ 296 /* Released in this function */
296 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER); 297 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
297 if (!page_virt) { 298 if (!page_virt) {
@@ -349,75 +350,6 @@ out:
349} 350}
350 351
351/** 352/**
352 * ecryptfs_new_lower_dentry
353 * @name: The name of the new dentry.
354 * @lower_dir_dentry: Parent directory of the new dentry.
355 * @nd: nameidata from last lookup.
356 *
357 * Create a new dentry or get it from lower parent dir.
358 */
359static struct dentry *
360ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
361 struct nameidata *nd)
362{
363 struct dentry *new_dentry;
364 struct dentry *tmp;
365 struct inode *lower_dir_inode;
366
367 lower_dir_inode = lower_dir_dentry->d_inode;
368
369 tmp = d_alloc(lower_dir_dentry, name);
370 if (!tmp)
371 return ERR_PTR(-ENOMEM);
372
373 mutex_lock(&lower_dir_inode->i_mutex);
374 new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
375 mutex_unlock(&lower_dir_inode->i_mutex);
376
377 if (!new_dentry)
378 new_dentry = tmp;
379 else
380 dput(tmp);
381
382 return new_dentry;
383}
384
385
386/**
387 * ecryptfs_lookup_one_lower
388 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
389 * @lower_dir_dentry: lower parent directory
390 * @name: lower file name
391 *
392 * Get the lower dentry from vfs. If lower dentry does not exist yet,
393 * create it.
394 */
395static struct dentry *
396ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
397 struct dentry *lower_dir_dentry, struct qstr *name)
398{
399 struct nameidata nd;
400 struct vfsmount *lower_mnt;
401 int err;
402
403 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
404 ecryptfs_dentry->d_parent));
405 err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
406 mntput(lower_mnt);
407
408 if (!err) {
409 /* we dont need the mount */
410 mntput(nd.path.mnt);
411 return nd.path.dentry;
412 }
413 if (err != -ENOENT)
414 return ERR_PTR(err);
415
416 /* create a new lower dentry */
417 return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
418}
419
420/**
421 * ecryptfs_lookup 353 * ecryptfs_lookup
422 * @ecryptfs_dir_inode: The eCryptfs directory inode 354 * @ecryptfs_dir_inode: The eCryptfs directory inode
423 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up 355 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
@@ -434,7 +366,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
434 size_t encrypted_and_encoded_name_size; 366 size_t encrypted_and_encoded_name_size;
435 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; 367 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
436 struct dentry *lower_dir_dentry, *lower_dentry; 368 struct dentry *lower_dir_dentry, *lower_dentry;
437 struct qstr lower_name;
438 int rc = 0; 369 int rc = 0;
439 370
440 if ((ecryptfs_dentry->d_name.len == 1 371 if ((ecryptfs_dentry->d_name.len == 1
@@ -444,20 +375,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
444 goto out_d_drop; 375 goto out_d_drop;
445 } 376 }
446 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 377 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
447 lower_name.name = ecryptfs_dentry->d_name.name; 378 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
448 lower_name.len = ecryptfs_dentry->d_name.len; 379 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
449 lower_name.hash = ecryptfs_dentry->d_name.hash; 380 lower_dir_dentry,
450 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { 381 ecryptfs_dentry->d_name.len);
451 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, 382 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
452 lower_dir_dentry->d_inode, &lower_name);
453 if (rc < 0)
454 goto out_d_drop;
455 }
456 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
457 lower_dir_dentry, &lower_name);
458 if (IS_ERR(lower_dentry)) { 383 if (IS_ERR(lower_dentry)) {
459 rc = PTR_ERR(lower_dentry); 384 rc = PTR_ERR(lower_dentry);
460 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " 385 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
461 "[%d] on lower_dentry = [%s]\n", __func__, rc, 386 "[%d] on lower_dentry = [%s]\n", __func__, rc,
462 encrypted_and_encoded_name); 387 encrypted_and_encoded_name);
463 goto out_d_drop; 388 goto out_d_drop;
@@ -479,28 +404,21 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
479 "filename; rc = [%d]\n", __func__, rc); 404 "filename; rc = [%d]\n", __func__, rc);
480 goto out_d_drop; 405 goto out_d_drop;
481 } 406 }
482 lower_name.name = encrypted_and_encoded_name; 407 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
483 lower_name.len = encrypted_and_encoded_name_size; 408 lower_dentry = lookup_one_len(encrypted_and_encoded_name,
484 lower_name.hash = full_name_hash(lower_name.name, lower_name.len); 409 lower_dir_dentry,
485 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { 410 encrypted_and_encoded_name_size);
486 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, 411 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
487 lower_dir_dentry->d_inode, &lower_name);
488 if (rc < 0)
489 goto out_d_drop;
490 }
491 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
492 lower_dir_dentry, &lower_name);
493 if (IS_ERR(lower_dentry)) { 412 if (IS_ERR(lower_dentry)) {
494 rc = PTR_ERR(lower_dentry); 413 rc = PTR_ERR(lower_dentry);
495 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " 414 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
496 "[%d] on lower_dentry = [%s]\n", __func__, rc, 415 "[%d] on lower_dentry = [%s]\n", __func__, rc,
497 encrypted_and_encoded_name); 416 encrypted_and_encoded_name);
498 goto out_d_drop; 417 goto out_d_drop;
499 } 418 }
500lookup_and_interpose: 419lookup_and_interpose:
501 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, 420 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
502 ecryptfs_dir_inode, 421 ecryptfs_dir_inode);
503 ecryptfs_nd);
504 goto out; 422 goto out;
505out_d_drop: 423out_d_drop:
506 d_drop(ecryptfs_dentry); 424 d_drop(ecryptfs_dentry);
@@ -1092,6 +1010,8 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1092 rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), 1010 rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
1093 ecryptfs_dentry_to_lower(dentry), &lower_stat); 1011 ecryptfs_dentry_to_lower(dentry), &lower_stat);
1094 if (!rc) { 1012 if (!rc) {
1013 fsstack_copy_attr_all(dentry->d_inode,
1014 ecryptfs_inode_to_lower(dentry->d_inode));
1095 generic_fillattr(dentry->d_inode, stat); 1015 generic_fillattr(dentry->d_inode, stat);
1096 stat->blocks = lower_stat.blocks; 1016 stat->blocks = lower_stat.blocks;
1097 } 1017 }
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index a8e7797b9477..9c13412e6c99 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -23,7 +23,6 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
23} 23}
24static const struct address_space_operations efs_aops = { 24static const struct address_space_operations efs_aops = {
25 .readpage = efs_readpage, 25 .readpage = efs_readpage,
26 .sync_page = block_sync_page,
27 .bmap = _efs_bmap 26 .bmap = _efs_bmap
28}; 27};
29 28
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e0194b3e14d6..d9a591773919 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_get);
99 * @ctx: [in] Pointer to eventfd context. 99 * @ctx: [in] Pointer to eventfd context.
100 * 100 *
101 * The eventfd context reference must have been previously acquired either 101 * The eventfd context reference must have been previously acquired either
102 * with eventfd_ctx_get() or eventfd_ctx_fdget()). 102 * with eventfd_ctx_get() or eventfd_ctx_fdget().
103 */ 103 */
104void eventfd_ctx_put(struct eventfd_ctx *ctx) 104void eventfd_ctx_put(struct eventfd_ctx *ctx)
105{ 105{
@@ -146,9 +146,9 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
146 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. 146 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
147 * @ctx: [in] Pointer to eventfd context. 147 * @ctx: [in] Pointer to eventfd context.
148 * @wait: [in] Wait queue to be removed. 148 * @wait: [in] Wait queue to be removed.
149 * @cnt: [out] Pointer to the 64bit conter value. 149 * @cnt: [out] Pointer to the 64-bit counter value.
150 * 150 *
151 * Returns zero if successful, or the following error codes: 151 * Returns %0 if successful, or the following error codes:
152 * 152 *
153 * -EAGAIN : The operation would have blocked. 153 * -EAGAIN : The operation would have blocked.
154 * 154 *
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
175 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. 175 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
176 * @ctx: [in] Pointer to eventfd context. 176 * @ctx: [in] Pointer to eventfd context.
177 * @no_wait: [in] Different from zero if the operation should not block. 177 * @no_wait: [in] Different from zero if the operation should not block.
178 * @cnt: [out] Pointer to the 64bit conter value. 178 * @cnt: [out] Pointer to the 64-bit counter value.
179 * 179 *
180 * Returns zero if successful, or the following error codes: 180 * Returns %0 if successful, or the following error codes:
181 * 181 *
182 * -EAGAIN : The operation would have blocked but @no_wait was nonzero. 182 * -EAGAIN : The operation would have blocked but @no_wait was non-zero.
183 * -ERESTARTSYS : A signal interrupted the wait operation. 183 * -ERESTARTSYS : A signal interrupted the wait operation.
184 * 184 *
185 * If @no_wait is zero, the function might sleep until the eventfd internal 185 * If @no_wait is zero, the function might sleep until the eventfd internal
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7d6064..ed38801b57a7 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -62,7 +62,14 @@
62 * This mutex is acquired by ep_free() during the epoll file 62 * This mutex is acquired by ep_free() during the epoll file
63 * cleanup path and it is also acquired by eventpoll_release_file() 63 * cleanup path and it is also acquired by eventpoll_release_file()
64 * if a file has been pushed inside an epoll set and it is then 64 * if a file has been pushed inside an epoll set and it is then
65 * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). 65 * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
66 * It is also acquired when inserting an epoll fd onto another epoll
67 * fd. We do this so that we walk the epoll tree and ensure that this
68 * insertion does not create a cycle of epoll file descriptors, which
69 * could lead to deadlock. We need a global mutex to prevent two
70 * simultaneous inserts (A into B and B into A) from racing and
71 * constructing a cycle without either insert observing that it is
72 * going to.
66 * It is possible to drop the "ep->mtx" and to use the global 73 * It is possible to drop the "ep->mtx" and to use the global
67 * mutex "epmutex" (together with "ep->lock") to have it working, 74 * mutex "epmutex" (together with "ep->lock") to have it working,
68 * but having "ep->mtx" will make the interface more scalable. 75 * but having "ep->mtx" will make the interface more scalable.
@@ -145,11 +152,11 @@ struct epitem {
145 152
146/* 153/*
147 * This structure is stored inside the "private_data" member of the file 154 * This structure is stored inside the "private_data" member of the file
148 * structure and rapresent the main data sructure for the eventpoll 155 * structure and represents the main data structure for the eventpoll
149 * interface. 156 * interface.
150 */ 157 */
151struct eventpoll { 158struct eventpoll {
152 /* Protect the this structure access */ 159 /* Protect the access to this structure */
153 spinlock_t lock; 160 spinlock_t lock;
154 161
155 /* 162 /*
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
224 */ 231 */
225static DEFINE_MUTEX(epmutex); 232static DEFINE_MUTEX(epmutex);
226 233
234/* Used to check for epoll file descriptor inclusion loops */
235static struct nested_calls poll_loop_ncalls;
236
227/* Used for safe wake up implementation */ 237/* Used for safe wake up implementation */
228static struct nested_calls poll_safewake_ncalls; 238static struct nested_calls poll_safewake_ncalls;
229 239
@@ -306,6 +316,19 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
306} 316}
307 317
308/** 318/**
319 * ep_events_available - Checks if ready events might be available.
320 *
321 * @ep: Pointer to the eventpoll context.
322 *
323 * Returns: Returns a value different than zero if ready events are available,
324 * or zero otherwise.
325 */
326static inline int ep_events_available(struct eventpoll *ep)
327{
328 return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
329}
330
331/**
309 * ep_call_nested - Perform a bound (possibly) nested call, by checking 332 * ep_call_nested - Perform a bound (possibly) nested call, by checking
310 * that the recursion limit is not exceeded, and that 333 * that the recursion limit is not exceeded, and that
311 * the same nested call (by the meaning of same cookie) is 334 * the same nested call (by the meaning of same cookie) is
@@ -783,7 +806,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
783 806
784/* 807/*
785 * This is the callback that is passed to the wait queue wakeup 808 * This is the callback that is passed to the wait queue wakeup
786 * machanism. It is called by the stored file descriptors when they 809 * mechanism. It is called by the stored file descriptors when they
787 * have events to report. 810 * have events to report.
788 */ 811 */
789static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) 812static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
@@ -814,9 +837,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
814 goto out_unlock; 837 goto out_unlock;
815 838
816 /* 839 /*
817 * If we are trasfering events to userspace, we can hold no locks 840 * If we are transferring events to userspace, we can hold no locks
818 * (because we're accessing user memory, and because of linux f_op->poll() 841 * (because we're accessing user memory, and because of linux f_op->poll()
819 * semantics). All the events that happens during that period of time are 842 * semantics). All the events that happen during that period of time are
820 * chained in ep->ovflist and requeued later on. 843 * chained in ep->ovflist and requeued later on.
821 */ 844 */
822 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { 845 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
@@ -1114,31 +1137,63 @@ static int ep_send_events(struct eventpoll *ep,
1114 return ep_scan_ready_list(ep, ep_send_events_proc, &esed); 1137 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1115} 1138}
1116 1139
1140static inline struct timespec ep_set_mstimeout(long ms)
1141{
1142 struct timespec now, ts = {
1143 .tv_sec = ms / MSEC_PER_SEC,
1144 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1145 };
1146
1147 ktime_get_ts(&now);
1148 return timespec_add_safe(now, ts);
1149}
1150
1151/**
1152 * ep_poll - Retrieves ready events, and delivers them to the caller supplied
1153 * event buffer.
1154 *
1155 * @ep: Pointer to the eventpoll context.
1156 * @events: Pointer to the userspace buffer where the ready events should be
1157 * stored.
1158 * @maxevents: Size (in terms of number of events) of the caller event buffer.
1159 * @timeout: Maximum timeout for the ready events fetch operation, in
1160 * milliseconds. If the @timeout is zero, the function will not block,
1161 * while if the @timeout is less than zero, the function will block
1162 * until at least one event has been retrieved (or an error
1163 * occurred).
1164 *
1165 * Returns: Returns the number of ready events which have been fetched, or an
1166 * error code, in case of error.
1167 */
1117static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1168static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1118 int maxevents, long timeout) 1169 int maxevents, long timeout)
1119{ 1170{
1120 int res, eavail, timed_out = 0; 1171 int res = 0, eavail, timed_out = 0;
1121 unsigned long flags; 1172 unsigned long flags;
1122 long slack; 1173 long slack = 0;
1123 wait_queue_t wait; 1174 wait_queue_t wait;
1124 struct timespec end_time;
1125 ktime_t expires, *to = NULL; 1175 ktime_t expires, *to = NULL;
1126 1176
1127 if (timeout > 0) { 1177 if (timeout > 0) {
1128 ktime_get_ts(&end_time); 1178 struct timespec end_time = ep_set_mstimeout(timeout);
1129 timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC); 1179
1130 slack = select_estimate_accuracy(&end_time); 1180 slack = select_estimate_accuracy(&end_time);
1131 to = &expires; 1181 to = &expires;
1132 *to = timespec_to_ktime(end_time); 1182 *to = timespec_to_ktime(end_time);
1133 } else if (timeout == 0) { 1183 } else if (timeout == 0) {
1184 /*
1185 * Avoid the unnecessary trip to the wait queue loop, if the
1186 * caller specified a non blocking operation.
1187 */
1134 timed_out = 1; 1188 timed_out = 1;
1189 spin_lock_irqsave(&ep->lock, flags);
1190 goto check_events;
1135 } 1191 }
1136 1192
1137retry: 1193fetch_events:
1138 spin_lock_irqsave(&ep->lock, flags); 1194 spin_lock_irqsave(&ep->lock, flags);
1139 1195
1140 res = 0; 1196 if (!ep_events_available(ep)) {
1141 if (list_empty(&ep->rdllist)) {
1142 /* 1197 /*
1143 * We don't have any available event to return to the caller. 1198 * We don't have any available event to return to the caller.
1144 * We need to sleep here, and we will be wake up by 1199 * We need to sleep here, and we will be wake up by
@@ -1154,7 +1209,7 @@ retry:
1154 * to TASK_INTERRUPTIBLE before doing the checks. 1209 * to TASK_INTERRUPTIBLE before doing the checks.
1155 */ 1210 */
1156 set_current_state(TASK_INTERRUPTIBLE); 1211 set_current_state(TASK_INTERRUPTIBLE);
1157 if (!list_empty(&ep->rdllist) || timed_out) 1212 if (ep_events_available(ep) || timed_out)
1158 break; 1213 break;
1159 if (signal_pending(current)) { 1214 if (signal_pending(current)) {
1160 res = -EINTR; 1215 res = -EINTR;
@@ -1171,8 +1226,9 @@ retry:
1171 1226
1172 set_current_state(TASK_RUNNING); 1227 set_current_state(TASK_RUNNING);
1173 } 1228 }
1229check_events:
1174 /* Is it worth to try to dig for events ? */ 1230 /* Is it worth to try to dig for events ? */
1175 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; 1231 eavail = ep_events_available(ep);
1176 1232
1177 spin_unlock_irqrestore(&ep->lock, flags); 1233 spin_unlock_irqrestore(&ep->lock, flags);
1178 1234
@@ -1183,11 +1239,67 @@ retry:
1183 */ 1239 */
1184 if (!res && eavail && 1240 if (!res && eavail &&
1185 !(res = ep_send_events(ep, events, maxevents)) && !timed_out) 1241 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1186 goto retry; 1242 goto fetch_events;
1187 1243
1188 return res; 1244 return res;
1189} 1245}
1190 1246
1247/**
1248 * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
1249 * API, to verify that adding an epoll file inside another
1250 * epoll structure, does not violate the constraints, in
1251 * terms of closed loops, or too deep chains (which can
1252 * result in excessive stack usage).
1253 *
1254 * @priv: Pointer to the epoll file to be currently checked.
1255 * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
1256 * data structure pointer.
1257 * @call_nests: Current dept of the @ep_call_nested() call stack.
1258 *
1259 * Returns: Returns zero if adding the epoll @file inside current epoll
1260 * structure @ep does not violate the constraints, or -1 otherwise.
1261 */
1262static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1263{
1264 int error = 0;
1265 struct file *file = priv;
1266 struct eventpoll *ep = file->private_data;
1267 struct rb_node *rbp;
1268 struct epitem *epi;
1269
1270 mutex_lock(&ep->mtx);
1271 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1272 epi = rb_entry(rbp, struct epitem, rbn);
1273 if (unlikely(is_file_epoll(epi->ffd.file))) {
1274 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1275 ep_loop_check_proc, epi->ffd.file,
1276 epi->ffd.file->private_data, current);
1277 if (error != 0)
1278 break;
1279 }
1280 }
1281 mutex_unlock(&ep->mtx);
1282
1283 return error;
1284}
1285
1286/**
1287 * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
1288 * another epoll file (represented by @ep) does not create
1289 * closed loops or too deep chains.
1290 *
1291 * @ep: Pointer to the epoll private data structure.
1292 * @file: Pointer to the epoll file to be checked.
1293 *
1294 * Returns: Returns zero if adding the epoll @file inside current epoll
1295 * structure @ep does not violate the constraints, or -1 otherwise.
1296 */
1297static int ep_loop_check(struct eventpoll *ep, struct file *file)
1298{
1299 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1300 ep_loop_check_proc, file, ep, current);
1301}
1302
1191/* 1303/*
1192 * Open an eventpoll file descriptor. 1304 * Open an eventpoll file descriptor.
1193 */ 1305 */
@@ -1236,6 +1348,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1236 struct epoll_event __user *, event) 1348 struct epoll_event __user *, event)
1237{ 1349{
1238 int error; 1350 int error;
1351 int did_lock_epmutex = 0;
1239 struct file *file, *tfile; 1352 struct file *file, *tfile;
1240 struct eventpoll *ep; 1353 struct eventpoll *ep;
1241 struct epitem *epi; 1354 struct epitem *epi;
@@ -1277,6 +1390,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1277 */ 1390 */
1278 ep = file->private_data; 1391 ep = file->private_data;
1279 1392
1393 /*
1394 * When we insert an epoll file descriptor, inside another epoll file
1395 * descriptor, there is the change of creating closed loops, which are
1396 * better be handled here, than in more critical paths.
1397 *
1398 * We hold epmutex across the loop check and the insert in this case, in
1399 * order to prevent two separate inserts from racing and each doing the
1400 * insert "at the same time" such that ep_loop_check passes on both
1401 * before either one does the insert, thereby creating a cycle.
1402 */
1403 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
1404 mutex_lock(&epmutex);
1405 did_lock_epmutex = 1;
1406 error = -ELOOP;
1407 if (ep_loop_check(ep, tfile) != 0)
1408 goto error_tgt_fput;
1409 }
1410
1411
1280 mutex_lock(&ep->mtx); 1412 mutex_lock(&ep->mtx);
1281 1413
1282 /* 1414 /*
@@ -1312,6 +1444,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1312 mutex_unlock(&ep->mtx); 1444 mutex_unlock(&ep->mtx);
1313 1445
1314error_tgt_fput: 1446error_tgt_fput:
1447 if (unlikely(did_lock_epmutex))
1448 mutex_unlock(&epmutex);
1449
1315 fput(tfile); 1450 fput(tfile);
1316error_fput: 1451error_fput:
1317 fput(file); 1452 fput(file);
@@ -1431,6 +1566,12 @@ static int __init eventpoll_init(void)
1431 EP_ITEM_COST; 1566 EP_ITEM_COST;
1432 BUG_ON(max_user_watches < 0); 1567 BUG_ON(max_user_watches < 0);
1433 1568
1569 /*
1570 * Initialize the structure used to perform epoll file descriptor
1571 * inclusion loops checks.
1572 */
1573 ep_nested_calls_init(&poll_loop_ncalls);
1574
1434 /* Initialize the structure used to perform safe poll wait head wake ups */ 1575 /* Initialize the structure used to perform safe poll wait head wake ups */
1435 ep_nested_calls_init(&poll_safewake_ncalls); 1576 ep_nested_calls_init(&poll_safewake_ncalls);
1436 1577
diff --git a/fs/exec.c b/fs/exec.c
index c62efcb959c7..5e62d26a4fec 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
115 struct file *file; 115 struct file *file;
116 char *tmp = getname(library); 116 char *tmp = getname(library);
117 int error = PTR_ERR(tmp); 117 int error = PTR_ERR(tmp);
118 static const struct open_flags uselib_flags = {
119 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
120 .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
121 .intent = LOOKUP_OPEN
122 };
118 123
119 if (IS_ERR(tmp)) 124 if (IS_ERR(tmp))
120 goto out; 125 goto out;
121 126
122 file = do_filp_open(AT_FDCWD, tmp, 127 file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
123 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
124 MAY_READ | MAY_EXEC | MAY_OPEN);
125 putname(tmp); 128 putname(tmp);
126 error = PTR_ERR(file); 129 error = PTR_ERR(file);
127 if (IS_ERR(file)) 130 if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
721{ 724{
722 struct file *file; 725 struct file *file;
723 int err; 726 int err;
727 static const struct open_flags open_exec_flags = {
728 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
729 .acc_mode = MAY_EXEC | MAY_OPEN,
730 .intent = LOOKUP_OPEN
731 };
724 732
725 file = do_filp_open(AT_FDCWD, name, 733 file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
726 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
727 MAY_EXEC | MAY_OPEN);
728 if (IS_ERR(file)) 734 if (IS_ERR(file))
729 goto out; 735 goto out;
730 736
@@ -1869,7 +1875,7 @@ static void wait_for_dump_helpers(struct file *file)
1869 1875
1870 1876
1871/* 1877/*
1872 * uhm_pipe_setup 1878 * umh_pipe_setup
1873 * helper function to customize the process used 1879 * helper function to customize the process used
1874 * to collect the core in userspace. Specifically 1880 * to collect the core in userspace. Specifically
1875 * it sets up a pipe and installs it as fd 0 (stdin) 1881 * it sets up a pipe and installs it as fd 0 (stdin)
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index f0d520312d8b..5e74ad3d4009 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -53,10 +53,14 @@
53#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ 53#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
54 54
55/* exofs Application specific page/attribute */ 55/* exofs Application specific page/attribute */
56/* Inode attrs */
56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) 57# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
57# define EXOFS_ATTR_INODE_DATA 1 58# define EXOFS_ATTR_INODE_DATA 1
58# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 59# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
59# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 60# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
61/* Partition attrs */
62# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3)
63# define EXOFS_ATTR_SB_STATS 1
60 64
61/* 65/*
62 * The maximum number of files we can have is limited by the size of the 66 * The maximum number of files we can have is limited by the size of the
@@ -86,8 +90,8 @@ enum {
86 */ 90 */
87enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; 91enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
88struct exofs_fscb { 92struct exofs_fscb {
89 __le64 s_nextid; /* Highest object ID used */ 93 __le64 s_nextid; /* Only used after mkfs */
90 __le64 s_numfiles; /* Number of files on fs */ 94 __le64 s_numfiles; /* Only used after mkfs */
91 __le32 s_version; /* == EXOFS_FSCB_VER */ 95 __le32 s_version; /* == EXOFS_FSCB_VER */
92 __le16 s_magic; /* Magic signature */ 96 __le16 s_magic; /* Magic signature */
93 __le16 s_newfs; /* Non-zero if this is a new fs */ 97 __le16 s_newfs; /* Non-zero if this is a new fs */
@@ -98,6 +102,16 @@ struct exofs_fscb {
98} __packed; 102} __packed;
99 103
100/* 104/*
105 * This struct is set on the FS partition's attributes.
106 * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
107 * with the create command, to atomically persist the sb writeable information.
108 */
109struct exofs_sb_stats {
110 __le64 s_nextid; /* Highest object ID used */
111 __le64 s_numfiles; /* Number of files on fs */
112} __packed;
113
114/*
101 * Describes the raid used in the FS. It is part of the device table. 115 * Describes the raid used in the FS. It is part of the device table.
102 * This here is taken from the pNFS-objects definition. In exofs we 116 * This here is taken from the pNFS-objects definition. In exofs we
103 * use one raid policy through-out the filesystem. (NOTE: the funny 117 * use one raid policy through-out the filesystem. (NOTE: the funny
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index dcc941d82d67..d0941c6a1f72 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -124,7 +124,7 @@ out:
124 124
125Ebadsize: 125Ebadsize:
126 EXOFS_ERR("ERROR [exofs_check_page]: " 126 EXOFS_ERR("ERROR [exofs_check_page]: "
127 "size of directory #%lu is not a multiple of chunk size", 127 "size of directory(0x%lx) is not a multiple of chunk size\n",
128 dir->i_ino 128 dir->i_ino
129 ); 129 );
130 goto fail; 130 goto fail;
@@ -142,8 +142,8 @@ Espan:
142 goto bad_entry; 142 goto bad_entry;
143bad_entry: 143bad_entry:
144 EXOFS_ERR( 144 EXOFS_ERR(
145 "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - " 145 "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
146 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d", 146 "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
147 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, 147 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
148 _LLU(le64_to_cpu(p->inode_no)), 148 _LLU(le64_to_cpu(p->inode_no)),
149 rec_len, p->name_len); 149 rec_len, p->name_len);
@@ -151,8 +151,8 @@ bad_entry:
151Eend: 151Eend:
152 p = (struct exofs_dir_entry *)(kaddr + offs); 152 p = (struct exofs_dir_entry *)(kaddr + offs);
153 EXOFS_ERR("ERROR [exofs_check_page]: " 153 EXOFS_ERR("ERROR [exofs_check_page]: "
154 "entry in directory #%lu spans the page boundary" 154 "entry in directory(0x%lx) spans the page boundary"
155 "offset=%lu, inode=%llu", 155 "offset=%lu, inode=0x%llx\n",
156 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, 156 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
157 _LLU(le64_to_cpu(p->inode_no))); 157 _LLU(le64_to_cpu(p->inode_no)));
158fail: 158fail:
@@ -261,9 +261,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
261 struct page *page = exofs_get_page(inode, n); 261 struct page *page = exofs_get_page(inode, n);
262 262
263 if (IS_ERR(page)) { 263 if (IS_ERR(page)) {
264 EXOFS_ERR("ERROR: " 264 EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
265 "bad page in #%lu", 265 inode->i_ino);
266 inode->i_ino);
267 filp->f_pos += PAGE_CACHE_SIZE - offset; 266 filp->f_pos += PAGE_CACHE_SIZE - offset;
268 return PTR_ERR(page); 267 return PTR_ERR(page);
269 } 268 }
@@ -283,7 +282,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
283 for (; (char *)de <= limit; de = exofs_next_entry(de)) { 282 for (; (char *)de <= limit; de = exofs_next_entry(de)) {
284 if (de->rec_len == 0) { 283 if (de->rec_len == 0) {
285 EXOFS_ERR("ERROR: " 284 EXOFS_ERR("ERROR: "
286 "zero-length directory entry"); 285 "zero-length entry in directory(0x%lx)\n",
286 inode->i_ino);
287 exofs_put_page(page); 287 exofs_put_page(page);
288 return -EIO; 288 return -EIO;
289 } 289 }
@@ -342,9 +342,9 @@ struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
342 kaddr += exofs_last_byte(dir, n) - reclen; 342 kaddr += exofs_last_byte(dir, n) - reclen;
343 while ((char *) de <= kaddr) { 343 while ((char *) de <= kaddr) {
344 if (de->rec_len == 0) { 344 if (de->rec_len == 0) {
345 EXOFS_ERR( 345 EXOFS_ERR("ERROR: zero-length entry in "
346 "ERROR: exofs_find_entry: " 346 "directory(0x%lx)\n",
347 "zero-length directory entry"); 347 dir->i_ino);
348 exofs_put_page(page); 348 exofs_put_page(page);
349 goto out; 349 goto out;
350 } 350 }
@@ -472,7 +472,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
472 } 472 }
473 if (de->rec_len == 0) { 473 if (de->rec_len == 0) {
474 EXOFS_ERR("ERROR: exofs_add_link: " 474 EXOFS_ERR("ERROR: exofs_add_link: "
475 "zero-length directory entry"); 475 "zero-length entry in directory(0x%lx)\n",
476 inode->i_ino);
476 err = -EIO; 477 err = -EIO;
477 goto out_unlock; 478 goto out_unlock;
478 } 479 }
@@ -491,7 +492,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
491 exofs_put_page(page); 492 exofs_put_page(page);
492 } 493 }
493 494
494 EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode); 495 EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
496 dentry, inode->i_ino);
495 return -EINVAL; 497 return -EINVAL;
496 498
497got_it: 499got_it:
@@ -542,7 +544,8 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
542 while (de < dir) { 544 while (de < dir) {
543 if (de->rec_len == 0) { 545 if (de->rec_len == 0) {
544 EXOFS_ERR("ERROR: exofs_delete_entry:" 546 EXOFS_ERR("ERROR: exofs_delete_entry:"
545 "zero-length directory entry"); 547 "zero-length entry in directory(0x%lx)\n",
548 inode->i_ino);
546 err = -EIO; 549 err = -EIO;
547 goto out; 550 goto out;
548 } 551 }
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2dc925fa1010..c965806c2821 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -77,7 +77,7 @@ struct exofs_layout {
77 * our extension to the in-memory superblock 77 * our extension to the in-memory superblock
78 */ 78 */
79struct exofs_sb_info { 79struct exofs_sb_info {
80 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ 80 struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
81 int s_timeout; /* timeout for OSD operations */ 81 int s_timeout; /* timeout for OSD operations */
82 uint64_t s_nextid; /* highest object ID used */ 82 uint64_t s_nextid; /* highest object ID used */
83 uint32_t s_numfiles; /* number of files on fs */ 83 uint32_t s_numfiles; /* number of files on fs */
@@ -256,6 +256,8 @@ static inline int exofs_oi_read(struct exofs_i_info *oi,
256} 256}
257 257
258/* inode.c */ 258/* inode.c */
259unsigned exofs_max_io_pages(struct exofs_layout *layout,
260 unsigned expected_pages);
259int exofs_setattr(struct dentry *, struct iattr *); 261int exofs_setattr(struct dentry *, struct iattr *);
260int exofs_write_begin(struct file *file, struct address_space *mapping, 262int exofs_write_begin(struct file *file, struct address_space *mapping,
261 loff_t pos, unsigned len, unsigned flags, 263 loff_t pos, unsigned len, unsigned flags,
@@ -279,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
279 struct inode *); 281 struct inode *);
280 282
281/* super.c */ 283/* super.c */
282int exofs_sync_fs(struct super_block *sb, int wait); 284int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
283 285
284/********************* 286/*********************
285 * operation vectors * 287 * operation vectors *
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index b905c79b4f0a..45ca323d8363 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -45,22 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
45static int exofs_file_fsync(struct file *filp, int datasync) 45static int exofs_file_fsync(struct file *filp, int datasync)
46{ 46{
47 int ret; 47 int ret;
48 struct inode *inode = filp->f_mapping->host;
49 struct super_block *sb;
50
51 if (!(inode->i_state & I_DIRTY))
52 return 0;
53 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
54 return 0;
55
56 ret = sync_inode_metadata(inode, 1);
57
58 /* This is a good place to write the sb */
59 /* TODO: Sechedule an sb-sync on create */
60 sb = inode->i_sb;
61 if (sb->s_dirt)
62 exofs_sync_fs(sb, 1);
63 48
49 ret = sync_inode_metadata(filp->f_mapping->host, 1);
64 return ret; 50 return ret;
65} 51}
66 52
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 42685424817b..8472c098445d 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -43,6 +43,17 @@ enum { BIO_MAX_PAGES_KMALLOC =
43 PAGE_SIZE / sizeof(struct page *), 43 PAGE_SIZE / sizeof(struct page *),
44}; 44};
45 45
46unsigned exofs_max_io_pages(struct exofs_layout *layout,
47 unsigned expected_pages)
48{
49 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
50
51 /* TODO: easily support bio chaining */
52 pages = min_t(unsigned, pages,
53 layout->group_width * BIO_MAX_PAGES_KMALLOC);
54 return pages;
55}
56
46struct page_collect { 57struct page_collect {
47 struct exofs_sb_info *sbi; 58 struct exofs_sb_info *sbi;
48 struct inode *inode; 59 struct inode *inode;
@@ -97,8 +108,7 @@ static void _pcol_reset(struct page_collect *pcol)
97 108
98static int pcol_try_alloc(struct page_collect *pcol) 109static int pcol_try_alloc(struct page_collect *pcol)
99{ 110{
100 unsigned pages = min_t(unsigned, pcol->expected_pages, 111 unsigned pages;
101 MAX_PAGES_KMALLOC);
102 112
103 if (!pcol->ios) { /* First time allocate io_state */ 113 if (!pcol->ios) { /* First time allocate io_state */
104 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); 114 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
@@ -108,8 +118,7 @@ static int pcol_try_alloc(struct page_collect *pcol)
108 } 118 }
109 119
110 /* TODO: easily support bio chaining */ 120 /* TODO: easily support bio chaining */
111 pages = min_t(unsigned, pages, 121 pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
112 pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
113 122
114 for (; pages; pages >>= 1) { 123 for (; pages; pages >>= 1) {
115 pcol->pages = kmalloc(pages * sizeof(struct page *), 124 pcol->pages = kmalloc(pages * sizeof(struct page *),
@@ -350,8 +359,10 @@ static int readpage_strip(void *data, struct page *page)
350 359
351 if (!pcol->read_4_write) 360 if (!pcol->read_4_write)
352 unlock_page(page); 361 unlock_page(page);
353 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 362 EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
354 " splitting\n", inode->i_ino, page->index); 363 "read_4_write=%d index=0x%lx end_index=0x%lx "
364 "splitting\n", inode->i_ino, len,
365 pcol->read_4_write, page->index, end_index);
355 366
356 return read_exec(pcol); 367 return read_exec(pcol);
357 } 368 }
@@ -722,11 +733,28 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
722 733
723 /* read modify write */ 734 /* read modify write */
724 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { 735 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
736 loff_t i_size = i_size_read(mapping->host);
737 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
738 size_t rlen;
739
740 if (page->index < end_index)
741 rlen = PAGE_CACHE_SIZE;
742 else if (page->index == end_index)
743 rlen = i_size & ~PAGE_CACHE_MASK;
744 else
745 rlen = 0;
746
747 if (!rlen) {
748 clear_highpage(page);
749 SetPageUptodate(page);
750 goto out;
751 }
752
725 ret = _readpage(page, true); 753 ret = _readpage(page, true);
726 if (ret) { 754 if (ret) {
727 /*SetPageError was done by _readpage. Is it ok?*/ 755 /*SetPageError was done by _readpage. Is it ok?*/
728 unlock_page(page); 756 unlock_page(page);
729 EXOFS_DBGMSG("__readpage_filler failed\n"); 757 EXOFS_DBGMSG("__readpage failed\n");
730 } 758 }
731 } 759 }
732out: 760out:
@@ -795,7 +823,6 @@ const struct address_space_operations exofs_aops = {
795 .direct_IO = NULL, /* TODO: Should be trivial to do */ 823 .direct_IO = NULL, /* TODO: Should be trivial to do */
796 824
797 /* With these NULL has special meaning or default is not exported */ 825 /* With these NULL has special meaning or default is not exported */
798 .sync_page = NULL,
799 .get_xip_mem = NULL, 826 .get_xip_mem = NULL,
800 .migratepage = NULL, 827 .migratepage = NULL,
801 .launder_page = NULL, 828 .launder_page = NULL,
@@ -1074,6 +1101,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
1074 } 1101 }
1075 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; 1102 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1076} 1103}
1104
1077/* 1105/*
1078 * Callback function from exofs_new_inode(). The important thing is that we 1106 * Callback function from exofs_new_inode(). The important thing is that we
1079 * set the obj_created flag so that other methods know that the object exists on 1107 * set the obj_created flag so that other methods know that the object exists on
@@ -1132,7 +1160,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1132 sbi = sb->s_fs_info; 1160 sbi = sb->s_fs_info;
1133 1161
1134 inode->i_mapping->backing_dev_info = sb->s_bdi; 1162 inode->i_mapping->backing_dev_info = sb->s_bdi;
1135 sb->s_dirt = 1;
1136 inode_init_owner(inode, dir, mode); 1163 inode_init_owner(inode, dir, mode);
1137 inode->i_ino = sbi->s_nextid++; 1164 inode->i_ino = sbi->s_nextid++;
1138 inode->i_blkbits = EXOFS_BLKSHIFT; 1165 inode->i_blkbits = EXOFS_BLKSHIFT;
@@ -1143,6 +1170,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1143 spin_unlock(&sbi->s_next_gen_lock); 1170 spin_unlock(&sbi->s_next_gen_lock);
1144 insert_inode_hash(inode); 1171 insert_inode_hash(inode);
1145 1172
1173 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
1174
1146 mark_inode_dirty(inode); 1175 mark_inode_dirty(inode);
1147 1176
1148 ret = exofs_get_io_state(&sbi->layout, &ios); 1177 ret = exofs_get_io_state(&sbi->layout, &ios);
@@ -1273,7 +1302,8 @@ out:
1273 1302
1274int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) 1303int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
1275{ 1304{
1276 return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 1305 /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
1306 return exofs_update_inode(inode, 1);
1277} 1307}
1278 1308
1279/* 1309/*
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 264e95d02830..4d70db110cfc 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
272 new_de = exofs_find_entry(new_dir, new_dentry, &new_page); 272 new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
273 if (!new_de) 273 if (!new_de)
274 goto out_dir; 274 goto out_dir;
275 inode_inc_link_count(old_inode);
276 err = exofs_set_link(new_dir, new_de, new_page, old_inode); 275 err = exofs_set_link(new_dir, new_de, new_page, old_inode);
277 new_inode->i_ctime = CURRENT_TIME; 276 new_inode->i_ctime = CURRENT_TIME;
278 if (dir_de) 277 if (dir_de)
@@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
286 if (new_dir->i_nlink >= EXOFS_LINK_MAX) 285 if (new_dir->i_nlink >= EXOFS_LINK_MAX)
287 goto out_dir; 286 goto out_dir;
288 } 287 }
289 inode_inc_link_count(old_inode);
290 err = exofs_add_link(new_dentry, old_inode); 288 err = exofs_add_link(new_dentry, old_inode);
291 if (err) { 289 if (err)
292 inode_dec_link_count(old_inode);
293 goto out_dir; 290 goto out_dir;
294 }
295 if (dir_de) 291 if (dir_de)
296 inode_inc_link_count(new_dir); 292 inode_inc_link_count(new_dir);
297 } 293 }
@@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
299 old_inode->i_ctime = CURRENT_TIME; 295 old_inode->i_ctime = CURRENT_TIME;
300 296
301 exofs_delete_entry(old_de, old_page); 297 exofs_delete_entry(old_de, old_page);
302 inode_dec_link_count(old_inode); 298 mark_inode_dirty(old_inode);
303 299
304 if (dir_de) { 300 if (dir_de) {
305 err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); 301 err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8c6c4669b381..06065bd37fc3 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -48,6 +48,7 @@
48 * struct to hold what we get from mount options 48 * struct to hold what we get from mount options
49 */ 49 */
50struct exofs_mountopt { 50struct exofs_mountopt {
51 bool is_osdname;
51 const char *dev_name; 52 const char *dev_name;
52 uint64_t pid; 53 uint64_t pid;
53 int timeout; 54 int timeout;
@@ -56,7 +57,7 @@ struct exofs_mountopt {
56/* 57/*
57 * exofs-specific mount-time options. 58 * exofs-specific mount-time options.
58 */ 59 */
59enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err }; 60enum { Opt_name, Opt_pid, Opt_to, Opt_err };
60 61
61/* 62/*
62 * Our mount-time options. These should ideally be 64-bit unsigned, but the 63 * Our mount-time options. These should ideally be 64-bit unsigned, but the
@@ -64,6 +65,7 @@ enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
64 * sufficient for most applications now. 65 * sufficient for most applications now.
65 */ 66 */
66static match_table_t tokens = { 67static match_table_t tokens = {
68 {Opt_name, "osdname=%s"},
67 {Opt_pid, "pid=%u"}, 69 {Opt_pid, "pid=%u"},
68 {Opt_to, "to=%u"}, 70 {Opt_to, "to=%u"},
69 {Opt_err, NULL} 71 {Opt_err, NULL}
@@ -94,6 +96,14 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
94 96
95 token = match_token(p, tokens, args); 97 token = match_token(p, tokens, args);
96 switch (token) { 98 switch (token) {
99 case Opt_name:
100 opts->dev_name = match_strdup(&args[0]);
101 if (unlikely(!opts->dev_name)) {
102 EXOFS_ERR("Error allocating dev_name");
103 return -ENOMEM;
104 }
105 opts->is_osdname = true;
106 break;
97 case Opt_pid: 107 case Opt_pid:
98 if (0 == match_strlcpy(str, &args[0], sizeof(str))) 108 if (0 == match_strlcpy(str, &args[0], sizeof(str)))
99 return -EINVAL; 109 return -EINVAL;
@@ -203,6 +213,101 @@ static void destroy_inodecache(void)
203static const struct super_operations exofs_sops; 213static const struct super_operations exofs_sops;
204static const struct export_operations exofs_export_ops; 214static const struct export_operations exofs_export_ops;
205 215
216static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
217 EXOFS_APAGE_SB_DATA,
218 EXOFS_ATTR_SB_STATS,
219 sizeof(struct exofs_sb_stats));
220
221static int __sbi_read_stats(struct exofs_sb_info *sbi)
222{
223 struct osd_attr attrs[] = {
224 [0] = g_attr_sb_stats,
225 };
226 struct exofs_io_state *ios;
227 int ret;
228
229 ret = exofs_get_io_state(&sbi->layout, &ios);
230 if (unlikely(ret)) {
231 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
232 return ret;
233 }
234
235 ios->cred = sbi->s_cred;
236
237 ios->in_attr = attrs;
238 ios->in_attr_len = ARRAY_SIZE(attrs);
239
240 ret = exofs_sbi_read(ios);
241 if (unlikely(ret)) {
242 EXOFS_ERR("Error reading super_block stats => %d\n", ret);
243 goto out;
244 }
245
246 ret = extract_attr_from_ios(ios, &attrs[0]);
247 if (ret) {
248 EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
249 goto out;
250 }
251 if (attrs[0].len) {
252 struct exofs_sb_stats *ess;
253
254 if (unlikely(attrs[0].len != sizeof(*ess))) {
255 EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
256 "size(%d) != expected(%zd)\n",
257 __func__, attrs[0].len, sizeof(*ess));
258 goto out;
259 }
260
261 ess = attrs[0].val_ptr;
262 sbi->s_nextid = le64_to_cpu(ess->s_nextid);
263 sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
264 }
265
266out:
267 exofs_put_io_state(ios);
268 return ret;
269}
270
271static void stats_done(struct exofs_io_state *ios, void *p)
272{
273 exofs_put_io_state(ios);
274 /* Good thanks nothing to do anymore */
275}
276
277/* Asynchronously write the stats attribute */
278int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
279{
280 struct osd_attr attrs[] = {
281 [0] = g_attr_sb_stats,
282 };
283 struct exofs_io_state *ios;
284 int ret;
285
286 ret = exofs_get_io_state(&sbi->layout, &ios);
287 if (unlikely(ret)) {
288 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
289 return ret;
290 }
291
292 sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid);
293 sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
294 attrs[0].val_ptr = &sbi->s_ess;
295
296 ios->cred = sbi->s_cred;
297 ios->done = stats_done;
298 ios->private = sbi;
299 ios->out_attr = attrs;
300 ios->out_attr_len = ARRAY_SIZE(attrs);
301
302 ret = exofs_sbi_write(ios);
303 if (unlikely(ret)) {
304 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
305 exofs_put_io_state(ios);
306 }
307
308 return ret;
309}
310
206/* 311/*
207 * Write the superblock to the OSD 312 * Write the superblock to the OSD
208 */ 313 */
@@ -213,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait)
213 struct exofs_io_state *ios; 318 struct exofs_io_state *ios;
214 int ret = -ENOMEM; 319 int ret = -ENOMEM;
215 320
216 lock_super(sb); 321 fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
322 if (unlikely(!fscb))
323 return -ENOMEM;
324
217 sbi = sb->s_fs_info; 325 sbi = sb->s_fs_info;
218 fscb = &sbi->s_fscb;
219 326
327 /* NOTE: We no longer dirty the super_block anywhere in exofs. The
328 * reason we write the fscb here on unmount is so we can stay backwards
329 * compatible with fscb->s_version == 1. (What we are not compatible
330 * with is if a new version FS crashed and then we try to mount an old
331 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
332 * the writeable info is set in exofs_sbi_write_stats() above.
333 */
220 ret = exofs_get_io_state(&sbi->layout, &ios); 334 ret = exofs_get_io_state(&sbi->layout, &ios);
221 if (ret) 335 if (unlikely(ret))
222 goto out; 336 goto out;
223 337
224 /* Note: We only write the changing part of the fscb. .i.e upto the 338 lock_super(sb);
225 * the fscb->s_dev_table_oid member. There is no read-modify-write 339
226 * here.
227 */
228 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); 340 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
229 memset(fscb, 0, ios->length); 341 memset(fscb, 0, ios->length);
230 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 342 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -239,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait)
239 ios->cred = sbi->s_cred; 351 ios->cred = sbi->s_cred;
240 352
241 ret = exofs_sbi_write(ios); 353 ret = exofs_sbi_write(ios);
242 if (unlikely(ret)) { 354 if (unlikely(ret))
243 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); 355 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
244 goto out; 356 else
245 } 357 sb->s_dirt = 0;
246 sb->s_dirt = 0; 358
247 359
360 unlock_super(sb);
248out: 361out:
249 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); 362 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
250 exofs_put_io_state(ios); 363 exofs_put_io_state(ios);
251 unlock_super(sb); 364 kfree(fscb);
252 return ret; 365 return ret;
253} 366}
254 367
@@ -292,13 +405,14 @@ static void exofs_put_super(struct super_block *sb)
292 int num_pend; 405 int num_pend;
293 struct exofs_sb_info *sbi = sb->s_fs_info; 406 struct exofs_sb_info *sbi = sb->s_fs_info;
294 407
295 if (sb->s_dirt)
296 exofs_write_super(sb);
297
298 /* make sure there are no pending commands */ 408 /* make sure there are no pending commands */
299 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; 409 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
300 num_pend = atomic_read(&sbi->s_curr_pending)) { 410 num_pend = atomic_read(&sbi->s_curr_pending)) {
301 wait_queue_head_t wq; 411 wait_queue_head_t wq;
412
413 printk(KERN_NOTICE "%s: !!Pending operations in flight. "
414 "This is a BUG. please report to osd-dev@open-osd.org\n",
415 __func__);
302 init_waitqueue_head(&wq); 416 init_waitqueue_head(&wq);
303 wait_event_timeout(wq, 417 wait_event_timeout(wq,
304 (atomic_read(&sbi->s_curr_pending) == 0), 418 (atomic_read(&sbi->s_curr_pending) == 0),
@@ -390,6 +504,23 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
390 return 0; 504 return 0;
391} 505}
392 506
507static unsigned __ra_pages(struct exofs_layout *layout)
508{
509 const unsigned _MIN_RA = 32; /* min 128K read-ahead */
510 unsigned ra_pages = layout->group_width * layout->stripe_unit /
511 PAGE_SIZE;
512 unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
513
514 ra_pages *= 2; /* two stripes */
515 if (ra_pages < _MIN_RA)
516 ra_pages = roundup(_MIN_RA, ra_pages / 2);
517
518 if (ra_pages > max_io_pages)
519 ra_pages = max_io_pages;
520
521 return ra_pages;
522}
523
393/* @odi is valid only as long as @fscb_dev is valid */ 524/* @odi is valid only as long as @fscb_dev is valid */
394static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, 525static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
395 struct osd_dev_info *odi) 526 struct osd_dev_info *odi)
@@ -495,7 +626,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
495 } 626 }
496 627
497 od = osduld_info_lookup(&odi); 628 od = osduld_info_lookup(&odi);
498 if (unlikely(IS_ERR(od))) { 629 if (IS_ERR(od)) {
499 ret = PTR_ERR(od); 630 ret = PTR_ERR(od);
500 EXOFS_ERR("ERROR: device requested is not found " 631 EXOFS_ERR("ERROR: device requested is not found "
501 "osd_name-%s =>%d\n", odi.osdname, ret); 632 "osd_name-%s =>%d\n", odi.osdname, ret);
@@ -558,9 +689,17 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
558 goto free_bdi; 689 goto free_bdi;
559 690
560 /* use mount options to fill superblock */ 691 /* use mount options to fill superblock */
561 od = osduld_path_lookup(opts->dev_name); 692 if (opts->is_osdname) {
693 struct osd_dev_info odi = {.systemid_len = 0};
694
695 odi.osdname_len = strlen(opts->dev_name);
696 odi.osdname = (u8 *)opts->dev_name;
697 od = osduld_info_lookup(&odi);
698 } else {
699 od = osduld_path_lookup(opts->dev_name);
700 }
562 if (IS_ERR(od)) { 701 if (IS_ERR(od)) {
563 ret = PTR_ERR(od); 702 ret = -EINVAL;
564 goto free_sbi; 703 goto free_sbi;
565 } 704 }
566 705
@@ -594,6 +733,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
594 goto free_sbi; 733 goto free_sbi;
595 734
596 sb->s_magic = le16_to_cpu(fscb.s_magic); 735 sb->s_magic = le16_to_cpu(fscb.s_magic);
736 /* NOTE: we read below to be backward compatible with old versions */
597 sbi->s_nextid = le64_to_cpu(fscb.s_nextid); 737 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
598 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); 738 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
599 739
@@ -604,7 +744,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
604 ret = -EINVAL; 744 ret = -EINVAL;
605 goto free_sbi; 745 goto free_sbi;
606 } 746 }
607 if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) { 747 if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
608 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", 748 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
609 EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); 749 EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
610 ret = -EINVAL; 750 ret = -EINVAL;
@@ -622,7 +762,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
622 goto free_sbi; 762 goto free_sbi;
623 } 763 }
624 764
765 __sbi_read_stats(sbi);
766
625 /* set up operation vectors */ 767 /* set up operation vectors */
768 sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
626 sb->s_bdi = &sbi->bdi; 769 sb->s_bdi = &sbi->bdi;
627 sb->s_fs_info = sbi; 770 sb->s_fs_info = sbi;
628 sb->s_op = &exofs_sops; 771 sb->s_op = &exofs_sops;
@@ -652,6 +795,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
652 795
653 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], 796 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
654 sbi->layout.s_pid); 797 sbi->layout.s_pid);
798 if (opts->is_osdname)
799 kfree(opts->dev_name);
655 return 0; 800 return 0;
656 801
657free_sbi: 802free_sbi:
@@ -660,6 +805,8 @@ free_bdi:
660 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 805 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
661 opts->dev_name, sbi->layout.s_pid, ret); 806 opts->dev_name, sbi->layout.s_pid, ret);
662 exofs_free_sbi(sbi); 807 exofs_free_sbi(sbi);
808 if (opts->is_osdname)
809 kfree(opts->dev_name);
663 return ret; 810 return ret;
664} 811}
665 812
@@ -677,7 +824,8 @@ static struct dentry *exofs_mount(struct file_system_type *type,
677 if (ret) 824 if (ret)
678 return ERR_PTR(ret); 825 return ERR_PTR(ret);
679 826
680 opts.dev_name = dev_name; 827 if (!opts.dev_name)
828 opts.dev_name = dev_name;
681 return mount_nodev(type, flags, &opts, exofs_fill_super); 829 return mount_nodev(type, flags, &opts, exofs_fill_super);
682} 830}
683 831
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b6825740dd5..b05acb796135 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
320 struct inode * inode = dentry->d_inode; 320 struct inode * inode = dentry->d_inode;
321 int len = *max_len; 321 int len = *max_len;
322 int type = FILEID_INO32_GEN; 322 int type = FILEID_INO32_GEN;
323 323
324 if (len < 2 || (connectable && len < 4)) 324 if (connectable && (len < 4)) {
325 *max_len = 4;
326 return 255;
327 } else if (len < 2) {
328 *max_len = 2;
325 return 255; 329 return 255;
330 }
326 331
327 len = 2; 332 len = 2;
328 fid->i32.ino = inode->i_ino; 333 fid->i32.ino = inode->i_ino;
@@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
369 /* 374 /*
370 * Try to get any dentry for the given file handle from the filesystem. 375 * Try to get any dentry for the given file handle from the filesystem.
371 */ 376 */
377 if (!nop || !nop->fh_to_dentry)
378 return ERR_PTR(-ESTALE);
372 result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); 379 result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
373 if (!result) 380 if (!result)
374 result = ERR_PTR(-ESTALE); 381 result = ERR_PTR(-ESTALE);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 7b4180554a62..abea5a17c764 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -406,7 +406,7 @@ ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
406 return -EINVAL; 406 return -EINVAL;
407 if (!test_opt(dentry->d_sb, POSIX_ACL)) 407 if (!test_opt(dentry->d_sb, POSIX_ACL))
408 return -EOPNOTSUPP; 408 return -EOPNOTSUPP;
409 if (!is_owner_or_cap(dentry->d_inode)) 409 if (!inode_owner_or_capable(dentry->d_inode))
410 return -EPERM; 410 return -EPERM;
411 411
412 if (value) { 412 if (value) {
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 6346a2acf326..645be9e7ee47 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
110extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); 110extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
111 111
112/* ialloc.c */ 112/* ialloc.c */
113extern struct inode * ext2_new_inode (struct inode *, int); 113extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *);
114extern void ext2_free_inode (struct inode *); 114extern void ext2_free_inode (struct inode *);
115extern unsigned long ext2_count_free_inodes (struct super_block *); 115extern unsigned long ext2_count_free_inodes (struct super_block *);
116extern void ext2_check_inodes_bitmap (struct super_block *); 116extern void ext2_check_inodes_bitmap (struct super_block *);
@@ -174,3 +174,9 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
174 return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) + 174 return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
175 le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block); 175 le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
176} 176}
177
178#define ext2_set_bit __test_and_set_bit_le
179#define ext2_clear_bit __test_and_clear_bit_le
180#define ext2_test_bit test_bit_le
181#define ext2_find_first_zero_bit find_first_zero_bit_le
182#define ext2_find_next_zero_bit find_next_zero_bit_le
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad70479aabff..ee9ed31948e1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -429,7 +429,8 @@ found:
429 return group; 429 return group;
430} 430}
431 431
432struct inode *ext2_new_inode(struct inode *dir, int mode) 432struct inode *ext2_new_inode(struct inode *dir, int mode,
433 const struct qstr *qstr)
433{ 434{
434 struct super_block *sb; 435 struct super_block *sb;
435 struct buffer_head *bitmap_bh = NULL; 436 struct buffer_head *bitmap_bh = NULL;
@@ -585,7 +586,7 @@ got:
585 if (err) 586 if (err)
586 goto fail_free_drop; 587 goto fail_free_drop;
587 588
588 err = ext2_init_security(inode,dir); 589 err = ext2_init_security(inode, dir, qstr);
589 if (err) 590 if (err)
590 goto fail_free_drop; 591 goto fail_free_drop;
591 592
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 40ad210a5049..c47f706878b5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -860,7 +860,6 @@ const struct address_space_operations ext2_aops = {
860 .readpage = ext2_readpage, 860 .readpage = ext2_readpage,
861 .readpages = ext2_readpages, 861 .readpages = ext2_readpages,
862 .writepage = ext2_writepage, 862 .writepage = ext2_writepage,
863 .sync_page = block_sync_page,
864 .write_begin = ext2_write_begin, 863 .write_begin = ext2_write_begin,
865 .write_end = ext2_write_end, 864 .write_end = ext2_write_end,
866 .bmap = ext2_bmap, 865 .bmap = ext2_bmap,
@@ -880,7 +879,6 @@ const struct address_space_operations ext2_nobh_aops = {
880 .readpage = ext2_readpage, 879 .readpage = ext2_readpage,
881 .readpages = ext2_readpages, 880 .readpages = ext2_readpages,
882 .writepage = ext2_nobh_writepage, 881 .writepage = ext2_nobh_writepage,
883 .sync_page = block_sync_page,
884 .write_begin = ext2_nobh_write_begin, 882 .write_begin = ext2_nobh_write_begin,
885 .write_end = nobh_write_end, 883 .write_end = nobh_write_end,
886 .bmap = ext2_bmap, 884 .bmap = ext2_bmap,
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index e7431309bdca..f81e250ac5c4 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
39 if (ret) 39 if (ret)
40 return ret; 40 return ret;
41 41
42 if (!is_owner_or_cap(inode)) { 42 if (!inode_owner_or_capable(inode)) {
43 ret = -EACCES; 43 ret = -EACCES;
44 goto setflags_out; 44 goto setflags_out;
45 } 45 }
@@ -89,7 +89,7 @@ setflags_out:
89 case EXT2_IOC_GETVERSION: 89 case EXT2_IOC_GETVERSION:
90 return put_user(inode->i_generation, (int __user *) arg); 90 return put_user(inode->i_generation, (int __user *) arg);
91 case EXT2_IOC_SETVERSION: 91 case EXT2_IOC_SETVERSION:
92 if (!is_owner_or_cap(inode)) 92 if (!inode_owner_or_capable(inode))
93 return -EPERM; 93 return -EPERM;
94 ret = mnt_want_write(filp->f_path.mnt); 94 ret = mnt_want_write(filp->f_path.mnt);
95 if (ret) 95 if (ret)
@@ -115,7 +115,7 @@ setflags_out:
115 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) 115 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
116 return -ENOTTY; 116 return -ENOTTY;
117 117
118 if (!is_owner_or_cap(inode)) 118 if (!inode_owner_or_capable(inode))
119 return -EACCES; 119 return -EACCES;
120 120
121 if (get_user(rsv_window_size, (int __user *)arg)) 121 if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2e1d8341d827..ed5c5d496ee9 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -104,7 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
104 104
105 dquot_initialize(dir); 105 dquot_initialize(dir);
106 106
107 inode = ext2_new_inode(dir, mode); 107 inode = ext2_new_inode(dir, mode, &dentry->d_name);
108 if (IS_ERR(inode)) 108 if (IS_ERR(inode))
109 return PTR_ERR(inode); 109 return PTR_ERR(inode);
110 110
@@ -133,7 +133,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
133 133
134 dquot_initialize(dir); 134 dquot_initialize(dir);
135 135
136 inode = ext2_new_inode (dir, mode); 136 inode = ext2_new_inode (dir, mode, &dentry->d_name);
137 err = PTR_ERR(inode); 137 err = PTR_ERR(inode);
138 if (!IS_ERR(inode)) { 138 if (!IS_ERR(inode)) {
139 init_special_inode(inode, inode->i_mode, rdev); 139 init_special_inode(inode, inode->i_mode, rdev);
@@ -159,7 +159,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
159 159
160 dquot_initialize(dir); 160 dquot_initialize(dir);
161 161
162 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); 162 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
163 err = PTR_ERR(inode); 163 err = PTR_ERR(inode);
164 if (IS_ERR(inode)) 164 if (IS_ERR(inode))
165 goto out; 165 goto out;
@@ -230,7 +230,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
230 230
231 inode_inc_link_count(dir); 231 inode_inc_link_count(dir);
232 232
233 inode = ext2_new_inode (dir, S_IFDIR | mode); 233 inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name);
234 err = PTR_ERR(inode); 234 err = PTR_ERR(inode);
235 if (IS_ERR(inode)) 235 if (IS_ERR(inode))
236 goto out_dir; 236 goto out_dir;
@@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
344 new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page); 344 new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
345 if (!new_de) 345 if (!new_de)
346 goto out_dir; 346 goto out_dir;
347 inode_inc_link_count(old_inode);
348 ext2_set_link(new_dir, new_de, new_page, old_inode, 1); 347 ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
349 new_inode->i_ctime = CURRENT_TIME_SEC; 348 new_inode->i_ctime = CURRENT_TIME_SEC;
350 if (dir_de) 349 if (dir_de)
@@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
356 if (new_dir->i_nlink >= EXT2_LINK_MAX) 355 if (new_dir->i_nlink >= EXT2_LINK_MAX)
357 goto out_dir; 356 goto out_dir;
358 } 357 }
359 inode_inc_link_count(old_inode);
360 err = ext2_add_link(new_dentry, old_inode); 358 err = ext2_add_link(new_dentry, old_inode);
361 if (err) { 359 if (err)
362 inode_dec_link_count(old_inode);
363 goto out_dir; 360 goto out_dir;
364 }
365 if (dir_de) 361 if (dir_de)
366 inode_inc_link_count(new_dir); 362 inode_inc_link_count(new_dir);
367 } 363 }
@@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
369 /* 365 /*
370 * Like most other Unix systems, set the ctime for inodes on a 366 * Like most other Unix systems, set the ctime for inodes on a
371 * rename. 367 * rename.
372 * inode_dec_link_count() will mark the inode dirty.
373 */ 368 */
374 old_inode->i_ctime = CURRENT_TIME_SEC; 369 old_inode->i_ctime = CURRENT_TIME_SEC;
370 mark_inode_dirty(old_inode);
375 371
376 ext2_delete_entry (old_de, old_page); 372 ext2_delete_entry (old_de, old_page);
377 inode_dec_link_count(old_inode);
378 373
379 if (dir_de) { 374 if (dir_de) {
380 if (old_dir != new_dir) 375 if (old_dir != new_dir)
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index a1a1c2184616..5e41cccff762 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -116,9 +116,11 @@ exit_ext2_xattr(void)
116# endif /* CONFIG_EXT2_FS_XATTR */ 116# endif /* CONFIG_EXT2_FS_XATTR */
117 117
118#ifdef CONFIG_EXT2_FS_SECURITY 118#ifdef CONFIG_EXT2_FS_SECURITY
119extern int ext2_init_security(struct inode *inode, struct inode *dir); 119extern int ext2_init_security(struct inode *inode, struct inode *dir,
120 const struct qstr *qstr);
120#else 121#else
121static inline int ext2_init_security(struct inode *inode, struct inode *dir) 122static inline int ext2_init_security(struct inode *inode, struct inode *dir,
123 const struct qstr *qstr)
122{ 124{
123 return 0; 125 return 0;
124} 126}
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 3004e15d5da5..5d979b4347b0 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -47,14 +47,15 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
47} 47}
48 48
49int 49int
50ext2_init_security(struct inode *inode, struct inode *dir) 50ext2_init_security(struct inode *inode, struct inode *dir,
51 const struct qstr *qstr)
51{ 52{
52 int err; 53 int err;
53 size_t len; 54 size_t len;
54 void *value; 55 void *value;
55 char *name; 56 char *name;
56 57
57 err = security_inode_init_security(inode, dir, &name, &value, &len); 58 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
58 if (err) { 59 if (err) {
59 if (err == -EOPNOTSUPP) 60 if (err == -EOPNOTSUPP)
60 return 0; 61 return 0;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e4fa49e6c539..9d021c0d472a 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -435,7 +435,7 @@ ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
435 return -EINVAL; 435 return -EINVAL;
436 if (!test_opt(inode->i_sb, POSIX_ACL)) 436 if (!test_opt(inode->i_sb, POSIX_ACL))
437 return -EOPNOTSUPP; 437 return -EOPNOTSUPP;
438 if (!is_owner_or_cap(inode)) 438 if (!inode_owner_or_capable(inode))
439 return -EPERM; 439 return -EPERM;
440 440
441 if (value) { 441 if (value) {
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 045995c8ce5a..153242187fce 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1991,6 +1991,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
1991 spin_unlock(sb_bgl_lock(sbi, group)); 1991 spin_unlock(sb_bgl_lock(sbi, group));
1992 percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); 1992 percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
1993 1993
1994 free_blocks -= next - start;
1994 /* Do not issue a TRIM on extents smaller than minblocks */ 1995 /* Do not issue a TRIM on extents smaller than minblocks */
1995 if ((next - start) < minblocks) 1996 if ((next - start) < minblocks)
1996 goto free_extent; 1997 goto free_extent;
@@ -2040,7 +2041,7 @@ free_extent:
2040 cond_resched(); 2041 cond_resched();
2041 2042
2042 /* No more suitable extents */ 2043 /* No more suitable extents */
2043 if ((free_blocks - count) < minblocks) 2044 if (free_blocks < minblocks)
2044 break; 2045 break;
2045 } 2046 }
2046 2047
@@ -2090,7 +2091,8 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2090 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); 2091 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2091 int ret = 0; 2092 int ret = 0;
2092 2093
2093 start = range->start >> sb->s_blocksize_bits; 2094 start = (range->start >> sb->s_blocksize_bits) +
2095 le32_to_cpu(es->s_first_data_block);
2094 len = range->len >> sb->s_blocksize_bits; 2096 len = range->len >> sb->s_blocksize_bits;
2095 minlen = range->minlen >> sb->s_blocksize_bits; 2097 minlen = range->minlen >> sb->s_blocksize_bits;
2096 trimmed = 0; 2098 trimmed = 0;
@@ -2099,10 +2101,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2099 return -EINVAL; 2101 return -EINVAL;
2100 if (start >= max_blks) 2102 if (start >= max_blks)
2101 goto out; 2103 goto out;
2102 if (start < le32_to_cpu(es->s_first_data_block)) {
2103 len -= le32_to_cpu(es->s_first_data_block) - start;
2104 start = le32_to_cpu(es->s_first_data_block);
2105 }
2106 if (start + len > max_blks) 2104 if (start + len > max_blks)
2107 len = max_blks - start; 2105 len = max_blks - start;
2108 2106
@@ -2129,10 +2127,15 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2129 if (free_blocks < minlen) 2127 if (free_blocks < minlen)
2130 continue; 2128 continue;
2131 2129
2132 if (len >= EXT3_BLOCKS_PER_GROUP(sb)) 2130 /*
2133 len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block); 2131 * For all the groups except the last one, last block will
2134 else 2132 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
2133 * change it for the last group in which case first_block +
2134 * len < EXT3_BLOCKS_PER_GROUP(sb).
2135 */
2136 if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
2135 last_block = first_block + len; 2137 last_block = first_block + len;
2138 len -= last_block - first_block;
2136 2139
2137 ret = ext3_trim_all_free(sb, group, first_block, 2140 ret = ext3_trim_all_free(sb, group, first_block,
2138 last_block, minlen); 2141 last_block, minlen);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9724aef22460..bfc2dc43681d 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -404,7 +404,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
404 * For other inodes, search forward from the parent directory's block 404 * For other inodes, search forward from the parent directory's block
405 * group to find a free inode. 405 * group to find a free inode.
406 */ 406 */
407struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) 407struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
408 const struct qstr *qstr, int mode)
408{ 409{
409 struct super_block *sb; 410 struct super_block *sb;
410 struct buffer_head *bitmap_bh = NULL; 411 struct buffer_head *bitmap_bh = NULL;
@@ -589,7 +590,7 @@ got:
589 if (err) 590 if (err)
590 goto fail_free_drop; 591 goto fail_free_drop;
591 592
592 err = ext3_init_security(handle,inode, dir); 593 err = ext3_init_security(handle, inode, dir, qstr);
593 if (err) 594 if (err)
594 goto fail_free_drop; 595 goto fail_free_drop;
595 596
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ae94f6d949f5..fe2541d250e4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1894,7 +1894,6 @@ static const struct address_space_operations ext3_ordered_aops = {
1894 .readpage = ext3_readpage, 1894 .readpage = ext3_readpage,
1895 .readpages = ext3_readpages, 1895 .readpages = ext3_readpages,
1896 .writepage = ext3_ordered_writepage, 1896 .writepage = ext3_ordered_writepage,
1897 .sync_page = block_sync_page,
1898 .write_begin = ext3_write_begin, 1897 .write_begin = ext3_write_begin,
1899 .write_end = ext3_ordered_write_end, 1898 .write_end = ext3_ordered_write_end,
1900 .bmap = ext3_bmap, 1899 .bmap = ext3_bmap,
@@ -1910,7 +1909,6 @@ static const struct address_space_operations ext3_writeback_aops = {
1910 .readpage = ext3_readpage, 1909 .readpage = ext3_readpage,
1911 .readpages = ext3_readpages, 1910 .readpages = ext3_readpages,
1912 .writepage = ext3_writeback_writepage, 1911 .writepage = ext3_writeback_writepage,
1913 .sync_page = block_sync_page,
1914 .write_begin = ext3_write_begin, 1912 .write_begin = ext3_write_begin,
1915 .write_end = ext3_writeback_write_end, 1913 .write_end = ext3_writeback_write_end,
1916 .bmap = ext3_bmap, 1914 .bmap = ext3_bmap,
@@ -1926,7 +1924,6 @@ static const struct address_space_operations ext3_journalled_aops = {
1926 .readpage = ext3_readpage, 1924 .readpage = ext3_readpage,
1927 .readpages = ext3_readpages, 1925 .readpages = ext3_readpages,
1928 .writepage = ext3_journalled_writepage, 1926 .writepage = ext3_journalled_writepage,
1929 .sync_page = block_sync_page,
1930 .write_begin = ext3_write_begin, 1927 .write_begin = ext3_write_begin,
1931 .write_end = ext3_journalled_write_end, 1928 .write_end = ext3_journalled_write_end,
1932 .set_page_dirty = ext3_journalled_set_page_dirty, 1929 .set_page_dirty = ext3_journalled_set_page_dirty,
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index fc080dd561f7..f4090bd2f345 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -38,7 +38,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
38 unsigned int oldflags; 38 unsigned int oldflags;
39 unsigned int jflag; 39 unsigned int jflag;
40 40
41 if (!is_owner_or_cap(inode)) 41 if (!inode_owner_or_capable(inode))
42 return -EACCES; 42 return -EACCES;
43 43
44 if (get_user(flags, (int __user *) arg)) 44 if (get_user(flags, (int __user *) arg))
@@ -123,7 +123,7 @@ flags_out:
123 __u32 generation; 123 __u32 generation;
124 int err; 124 int err;
125 125
126 if (!is_owner_or_cap(inode)) 126 if (!inode_owner_or_capable(inode))
127 return -EPERM; 127 return -EPERM;
128 128
129 err = mnt_want_write(filp->f_path.mnt); 129 err = mnt_want_write(filp->f_path.mnt);
@@ -192,7 +192,7 @@ setversion_out:
192 if (err) 192 if (err)
193 return err; 193 return err;
194 194
195 if (!is_owner_or_cap(inode)) { 195 if (!inode_owner_or_capable(inode)) {
196 err = -EACCES; 196 err = -EACCES;
197 goto setrsvsz_out; 197 goto setrsvsz_out;
198 } 198 }
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71810ec..32f3b8695859 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1540,8 +1540,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1540 goto cleanup; 1540 goto cleanup;
1541 node2 = (struct dx_node *)(bh2->b_data); 1541 node2 = (struct dx_node *)(bh2->b_data);
1542 entries2 = node2->entries; 1542 entries2 = node2->entries;
1543 memset(&node2->fake, 0, sizeof(struct fake_dirent));
1543 node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); 1544 node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
1544 node2->fake.inode = 0;
1545 BUFFER_TRACE(frame->bh, "get_write_access"); 1545 BUFFER_TRACE(frame->bh, "get_write_access");
1546 err = ext3_journal_get_write_access(handle, frame->bh); 1546 err = ext3_journal_get_write_access(handle, frame->bh);
1547 if (err) 1547 if (err)
@@ -1710,7 +1710,7 @@ retry:
1710 if (IS_DIRSYNC(dir)) 1710 if (IS_DIRSYNC(dir))
1711 handle->h_sync = 1; 1711 handle->h_sync = 1;
1712 1712
1713 inode = ext3_new_inode (handle, dir, mode); 1713 inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1714 err = PTR_ERR(inode); 1714 err = PTR_ERR(inode);
1715 if (!IS_ERR(inode)) { 1715 if (!IS_ERR(inode)) {
1716 inode->i_op = &ext3_file_inode_operations; 1716 inode->i_op = &ext3_file_inode_operations;
@@ -1746,7 +1746,7 @@ retry:
1746 if (IS_DIRSYNC(dir)) 1746 if (IS_DIRSYNC(dir))
1747 handle->h_sync = 1; 1747 handle->h_sync = 1;
1748 1748
1749 inode = ext3_new_inode (handle, dir, mode); 1749 inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1750 err = PTR_ERR(inode); 1750 err = PTR_ERR(inode);
1751 if (!IS_ERR(inode)) { 1751 if (!IS_ERR(inode)) {
1752 init_special_inode(inode, inode->i_mode, rdev); 1752 init_special_inode(inode, inode->i_mode, rdev);
@@ -1784,7 +1784,7 @@ retry:
1784 if (IS_DIRSYNC(dir)) 1784 if (IS_DIRSYNC(dir))
1785 handle->h_sync = 1; 1785 handle->h_sync = 1;
1786 1786
1787 inode = ext3_new_inode (handle, dir, S_IFDIR | mode); 1787 inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
1788 err = PTR_ERR(inode); 1788 err = PTR_ERR(inode);
1789 if (IS_ERR(inode)) 1789 if (IS_ERR(inode))
1790 goto out_stop; 1790 goto out_stop;
@@ -2206,7 +2206,7 @@ retry:
2206 if (IS_DIRSYNC(dir)) 2206 if (IS_DIRSYNC(dir))
2207 handle->h_sync = 1; 2207 handle->h_sync = 1;
2208 2208
2209 inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); 2209 inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
2210 err = PTR_ERR(inode); 2210 err = PTR_ERR(inode);
2211 if (IS_ERR(inode)) 2211 if (IS_ERR(inode))
2212 goto out_stop; 2212 goto out_stop;
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
2253 2253
2254 dquot_initialize(dir); 2254 dquot_initialize(dir);
2255 2255
2256 /*
2257 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2258 * otherwise has the potential to corrupt the orphan inode list.
2259 */
2260 if (inode->i_nlink == 0)
2261 return -ENOENT;
2262
2263retry: 2256retry:
2264 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2257 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2265 EXT3_INDEX_EXTRA_TRANS_BLOCKS); 2258 EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f2473..071689f86e18 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1464,6 +1464,13 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1464 return; 1464 return;
1465 } 1465 }
1466 1466
1467 /* Check if feature set allows readwrite operations */
1468 if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
1469 ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
1470 "unknown ROCOMPAT features");
1471 return;
1472 }
1473
1467 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { 1474 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
1468 if (es->s_last_orphan) 1475 if (es->s_last_orphan)
1469 jbd_debug(1, "Errors on filesystem, " 1476 jbd_debug(1, "Errors on filesystem, "
@@ -1936,6 +1943,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1936 sb->s_qcop = &ext3_qctl_operations; 1943 sb->s_qcop = &ext3_qctl_operations;
1937 sb->dq_op = &ext3_quota_operations; 1944 sb->dq_op = &ext3_quota_operations;
1938#endif 1945#endif
1946 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
1939 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 1947 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
1940 mutex_init(&sbi->s_orphan_lock); 1948 mutex_init(&sbi->s_orphan_lock);
1941 mutex_init(&sbi->s_resize_lock); 1949 mutex_init(&sbi->s_resize_lock);
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 377fe7201169..2be4f69bfa64 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -128,10 +128,10 @@ exit_ext3_xattr(void)
128 128
129#ifdef CONFIG_EXT3_FS_SECURITY 129#ifdef CONFIG_EXT3_FS_SECURITY
130extern int ext3_init_security(handle_t *handle, struct inode *inode, 130extern int ext3_init_security(handle_t *handle, struct inode *inode,
131 struct inode *dir); 131 struct inode *dir, const struct qstr *qstr);
132#else 132#else
133static inline int ext3_init_security(handle_t *handle, struct inode *inode, 133static inline int ext3_init_security(handle_t *handle, struct inode *inode,
134 struct inode *dir) 134 struct inode *dir, const struct qstr *qstr)
135{ 135{
136 return 0; 136 return 0;
137} 137}
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 03a99bfc59f9..b8d9f83aa5c5 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -49,14 +49,15 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
49} 49}
50 50
51int 51int
52ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir) 52ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
53 const struct qstr *qstr)
53{ 54{
54 int err; 55 int err;
55 size_t len; 56 size_t len;
56 void *value; 57 void *value;
57 char *name; 58 char *name;
58 59
59 err = security_inode_init_security(inode, dir, &name, &value, &len); 60 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
60 if (err) { 61 if (err) {
61 if (err == -EOPNOTSUPP) 62 if (err == -EOPNOTSUPP)
62 return 0; 63 return 0;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e0270d1f8d82..21eacd7b7d79 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -433,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
433 return -EINVAL; 433 return -EINVAL;
434 if (!test_opt(inode->i_sb, POSIX_ACL)) 434 if (!test_opt(inode->i_sb, POSIX_ACL))
435 return -EOPNOTSUPP; 435 return -EOPNOTSUPP;
436 if (!is_owner_or_cap(inode)) 436 if (!inode_owner_or_capable(inode))
437 return -EPERM; 437 return -EPERM;
438 438
439 if (value) { 439 if (value) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c8d97b56f34..4daaf2b753f4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
849 /* current io_end structure for async DIO write*/ 849 /* current io_end structure for async DIO write*/
850 ext4_io_end_t *cur_aio_dio; 850 ext4_io_end_t *cur_aio_dio;
851 atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
851 852
852 spinlock_t i_block_reservation_lock; 853 spinlock_t i_block_reservation_lock;
853 854
@@ -922,14 +923,14 @@ struct ext4_inode_info {
922#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ 923#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
923 EXT4_MOUNT2_##opt) 924 EXT4_MOUNT2_##opt)
924 925
925#define ext4_set_bit ext2_set_bit 926#define ext4_set_bit __test_and_set_bit_le
926#define ext4_set_bit_atomic ext2_set_bit_atomic 927#define ext4_set_bit_atomic ext2_set_bit_atomic
927#define ext4_clear_bit ext2_clear_bit 928#define ext4_clear_bit __test_and_clear_bit_le
928#define ext4_clear_bit_atomic ext2_clear_bit_atomic 929#define ext4_clear_bit_atomic ext2_clear_bit_atomic
929#define ext4_test_bit ext2_test_bit 930#define ext4_test_bit test_bit_le
930#define ext4_find_first_zero_bit ext2_find_first_zero_bit 931#define ext4_find_first_zero_bit find_first_zero_bit_le
931#define ext4_find_next_zero_bit ext2_find_next_zero_bit 932#define ext4_find_next_zero_bit find_next_zero_bit_le
932#define ext4_find_next_bit ext2_find_next_bit 933#define ext4_find_next_bit find_next_bit_le
933 934
934/* 935/*
935 * Maximal mount counts between two filesystem checks 936 * Maximal mount counts between two filesystem checks
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
2119 2120
2120#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 2121#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2121 2122
2123/* For ioend & aio unwritten conversion wait queues */
2124#define EXT4_WQ_HASH_SZ 37
2125#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
2126 EXT4_WQ_HASH_SZ])
2127#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
2128 EXT4_WQ_HASH_SZ])
2129extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
2130extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
2131
2122#endif /* __KERNEL__ */ 2132#endif /* __KERNEL__ */
2123 2133
2124#endif /* _EXT4_H */ 2134#endif /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 63a75810b7c3..7516fb9c0bd5 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -131,7 +131,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
131 * fragmenting the file system's free space. Maybe we 131 * fragmenting the file system's free space. Maybe we
132 * should have some hueristics or some way to allow 132 * should have some hueristics or some way to allow
133 * userspace to pass a hint to file system, 133 * userspace to pass a hint to file system,
134 * especiially if the latter case turns out to be 134 * especially if the latter case turns out to be
135 * common. 135 * common.
136 */ 136 */
137 ex = path[depth].p_ext; 137 ex = path[depth].p_ext;
@@ -2844,7 +2844,7 @@ fix_extent_len:
2844 * ext4_get_blocks_dio_write() when DIO to write 2844 * ext4_get_blocks_dio_write() when DIO to write
2845 * to an uninitialized extent. 2845 * to an uninitialized extent.
2846 * 2846 *
2847 * Writing to an uninitized extent may result in splitting the uninitialized 2847 * Writing to an uninitialized extent may result in splitting the uninitialized
2848 * extent into multiple /initialized uninitialized extents (up to three) 2848 * extent into multiple /initialized uninitialized extents (up to three)
2849 * There are three possibilities: 2849 * There are three possibilities:
2850 * a> There is no split required: Entire extent should be uninitialized 2850 * a> There is no split required: Entire extent should be uninitialized
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3174 * that this IO needs to convertion to written when IO is 3174 * that this IO needs to convertion to written when IO is
3175 * completed 3175 * completed
3176 */ 3176 */
3177 if (io) 3177 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3178 io->flag = EXT4_IO_END_UNWRITTEN; 3178 io->flag = EXT4_IO_END_UNWRITTEN;
3179 else 3179 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3180 } else
3180 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3181 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3181 if (ext4_should_dioread_nolock(inode)) 3182 if (ext4_should_dioread_nolock(inode))
3182 map->m_flags |= EXT4_MAP_UNINIT; 3183 map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3463 * that we need to perform convertion when IO is done. 3464 * that we need to perform convertion when IO is done.
3464 */ 3465 */
3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3466 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3466 if (io) 3467 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3467 io->flag = EXT4_IO_END_UNWRITTEN; 3468 io->flag = EXT4_IO_END_UNWRITTEN;
3468 else 3469 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3470 } else
3469 ext4_set_inode_state(inode, 3471 ext4_set_inode_state(inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3472 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3473 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e8322c8aa88..7b80d543b89e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
55 return 0; 55 return 0;
56} 56}
57 57
58static void ext4_aiodio_wait(struct inode *inode)
59{
60 wait_queue_head_t *wq = ext4_ioend_wq(inode);
61
62 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
63}
64
65/*
66 * This tests whether the IO in question is block-aligned or not.
67 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
68 * are converted to written only after the IO is complete. Until they are
69 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
70 * it needs to zero out portions of the start and/or end block. If 2 AIO
71 * threads are at work on the same unwritten block, they must be synchronized
72 * or one thread will zero the other's data, causing corruption.
73 */
74static int
75ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
76 unsigned long nr_segs, loff_t pos)
77{
78 struct super_block *sb = inode->i_sb;
79 int blockmask = sb->s_blocksize - 1;
80 size_t count = iov_length(iov, nr_segs);
81 loff_t final_size = pos + count;
82
83 if (pos >= inode->i_size)
84 return 0;
85
86 if ((pos & blockmask) || (final_size & blockmask))
87 return 1;
88
89 return 0;
90}
91
58static ssize_t 92static ssize_t
59ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 93ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
60 unsigned long nr_segs, loff_t pos) 94 unsigned long nr_segs, loff_t pos)
61{ 95{
62 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 96 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
97 int unaligned_aio = 0;
98 int ret;
63 99
64 /* 100 /*
65 * If we have encountered a bitmap-format file, the size limit 101 * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
78 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, 114 nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
79 sbi->s_bitmap_maxbytes - pos); 115 sbi->s_bitmap_maxbytes - pos);
80 } 116 }
117 } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
118 !is_sync_kiocb(iocb))) {
119 unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
81 } 120 }
82 121
83 return generic_file_aio_write(iocb, iov, nr_segs, pos); 122 /* Unaligned direct AIO must be serialized; see comment above */
123 if (unaligned_aio) {
124 static unsigned long unaligned_warn_time;
125
126 /* Warn about this once per day */
127 if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
128 ext4_msg(inode->i_sb, KERN_WARNING,
129 "Unaligned AIO/DIO on inode %ld by %s; "
130 "performance will be poor.",
131 inode->i_ino, current->comm);
132 mutex_lock(ext4_aio_mutex(inode));
133 ext4_aiodio_wait(inode);
134 }
135
136 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
137
138 if (unaligned_aio)
139 mutex_unlock(ext4_aio_mutex(inode));
140
141 return ret;
84} 142}
85 143
86static const struct vm_operations_struct ext4_file_vm_ops = { 144static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index eb9097aec6f0..78b79e1bd7ed 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1042,7 +1042,7 @@ got:
1042 if (err) 1042 if (err)
1043 goto fail_free_drop; 1043 goto fail_free_drop;
1044 1044
1045 err = ext4_init_security(handle, inode, dir); 1045 err = ext4_init_security(handle, inode, dir, qstr);
1046 if (err) 1046 if (err)
1047 goto fail_free_drop; 1047 goto fail_free_drop;
1048 1048
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f7f9e49914f..9297ad46c465 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3903,7 +3903,6 @@ static const struct address_space_operations ext4_ordered_aops = {
3903 .readpage = ext4_readpage, 3903 .readpage = ext4_readpage,
3904 .readpages = ext4_readpages, 3904 .readpages = ext4_readpages,
3905 .writepage = ext4_writepage, 3905 .writepage = ext4_writepage,
3906 .sync_page = block_sync_page,
3907 .write_begin = ext4_write_begin, 3906 .write_begin = ext4_write_begin,
3908 .write_end = ext4_ordered_write_end, 3907 .write_end = ext4_ordered_write_end,
3909 .bmap = ext4_bmap, 3908 .bmap = ext4_bmap,
@@ -3919,7 +3918,6 @@ static const struct address_space_operations ext4_writeback_aops = {
3919 .readpage = ext4_readpage, 3918 .readpage = ext4_readpage,
3920 .readpages = ext4_readpages, 3919 .readpages = ext4_readpages,
3921 .writepage = ext4_writepage, 3920 .writepage = ext4_writepage,
3922 .sync_page = block_sync_page,
3923 .write_begin = ext4_write_begin, 3921 .write_begin = ext4_write_begin,
3924 .write_end = ext4_writeback_write_end, 3922 .write_end = ext4_writeback_write_end,
3925 .bmap = ext4_bmap, 3923 .bmap = ext4_bmap,
@@ -3935,7 +3933,6 @@ static const struct address_space_operations ext4_journalled_aops = {
3935 .readpage = ext4_readpage, 3933 .readpage = ext4_readpage,
3936 .readpages = ext4_readpages, 3934 .readpages = ext4_readpages,
3937 .writepage = ext4_writepage, 3935 .writepage = ext4_writepage,
3938 .sync_page = block_sync_page,
3939 .write_begin = ext4_write_begin, 3936 .write_begin = ext4_write_begin,
3940 .write_end = ext4_journalled_write_end, 3937 .write_end = ext4_journalled_write_end,
3941 .set_page_dirty = ext4_journalled_set_page_dirty, 3938 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3951,7 +3948,6 @@ static const struct address_space_operations ext4_da_aops = {
3951 .readpages = ext4_readpages, 3948 .readpages = ext4_readpages,
3952 .writepage = ext4_writepage, 3949 .writepage = ext4_writepage,
3953 .writepages = ext4_da_writepages, 3950 .writepages = ext4_da_writepages,
3954 .sync_page = block_sync_page,
3955 .write_begin = ext4_da_write_begin, 3951 .write_begin = ext4_da_write_begin,
3956 .write_end = ext4_da_write_end, 3952 .write_end = ext4_da_write_end,
3957 .bmap = ext4_bmap, 3953 .bmap = ext4_bmap,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eb3bc2fe647e..a84faa110bcd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
38 unsigned int oldflags; 38 unsigned int oldflags;
39 unsigned int jflag; 39 unsigned int jflag;
40 40
41 if (!is_owner_or_cap(inode)) 41 if (!inode_owner_or_capable(inode))
42 return -EACCES; 42 return -EACCES;
43 43
44 if (get_user(flags, (int __user *) arg)) 44 if (get_user(flags, (int __user *) arg))
@@ -146,7 +146,7 @@ flags_out:
146 __u32 generation; 146 __u32 generation;
147 int err; 147 int err;
148 148
149 if (!is_owner_or_cap(inode)) 149 if (!inode_owner_or_capable(inode))
150 return -EPERM; 150 return -EPERM;
151 151
152 err = mnt_want_write(filp->f_path.mnt); 152 err = mnt_want_write(filp->f_path.mnt);
@@ -298,7 +298,7 @@ mext_out:
298 case EXT4_IOC_MIGRATE: 298 case EXT4_IOC_MIGRATE:
299 { 299 {
300 int err; 300 int err;
301 if (!is_owner_or_cap(inode)) 301 if (!inode_owner_or_capable(inode))
302 return -EACCES; 302 return -EACCES;
303 303
304 err = mnt_want_write(filp->f_path.mnt); 304 err = mnt_want_write(filp->f_path.mnt);
@@ -320,7 +320,7 @@ mext_out:
320 case EXT4_IOC_ALLOC_DA_BLKS: 320 case EXT4_IOC_ALLOC_DA_BLKS:
321 { 321 {
322 int err; 322 int err;
323 if (!is_owner_or_cap(inode)) 323 if (!inode_owner_or_capable(inode))
324 return -EACCES; 324 return -EACCES;
325 325
326 err = mnt_want_write(filp->f_path.mnt); 326 err = mnt_want_write(filp->f_path.mnt);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 851f49b2f9d2..d1fe09aea73d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep;
342/* We create slab caches for groupinfo data structures based on the 342/* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for 343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */ 344 * each unique s_blocksize_bits */
345#define NR_GRPINFO_CACHES \ 345#define NR_GRPINFO_CACHES 8
346 (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
347static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 346static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
348 347
348static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
349 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
350 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
351 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
352};
353
349static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 354static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
350 ext4_group_t group); 355 ext4_group_t group);
351static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 356static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -2414,6 +2419,55 @@ err_freesgi:
2414 return -ENOMEM; 2419 return -ENOMEM;
2415} 2420}
2416 2421
2422static void ext4_groupinfo_destroy_slabs(void)
2423{
2424 int i;
2425
2426 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2427 if (ext4_groupinfo_caches[i])
2428 kmem_cache_destroy(ext4_groupinfo_caches[i]);
2429 ext4_groupinfo_caches[i] = NULL;
2430 }
2431}
2432
2433static int ext4_groupinfo_create_slab(size_t size)
2434{
2435 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2436 int slab_size;
2437 int blocksize_bits = order_base_2(size);
2438 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2439 struct kmem_cache *cachep;
2440
2441 if (cache_index >= NR_GRPINFO_CACHES)
2442 return -EINVAL;
2443
2444 if (unlikely(cache_index < 0))
2445 cache_index = 0;
2446
2447 mutex_lock(&ext4_grpinfo_slab_create_mutex);
2448 if (ext4_groupinfo_caches[cache_index]) {
2449 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2450 return 0; /* Already created */
2451 }
2452
2453 slab_size = offsetof(struct ext4_group_info,
2454 bb_counters[blocksize_bits + 2]);
2455
2456 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2457 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2458 NULL);
2459
2460 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2461 if (!cachep) {
2462 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
2463 return -ENOMEM;
2464 }
2465
2466 ext4_groupinfo_caches[cache_index] = cachep;
2467
2468 return 0;
2469}
2470
2417int ext4_mb_init(struct super_block *sb, int needs_recovery) 2471int ext4_mb_init(struct super_block *sb, int needs_recovery)
2418{ 2472{
2419 struct ext4_sb_info *sbi = EXT4_SB(sb); 2473 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2421 unsigned offset; 2475 unsigned offset;
2422 unsigned max; 2476 unsigned max;
2423 int ret; 2477 int ret;
2424 int cache_index;
2425 struct kmem_cache *cachep;
2426 char *namep = NULL;
2427 2478
2428 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2479 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2429 2480
@@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2440 goto out; 2491 goto out;
2441 } 2492 }
2442 2493
2443 cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 2494 ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2444 cachep = ext4_groupinfo_caches[cache_index]; 2495 if (ret < 0)
2445 if (!cachep) { 2496 goto out;
2446 char name[32];
2447 int len = offsetof(struct ext4_group_info,
2448 bb_counters[sb->s_blocksize_bits + 2]);
2449
2450 sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
2451 namep = kstrdup(name, GFP_KERNEL);
2452 if (!namep) {
2453 ret = -ENOMEM;
2454 goto out;
2455 }
2456
2457 /* Need to free the kmem_cache_name() when we
2458 * destroy the slab */
2459 cachep = kmem_cache_create(namep, len, 0,
2460 SLAB_RECLAIM_ACCOUNT, NULL);
2461 if (!cachep) {
2462 ret = -ENOMEM;
2463 goto out;
2464 }
2465 ext4_groupinfo_caches[cache_index] = cachep;
2466 }
2467 2497
2468 /* order 0 is regular bitmap */ 2498 /* order 0 is regular bitmap */
2469 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2499 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2520,7 +2550,6 @@ out:
2520 if (ret) { 2550 if (ret) {
2521 kfree(sbi->s_mb_offsets); 2551 kfree(sbi->s_mb_offsets);
2522 kfree(sbi->s_mb_maxs); 2552 kfree(sbi->s_mb_maxs);
2523 kfree(namep);
2524 } 2553 }
2525 return ret; 2554 return ret;
2526} 2555}
@@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void)
2734 2763
2735void ext4_exit_mballoc(void) 2764void ext4_exit_mballoc(void)
2736{ 2765{
2737 int i;
2738 /* 2766 /*
2739 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2767 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2740 * before destroying the slab cache. 2768 * before destroying the slab cache.
@@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void)
2743 kmem_cache_destroy(ext4_pspace_cachep); 2771 kmem_cache_destroy(ext4_pspace_cachep);
2744 kmem_cache_destroy(ext4_ac_cachep); 2772 kmem_cache_destroy(ext4_ac_cachep);
2745 kmem_cache_destroy(ext4_free_ext_cachep); 2773 kmem_cache_destroy(ext4_free_ext_cachep);
2746 2774 ext4_groupinfo_destroy_slabs();
2747 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2748 struct kmem_cache *cachep = ext4_groupinfo_caches[i];
2749 if (cachep) {
2750 char *name = (char *)kmem_cache_name(cachep);
2751 kmem_cache_destroy(cachep);
2752 kfree(name);
2753 }
2754 }
2755 ext4_remove_debugfs_entry(); 2775 ext4_remove_debugfs_entry();
2756} 2776}
2757 2777
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390d32c5..e781b7ea5630 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry,
2304 2304
2305 dquot_initialize(dir); 2305 dquot_initialize(dir);
2306 2306
2307 /*
2308 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2309 * otherwise has the potential to corrupt the orphan inode list.
2310 */
2311 if (inode->i_nlink == 0)
2312 return -ENOENT;
2313
2314retry: 2307retry:
2315 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2308 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2316 EXT4_INDEX_EXTRA_TRANS_BLOCKS); 2309 EXT4_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7270dcfca92a..e2cd90e4bb7c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
32 32
33static struct kmem_cache *io_page_cachep, *io_end_cachep; 33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34 34
35#define WQ_HASH_SZ 37
36#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38
39int __init ext4_init_pageio(void) 35int __init ext4_init_pageio(void)
40{ 36{
41 int i;
42
43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); 37 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
44 if (io_page_cachep == NULL) 38 if (io_page_cachep == NULL)
45 return -ENOMEM; 39 return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
48 kmem_cache_destroy(io_page_cachep); 42 kmem_cache_destroy(io_page_cachep);
49 return -ENOMEM; 43 return -ENOMEM;
50 } 44 }
51 for (i = 0; i < WQ_HASH_SZ; i++)
52 init_waitqueue_head(&ioend_wq[i]);
53
54 return 0; 45 return 0;
55} 46}
56 47
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
62 53
63void ext4_ioend_wait(struct inode *inode) 54void ext4_ioend_wait(struct inode *inode)
64{ 55{
65 wait_queue_head_t *wq = to_ioend_wq(inode); 56 wait_queue_head_t *wq = ext4_ioend_wq(inode);
66 57
67 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 58 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
68} 59}
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
87 for (i = 0; i < io->num_io_pages; i++) 78 for (i = 0; i < io->num_io_pages; i++)
88 put_io_page(io->pages[i]); 79 put_io_page(io->pages[i]);
89 io->num_io_pages = 0; 80 io->num_io_pages = 0;
90 wq = to_ioend_wq(io->inode); 81 wq = ext4_ioend_wq(io->inode);
91 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && 82 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
92 waitqueue_active(wq)) 83 waitqueue_active(wq))
93 wake_up_all(wq); 84 wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
102 struct inode *inode = io->inode; 93 struct inode *inode = io->inode;
103 loff_t offset = io->offset; 94 loff_t offset = io->offset;
104 ssize_t size = io->size; 95 ssize_t size = io->size;
96 wait_queue_head_t *wq;
105 int ret = 0; 97 int ret = 0;
106 98
107 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 99 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
126 if (io->iocb) 118 if (io->iocb)
127 aio_complete(io->iocb, io->result, 0); 119 aio_complete(io->iocb, io->result, 0);
128 /* clear the DIO AIO unwritten flag */ 120 /* clear the DIO AIO unwritten flag */
129 io->flag &= ~EXT4_IO_END_UNWRITTEN; 121 if (io->flag & EXT4_IO_END_UNWRITTEN) {
122 io->flag &= ~EXT4_IO_END_UNWRITTEN;
123 /* Wake up anyone waiting on unwritten extent conversion */
124 wq = ext4_ioend_wq(io->inode);
125 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
126 waitqueue_active(wq)) {
127 wake_up_all(wq);
128 }
129 }
130
130 return ret; 131 return ret;
131} 132}
132 133
@@ -190,6 +191,7 @@ static void ext4_end_bio(struct bio *bio, int error)
190 struct inode *inode; 191 struct inode *inode;
191 unsigned long flags; 192 unsigned long flags;
192 int i; 193 int i;
194 sector_t bi_sector = bio->bi_sector;
193 195
194 BUG_ON(!io_end); 196 BUG_ON(!io_end);
195 bio->bi_private = NULL; 197 bio->bi_private = NULL;
@@ -207,9 +209,7 @@ static void ext4_end_bio(struct bio *bio, int error)
207 if (error) 209 if (error)
208 SetPageError(page); 210 SetPageError(page);
209 BUG_ON(!head); 211 BUG_ON(!head);
210 if (head->b_size == PAGE_CACHE_SIZE) 212 if (head->b_size != PAGE_CACHE_SIZE) {
211 clear_buffer_dirty(head);
212 else {
213 loff_t offset; 213 loff_t offset;
214 loff_t io_end_offset = io_end->offset + io_end->size; 214 loff_t io_end_offset = io_end->offset + io_end->size;
215 215
@@ -221,7 +221,6 @@ static void ext4_end_bio(struct bio *bio, int error)
221 if (error) 221 if (error)
222 buffer_io_error(bh); 222 buffer_io_error(bh);
223 223
224 clear_buffer_dirty(bh);
225 } 224 }
226 if (buffer_delay(bh)) 225 if (buffer_delay(bh))
227 partial_write = 1; 226 partial_write = 1;
@@ -257,7 +256,7 @@ static void ext4_end_bio(struct bio *bio, int error)
257 (unsigned long long) io_end->offset, 256 (unsigned long long) io_end->offset,
258 (long) io_end->size, 257 (long) io_end->size,
259 (unsigned long long) 258 (unsigned long long)
260 bio->bi_sector >> (inode->i_blkbits - 9)); 259 bi_sector >> (inode->i_blkbits - 9));
261 } 260 }
262 261
263 /* Add the io_end to per-inode completed io list*/ 262 /* Add the io_end to per-inode completed io list*/
@@ -311,8 +310,7 @@ static int io_submit_init(struct ext4_io_submit *io,
311 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 310 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
312 311
313 io->io_bio = bio; 312 io->io_bio = bio;
314 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? 313 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
315 WRITE_SYNC_PLUG : WRITE);
316 io->io_next_block = bh->b_blocknr; 314 io->io_next_block = bh->b_blocknr;
317 return 0; 315 return 0;
318} 316}
@@ -380,6 +378,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
380 378
381 blocksize = 1 << inode->i_blkbits; 379 blocksize = 1 << inode->i_blkbits;
382 380
381 BUG_ON(!PageLocked(page));
383 BUG_ON(PageWriteback(page)); 382 BUG_ON(PageWriteback(page));
384 set_page_writeback(page); 383 set_page_writeback(page);
385 ClearPageError(page); 384 ClearPageError(page);
@@ -397,12 +396,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
397 for (bh = head = page_buffers(page), block_start = 0; 396 for (bh = head = page_buffers(page), block_start = 0;
398 bh != head || !block_start; 397 bh != head || !block_start;
399 block_start = block_end, bh = bh->b_this_page) { 398 block_start = block_end, bh = bh->b_this_page) {
399
400 block_end = block_start + blocksize; 400 block_end = block_start + blocksize;
401 if (block_start >= len) { 401 if (block_start >= len) {
402 clear_buffer_dirty(bh); 402 clear_buffer_dirty(bh);
403 set_buffer_uptodate(bh); 403 set_buffer_uptodate(bh);
404 continue; 404 continue;
405 } 405 }
406 clear_buffer_dirty(bh);
406 ret = io_submit_add_bh(io, io_page, inode, wbc, bh); 407 ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
407 if (ret) { 408 if (ret) {
408 /* 409 /*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 48ce561fafac..203f9e4a70be 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77 const char *dev_name, void *data); 77 const char *dev_name, void *data);
78static void ext4_destroy_lazyinit_thread(void); 78static void ext4_destroy_lazyinit_thread(void);
79static void ext4_unregister_li_request(struct super_block *sb); 79static void ext4_unregister_li_request(struct super_block *sb);
80static void ext4_clear_request_list(void);
80 81
81#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 82#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
82static struct file_system_type ext3_fs_type = { 83static struct file_system_type ext3_fs_type = {
@@ -832,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
832 ei->i_sync_tid = 0; 833 ei->i_sync_tid = 0;
833 ei->i_datasync_tid = 0; 834 ei->i_datasync_tid = 0;
834 atomic_set(&ei->i_ioend_count, 0); 835 atomic_set(&ei->i_ioend_count, 0);
836 atomic_set(&ei->i_aiodio_unwritten, 0);
835 837
836 return &ei->vfs_inode; 838 return &ei->vfs_inode;
837} 839}
@@ -2716,6 +2718,8 @@ static void ext4_unregister_li_request(struct super_block *sb)
2716 mutex_unlock(&ext4_li_info->li_list_mtx); 2718 mutex_unlock(&ext4_li_info->li_list_mtx);
2717} 2719}
2718 2720
2721static struct task_struct *ext4_lazyinit_task;
2722
2719/* 2723/*
2720 * This is the function where ext4lazyinit thread lives. It walks 2724 * This is the function where ext4lazyinit thread lives. It walks
2721 * through the request list searching for next scheduled filesystem. 2725 * through the request list searching for next scheduled filesystem.
@@ -2784,6 +2788,10 @@ cont_thread:
2784 if (time_before(jiffies, next_wakeup)) 2788 if (time_before(jiffies, next_wakeup))
2785 schedule(); 2789 schedule();
2786 finish_wait(&eli->li_wait_daemon, &wait); 2790 finish_wait(&eli->li_wait_daemon, &wait);
2791 if (kthread_should_stop()) {
2792 ext4_clear_request_list();
2793 goto exit_thread;
2794 }
2787 } 2795 }
2788 2796
2789exit_thread: 2797exit_thread:
@@ -2808,6 +2816,7 @@ exit_thread:
2808 wake_up(&eli->li_wait_task); 2816 wake_up(&eli->li_wait_task);
2809 2817
2810 kfree(ext4_li_info); 2818 kfree(ext4_li_info);
2819 ext4_lazyinit_task = NULL;
2811 ext4_li_info = NULL; 2820 ext4_li_info = NULL;
2812 mutex_unlock(&ext4_li_mtx); 2821 mutex_unlock(&ext4_li_mtx);
2813 2822
@@ -2830,11 +2839,10 @@ static void ext4_clear_request_list(void)
2830 2839
2831static int ext4_run_lazyinit_thread(void) 2840static int ext4_run_lazyinit_thread(void)
2832{ 2841{
2833 struct task_struct *t; 2842 ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
2834 2843 ext4_li_info, "ext4lazyinit");
2835 t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); 2844 if (IS_ERR(ext4_lazyinit_task)) {
2836 if (IS_ERR(t)) { 2845 int err = PTR_ERR(ext4_lazyinit_task);
2837 int err = PTR_ERR(t);
2838 ext4_clear_request_list(); 2846 ext4_clear_request_list();
2839 del_timer_sync(&ext4_li_info->li_timer); 2847 del_timer_sync(&ext4_li_info->li_timer);
2840 kfree(ext4_li_info); 2848 kfree(ext4_li_info);
@@ -2985,16 +2993,10 @@ static void ext4_destroy_lazyinit_thread(void)
2985 * If thread exited earlier 2993 * If thread exited earlier
2986 * there's nothing to be done. 2994 * there's nothing to be done.
2987 */ 2995 */
2988 if (!ext4_li_info) 2996 if (!ext4_li_info || !ext4_lazyinit_task)
2989 return; 2997 return;
2990 2998
2991 ext4_clear_request_list(); 2999 kthread_stop(ext4_lazyinit_task);
2992
2993 while (ext4_li_info->li_task) {
2994 wake_up(&ext4_li_info->li_wait_daemon);
2995 wait_event(ext4_li_info->li_wait_task,
2996 ext4_li_info->li_task == NULL);
2997 }
2998} 3000}
2999 3001
3000static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3002static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -3413,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3413 sb->s_qcop = &ext4_qctl_operations; 3415 sb->s_qcop = &ext4_qctl_operations;
3414 sb->dq_op = &ext4_quota_operations; 3416 sb->dq_op = &ext4_quota_operations;
3415#endif 3417#endif
3418 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
3419
3416 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 3420 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
3417 mutex_init(&sbi->s_orphan_lock); 3421 mutex_init(&sbi->s_orphan_lock);
3418 mutex_init(&sbi->s_resize_lock); 3422 mutex_init(&sbi->s_resize_lock);
@@ -3507,7 +3511,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3507 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); 3511 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
3508 3512
3509no_journal: 3513no_journal:
3510 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3514 /*
3515 * The maximum number of concurrent works can be high and
3516 * concurrency isn't really necessary. Limit it to 1.
3517 */
3518 EXT4_SB(sb)->dio_unwritten_wq =
3519 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1);
3511 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3520 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3512 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3521 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
3513 goto failed_mount_wq; 3522 goto failed_mount_wq;
@@ -4768,7 +4777,7 @@ static struct file_system_type ext4_fs_type = {
4768 .fs_flags = FS_REQUIRES_DEV, 4777 .fs_flags = FS_REQUIRES_DEV,
4769}; 4778};
4770 4779
4771int __init ext4_init_feat_adverts(void) 4780static int __init ext4_init_feat_adverts(void)
4772{ 4781{
4773 struct ext4_features *ef; 4782 struct ext4_features *ef;
4774 int ret = -ENOMEM; 4783 int ret = -ENOMEM;
@@ -4792,23 +4801,44 @@ out:
4792 return ret; 4801 return ret;
4793} 4802}
4794 4803
4804static void ext4_exit_feat_adverts(void)
4805{
4806 kobject_put(&ext4_feat->f_kobj);
4807 wait_for_completion(&ext4_feat->f_kobj_unregister);
4808 kfree(ext4_feat);
4809}
4810
4811/* Shared across all ext4 file systems */
4812wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
4813struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
4814
4795static int __init ext4_init_fs(void) 4815static int __init ext4_init_fs(void)
4796{ 4816{
4797 int err; 4817 int i, err;
4798 4818
4799 ext4_check_flag_values(); 4819 ext4_check_flag_values();
4820
4821 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
4822 mutex_init(&ext4__aio_mutex[i]);
4823 init_waitqueue_head(&ext4__ioend_wq[i]);
4824 }
4825
4800 err = ext4_init_pageio(); 4826 err = ext4_init_pageio();
4801 if (err) 4827 if (err)
4802 return err; 4828 return err;
4803 err = ext4_init_system_zone(); 4829 err = ext4_init_system_zone();
4804 if (err) 4830 if (err)
4805 goto out5; 4831 goto out7;
4806 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 4832 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4807 if (!ext4_kset) 4833 if (!ext4_kset)
4808 goto out4; 4834 goto out6;
4809 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 4835 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4836 if (!ext4_proc_root)
4837 goto out5;
4810 4838
4811 err = ext4_init_feat_adverts(); 4839 err = ext4_init_feat_adverts();
4840 if (err)
4841 goto out4;
4812 4842
4813 err = ext4_init_mballoc(); 4843 err = ext4_init_mballoc();
4814 if (err) 4844 if (err)
@@ -4838,12 +4868,14 @@ out1:
4838out2: 4868out2:
4839 ext4_exit_mballoc(); 4869 ext4_exit_mballoc();
4840out3: 4870out3:
4841 kfree(ext4_feat); 4871 ext4_exit_feat_adverts();
4872out4:
4842 remove_proc_entry("fs/ext4", NULL); 4873 remove_proc_entry("fs/ext4", NULL);
4874out5:
4843 kset_unregister(ext4_kset); 4875 kset_unregister(ext4_kset);
4844out4: 4876out6:
4845 ext4_exit_system_zone(); 4877 ext4_exit_system_zone();
4846out5: 4878out7:
4847 ext4_exit_pageio(); 4879 ext4_exit_pageio();
4848 return err; 4880 return err;
4849} 4881}
@@ -4857,6 +4889,7 @@ static void __exit ext4_exit_fs(void)
4857 destroy_inodecache(); 4889 destroy_inodecache();
4858 ext4_exit_xattr(); 4890 ext4_exit_xattr();
4859 ext4_exit_mballoc(); 4891 ext4_exit_mballoc();
4892 ext4_exit_feat_adverts();
4860 remove_proc_entry("fs/ext4", NULL); 4893 remove_proc_entry("fs/ext4", NULL);
4861 kset_unregister(ext4_kset); 4894 kset_unregister(ext4_kset);
4862 ext4_exit_system_zone(); 4895 ext4_exit_system_zone();
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1ef16520b950..25b7387ff183 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
145 145
146#ifdef CONFIG_EXT4_FS_SECURITY 146#ifdef CONFIG_EXT4_FS_SECURITY
147extern int ext4_init_security(handle_t *handle, struct inode *inode, 147extern int ext4_init_security(handle_t *handle, struct inode *inode,
148 struct inode *dir); 148 struct inode *dir, const struct qstr *qstr);
149#else 149#else
150static inline int ext4_init_security(handle_t *handle, struct inode *inode, 150static inline int ext4_init_security(handle_t *handle, struct inode *inode,
151 struct inode *dir) 151 struct inode *dir, const struct qstr *qstr)
152{ 152{
153 return 0; 153 return 0;
154} 154}
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 9b21268e121c..007c3bfbf094 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
49} 49}
50 50
51int 51int
52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir) 52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
53 const struct qstr *qstr)
53{ 54{
54 int err; 55 int err;
55 size_t len; 56 size_t len;
56 void *value; 57 void *value;
57 char *name; 58 char *name;
58 59
59 err = security_inode_init_security(inode, dir, &name, &value, &len); 60 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
60 if (err) { 61 if (err) {
61 if (err == -EOPNOTSUPP) 62 if (err == -EOPNOTSUPP)
62 return 0; 63 return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd1..8d68690bdcf1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -236,7 +236,6 @@ static const struct address_space_operations fat_aops = {
236 .readpages = fat_readpages, 236 .readpages = fat_readpages,
237 .writepage = fat_writepage, 237 .writepage = fat_writepage,
238 .writepages = fat_writepages, 238 .writepages = fat_writepages,
239 .sync_page = block_sync_page,
240 .write_begin = fat_write_begin, 239 .write_begin = fat_write_begin,
241 .write_end = fat_write_end, 240 .write_end = fat_write_end,
242 .direct_IO = fat_direct_IO, 241 .direct_IO = fat_direct_IO,
@@ -757,8 +756,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
757 struct inode *inode = de->d_inode; 756 struct inode *inode = de->d_inode;
758 u32 ipos_h, ipos_m, ipos_l; 757 u32 ipos_h, ipos_m, ipos_l;
759 758
760 if (len < 5) 759 if (len < 5) {
760 *lenp = 5;
761 return 255; /* no room */ 761 return 255; /* no room */
762 }
762 763
763 ipos_h = MSDOS_I(inode)->i_pos >> 8; 764 ipos_h = MSDOS_I(inode)->i_pos >> 8;
764 ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; 765 ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f88f752babd9..adae3fb7451a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
43 43
44static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) 44static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
45{ 45{
46 if (nd->flags & LOOKUP_RCU) 46 if (nd && nd->flags & LOOKUP_RCU)
47 return -ECHILD; 47 return -ECHILD;
48 48
49 /* This is not negative dentry. Always valid. */ 49 /* This is not negative dentry. Always valid. */
@@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
54 54
55static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) 55static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
56{ 56{
57 if (nd->flags & LOOKUP_RCU) 57 if (nd && nd->flags & LOOKUP_RCU)
58 return -ECHILD; 58 return -ECHILD;
59 59
60 /* 60 /*
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ecc8b3954ed6..22764c7c8382 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
131SYSCALL_DEFINE1(dup, unsigned int, fildes) 131SYSCALL_DEFINE1(dup, unsigned int, fildes)
132{ 132{
133 int ret = -EBADF; 133 int ret = -EBADF;
134 struct file *file = fget(fildes); 134 struct file *file = fget_raw(fildes);
135 135
136 if (file) { 136 if (file) {
137 ret = get_unused_fd(); 137 ret = get_unused_fd();
@@ -159,7 +159,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
159 159
160 /* O_NOATIME can only be set by the owner or superuser */ 160 /* O_NOATIME can only be set by the owner or superuser */
161 if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) 161 if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
162 if (!is_owner_or_cap(inode)) 162 if (!inode_owner_or_capable(inode))
163 return -EPERM; 163 return -EPERM;
164 164
165 /* required for strict SunOS emulation */ 165 /* required for strict SunOS emulation */
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
426 return err; 426 return err;
427} 427}
428 428
429static int check_fcntl_cmd(unsigned cmd)
430{
431 switch (cmd) {
432 case F_DUPFD:
433 case F_DUPFD_CLOEXEC:
434 case F_GETFD:
435 case F_SETFD:
436 case F_GETFL:
437 return 1;
438 }
439 return 0;
440}
441
429SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) 442SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
430{ 443{
431 struct file *filp; 444 struct file *filp;
432 long err = -EBADF; 445 long err = -EBADF;
433 446
434 filp = fget(fd); 447 filp = fget_raw(fd);
435 if (!filp) 448 if (!filp)
436 goto out; 449 goto out;
437 450
451 if (unlikely(filp->f_mode & FMODE_PATH)) {
452 if (!check_fcntl_cmd(cmd)) {
453 fput(filp);
454 goto out;
455 }
456 }
457
438 err = security_file_fcntl(filp, cmd, arg); 458 err = security_file_fcntl(filp, cmd, arg);
439 if (err) { 459 if (err) {
440 fput(filp); 460 fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
456 long err; 476 long err;
457 477
458 err = -EBADF; 478 err = -EBADF;
459 filp = fget(fd); 479 filp = fget_raw(fd);
460 if (!filp) 480 if (!filp)
461 goto out; 481 goto out;
462 482
483 if (unlikely(filp->f_mode & FMODE_PATH)) {
484 if (!check_fcntl_cmd(cmd)) {
485 fput(filp);
486 goto out;
487 }
488 }
489
463 err = security_file_fcntl(filp, cmd, arg); 490 err = security_file_fcntl(filp, cmd, arg);
464 if (err) { 491 if (err) {
465 fput(filp); 492 fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
808 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY 835 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
809 * is defined as O_NONBLOCK on some platforms and not on others. 836 * is defined as O_NONBLOCK on some platforms and not on others.
810 */ 837 */
811 BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( 838 BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
812 O_RDONLY | O_WRONLY | O_RDWR | 839 O_RDONLY | O_WRONLY | O_RDWR |
813 O_CREAT | O_EXCL | O_NOCTTY | 840 O_CREAT | O_EXCL | O_NOCTTY |
814 O_TRUNC | O_APPEND | /* O_NONBLOCK | */ 841 O_TRUNC | O_APPEND | /* O_NONBLOCK | */
815 __O_SYNC | O_DSYNC | FASYNC | 842 __O_SYNC | O_DSYNC | FASYNC |
816 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 843 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
817 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 844 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
818 FMODE_EXEC 845 __FMODE_EXEC | O_PATH
819 )); 846 ));
820 847
821 fasync_cache = kmem_cache_create("fasync_cache", 848 fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 000000000000..bf93ad2bee07
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,265 @@
1#include <linux/syscalls.h>
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/file.h>
5#include <linux/mount.h>
6#include <linux/namei.h>
7#include <linux/exportfs.h>
8#include <linux/fs_struct.h>
9#include <linux/fsnotify.h>
10#include <asm/uaccess.h>
11#include "internal.h"
12
13static long do_sys_name_to_handle(struct path *path,
14 struct file_handle __user *ufh,
15 int __user *mnt_id)
16{
17 long retval;
18 struct file_handle f_handle;
19 int handle_dwords, handle_bytes;
20 struct file_handle *handle = NULL;
21
22 /*
23 * We need t make sure wether the file system
24 * support decoding of the file handle
25 */
26 if (!path->mnt->mnt_sb->s_export_op ||
27 !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
28 return -EOPNOTSUPP;
29
30 if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
31 return -EFAULT;
32
33 if (f_handle.handle_bytes > MAX_HANDLE_SZ)
34 return -EINVAL;
35
36 handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
37 GFP_KERNEL);
38 if (!handle)
39 return -ENOMEM;
40
41 /* convert handle size to multiple of sizeof(u32) */
42 handle_dwords = f_handle.handle_bytes >> 2;
43
44 /* we ask for a non connected handle */
45 retval = exportfs_encode_fh(path->dentry,
46 (struct fid *)handle->f_handle,
47 &handle_dwords, 0);
48 handle->handle_type = retval;
49 /* convert handle size to bytes */
50 handle_bytes = handle_dwords * sizeof(u32);
51 handle->handle_bytes = handle_bytes;
52 if ((handle->handle_bytes > f_handle.handle_bytes) ||
53 (retval == 255) || (retval == -ENOSPC)) {
54 /* As per old exportfs_encode_fh documentation
55 * we could return ENOSPC to indicate overflow
56 * But file system returned 255 always. So handle
57 * both the values
58 */
59 /*
60 * set the handle size to zero so we copy only
61 * non variable part of the file_handle
62 */
63 handle_bytes = 0;
64 retval = -EOVERFLOW;
65 } else
66 retval = 0;
67 /* copy the mount id */
68 if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
69 copy_to_user(ufh, handle,
70 sizeof(struct file_handle) + handle_bytes))
71 retval = -EFAULT;
72 kfree(handle);
73 return retval;
74}
75
76/**
77 * sys_name_to_handle_at: convert name to handle
78 * @dfd: directory relative to which name is interpreted if not absolute
79 * @name: name that should be converted to handle.
80 * @handle: resulting file handle
81 * @mnt_id: mount id of the file system containing the file
82 * @flag: flag value to indicate whether to follow symlink or not
83 *
84 * @handle->handle_size indicate the space available to store the
85 * variable part of the file handle in bytes. If there is not
86 * enough space, the field is updated to return the minimum
87 * value required.
88 */
89SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
90 struct file_handle __user *, handle, int __user *, mnt_id,
91 int, flag)
92{
93 struct path path;
94 int lookup_flags;
95 int err;
96
97 if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
98 return -EINVAL;
99
100 lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
101 if (flag & AT_EMPTY_PATH)
102 lookup_flags |= LOOKUP_EMPTY;
103 err = user_path_at(dfd, name, lookup_flags, &path);
104 if (!err) {
105 err = do_sys_name_to_handle(&path, handle, mnt_id);
106 path_put(&path);
107 }
108 return err;
109}
110
111static struct vfsmount *get_vfsmount_from_fd(int fd)
112{
113 struct path path;
114
115 if (fd == AT_FDCWD) {
116 struct fs_struct *fs = current->fs;
117 spin_lock(&fs->lock);
118 path = fs->pwd;
119 mntget(path.mnt);
120 spin_unlock(&fs->lock);
121 } else {
122 int fput_needed;
123 struct file *file = fget_light(fd, &fput_needed);
124 if (!file)
125 return ERR_PTR(-EBADF);
126 path = file->f_path;
127 mntget(path.mnt);
128 fput_light(file, fput_needed);
129 }
130 return path.mnt;
131}
132
133static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
134{
135 return 1;
136}
137
138static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
139 struct path *path)
140{
141 int retval = 0;
142 int handle_dwords;
143
144 path->mnt = get_vfsmount_from_fd(mountdirfd);
145 if (IS_ERR(path->mnt)) {
146 retval = PTR_ERR(path->mnt);
147 goto out_err;
148 }
149 /* change the handle size to multiple of sizeof(u32) */
150 handle_dwords = handle->handle_bytes >> 2;
151 path->dentry = exportfs_decode_fh(path->mnt,
152 (struct fid *)handle->f_handle,
153 handle_dwords, handle->handle_type,
154 vfs_dentry_acceptable, NULL);
155 if (IS_ERR(path->dentry)) {
156 retval = PTR_ERR(path->dentry);
157 goto out_mnt;
158 }
159 return 0;
160out_mnt:
161 mntput(path->mnt);
162out_err:
163 return retval;
164}
165
166static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
167 struct path *path)
168{
169 int retval = 0;
170 struct file_handle f_handle;
171 struct file_handle *handle = NULL;
172
173 /*
174 * With handle we don't look at the execute bit on the
175 * the directory. Ideally we would like CAP_DAC_SEARCH.
176 * But we don't have that
177 */
178 if (!capable(CAP_DAC_READ_SEARCH)) {
179 retval = -EPERM;
180 goto out_err;
181 }
182 if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
183 retval = -EFAULT;
184 goto out_err;
185 }
186 if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
187 (f_handle.handle_bytes == 0)) {
188 retval = -EINVAL;
189 goto out_err;
190 }
191 handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
192 GFP_KERNEL);
193 if (!handle) {
194 retval = -ENOMEM;
195 goto out_err;
196 }
197 /* copy the full handle */
198 if (copy_from_user(handle, ufh,
199 sizeof(struct file_handle) +
200 f_handle.handle_bytes)) {
201 retval = -EFAULT;
202 goto out_handle;
203 }
204
205 retval = do_handle_to_path(mountdirfd, handle, path);
206
207out_handle:
208 kfree(handle);
209out_err:
210 return retval;
211}
212
213long do_handle_open(int mountdirfd,
214 struct file_handle __user *ufh, int open_flag)
215{
216 long retval = 0;
217 struct path path;
218 struct file *file;
219 int fd;
220
221 retval = handle_to_path(mountdirfd, ufh, &path);
222 if (retval)
223 return retval;
224
225 fd = get_unused_fd_flags(open_flag);
226 if (fd < 0) {
227 path_put(&path);
228 return fd;
229 }
230 file = file_open_root(path.dentry, path.mnt, "", open_flag);
231 if (IS_ERR(file)) {
232 put_unused_fd(fd);
233 retval = PTR_ERR(file);
234 } else {
235 retval = fd;
236 fsnotify_open(file);
237 fd_install(fd, file);
238 }
239 path_put(&path);
240 return retval;
241}
242
243/**
244 * sys_open_by_handle_at: Open the file handle
245 * @mountdirfd: directory file descriptor
246 * @handle: file handle to be opened
247 * @flag: open flags.
248 *
249 * @mountdirfd indicate the directory file descriptor
250 * of the mount point. file handle is decoded relative
251 * to the vfsmount pointed by the @mountdirfd. @flags
252 * value is same as the open(2) flags.
253 */
254SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
255 struct file_handle __user *, handle,
256 int, flags)
257{
258 long ret;
259
260 if (force_o_largefile())
261 flags |= O_LARGEFILE;
262
263 ret = do_handle_open(mountdirfd, handle, flags);
264 return ret;
265}
diff --git a/fs/fifo.c b/fs/fifo.c
index 4e303c22d5ee..b1a524d798e7 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -66,8 +66,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
66 /* suppress POLLHUP until we have 66 /* suppress POLLHUP until we have
67 * seen a writer */ 67 * seen a writer */
68 filp->f_version = pipe->w_counter; 68 filp->f_version = pipe->w_counter;
69 } else 69 } else {
70 {
71 wait_for_partner(inode, &pipe->w_counter); 70 wait_for_partner(inode, &pipe->w_counter);
72 if(signal_pending(current)) 71 if(signal_pending(current))
73 goto err_rd; 72 goto err_rd;
diff --git a/fs/file_table.c b/fs/file_table.c
index c3e89adf53c0..01e4c1e8e6b6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -125,13 +125,13 @@ struct file *get_empty_filp(void)
125 goto fail; 125 goto fail;
126 126
127 percpu_counter_inc(&nr_files); 127 percpu_counter_inc(&nr_files);
128 f->f_cred = get_cred(cred);
128 if (security_file_alloc(f)) 129 if (security_file_alloc(f))
129 goto fail_sec; 130 goto fail_sec;
130 131
131 INIT_LIST_HEAD(&f->f_u.fu_list); 132 INIT_LIST_HEAD(&f->f_u.fu_list);
132 atomic_long_set(&f->f_count, 1); 133 atomic_long_set(&f->f_count, 1);
133 rwlock_init(&f->f_owner.lock); 134 rwlock_init(&f->f_owner.lock);
134 f->f_cred = get_cred(cred);
135 spin_lock_init(&f->f_lock); 135 spin_lock_init(&f->f_lock);
136 eventpoll_init_file(f); 136 eventpoll_init_file(f);
137 /* f->f_version: 0 */ 137 /* f->f_version: 0 */
@@ -190,7 +190,8 @@ struct file *alloc_file(struct path *path, fmode_t mode,
190 file_take_write(file); 190 file_take_write(file);
191 WARN_ON(mnt_clone_write(path->mnt)); 191 WARN_ON(mnt_clone_write(path->mnt));
192 } 192 }
193 ima_counts_get(file); 193 if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
194 i_readcount_inc(path->dentry->d_inode);
194 return file; 195 return file;
195} 196}
196EXPORT_SYMBOL(alloc_file); 197EXPORT_SYMBOL(alloc_file);
@@ -246,11 +247,15 @@ static void __fput(struct file *file)
246 file->f_op->release(inode, file); 247 file->f_op->release(inode, file);
247 security_file_free(file); 248 security_file_free(file);
248 ima_file_free(file); 249 ima_file_free(file);
249 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) 250 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
251 !(file->f_mode & FMODE_PATH))) {
250 cdev_put(inode->i_cdev); 252 cdev_put(inode->i_cdev);
253 }
251 fops_put(file->f_op); 254 fops_put(file->f_op);
252 put_pid(file->f_owner.pid); 255 put_pid(file->f_owner.pid);
253 file_sb_list_del(file); 256 file_sb_list_del(file);
257 if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
258 i_readcount_dec(inode);
254 if (file->f_mode & FMODE_WRITE) 259 if (file->f_mode & FMODE_WRITE)
255 drop_file_write_access(file); 260 drop_file_write_access(file);
256 file->f_path.dentry = NULL; 261 file->f_path.dentry = NULL;
@@ -276,11 +281,10 @@ struct file *fget(unsigned int fd)
276 rcu_read_lock(); 281 rcu_read_lock();
277 file = fcheck_files(files, fd); 282 file = fcheck_files(files, fd);
278 if (file) { 283 if (file) {
279 if (!atomic_long_inc_not_zero(&file->f_count)) { 284 /* File object ref couldn't be taken */
280 /* File object ref couldn't be taken */ 285 if (file->f_mode & FMODE_PATH ||
281 rcu_read_unlock(); 286 !atomic_long_inc_not_zero(&file->f_count))
282 return NULL; 287 file = NULL;
283 }
284 } 288 }
285 rcu_read_unlock(); 289 rcu_read_unlock();
286 290
@@ -289,6 +293,25 @@ struct file *fget(unsigned int fd)
289 293
290EXPORT_SYMBOL(fget); 294EXPORT_SYMBOL(fget);
291 295
296struct file *fget_raw(unsigned int fd)
297{
298 struct file *file;
299 struct files_struct *files = current->files;
300
301 rcu_read_lock();
302 file = fcheck_files(files, fd);
303 if (file) {
304 /* File object ref couldn't be taken */
305 if (!atomic_long_inc_not_zero(&file->f_count))
306 file = NULL;
307 }
308 rcu_read_unlock();
309
310 return file;
311}
312
313EXPORT_SYMBOL(fget_raw);
314
292/* 315/*
293 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 316 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
294 * 317 *
@@ -313,6 +336,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
313 *fput_needed = 0; 336 *fput_needed = 0;
314 if (atomic_read(&files->count) == 1) { 337 if (atomic_read(&files->count) == 1) {
315 file = fcheck_files(files, fd); 338 file = fcheck_files(files, fd);
339 if (file && (file->f_mode & FMODE_PATH))
340 file = NULL;
341 } else {
342 rcu_read_lock();
343 file = fcheck_files(files, fd);
344 if (file) {
345 if (!(file->f_mode & FMODE_PATH) &&
346 atomic_long_inc_not_zero(&file->f_count))
347 *fput_needed = 1;
348 else
349 /* Didn't get the reference, someone's freed */
350 file = NULL;
351 }
352 rcu_read_unlock();
353 }
354
355 return file;
356}
357
358struct file *fget_raw_light(unsigned int fd, int *fput_needed)
359{
360 struct file *file;
361 struct files_struct *files = current->files;
362
363 *fput_needed = 0;
364 if (atomic_read(&files->count) == 1) {
365 file = fcheck_files(files, fd);
316 } else { 366 } else {
317 rcu_read_lock(); 367 rcu_read_lock();
318 file = fcheck_files(files, fd); 368 file = fcheck_files(files, fd);
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 1429f3ae1e86..5d318c44f855 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -44,7 +44,6 @@ static sector_t vxfs_bmap(struct address_space *, sector_t);
44const struct address_space_operations vxfs_aops = { 44const struct address_space_operations vxfs_aops = {
45 .readpage = vxfs_readpage, 45 .readpage = vxfs_readpage,
46 .bmap = vxfs_bmap, 46 .bmap = vxfs_bmap,
47 .sync_page = block_sync_page,
48}; 47};
49 48
50inline void 49inline void
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c6e4956786..b5ed541fb137 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -176,6 +176,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
176} 176}
177 177
178/* 178/*
179 * Remove the inode from the writeback list it is on.
180 */
181void inode_wb_list_del(struct inode *inode)
182{
183 spin_lock(&inode_wb_list_lock);
184 list_del_init(&inode->i_wb_list);
185 spin_unlock(&inode_wb_list_lock);
186}
187
188
189/*
179 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
180 * furthest end of its superblock's dirty-inode list. 191 * furthest end of its superblock's dirty-inode list.
181 * 192 *
@@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
188{ 199{
189 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 200 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
190 201
202 assert_spin_locked(&inode_wb_list_lock);
191 if (!list_empty(&wb->b_dirty)) { 203 if (!list_empty(&wb->b_dirty)) {
192 struct inode *tail; 204 struct inode *tail;
193 205
@@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
205{ 217{
206 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 218 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
207 219
220 assert_spin_locked(&inode_wb_list_lock);
208 list_move(&inode->i_wb_list, &wb->b_more_io); 221 list_move(&inode->i_wb_list, &wb->b_more_io);
209} 222}
210 223
211static void inode_sync_complete(struct inode *inode) 224static void inode_sync_complete(struct inode *inode)
212{ 225{
213 /* 226 /*
214 * Prevent speculative execution through spin_unlock(&inode_lock); 227 * Prevent speculative execution through
228 * spin_unlock(&inode_wb_list_lock);
215 */ 229 */
230
216 smp_mb(); 231 smp_mb();
217 wake_up_bit(&inode->i_state, __I_SYNC); 232 wake_up_bit(&inode->i_state, __I_SYNC);
218} 233}
@@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
286 */ 301 */
287static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 302static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
288{ 303{
304 assert_spin_locked(&inode_wb_list_lock);
289 list_splice_init(&wb->b_more_io, &wb->b_io); 305 list_splice_init(&wb->b_more_io, &wb->b_io);
290 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 306 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
291} 307}
@@ -306,25 +322,25 @@ static void inode_wait_for_writeback(struct inode *inode)
306 wait_queue_head_t *wqh; 322 wait_queue_head_t *wqh;
307 323
308 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 324 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
309 while (inode->i_state & I_SYNC) { 325 while (inode->i_state & I_SYNC) {
310 spin_unlock(&inode_lock); 326 spin_unlock(&inode->i_lock);
327 spin_unlock(&inode_wb_list_lock);
311 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 328 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
312 spin_lock(&inode_lock); 329 spin_lock(&inode_wb_list_lock);
330 spin_lock(&inode->i_lock);
313 } 331 }
314} 332}
315 333
316/* 334/*
317 * Write out an inode's dirty pages. Called under inode_lock. Either the 335 * Write out an inode's dirty pages. Called under inode_wb_list_lock and
318 * caller has ref on the inode (either via __iget or via syscall against an fd) 336 * inode->i_lock. Either the caller has an active reference on the inode or
319 * or the inode has I_WILL_FREE set (via generic_forget_inode) 337 * the inode has I_WILL_FREE set.
320 * 338 *
321 * If `wait' is set, wait on the writeout. 339 * If `wait' is set, wait on the writeout.
322 * 340 *
323 * The whole writeout design is quite complex and fragile. We want to avoid 341 * The whole writeout design is quite complex and fragile. We want to avoid
324 * starvation of particular inodes when others are being redirtied, prevent 342 * starvation of particular inodes when others are being redirtied, prevent
325 * livelocks, etc. 343 * livelocks, etc.
326 *
327 * Called under inode_lock.
328 */ 344 */
329static int 345static int
330writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 346writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -333,6 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
333 unsigned dirty; 349 unsigned dirty;
334 int ret; 350 int ret;
335 351
352 assert_spin_locked(&inode_wb_list_lock);
353 assert_spin_locked(&inode->i_lock);
354
336 if (!atomic_read(&inode->i_count)) 355 if (!atomic_read(&inode->i_count))
337 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 356 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
338 else 357 else
@@ -363,7 +382,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
363 /* Set I_SYNC, reset I_DIRTY_PAGES */ 382 /* Set I_SYNC, reset I_DIRTY_PAGES */
364 inode->i_state |= I_SYNC; 383 inode->i_state |= I_SYNC;
365 inode->i_state &= ~I_DIRTY_PAGES; 384 inode->i_state &= ~I_DIRTY_PAGES;
366 spin_unlock(&inode_lock); 385 spin_unlock(&inode->i_lock);
386 spin_unlock(&inode_wb_list_lock);
367 387
368 ret = do_writepages(mapping, wbc); 388 ret = do_writepages(mapping, wbc);
369 389
@@ -383,10 +403,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
383 * due to delalloc, clear dirty metadata flags right before 403 * due to delalloc, clear dirty metadata flags right before
384 * write_inode() 404 * write_inode()
385 */ 405 */
386 spin_lock(&inode_lock); 406 spin_lock(&inode->i_lock);
387 dirty = inode->i_state & I_DIRTY; 407 dirty = inode->i_state & I_DIRTY;
388 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 408 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
389 spin_unlock(&inode_lock); 409 spin_unlock(&inode->i_lock);
390 /* Don't write the inode if only I_DIRTY_PAGES was set */ 410 /* Don't write the inode if only I_DIRTY_PAGES was set */
391 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 411 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
392 int err = write_inode(inode, wbc); 412 int err = write_inode(inode, wbc);
@@ -394,7 +414,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
394 ret = err; 414 ret = err;
395 } 415 }
396 416
397 spin_lock(&inode_lock); 417 spin_lock(&inode_wb_list_lock);
418 spin_lock(&inode->i_lock);
398 inode->i_state &= ~I_SYNC; 419 inode->i_state &= ~I_SYNC;
399 if (!(inode->i_state & I_FREEING)) { 420 if (!(inode->i_state & I_FREEING)) {
400 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 421 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -506,7 +527,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
506 * kind does not need peridic writeout yet, and for the latter 527 * kind does not need peridic writeout yet, and for the latter
507 * kind writeout is handled by the freer. 528 * kind writeout is handled by the freer.
508 */ 529 */
530 spin_lock(&inode->i_lock);
509 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 531 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
532 spin_unlock(&inode->i_lock);
510 requeue_io(inode); 533 requeue_io(inode);
511 continue; 534 continue;
512 } 535 }
@@ -515,10 +538,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
515 * Was this inode dirtied after sync_sb_inodes was called? 538 * Was this inode dirtied after sync_sb_inodes was called?
516 * This keeps sync from extra jobs and livelock. 539 * This keeps sync from extra jobs and livelock.
517 */ 540 */
518 if (inode_dirtied_after(inode, wbc->wb_start)) 541 if (inode_dirtied_after(inode, wbc->wb_start)) {
542 spin_unlock(&inode->i_lock);
519 return 1; 543 return 1;
544 }
520 545
521 __iget(inode); 546 __iget(inode);
547
522 pages_skipped = wbc->pages_skipped; 548 pages_skipped = wbc->pages_skipped;
523 writeback_single_inode(inode, wbc); 549 writeback_single_inode(inode, wbc);
524 if (wbc->pages_skipped != pages_skipped) { 550 if (wbc->pages_skipped != pages_skipped) {
@@ -528,10 +554,11 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
528 */ 554 */
529 redirty_tail(inode); 555 redirty_tail(inode);
530 } 556 }
531 spin_unlock(&inode_lock); 557 spin_unlock(&inode->i_lock);
558 spin_unlock(&inode_wb_list_lock);
532 iput(inode); 559 iput(inode);
533 cond_resched(); 560 cond_resched();
534 spin_lock(&inode_lock); 561 spin_lock(&inode_wb_list_lock);
535 if (wbc->nr_to_write <= 0) { 562 if (wbc->nr_to_write <= 0) {
536 wbc->more_io = 1; 563 wbc->more_io = 1;
537 return 1; 564 return 1;
@@ -550,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
550 577
551 if (!wbc->wb_start) 578 if (!wbc->wb_start)
552 wbc->wb_start = jiffies; /* livelock avoidance */ 579 wbc->wb_start = jiffies; /* livelock avoidance */
553 spin_lock(&inode_lock); 580 spin_lock(&inode_wb_list_lock);
554 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 581 if (!wbc->for_kupdate || list_empty(&wb->b_io))
555 queue_io(wb, wbc->older_than_this); 582 queue_io(wb, wbc->older_than_this);
556 583
@@ -568,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
568 if (ret) 595 if (ret)
569 break; 596 break;
570 } 597 }
571 spin_unlock(&inode_lock); 598 spin_unlock(&inode_wb_list_lock);
572 /* Leave any unwritten inodes on b_io */ 599 /* Leave any unwritten inodes on b_io */
573} 600}
574 601
@@ -577,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
577{ 604{
578 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 605 WARN_ON(!rwsem_is_locked(&sb->s_umount));
579 606
580 spin_lock(&inode_lock); 607 spin_lock(&inode_wb_list_lock);
581 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 608 if (!wbc->for_kupdate || list_empty(&wb->b_io))
582 queue_io(wb, wbc->older_than_this); 609 queue_io(wb, wbc->older_than_this);
583 writeback_sb_inodes(sb, wb, wbc, true); 610 writeback_sb_inodes(sb, wb, wbc, true);
584 spin_unlock(&inode_lock); 611 spin_unlock(&inode_wb_list_lock);
585} 612}
586 613
587/* 614/*
@@ -720,13 +747,15 @@ static long wb_writeback(struct bdi_writeback *wb,
720 * become available for writeback. Otherwise 747 * become available for writeback. Otherwise
721 * we'll just busyloop. 748 * we'll just busyloop.
722 */ 749 */
723 spin_lock(&inode_lock); 750 spin_lock(&inode_wb_list_lock);
724 if (!list_empty(&wb->b_more_io)) { 751 if (!list_empty(&wb->b_more_io)) {
725 inode = wb_inode(wb->b_more_io.prev); 752 inode = wb_inode(wb->b_more_io.prev);
726 trace_wbc_writeback_wait(&wbc, wb->bdi); 753 trace_wbc_writeback_wait(&wbc, wb->bdi);
754 spin_lock(&inode->i_lock);
727 inode_wait_for_writeback(inode); 755 inode_wait_for_writeback(inode);
756 spin_unlock(&inode->i_lock);
728 } 757 }
729 spin_unlock(&inode_lock); 758 spin_unlock(&inode_wb_list_lock);
730 } 759 }
731 760
732 return wrote; 761 return wrote;
@@ -992,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
992{ 1021{
993 struct super_block *sb = inode->i_sb; 1022 struct super_block *sb = inode->i_sb;
994 struct backing_dev_info *bdi = NULL; 1023 struct backing_dev_info *bdi = NULL;
995 bool wakeup_bdi = false;
996 1024
997 /* 1025 /*
998 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1026 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1016,7 +1044,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1016 if (unlikely(block_dump)) 1044 if (unlikely(block_dump))
1017 block_dump___mark_inode_dirty(inode); 1045 block_dump___mark_inode_dirty(inode);
1018 1046
1019 spin_lock(&inode_lock); 1047 spin_lock(&inode->i_lock);
1020 if ((inode->i_state & flags) != flags) { 1048 if ((inode->i_state & flags) != flags) {
1021 const int was_dirty = inode->i_state & I_DIRTY; 1049 const int was_dirty = inode->i_state & I_DIRTY;
1022 1050
@@ -1028,7 +1056,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1028 * superblock list, based upon its state. 1056 * superblock list, based upon its state.
1029 */ 1057 */
1030 if (inode->i_state & I_SYNC) 1058 if (inode->i_state & I_SYNC)
1031 goto out; 1059 goto out_unlock_inode;
1032 1060
1033 /* 1061 /*
1034 * Only add valid (hashed) inodes to the superblock's 1062 * Only add valid (hashed) inodes to the superblock's
@@ -1036,16 +1064,17 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1036 */ 1064 */
1037 if (!S_ISBLK(inode->i_mode)) { 1065 if (!S_ISBLK(inode->i_mode)) {
1038 if (inode_unhashed(inode)) 1066 if (inode_unhashed(inode))
1039 goto out; 1067 goto out_unlock_inode;
1040 } 1068 }
1041 if (inode->i_state & I_FREEING) 1069 if (inode->i_state & I_FREEING)
1042 goto out; 1070 goto out_unlock_inode;
1043 1071
1044 /* 1072 /*
1045 * If the inode was already on b_dirty/b_io/b_more_io, don't 1073 * If the inode was already on b_dirty/b_io/b_more_io, don't
1046 * reposition it (that would break b_dirty time-ordering). 1074 * reposition it (that would break b_dirty time-ordering).
1047 */ 1075 */
1048 if (!was_dirty) { 1076 if (!was_dirty) {
1077 bool wakeup_bdi = false;
1049 bdi = inode_to_bdi(inode); 1078 bdi = inode_to_bdi(inode);
1050 1079
1051 if (bdi_cap_writeback_dirty(bdi)) { 1080 if (bdi_cap_writeback_dirty(bdi)) {
@@ -1062,15 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1062 wakeup_bdi = true; 1091 wakeup_bdi = true;
1063 } 1092 }
1064 1093
1094 spin_unlock(&inode->i_lock);
1095 spin_lock(&inode_wb_list_lock);
1065 inode->dirtied_when = jiffies; 1096 inode->dirtied_when = jiffies;
1066 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1097 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1098 spin_unlock(&inode_wb_list_lock);
1099
1100 if (wakeup_bdi)
1101 bdi_wakeup_thread_delayed(bdi);
1102 return;
1067 } 1103 }
1068 } 1104 }
1069out: 1105out_unlock_inode:
1070 spin_unlock(&inode_lock); 1106 spin_unlock(&inode->i_lock);
1071 1107
1072 if (wakeup_bdi)
1073 bdi_wakeup_thread_delayed(bdi);
1074} 1108}
1075EXPORT_SYMBOL(__mark_inode_dirty); 1109EXPORT_SYMBOL(__mark_inode_dirty);
1076 1110
@@ -1101,7 +1135,7 @@ static void wait_sb_inodes(struct super_block *sb)
1101 */ 1135 */
1102 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1136 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1103 1137
1104 spin_lock(&inode_lock); 1138 spin_lock(&inode_sb_list_lock);
1105 1139
1106 /* 1140 /*
1107 * Data integrity sync. Must wait for all pages under writeback, 1141 * Data integrity sync. Must wait for all pages under writeback,
@@ -1111,22 +1145,25 @@ static void wait_sb_inodes(struct super_block *sb)
1111 * we still have to wait for that writeout. 1145 * we still have to wait for that writeout.
1112 */ 1146 */
1113 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1147 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1114 struct address_space *mapping; 1148 struct address_space *mapping = inode->i_mapping;
1115 1149
1116 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 1150 spin_lock(&inode->i_lock);
1117 continue; 1151 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
1118 mapping = inode->i_mapping; 1152 (mapping->nrpages == 0)) {
1119 if (mapping->nrpages == 0) 1153 spin_unlock(&inode->i_lock);
1120 continue; 1154 continue;
1155 }
1121 __iget(inode); 1156 __iget(inode);
1122 spin_unlock(&inode_lock); 1157 spin_unlock(&inode->i_lock);
1158 spin_unlock(&inode_sb_list_lock);
1159
1123 /* 1160 /*
1124 * We hold a reference to 'inode' so it couldn't have 1161 * We hold a reference to 'inode' so it couldn't have been
1125 * been removed from s_inodes list while we dropped the 1162 * removed from s_inodes list while we dropped the
1126 * inode_lock. We cannot iput the inode now as we can 1163 * inode_sb_list_lock. We cannot iput the inode now as we can
1127 * be holding the last reference and we cannot iput it 1164 * be holding the last reference and we cannot iput it under
1128 * under inode_lock. So we keep the reference and iput 1165 * inode_sb_list_lock. So we keep the reference and iput it
1129 * it later. 1166 * later.
1130 */ 1167 */
1131 iput(old_inode); 1168 iput(old_inode);
1132 old_inode = inode; 1169 old_inode = inode;
@@ -1135,9 +1172,9 @@ static void wait_sb_inodes(struct super_block *sb)
1135 1172
1136 cond_resched(); 1173 cond_resched();
1137 1174
1138 spin_lock(&inode_lock); 1175 spin_lock(&inode_sb_list_lock);
1139 } 1176 }
1140 spin_unlock(&inode_lock); 1177 spin_unlock(&inode_sb_list_lock);
1141 iput(old_inode); 1178 iput(old_inode);
1142} 1179}
1143 1180
@@ -1271,9 +1308,11 @@ int write_inode_now(struct inode *inode, int sync)
1271 wbc.nr_to_write = 0; 1308 wbc.nr_to_write = 0;
1272 1309
1273 might_sleep(); 1310 might_sleep();
1274 spin_lock(&inode_lock); 1311 spin_lock(&inode_wb_list_lock);
1312 spin_lock(&inode->i_lock);
1275 ret = writeback_single_inode(inode, &wbc); 1313 ret = writeback_single_inode(inode, &wbc);
1276 spin_unlock(&inode_lock); 1314 spin_unlock(&inode->i_lock);
1315 spin_unlock(&inode_wb_list_lock);
1277 if (sync) 1316 if (sync)
1278 inode_sync_wait(inode); 1317 inode_sync_wait(inode);
1279 return ret; 1318 return ret;
@@ -1295,9 +1334,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1295{ 1334{
1296 int ret; 1335 int ret;
1297 1336
1298 spin_lock(&inode_lock); 1337 spin_lock(&inode_wb_list_lock);
1338 spin_lock(&inode->i_lock);
1299 ret = writeback_single_inode(inode, wbc); 1339 ret = writeback_single_inode(inode, wbc);
1300 spin_unlock(&inode_lock); 1340 spin_unlock(&inode->i_lock);
1341 spin_unlock(&inode_wb_list_lock);
1301 return ret; 1342 return ret;
1302} 1343}
1303EXPORT_SYMBOL(sync_inode); 1344EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3e87cce5837d..b6cca47f7b07 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -305,7 +305,7 @@ static void cuse_gendev_release(struct device *dev)
305static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) 305static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
306{ 306{
307 struct cuse_conn *cc = fc_to_cc(fc); 307 struct cuse_conn *cc = fc_to_cc(fc);
308 struct cuse_init_out *arg = &req->misc.cuse_init_out; 308 struct cuse_init_out *arg = req->out.args[0].value;
309 struct page *page = req->pages[0]; 309 struct page *page = req->pages[0];
310 struct cuse_devinfo devinfo = { }; 310 struct cuse_devinfo devinfo = { };
311 struct device *dev; 311 struct device *dev;
@@ -384,6 +384,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
384 dev_set_uevent_suppress(dev, 0); 384 dev_set_uevent_suppress(dev, 0);
385 kobject_uevent(&dev->kobj, KOBJ_ADD); 385 kobject_uevent(&dev->kobj, KOBJ_ADD);
386out: 386out:
387 kfree(arg);
387 __free_page(page); 388 __free_page(page);
388 return; 389 return;
389 390
@@ -405,6 +406,7 @@ static int cuse_send_init(struct cuse_conn *cc)
405 struct page *page; 406 struct page *page;
406 struct fuse_conn *fc = &cc->fc; 407 struct fuse_conn *fc = &cc->fc;
407 struct cuse_init_in *arg; 408 struct cuse_init_in *arg;
409 void *outarg;
408 410
409 BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); 411 BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
410 412
@@ -419,6 +421,10 @@ static int cuse_send_init(struct cuse_conn *cc)
419 if (!page) 421 if (!page)
420 goto err_put_req; 422 goto err_put_req;
421 423
424 outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
425 if (!outarg)
426 goto err_free_page;
427
422 arg = &req->misc.cuse_init_in; 428 arg = &req->misc.cuse_init_in;
423 arg->major = FUSE_KERNEL_VERSION; 429 arg->major = FUSE_KERNEL_VERSION;
424 arg->minor = FUSE_KERNEL_MINOR_VERSION; 430 arg->minor = FUSE_KERNEL_MINOR_VERSION;
@@ -429,7 +435,7 @@ static int cuse_send_init(struct cuse_conn *cc)
429 req->in.args[0].value = arg; 435 req->in.args[0].value = arg;
430 req->out.numargs = 2; 436 req->out.numargs = 2;
431 req->out.args[0].size = sizeof(struct cuse_init_out); 437 req->out.args[0].size = sizeof(struct cuse_init_out);
432 req->out.args[0].value = &req->misc.cuse_init_out; 438 req->out.args[0].value = outarg;
433 req->out.args[1].size = CUSE_INIT_INFO_MAX; 439 req->out.args[1].size = CUSE_INIT_INFO_MAX;
434 req->out.argvar = 1; 440 req->out.argvar = 1;
435 req->out.argpages = 1; 441 req->out.argpages = 1;
@@ -440,6 +446,8 @@ static int cuse_send_init(struct cuse_conn *cc)
440 446
441 return 0; 447 return 0;
442 448
449err_free_page:
450 __free_page(page);
443err_put_req: 451err_put_req:
444 fuse_put_request(fc, req); 452 fuse_put_request(fc, req);
445err: 453err:
@@ -458,7 +466,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
458 * @file: file struct being opened 466 * @file: file struct being opened
459 * 467 *
460 * Userland CUSE server can create a CUSE device by opening /dev/cuse 468 * Userland CUSE server can create a CUSE device by opening /dev/cuse
461 * and replying to the initilaization request kernel sends. This 469 * and replying to the initialization request kernel sends. This
462 * function is responsible for handling CUSE device initialization. 470 * function is responsible for handling CUSE device initialization.
463 * Because the fd opened by this function is used during 471 * Because the fd opened by this function is used during
464 * initialization, this function only creates cuse_conn and sends 472 * initialization, this function only creates cuse_conn and sends
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cf8d28d1fbad..640fc229df10 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -737,14 +737,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
737 if (WARN_ON(PageMlocked(oldpage))) 737 if (WARN_ON(PageMlocked(oldpage)))
738 goto out_fallback_unlock; 738 goto out_fallback_unlock;
739 739
740 remove_from_page_cache(oldpage); 740 err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
741 page_cache_release(oldpage);
742
743 err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
744 if (err) { 741 if (err) {
745 printk(KERN_WARNING "fuse_try_move_page: failed to add page"); 742 unlock_page(newpage);
746 goto out_fallback_unlock; 743 return err;
747 } 744 }
745
748 page_cache_get(newpage); 746 page_cache_get(newpage);
749 747
750 if (!(buf->flags & PIPE_BUF_FLAG_LRU)) 748 if (!(buf->flags & PIPE_BUF_FLAG_LRU))
@@ -1910,6 +1908,21 @@ __acquires(fc->lock)
1910 kfree(dequeue_forget(fc, 1, NULL)); 1908 kfree(dequeue_forget(fc, 1, NULL));
1911} 1909}
1912 1910
1911static void end_polls(struct fuse_conn *fc)
1912{
1913 struct rb_node *p;
1914
1915 p = rb_first(&fc->polled_files);
1916
1917 while (p) {
1918 struct fuse_file *ff;
1919 ff = rb_entry(p, struct fuse_file, polled_node);
1920 wake_up_interruptible_all(&ff->poll_wait);
1921
1922 p = rb_next(p);
1923 }
1924}
1925
1913/* 1926/*
1914 * Abort all requests. 1927 * Abort all requests.
1915 * 1928 *
@@ -1937,6 +1950,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
1937 fc->blocked = 0; 1950 fc->blocked = 0;
1938 end_io_requests(fc); 1951 end_io_requests(fc);
1939 end_queued_requests(fc); 1952 end_queued_requests(fc);
1953 end_polls(fc);
1940 wake_up_all(&fc->waitq); 1954 wake_up_all(&fc->waitq);
1941 wake_up_all(&fc->blocked_waitq); 1955 wake_up_all(&fc->blocked_waitq);
1942 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 1956 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1953,6 +1967,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
1953 fc->connected = 0; 1967 fc->connected = 0;
1954 fc->blocked = 0; 1968 fc->blocked = 0;
1955 end_queued_requests(fc); 1969 end_queued_requests(fc);
1970 end_polls(fc);
1956 wake_up_all(&fc->blocked_waitq); 1971 wake_up_all(&fc->blocked_waitq);
1957 spin_unlock(&fc->lock); 1972 spin_unlock(&fc->lock);
1958 fuse_conn_put(fc); 1973 fuse_conn_put(fc);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bfed8447ed80..c6ba49bd95b3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -158,10 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
158{ 158{
159 struct inode *inode; 159 struct inode *inode;
160 160
161 if (nd->flags & LOOKUP_RCU) 161 inode = ACCESS_ONCE(entry->d_inode);
162 return -ECHILD;
163
164 inode = entry->d_inode;
165 if (inode && is_bad_inode(inode)) 162 if (inode && is_bad_inode(inode))
166 return 0; 163 return 0;
167 else if (fuse_dentry_time(entry) < get_jiffies_64()) { 164 else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -177,6 +174,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
177 if (!inode) 174 if (!inode)
178 return 0; 175 return 0;
179 176
177 if (nd->flags & LOOKUP_RCU)
178 return -ECHILD;
179
180 fc = get_fuse_conn(inode); 180 fc = get_fuse_conn(inode);
181 req = fuse_get_req(fc); 181 req = fuse_get_req(fc);
182 if (IS_ERR(req)) 182 if (IS_ERR(req))
@@ -970,6 +970,14 @@ static int fuse_access(struct inode *inode, int mask)
970 return err; 970 return err;
971} 971}
972 972
973static int fuse_perm_getattr(struct inode *inode, int flags)
974{
975 if (flags & IPERM_FLAG_RCU)
976 return -ECHILD;
977
978 return fuse_do_getattr(inode, NULL, NULL);
979}
980
973/* 981/*
974 * Check permission. The two basic access models of FUSE are: 982 * Check permission. The two basic access models of FUSE are:
975 * 983 *
@@ -989,9 +997,6 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
989 bool refreshed = false; 997 bool refreshed = false;
990 int err = 0; 998 int err = 0;
991 999
992 if (flags & IPERM_FLAG_RCU)
993 return -ECHILD;
994
995 if (!fuse_allow_task(fc, current)) 1000 if (!fuse_allow_task(fc, current))
996 return -EACCES; 1001 return -EACCES;
997 1002
@@ -1000,9 +1005,15 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
1000 */ 1005 */
1001 if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) || 1006 if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
1002 ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { 1007 ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
1003 err = fuse_update_attributes(inode, NULL, NULL, &refreshed); 1008 struct fuse_inode *fi = get_fuse_inode(inode);
1004 if (err) 1009
1005 return err; 1010 if (fi->i_time < get_jiffies_64()) {
1011 refreshed = true;
1012
1013 err = fuse_perm_getattr(inode, flags);
1014 if (err)
1015 return err;
1016 }
1006 } 1017 }
1007 1018
1008 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { 1019 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
@@ -1012,7 +1023,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
1012 attributes. This is also needed, because the root 1023 attributes. This is also needed, because the root
1013 node will at first have no permissions */ 1024 node will at first have no permissions */
1014 if (err == -EACCES && !refreshed) { 1025 if (err == -EACCES && !refreshed) {
1015 err = fuse_do_getattr(inode, NULL, NULL); 1026 err = fuse_perm_getattr(inode, flags);
1016 if (!err) 1027 if (!err)
1017 err = generic_permission(inode, mask, 1028 err = generic_permission(inode, mask,
1018 flags, NULL); 1029 flags, NULL);
@@ -1023,13 +1034,16 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
1023 noticed immediately, only after the attribute 1034 noticed immediately, only after the attribute
1024 timeout has expired */ 1035 timeout has expired */
1025 } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { 1036 } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
1037 if (flags & IPERM_FLAG_RCU)
1038 return -ECHILD;
1039
1026 err = fuse_access(inode, mask); 1040 err = fuse_access(inode, mask);
1027 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { 1041 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
1028 if (!(inode->i_mode & S_IXUGO)) { 1042 if (!(inode->i_mode & S_IXUGO)) {
1029 if (refreshed) 1043 if (refreshed)
1030 return -EACCES; 1044 return -EACCES;
1031 1045
1032 err = fuse_do_getattr(inode, NULL, NULL); 1046 err = fuse_perm_getattr(inode, flags);
1033 if (!err && !(inode->i_mode & S_IXUGO)) 1047 if (!err && !(inode->i_mode & S_IXUGO))
1034 return -EACCES; 1048 return -EACCES;
1035 } 1049 }
@@ -1283,8 +1297,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1283 if (err) 1297 if (err)
1284 return err; 1298 return err;
1285 1299
1286 if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc) 1300 if (attr->ia_valid & ATTR_OPEN) {
1287 return 0; 1301 if (fc->atomic_o_trunc)
1302 return 0;
1303 file = NULL;
1304 }
1288 1305
1289 if (attr->ia_valid & ATTR_SIZE) 1306 if (attr->ia_valid & ATTR_SIZE)
1290 is_truncate = true; 1307 is_truncate = true;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 95da1bc1c826..6ea00734984e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)
86 return ff; 86 return ff;
87} 87}
88 88
89static void fuse_release_async(struct work_struct *work)
90{
91 struct fuse_req *req;
92 struct fuse_conn *fc;
93 struct path path;
94
95 req = container_of(work, struct fuse_req, misc.release.work);
96 path = req->misc.release.path;
97 fc = get_fuse_conn(path.dentry->d_inode);
98
99 fuse_put_request(fc, req);
100 path_put(&path);
101}
102
89static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) 103static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
90{ 104{
91 path_put(&req->misc.release.path); 105 if (fc->destroy_req) {
106 /*
107 * If this is a fuseblk mount, then it's possible that
108 * releasing the path will result in releasing the
109 * super block and sending the DESTROY request. If
110 * the server is single threaded, this would hang.
111 * For this reason do the path_put() in a separate
112 * thread.
113 */
114 atomic_inc(&req->count);
115 INIT_WORK(&req->misc.release.work, fuse_release_async);
116 schedule_work(&req->misc.release.work);
117 } else {
118 path_put(&req->misc.release.path);
119 }
92} 120}
93 121
94static void fuse_file_put(struct fuse_file *ff) 122static void fuse_file_put(struct fuse_file *ff, bool sync)
95{ 123{
96 if (atomic_dec_and_test(&ff->count)) { 124 if (atomic_dec_and_test(&ff->count)) {
97 struct fuse_req *req = ff->reserved_req; 125 struct fuse_req *req = ff->reserved_req;
98 126
99 req->end = fuse_release_end; 127 if (sync) {
100 fuse_request_send_background(ff->fc, req); 128 fuse_request_send(ff->fc, req);
129 path_put(&req->misc.release.path);
130 fuse_put_request(ff->fc, req);
131 } else {
132 req->end = fuse_release_end;
133 fuse_request_send_background(ff->fc, req);
134 }
101 kfree(ff); 135 kfree(ff);
102 } 136 }
103} 137}
@@ -188,7 +222,7 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
188 rb_erase(&ff->polled_node, &fc->polled_files); 222 rb_erase(&ff->polled_node, &fc->polled_files);
189 spin_unlock(&fc->lock); 223 spin_unlock(&fc->lock);
190 224
191 wake_up_interruptible_sync(&ff->poll_wait); 225 wake_up_interruptible_all(&ff->poll_wait);
192 226
193 inarg->fh = ff->fh; 227 inarg->fh = ff->fh;
194 inarg->flags = flags; 228 inarg->flags = flags;
@@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode)
219 * Normally this will send the RELEASE request, however if 253 * Normally this will send the RELEASE request, however if
220 * some asynchronous READ or WRITE requests are outstanding, 254 * some asynchronous READ or WRITE requests are outstanding,
221 * the sending will be delayed. 255 * the sending will be delayed.
256 *
257 * Make the release synchronous if this is a fuseblk mount,
258 * synchronous RELEASE is allowed (and desirable) in this case
259 * because the server can be trusted not to screw up.
222 */ 260 */
223 fuse_file_put(ff); 261 fuse_file_put(ff, ff->fc->destroy_req != NULL);
224} 262}
225 263
226static int fuse_open(struct inode *inode, struct file *file) 264static int fuse_open(struct inode *inode, struct file *file)
@@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
558 page_cache_release(page); 596 page_cache_release(page);
559 } 597 }
560 if (req->ff) 598 if (req->ff)
561 fuse_file_put(req->ff); 599 fuse_file_put(req->ff, false);
562} 600}
563 601
564static void fuse_send_readpages(struct fuse_req *req, struct file *file) 602static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
1137static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) 1175static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1138{ 1176{
1139 __free_page(req->pages[0]); 1177 __free_page(req->pages[0]);
1140 fuse_file_put(req->ff); 1178 fuse_file_put(req->ff, false);
1141} 1179}
1142 1180
1143static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) 1181static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ae5744a2f9e9..b788becada76 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,7 @@
21#include <linux/rwsem.h> 21#include <linux/rwsem.h>
22#include <linux/rbtree.h> 22#include <linux/rbtree.h>
23#include <linux/poll.h> 23#include <linux/poll.h>
24#include <linux/workqueue.h>
24 25
25/** Max number of pages that can be used in a single read request */ 26/** Max number of pages that can be used in a single read request */
26#define FUSE_MAX_PAGES_PER_REQ 32 27#define FUSE_MAX_PAGES_PER_REQ 32
@@ -262,13 +263,15 @@ struct fuse_req {
262 /** Data for asynchronous requests */ 263 /** Data for asynchronous requests */
263 union { 264 union {
264 struct { 265 struct {
265 struct fuse_release_in in; 266 union {
267 struct fuse_release_in in;
268 struct work_struct work;
269 };
266 struct path path; 270 struct path path;
267 } release; 271 } release;
268 struct fuse_init_in init_in; 272 struct fuse_init_in init_in;
269 struct fuse_init_out init_out; 273 struct fuse_init_out init_out;
270 struct cuse_init_in cuse_init_in; 274 struct cuse_init_in cuse_init_in;
271 struct cuse_init_out cuse_init_out;
272 struct { 275 struct {
273 struct fuse_read_in in; 276 struct fuse_read_in in;
274 u64 attr_ver; 277 u64 attr_ver;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd1..cc6ec4b2f0ff 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
637 u64 nodeid; 637 u64 nodeid;
638 u32 generation; 638 u32 generation;
639 639
640 if (*max_len < len) 640 if (*max_len < len) {
641 *max_len = len;
641 return 255; 642 return 255;
643 }
642 644
643 nodeid = get_fuse_inode(inode)->nodeid; 645 nodeid = get_fuse_inode(inode)->nodeid;
644 generation = inode->i_generation; 646 generation = inode->i_generation;
@@ -868,7 +870,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
868 870
869 fc->bdi.name = "fuse"; 871 fc->bdi.name = "fuse";
870 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 872 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
871 fc->bdi.unplug_io_fn = default_unplug_io_fn;
872 /* fuse does it's own writeback accounting */ 873 /* fuse does it's own writeback accounting */
873 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; 874 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
874 875
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 06c48a891832..8f26d1a58912 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -74,7 +74,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
74 return -EINVAL; 74 return -EINVAL;
75 if (S_ISLNK(inode->i_mode)) 75 if (S_ISLNK(inode->i_mode))
76 return -EOPNOTSUPP; 76 return -EOPNOTSUPP;
77 if (!is_owner_or_cap(inode)) 77 if (!inode_owner_or_capable(inode))
78 return -EPERM; 78 return -EPERM;
79 if (value) { 79 if (value) {
80 acl = posix_acl_from_xattr(value, size); 80 acl = posix_acl_from_xattr(value, size);
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 21f7e46da4c0..f3d23ef4e876 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,4 +1,4 @@
1EXTRA_CFLAGS := -I$(src) 1ccflags-y := -I$(src)
2obj-$(CONFIG_GFS2_FS) += gfs2.o 2obj-$(CONFIG_GFS2_FS) += gfs2.o
3gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ 3gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
4 glops.o inode.o log.o lops.o main.o meta_io.o \ 4 glops.o inode.o log.o lops.o main.o meta_io.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7118f1a780a9..cbc07155b1a0 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -80,8 +80,11 @@ int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
80 struct posix_acl *acl; 80 struct posix_acl *acl;
81 int error; 81 int error;
82 82
83 if (flags & IPERM_FLAG_RCU) 83 if (flags & IPERM_FLAG_RCU) {
84 return -ECHILD; 84 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
85 return -ECHILD;
86 return -EAGAIN;
87 }
85 88
86 acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); 89 acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
87 if (IS_ERR(acl)) 90 if (IS_ERR(acl))
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4f36f8832b9b..c71995b111bf 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -695,6 +695,7 @@ out:
695 if (error == 0) 695 if (error == 0)
696 return 0; 696 return 0;
697 697
698 unlock_page(page);
698 page_cache_release(page); 699 page_cache_release(page);
699 700
700 gfs2_trans_end(sdp); 701 gfs2_trans_end(sdp);
@@ -1116,7 +1117,6 @@ static const struct address_space_operations gfs2_writeback_aops = {
1116 .writepages = gfs2_writeback_writepages, 1117 .writepages = gfs2_writeback_writepages,
1117 .readpage = gfs2_readpage, 1118 .readpage = gfs2_readpage,
1118 .readpages = gfs2_readpages, 1119 .readpages = gfs2_readpages,
1119 .sync_page = block_sync_page,
1120 .write_begin = gfs2_write_begin, 1120 .write_begin = gfs2_write_begin,
1121 .write_end = gfs2_write_end, 1121 .write_end = gfs2_write_end,
1122 .bmap = gfs2_bmap, 1122 .bmap = gfs2_bmap,
@@ -1132,7 +1132,6 @@ static const struct address_space_operations gfs2_ordered_aops = {
1132 .writepage = gfs2_ordered_writepage, 1132 .writepage = gfs2_ordered_writepage,
1133 .readpage = gfs2_readpage, 1133 .readpage = gfs2_readpage,
1134 .readpages = gfs2_readpages, 1134 .readpages = gfs2_readpages,
1135 .sync_page = block_sync_page,
1136 .write_begin = gfs2_write_begin, 1135 .write_begin = gfs2_write_begin,
1137 .write_end = gfs2_write_end, 1136 .write_end = gfs2_write_end,
1138 .set_page_dirty = gfs2_set_page_dirty, 1137 .set_page_dirty = gfs2_set_page_dirty,
@@ -1150,7 +1149,6 @@ static const struct address_space_operations gfs2_jdata_aops = {
1150 .writepages = gfs2_jdata_writepages, 1149 .writepages = gfs2_jdata_writepages,
1151 .readpage = gfs2_readpage, 1150 .readpage = gfs2_readpage,
1152 .readpages = gfs2_readpages, 1151 .readpages = gfs2_readpages,
1153 .sync_page = block_sync_page,
1154 .write_begin = gfs2_write_begin, 1152 .write_begin = gfs2_write_begin,
1155 .write_end = gfs2_write_end, 1153 .write_end = gfs2_write_end,
1156 .set_page_dirty = gfs2_set_page_dirty, 1154 .set_page_dirty = gfs2_set_page_dirty,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3c4039d5eef1..ef3dc4b9fae2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -21,6 +21,7 @@
21#include "meta_io.h" 21#include "meta_io.h"
22#include "quota.h" 22#include "quota.h"
23#include "rgrp.h" 23#include "rgrp.h"
24#include "super.h"
24#include "trans.h" 25#include "trans.h"
25#include "dir.h" 26#include "dir.h"
26#include "util.h" 27#include "util.h"
@@ -757,7 +758,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
757 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 758 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
758 struct gfs2_rgrp_list rlist; 759 struct gfs2_rgrp_list rlist;
759 u64 bn, bstart; 760 u64 bn, bstart;
760 u32 blen; 761 u32 blen, btotal;
761 __be64 *p; 762 __be64 *p;
762 unsigned int rg_blocks = 0; 763 unsigned int rg_blocks = 0;
763 int metadata; 764 int metadata;
@@ -839,6 +840,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
839 840
840 bstart = 0; 841 bstart = 0;
841 blen = 0; 842 blen = 0;
843 btotal = 0;
842 844
843 for (p = top; p < bottom; p++) { 845 for (p = top; p < bottom; p++) {
844 if (!*p) 846 if (!*p)
@@ -851,9 +853,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
851 else { 853 else {
852 if (bstart) { 854 if (bstart) {
853 if (metadata) 855 if (metadata)
854 gfs2_free_meta(ip, bstart, blen); 856 __gfs2_free_meta(ip, bstart, blen);
855 else 857 else
856 gfs2_free_data(ip, bstart, blen); 858 __gfs2_free_data(ip, bstart, blen);
859
860 btotal += blen;
857 } 861 }
858 862
859 bstart = bn; 863 bstart = bn;
@@ -865,11 +869,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
865 } 869 }
866 if (bstart) { 870 if (bstart) {
867 if (metadata) 871 if (metadata)
868 gfs2_free_meta(ip, bstart, blen); 872 __gfs2_free_meta(ip, bstart, blen);
869 else 873 else
870 gfs2_free_data(ip, bstart, blen); 874 __gfs2_free_data(ip, bstart, blen);
875
876 btotal += blen;
871 } 877 }
872 878
879 gfs2_statfs_change(sdp, 0, +btotal, 0);
880 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
881 ip->i_inode.i_gid);
882
873 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 883 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
874 884
875 gfs2_dinode_out(ip, dibh->b_data); 885 gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4a456338b873..0da8da2c991d 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
44 int error; 44 int error;
45 int had_lock = 0; 45 int had_lock = 0;
46 46
47 if (nd->flags & LOOKUP_RCU) 47 if (nd && nd->flags & LOOKUP_RCU)
48 return -ECHILD; 48 return -ECHILD;
49 49
50 parent = dget_parent(dentry); 50 parent = dget_parent(dentry);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8184f9..b5a5e60df0d5 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
36 struct super_block *sb = inode->i_sb; 36 struct super_block *sb = inode->i_sb;
37 struct gfs2_inode *ip = GFS2_I(inode); 37 struct gfs2_inode *ip = GFS2_I(inode);
38 38
39 if (*len < GFS2_SMALL_FH_SIZE || 39 if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
40 (connectable && *len < GFS2_LARGE_FH_SIZE)) 40 *len = GFS2_LARGE_FH_SIZE;
41 return 255; 41 return 255;
42 } else if (*len < GFS2_SMALL_FH_SIZE) {
43 *len = GFS2_SMALL_FH_SIZE;
44 return 255;
45 }
42 46
43 fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); 47 fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
44 fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); 48 fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7cfdcb913363..b2682e073eee 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -221,7 +221,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
221 goto out_drop_write; 221 goto out_drop_write;
222 222
223 error = -EACCES; 223 error = -EACCES;
224 if (!is_owner_or_cap(inode)) 224 if (!inode_owner_or_capable(inode))
225 goto out; 225 goto out;
226 226
227 error = 0; 227 error = 0;
@@ -448,15 +448,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
448{ 448{
449 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 449 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
450 450
451 if (!(file->f_flags & O_NOATIME)) { 451 if (!(file->f_flags & O_NOATIME) &&
452 !IS_NOATIME(&ip->i_inode)) {
452 struct gfs2_holder i_gh; 453 struct gfs2_holder i_gh;
453 int error; 454 int error;
454 455
455 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); 456 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
456 error = gfs2_glock_nq(&i_gh); 457 error = gfs2_glock_nq(&i_gh);
457 file_accessed(file); 458 if (error == 0) {
458 if (error == 0) 459 file_accessed(file);
459 gfs2_glock_dq_uninit(&i_gh); 460 gfs2_glock_dq(&i_gh);
461 }
462 gfs2_holder_uninit(&i_gh);
463 if (error)
464 return error;
460 } 465 }
461 vma->vm_ops = &gfs2_vm_ops; 466 vma->vm_ops = &gfs2_vm_ops;
462 vma->vm_flags |= VM_CAN_NONLINEAR; 467 vma->vm_flags |= VM_CAN_NONLINEAR;
@@ -617,8 +622,7 @@ static void empty_write_end(struct page *page, unsigned from,
617{ 622{
618 struct gfs2_inode *ip = GFS2_I(page->mapping->host); 623 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
619 624
620 page_zero_new_buffers(page, from, to); 625 zero_user(page, from, to-from);
621 flush_dcache_page(page);
622 mark_page_accessed(page); 626 mark_page_accessed(page);
623 627
624 if (!gfs2_is_writeback(ip)) 628 if (!gfs2_is_writeback(ip))
@@ -627,36 +631,43 @@ static void empty_write_end(struct page *page, unsigned from,
627 block_commit_write(page, from, to); 631 block_commit_write(page, from, to);
628} 632}
629 633
630static int write_empty_blocks(struct page *page, unsigned from, unsigned to) 634static int needs_empty_write(sector_t block, struct inode *inode)
631{ 635{
632 unsigned start, end, next;
633 struct buffer_head *bh, *head;
634 int error; 636 int error;
637 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
635 638
636 if (!page_has_buffers(page)) { 639 bh_map.b_size = 1 << inode->i_blkbits;
637 error = __block_write_begin(page, from, to - from, gfs2_block_map); 640 error = gfs2_block_map(inode, block, &bh_map, 0);
638 if (unlikely(error)) 641 if (unlikely(error))
639 return error; 642 return error;
643 return !buffer_mapped(&bh_map);
644}
640 645
641 empty_write_end(page, from, to); 646static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
642 return 0; 647{
643 } 648 struct inode *inode = page->mapping->host;
649 unsigned start, end, next, blksize;
650 sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
651 int ret;
644 652
645 bh = head = page_buffers(page); 653 blksize = 1 << inode->i_blkbits;
646 next = end = 0; 654 next = end = 0;
647 while (next < from) { 655 while (next < from) {
648 next += bh->b_size; 656 next += blksize;
649 bh = bh->b_this_page; 657 block++;
650 } 658 }
651 start = next; 659 start = next;
652 do { 660 do {
653 next += bh->b_size; 661 next += blksize;
654 if (buffer_mapped(bh)) { 662 ret = needs_empty_write(block, inode);
663 if (unlikely(ret < 0))
664 return ret;
665 if (ret == 0) {
655 if (end) { 666 if (end) {
656 error = __block_write_begin(page, start, end - start, 667 ret = __block_write_begin(page, start, end - start,
657 gfs2_block_map); 668 gfs2_block_map);
658 if (unlikely(error)) 669 if (unlikely(ret))
659 return error; 670 return ret;
660 empty_write_end(page, start, end); 671 empty_write_end(page, start, end);
661 end = 0; 672 end = 0;
662 } 673 }
@@ -664,13 +675,13 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
664 } 675 }
665 else 676 else
666 end = next; 677 end = next;
667 bh = bh->b_this_page; 678 block++;
668 } while (next < to); 679 } while (next < to);
669 680
670 if (end) { 681 if (end) {
671 error = __block_write_begin(page, start, end - start, gfs2_block_map); 682 ret = __block_write_begin(page, start, end - start, gfs2_block_map);
672 if (unlikely(error)) 683 if (unlikely(ret))
673 return error; 684 return ret;
674 empty_write_end(page, start, end); 685 empty_write_end(page, start, end);
675 } 686 }
676 687
@@ -976,8 +987,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
976 987
977 mutex_lock(&fp->f_fl_mutex); 988 mutex_lock(&fp->f_fl_mutex);
978 flock_lock_file_wait(file, fl); 989 flock_lock_file_wait(file, fl);
979 if (fl_gh->gh_gl) 990 if (fl_gh->gh_gl) {
980 gfs2_glock_dq_uninit(fl_gh); 991 gfs2_glock_dq_wait(fl_gh);
992 gfs2_holder_uninit(fl_gh);
993 }
981 mutex_unlock(&fp->f_fl_mutex); 994 mutex_unlock(&fp->f_fl_mutex);
982} 995}
983 996
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 08a8beb152e6..e2431313491f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -26,6 +26,9 @@
26#include <linux/freezer.h> 26#include <linux/freezer.h>
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <linux/jiffies.h> 28#include <linux/jiffies.h>
29#include <linux/rcupdate.h>
30#include <linux/rculist_bl.h>
31#include <linux/bit_spinlock.h>
29 32
30#include "gfs2.h" 33#include "gfs2.h"
31#include "incore.h" 34#include "incore.h"
@@ -41,10 +44,6 @@
41#define CREATE_TRACE_POINTS 44#define CREATE_TRACE_POINTS
42#include "trace_gfs2.h" 45#include "trace_gfs2.h"
43 46
44struct gfs2_gl_hash_bucket {
45 struct hlist_head hb_list;
46};
47
48struct gfs2_glock_iter { 47struct gfs2_glock_iter {
49 int hash; /* hash bucket index */ 48 int hash; /* hash bucket index */
50 struct gfs2_sbd *sdp; /* incore superblock */ 49 struct gfs2_sbd *sdp; /* incore superblock */
@@ -54,7 +53,6 @@ struct gfs2_glock_iter {
54 53
55typedef void (*glock_examiner) (struct gfs2_glock * gl); 54typedef void (*glock_examiner) (struct gfs2_glock * gl);
56 55
57static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
58static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); 56static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) 57#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); 58static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
@@ -70,57 +68,9 @@ static DEFINE_SPINLOCK(lru_lock);
70#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) 68#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
71#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) 69#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
72 70
73static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE]; 71static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
74static struct dentry *gfs2_root; 72static struct dentry *gfs2_root;
75 73
76/*
77 * Despite what you might think, the numbers below are not arbitrary :-)
78 * They are taken from the ipv4 routing hash code, which is well tested
79 * and thus should be nearly optimal. Later on we might tweek the numbers
80 * but for now this should be fine.
81 *
82 * The reason for putting the locks in a separate array from the list heads
83 * is that we can have fewer locks than list heads and save memory. We use
84 * the same hash function for both, but with a different hash mask.
85 */
86#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
87 defined(CONFIG_PROVE_LOCKING)
88
89#ifdef CONFIG_LOCKDEP
90# define GL_HASH_LOCK_SZ 256
91#else
92# if NR_CPUS >= 32
93# define GL_HASH_LOCK_SZ 4096
94# elif NR_CPUS >= 16
95# define GL_HASH_LOCK_SZ 2048
96# elif NR_CPUS >= 8
97# define GL_HASH_LOCK_SZ 1024
98# elif NR_CPUS >= 4
99# define GL_HASH_LOCK_SZ 512
100# else
101# define GL_HASH_LOCK_SZ 256
102# endif
103#endif
104
105/* We never want more locks than chains */
106#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
107# undef GL_HASH_LOCK_SZ
108# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
109#endif
110
111static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
112
113static inline rwlock_t *gl_lock_addr(unsigned int x)
114{
115 return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
116}
117#else /* not SMP, so no spinlocks required */
118static inline rwlock_t *gl_lock_addr(unsigned int x)
119{
120 return NULL;
121}
122#endif
123
124/** 74/**
125 * gl_hash() - Turn glock number into hash bucket number 75 * gl_hash() - Turn glock number into hash bucket number
126 * @lock: The glock number 76 * @lock: The glock number
@@ -141,25 +91,35 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
141 return h; 91 return h;
142} 92}
143 93
144/** 94static inline void spin_lock_bucket(unsigned int hash)
145 * glock_free() - Perform a few checks and then release struct gfs2_glock 95{
146 * @gl: The glock to release 96 struct hlist_bl_head *bl = &gl_hash_table[hash];
147 * 97 bit_spin_lock(0, (unsigned long *)bl);
148 * Also calls lock module to release its internal structure for this glock. 98}
149 *
150 */
151 99
152static void glock_free(struct gfs2_glock *gl) 100static inline void spin_unlock_bucket(unsigned int hash)
101{
102 struct hlist_bl_head *bl = &gl_hash_table[hash];
103 __bit_spin_unlock(0, (unsigned long *)bl);
104}
105
106static void gfs2_glock_dealloc(struct rcu_head *rcu)
107{
108 struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
109
110 if (gl->gl_ops->go_flags & GLOF_ASPACE)
111 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
112 else
113 kmem_cache_free(gfs2_glock_cachep, gl);
114}
115
116void gfs2_glock_free(struct gfs2_glock *gl)
153{ 117{
154 struct gfs2_sbd *sdp = gl->gl_sbd; 118 struct gfs2_sbd *sdp = gl->gl_sbd;
155 struct address_space *mapping = gfs2_glock2aspace(gl);
156 struct kmem_cache *cachep = gfs2_glock_cachep;
157 119
158 GLOCK_BUG_ON(gl, mapping && mapping->nrpages); 120 call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
159 trace_gfs2_glock_put(gl); 121 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
160 if (mapping) 122 wake_up(&sdp->sd_glock_wait);
161 cachep = gfs2_glock_aspace_cachep;
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
163} 123}
164 124
165/** 125/**
@@ -185,34 +145,49 @@ static int demote_ok(const struct gfs2_glock *gl)
185{ 145{
186 const struct gfs2_glock_operations *glops = gl->gl_ops; 146 const struct gfs2_glock_operations *glops = gl->gl_ops;
187 147
148 /* assert_spin_locked(&gl->gl_spin); */
149
188 if (gl->gl_state == LM_ST_UNLOCKED) 150 if (gl->gl_state == LM_ST_UNLOCKED)
189 return 0; 151 return 0;
190 if (!list_empty(&gl->gl_holders)) 152 if (test_bit(GLF_LFLUSH, &gl->gl_flags))
153 return 0;
154 if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
155 !list_empty(&gl->gl_holders))
191 return 0; 156 return 0;
192 if (glops->go_demote_ok) 157 if (glops->go_demote_ok)
193 return glops->go_demote_ok(gl); 158 return glops->go_demote_ok(gl);
194 return 1; 159 return 1;
195} 160}
196 161
162
197/** 163/**
198 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list 164 * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
199 * @gl: the glock 165 * @gl: the glock
200 * 166 *
167 * If the glock is demotable, then we add it (or move it) to the end
168 * of the glock LRU list.
201 */ 169 */
202 170
203static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) 171static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
204{ 172{
205 int may_reclaim; 173 if (demote_ok(gl)) {
206 may_reclaim = (demote_ok(gl) && 174 spin_lock(&lru_lock);
207 (atomic_read(&gl->gl_ref) == 1 || 175
208 (gl->gl_name.ln_type == LM_TYPE_INODE && 176 if (!list_empty(&gl->gl_lru))
209 atomic_read(&gl->gl_ref) <= 2))); 177 list_del_init(&gl->gl_lru);
210 spin_lock(&lru_lock); 178 else
211 if (list_empty(&gl->gl_lru) && may_reclaim) { 179 atomic_inc(&lru_count);
180
212 list_add_tail(&gl->gl_lru, &lru_list); 181 list_add_tail(&gl->gl_lru, &lru_list);
213 atomic_inc(&lru_count); 182 spin_unlock(&lru_lock);
214 } 183 }
215 spin_unlock(&lru_lock); 184}
185
186void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
187{
188 spin_lock(&gl->gl_spin);
189 __gfs2_glock_schedule_for_reclaim(gl);
190 spin_unlock(&gl->gl_spin);
216} 191}
217 192
218/** 193/**
@@ -227,7 +202,6 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
227{ 202{
228 if (atomic_dec_and_test(&gl->gl_ref)) 203 if (atomic_dec_and_test(&gl->gl_ref))
229 GLOCK_BUG_ON(gl, 1); 204 GLOCK_BUG_ON(gl, 1);
230 gfs2_glock_schedule_for_reclaim(gl);
231} 205}
232 206
233/** 207/**
@@ -236,30 +210,26 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
236 * 210 *
237 */ 211 */
238 212
239int gfs2_glock_put(struct gfs2_glock *gl) 213void gfs2_glock_put(struct gfs2_glock *gl)
240{ 214{
241 int rv = 0; 215 struct gfs2_sbd *sdp = gl->gl_sbd;
216 struct address_space *mapping = gfs2_glock2aspace(gl);
242 217
243 write_lock(gl_lock_addr(gl->gl_hash)); 218 if (atomic_dec_and_test(&gl->gl_ref)) {
244 if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { 219 spin_lock_bucket(gl->gl_hash);
245 hlist_del(&gl->gl_list); 220 hlist_bl_del_rcu(&gl->gl_list);
221 spin_unlock_bucket(gl->gl_hash);
222 spin_lock(&lru_lock);
246 if (!list_empty(&gl->gl_lru)) { 223 if (!list_empty(&gl->gl_lru)) {
247 list_del_init(&gl->gl_lru); 224 list_del_init(&gl->gl_lru);
248 atomic_dec(&lru_count); 225 atomic_dec(&lru_count);
249 } 226 }
250 spin_unlock(&lru_lock); 227 spin_unlock(&lru_lock);
251 write_unlock(gl_lock_addr(gl->gl_hash));
252 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); 228 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
253 glock_free(gl); 229 GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
254 rv = 1; 230 trace_gfs2_glock_put(gl);
255 goto out; 231 sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
256 } 232 }
257 spin_lock(&gl->gl_spin);
258 gfs2_glock_schedule_for_reclaim(gl);
259 spin_unlock(&gl->gl_spin);
260 write_unlock(gl_lock_addr(gl->gl_hash));
261out:
262 return rv;
263} 233}
264 234
265/** 235/**
@@ -275,17 +245,15 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
275 const struct lm_lockname *name) 245 const struct lm_lockname *name)
276{ 246{
277 struct gfs2_glock *gl; 247 struct gfs2_glock *gl;
278 struct hlist_node *h; 248 struct hlist_bl_node *h;
279 249
280 hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) { 250 hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
281 if (!lm_name_equal(&gl->gl_name, name)) 251 if (!lm_name_equal(&gl->gl_name, name))
282 continue; 252 continue;
283 if (gl->gl_sbd != sdp) 253 if (gl->gl_sbd != sdp)
284 continue; 254 continue;
285 255 if (atomic_inc_not_zero(&gl->gl_ref))
286 atomic_inc(&gl->gl_ref); 256 return gl;
287
288 return gl;
289 } 257 }
290 258
291 return NULL; 259 return NULL;
@@ -743,10 +711,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
743 struct gfs2_glock *gl, *tmp; 711 struct gfs2_glock *gl, *tmp;
744 unsigned int hash = gl_hash(sdp, &name); 712 unsigned int hash = gl_hash(sdp, &name);
745 struct address_space *mapping; 713 struct address_space *mapping;
714 struct kmem_cache *cachep;
746 715
747 read_lock(gl_lock_addr(hash)); 716 rcu_read_lock();
748 gl = search_bucket(hash, sdp, &name); 717 gl = search_bucket(hash, sdp, &name);
749 read_unlock(gl_lock_addr(hash)); 718 rcu_read_unlock();
750 719
751 *glp = gl; 720 *glp = gl;
752 if (gl) 721 if (gl)
@@ -755,9 +724,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
755 return -ENOENT; 724 return -ENOENT;
756 725
757 if (glops->go_flags & GLOF_ASPACE) 726 if (glops->go_flags & GLOF_ASPACE)
758 gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL); 727 cachep = gfs2_glock_aspace_cachep;
759 else 728 else
760 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); 729 cachep = gfs2_glock_cachep;
730 gl = kmem_cache_alloc(cachep, GFP_KERNEL);
761 if (!gl) 731 if (!gl)
762 return -ENOMEM; 732 return -ENOMEM;
763 733
@@ -790,15 +760,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
790 mapping->writeback_index = 0; 760 mapping->writeback_index = 0;
791 } 761 }
792 762
793 write_lock(gl_lock_addr(hash)); 763 spin_lock_bucket(hash);
794 tmp = search_bucket(hash, sdp, &name); 764 tmp = search_bucket(hash, sdp, &name);
795 if (tmp) { 765 if (tmp) {
796 write_unlock(gl_lock_addr(hash)); 766 spin_unlock_bucket(hash);
797 glock_free(gl); 767 kmem_cache_free(cachep, gl);
768 atomic_dec(&sdp->sd_glock_disposal);
798 gl = tmp; 769 gl = tmp;
799 } else { 770 } else {
800 hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list); 771 hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
801 write_unlock(gl_lock_addr(hash)); 772 spin_unlock_bucket(hash);
802 } 773 }
803 774
804 *glp = gl; 775 *glp = gl;
@@ -1007,13 +978,13 @@ fail:
1007 insert_pt = &gh2->gh_list; 978 insert_pt = &gh2->gh_list;
1008 } 979 }
1009 set_bit(GLF_QUEUED, &gl->gl_flags); 980 set_bit(GLF_QUEUED, &gl->gl_flags);
981 trace_gfs2_glock_queue(gh, 1);
1010 if (likely(insert_pt == NULL)) { 982 if (likely(insert_pt == NULL)) {
1011 list_add_tail(&gh->gh_list, &gl->gl_holders); 983 list_add_tail(&gh->gh_list, &gl->gl_holders);
1012 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) 984 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
1013 goto do_cancel; 985 goto do_cancel;
1014 return; 986 return;
1015 } 987 }
1016 trace_gfs2_glock_queue(gh, 1);
1017 list_add_tail(&gh->gh_list, insert_pt); 988 list_add_tail(&gh->gh_list, insert_pt);
1018do_cancel: 989do_cancel:
1019 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); 990 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1113,6 +1084,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1113 !test_bit(GLF_DEMOTE, &gl->gl_flags)) 1084 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1114 fast_path = 1; 1085 fast_path = 1;
1115 } 1086 }
1087 __gfs2_glock_schedule_for_reclaim(gl);
1116 trace_gfs2_glock_queue(gh, 0); 1088 trace_gfs2_glock_queue(gh, 0);
1117 spin_unlock(&gl->gl_spin); 1089 spin_unlock(&gl->gl_spin);
1118 if (likely(fast_path)) 1090 if (likely(fast_path))
@@ -1276,10 +1248,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1276 1248
1277void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) 1249void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1278{ 1250{
1279 unsigned int x; 1251 while (num_gh--)
1280 1252 gfs2_glock_dq(&ghs[num_gh]);
1281 for (x = 0; x < num_gh; x++)
1282 gfs2_glock_dq(&ghs[x]);
1283} 1253}
1284 1254
1285/** 1255/**
@@ -1291,10 +1261,8 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1291 1261
1292void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) 1262void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1293{ 1263{
1294 unsigned int x; 1264 while (num_gh--)
1295 1265 gfs2_glock_dq_uninit(&ghs[num_gh]);
1296 for (x = 0; x < num_gh; x++)
1297 gfs2_glock_dq_uninit(&ghs[x]);
1298} 1266}
1299 1267
1300void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) 1268void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
@@ -1440,42 +1408,30 @@ static struct shrinker glock_shrinker = {
1440 * @sdp: the filesystem 1408 * @sdp: the filesystem
1441 * @bucket: the bucket 1409 * @bucket: the bucket
1442 * 1410 *
1443 * Returns: 1 if the bucket has entries
1444 */ 1411 */
1445 1412
1446static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp, 1413static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
1447 unsigned int hash) 1414 unsigned int hash)
1448{ 1415{
1449 struct gfs2_glock *gl, *prev = NULL; 1416 struct gfs2_glock *gl;
1450 int has_entries = 0; 1417 struct hlist_bl_head *head = &gl_hash_table[hash];
1451 struct hlist_head *head = &gl_hash_table[hash].hb_list; 1418 struct hlist_bl_node *pos;
1452 1419
1453 read_lock(gl_lock_addr(hash)); 1420 rcu_read_lock();
1454 /* Can't use hlist_for_each_entry - don't want prefetch here */ 1421 hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
1455 if (hlist_empty(head)) 1422 if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref))
1456 goto out;
1457 gl = list_entry(head->first, struct gfs2_glock, gl_list);
1458 while(1) {
1459 if (!sdp || gl->gl_sbd == sdp) {
1460 gfs2_glock_hold(gl);
1461 read_unlock(gl_lock_addr(hash));
1462 if (prev)
1463 gfs2_glock_put(prev);
1464 prev = gl;
1465 examiner(gl); 1423 examiner(gl);
1466 has_entries = 1;
1467 read_lock(gl_lock_addr(hash));
1468 }
1469 if (gl->gl_list.next == NULL)
1470 break;
1471 gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
1472 } 1424 }
1473out: 1425 rcu_read_unlock();
1474 read_unlock(gl_lock_addr(hash));
1475 if (prev)
1476 gfs2_glock_put(prev);
1477 cond_resched(); 1426 cond_resched();
1478 return has_entries; 1427}
1428
1429static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
1430{
1431 unsigned x;
1432
1433 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1434 examine_bucket(examiner, sdp, x);
1479} 1435}
1480 1436
1481 1437
@@ -1529,10 +1485,21 @@ static void clear_glock(struct gfs2_glock *gl)
1529 1485
1530void gfs2_glock_thaw(struct gfs2_sbd *sdp) 1486void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1531{ 1487{
1532 unsigned x; 1488 glock_hash_walk(thaw_glock, sdp);
1489}
1533 1490
1534 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) 1491static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1535 examine_bucket(thaw_glock, sdp, x); 1492{
1493 int ret;
1494 spin_lock(&gl->gl_spin);
1495 ret = __dump_glock(seq, gl);
1496 spin_unlock(&gl->gl_spin);
1497 return ret;
1498}
1499
1500static void dump_glock_func(struct gfs2_glock *gl)
1501{
1502 dump_glock(NULL, gl);
1536} 1503}
1537 1504
1538/** 1505/**
@@ -1545,13 +1512,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1545 1512
1546void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) 1513void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1547{ 1514{
1548 unsigned int x; 1515 glock_hash_walk(clear_glock, sdp);
1549
1550 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1551 examine_bucket(clear_glock, sdp, x);
1552 flush_workqueue(glock_workqueue); 1516 flush_workqueue(glock_workqueue);
1553 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); 1517 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
1554 gfs2_dump_lockstate(sdp); 1518 glock_hash_walk(dump_glock_func, sdp);
1555} 1519}
1556 1520
1557void gfs2_glock_finish_truncate(struct gfs2_inode *ip) 1521void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1717,73 +1681,22 @@ out:
1717 return error; 1681 return error;
1718} 1682}
1719 1683
1720static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1721{
1722 int ret;
1723 spin_lock(&gl->gl_spin);
1724 ret = __dump_glock(seq, gl);
1725 spin_unlock(&gl->gl_spin);
1726 return ret;
1727}
1728 1684
1729/**
1730 * gfs2_dump_lockstate - print out the current lockstate
1731 * @sdp: the filesystem
1732 * @ub: the buffer to copy the information into
1733 *
1734 * If @ub is NULL, dump the lockstate to the console.
1735 *
1736 */
1737
1738static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
1739{
1740 struct gfs2_glock *gl;
1741 struct hlist_node *h;
1742 unsigned int x;
1743 int error = 0;
1744
1745 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
1746
1747 read_lock(gl_lock_addr(x));
1748
1749 hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
1750 if (gl->gl_sbd != sdp)
1751 continue;
1752
1753 error = dump_glock(NULL, gl);
1754 if (error)
1755 break;
1756 }
1757
1758 read_unlock(gl_lock_addr(x));
1759
1760 if (error)
1761 break;
1762 }
1763
1764
1765 return error;
1766}
1767 1685
1768 1686
1769int __init gfs2_glock_init(void) 1687int __init gfs2_glock_init(void)
1770{ 1688{
1771 unsigned i; 1689 unsigned i;
1772 for(i = 0; i < GFS2_GL_HASH_SIZE; i++) { 1690 for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
1773 INIT_HLIST_HEAD(&gl_hash_table[i].hb_list); 1691 INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
1774 }
1775#ifdef GL_HASH_LOCK_SZ
1776 for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
1777 rwlock_init(&gl_hash_locks[i]);
1778 } 1692 }
1779#endif
1780 1693
1781 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | 1694 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
1782 WQ_HIGHPRI | WQ_FREEZEABLE, 0); 1695 WQ_HIGHPRI | WQ_FREEZABLE, 0);
1783 if (IS_ERR(glock_workqueue)) 1696 if (IS_ERR(glock_workqueue))
1784 return PTR_ERR(glock_workqueue); 1697 return PTR_ERR(glock_workqueue);
1785 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", 1698 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
1786 WQ_MEM_RECLAIM | WQ_FREEZEABLE, 1699 WQ_MEM_RECLAIM | WQ_FREEZABLE,
1787 0); 1700 0);
1788 if (IS_ERR(gfs2_delete_workqueue)) { 1701 if (IS_ERR(gfs2_delete_workqueue)) {
1789 destroy_workqueue(glock_workqueue); 1702 destroy_workqueue(glock_workqueue);
@@ -1802,62 +1715,54 @@ void gfs2_glock_exit(void)
1802 destroy_workqueue(gfs2_delete_workqueue); 1715 destroy_workqueue(gfs2_delete_workqueue);
1803} 1716}
1804 1717
1718static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
1719{
1720 return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
1721 struct gfs2_glock, gl_list);
1722}
1723
1724static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
1725{
1726 return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
1727 struct gfs2_glock, gl_list);
1728}
1729
1805static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) 1730static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
1806{ 1731{
1807 struct gfs2_glock *gl; 1732 struct gfs2_glock *gl;
1808 1733
1809restart: 1734 do {
1810 read_lock(gl_lock_addr(gi->hash)); 1735 gl = gi->gl;
1811 gl = gi->gl; 1736 if (gl) {
1812 if (gl) { 1737 gi->gl = glock_hash_next(gl);
1813 gi->gl = hlist_entry(gl->gl_list.next, 1738 } else {
1814 struct gfs2_glock, gl_list); 1739 gi->gl = glock_hash_chain(gi->hash);
1815 } else { 1740 }
1816 gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first, 1741 while (gi->gl == NULL) {
1817 struct gfs2_glock, gl_list); 1742 gi->hash++;
1818 } 1743 if (gi->hash >= GFS2_GL_HASH_SIZE) {
1819 if (gi->gl) 1744 rcu_read_unlock();
1820 gfs2_glock_hold(gi->gl); 1745 return 1;
1821 read_unlock(gl_lock_addr(gi->hash)); 1746 }
1822 if (gl) 1747 gi->gl = glock_hash_chain(gi->hash);
1823 gfs2_glock_put(gl); 1748 }
1824 while (gi->gl == NULL) { 1749 /* Skip entries for other sb and dead entries */
1825 gi->hash++; 1750 } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
1826 if (gi->hash >= GFS2_GL_HASH_SIZE)
1827 return 1;
1828 read_lock(gl_lock_addr(gi->hash));
1829 gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
1830 struct gfs2_glock, gl_list);
1831 if (gi->gl)
1832 gfs2_glock_hold(gi->gl);
1833 read_unlock(gl_lock_addr(gi->hash));
1834 }
1835
1836 if (gi->sdp != gi->gl->gl_sbd)
1837 goto restart;
1838 1751
1839 return 0; 1752 return 0;
1840} 1753}
1841 1754
1842static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
1843{
1844 if (gi->gl)
1845 gfs2_glock_put(gi->gl);
1846 gi->gl = NULL;
1847}
1848
1849static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) 1755static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
1850{ 1756{
1851 struct gfs2_glock_iter *gi = seq->private; 1757 struct gfs2_glock_iter *gi = seq->private;
1852 loff_t n = *pos; 1758 loff_t n = *pos;
1853 1759
1854 gi->hash = 0; 1760 gi->hash = 0;
1761 rcu_read_lock();
1855 1762
1856 do { 1763 do {
1857 if (gfs2_glock_iter_next(gi)) { 1764 if (gfs2_glock_iter_next(gi))
1858 gfs2_glock_iter_free(gi);
1859 return NULL; 1765 return NULL;
1860 }
1861 } while (n--); 1766 } while (n--);
1862 1767
1863 return gi->gl; 1768 return gi->gl;
@@ -1870,10 +1775,8 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
1870 1775
1871 (*pos)++; 1776 (*pos)++;
1872 1777
1873 if (gfs2_glock_iter_next(gi)) { 1778 if (gfs2_glock_iter_next(gi))
1874 gfs2_glock_iter_free(gi);
1875 return NULL; 1779 return NULL;
1876 }
1877 1780
1878 return gi->gl; 1781 return gi->gl;
1879} 1782}
@@ -1881,7 +1784,10 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
1881static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) 1784static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
1882{ 1785{
1883 struct gfs2_glock_iter *gi = seq->private; 1786 struct gfs2_glock_iter *gi = seq->private;
1884 gfs2_glock_iter_free(gi); 1787
1788 if (gi->gl)
1789 rcu_read_unlock();
1790 gi->gl = NULL;
1885} 1791}
1886 1792
1887static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) 1793static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 691851ceb615..aea160690e94 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -118,7 +118,7 @@ struct lm_lockops {
118 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); 118 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
119 void (*lm_unmount) (struct gfs2_sbd *sdp); 119 void (*lm_unmount) (struct gfs2_sbd *sdp);
120 void (*lm_withdraw) (struct gfs2_sbd *sdp); 120 void (*lm_withdraw) (struct gfs2_sbd *sdp);
121 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); 121 void (*lm_put_lock) (struct gfs2_glock *gl);
122 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, 122 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
123 unsigned int flags); 123 unsigned int flags);
124 void (*lm_cancel) (struct gfs2_glock *gl); 124 void (*lm_cancel) (struct gfs2_glock *gl);
@@ -174,7 +174,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp,
174 int create, struct gfs2_glock **glp); 174 int create, struct gfs2_glock **glp);
175void gfs2_glock_hold(struct gfs2_glock *gl); 175void gfs2_glock_hold(struct gfs2_glock *gl);
176void gfs2_glock_put_nolock(struct gfs2_glock *gl); 176void gfs2_glock_put_nolock(struct gfs2_glock *gl);
177int gfs2_glock_put(struct gfs2_glock *gl); 177void gfs2_glock_put(struct gfs2_glock *gl);
178void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, 178void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
179 struct gfs2_holder *gh); 179 struct gfs2_holder *gh);
180void gfs2_holder_reinit(unsigned int state, unsigned flags, 180void gfs2_holder_reinit(unsigned int state, unsigned flags,
@@ -223,25 +223,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
223 return error; 223 return error;
224} 224}
225 225
226/* Lock Value Block functions */ 226extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
227 227extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
228int gfs2_lvb_hold(struct gfs2_glock *gl); 228extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
229void gfs2_lvb_unhold(struct gfs2_glock *gl); 229extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
230 230extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
231void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); 231extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
232void gfs2_glock_complete(struct gfs2_glock *gl, int ret); 232extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
233void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 233extern void gfs2_glock_free(struct gfs2_glock *gl);
234void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); 234
235void gfs2_glock_finish_truncate(struct gfs2_inode *ip); 235extern int __init gfs2_glock_init(void);
236void gfs2_glock_thaw(struct gfs2_sbd *sdp); 236extern void gfs2_glock_exit(void);
237 237
238int __init gfs2_glock_init(void); 238extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
239void gfs2_glock_exit(void); 239extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
240 240extern int gfs2_register_debugfs(void);
241int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); 241extern void gfs2_unregister_debugfs(void);
242void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
243int gfs2_register_debugfs(void);
244void gfs2_unregister_debugfs(void);
245 242
246extern const struct lm_lockops gfs2_dlm_ops; 243extern const struct lm_lockops gfs2_dlm_ops;
247 244
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 263561bf1a50..3754e3cbf02b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,20 +56,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
56 BUG_ON(current->journal_info); 56 BUG_ON(current->journal_info);
57 current->journal_info = &tr; 57 current->journal_info = &tr;
58 58
59 gfs2_log_lock(sdp); 59 spin_lock(&sdp->sd_ail_lock);
60 while (!list_empty(head)) { 60 while (!list_empty(head)) {
61 bd = list_entry(head->next, struct gfs2_bufdata, 61 bd = list_entry(head->next, struct gfs2_bufdata,
62 bd_ail_gl_list); 62 bd_ail_gl_list);
63 bh = bd->bd_bh; 63 bh = bd->bd_bh;
64 gfs2_remove_from_ail(bd); 64 gfs2_remove_from_ail(bd);
65 spin_unlock(&sdp->sd_ail_lock);
66
65 bd->bd_bh = NULL; 67 bd->bd_bh = NULL;
66 bh->b_private = NULL; 68 bh->b_private = NULL;
67 bd->bd_blkno = bh->b_blocknr; 69 bd->bd_blkno = bh->b_blocknr;
70 gfs2_log_lock(sdp);
68 gfs2_assert_withdraw(sdp, !buffer_busy(bh)); 71 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
69 gfs2_trans_add_revoke(sdp, bd); 72 gfs2_trans_add_revoke(sdp, bd);
73 gfs2_log_unlock(sdp);
74
75 spin_lock(&sdp->sd_ail_lock);
70 } 76 }
71 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); 77 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
72 gfs2_log_unlock(sdp); 78 spin_unlock(&sdp->sd_ail_lock);
73 79
74 gfs2_trans_end(sdp); 80 gfs2_trans_end(sdp);
75 gfs2_log_flush(sdp, NULL); 81 gfs2_log_flush(sdp, NULL);
@@ -206,8 +212,17 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
206static int inode_go_demote_ok(const struct gfs2_glock *gl) 212static int inode_go_demote_ok(const struct gfs2_glock *gl)
207{ 213{
208 struct gfs2_sbd *sdp = gl->gl_sbd; 214 struct gfs2_sbd *sdp = gl->gl_sbd;
215 struct gfs2_holder *gh;
216
209 if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) 217 if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
210 return 0; 218 return 0;
219
220 if (!list_empty(&gl->gl_holders)) {
221 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
222 if (gh->gh_list.next != &gl->gl_holders)
223 return 0;
224 }
225
211 return 1; 226 return 1;
212} 227}
213 228
@@ -272,19 +287,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
272} 287}
273 288
274/** 289/**
275 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
276 * @gl: the glock
277 *
278 * Returns: 1 if it's ok
279 */
280
281static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
282{
283 const struct address_space *mapping = (const struct address_space *)(gl + 1);
284 return !mapping->nrpages;
285}
286
287/**
288 * rgrp_go_lock - operation done after an rgrp lock is locked by 290 * rgrp_go_lock - operation done after an rgrp lock is locked by
289 * a first holder on this node. 291 * a first holder on this node.
290 * @gl: the glock 292 * @gl: the glock
@@ -410,7 +412,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
410const struct gfs2_glock_operations gfs2_rgrp_glops = { 412const struct gfs2_glock_operations gfs2_rgrp_glops = {
411 .go_xmote_th = rgrp_go_sync, 413 .go_xmote_th = rgrp_go_sync,
412 .go_inval = rgrp_go_inval, 414 .go_inval = rgrp_go_inval,
413 .go_demote_ok = rgrp_go_demote_ok,
414 .go_lock = rgrp_go_lock, 415 .go_lock = rgrp_go_lock,
415 .go_unlock = rgrp_go_unlock, 416 .go_unlock = rgrp_go_unlock,
416 .go_dump = gfs2_rgrp_dump, 417 .go_dump = gfs2_rgrp_dump,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a79790c06275..870a89d6d4dc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -15,6 +15,8 @@
15#include <linux/workqueue.h> 15#include <linux/workqueue.h>
16#include <linux/dlm.h> 16#include <linux/dlm.h>
17#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
18#include <linux/rcupdate.h>
19#include <linux/rculist_bl.h>
18 20
19#define DIO_WAIT 0x00000010 21#define DIO_WAIT 0x00000010
20#define DIO_METADATA 0x00000020 22#define DIO_METADATA 0x00000020
@@ -201,7 +203,7 @@ enum {
201}; 203};
202 204
203struct gfs2_glock { 205struct gfs2_glock {
204 struct hlist_node gl_list; 206 struct hlist_bl_node gl_list;
205 unsigned long gl_flags; /* GLF_... */ 207 unsigned long gl_flags; /* GLF_... */
206 struct lm_lockname gl_name; 208 struct lm_lockname gl_name;
207 atomic_t gl_ref; 209 atomic_t gl_ref;
@@ -234,6 +236,7 @@ struct gfs2_glock {
234 atomic_t gl_ail_count; 236 atomic_t gl_ail_count;
235 struct delayed_work gl_work; 237 struct delayed_work gl_work;
236 struct work_struct gl_delete; 238 struct work_struct gl_delete;
239 struct rcu_head gl_rcu;
237}; 240};
238 241
239#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ 242#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
@@ -314,6 +317,7 @@ enum {
314 QDF_USER = 0, 317 QDF_USER = 0,
315 QDF_CHANGE = 1, 318 QDF_CHANGE = 1,
316 QDF_LOCKED = 2, 319 QDF_LOCKED = 2,
320 QDF_REFRESH = 3,
317}; 321};
318 322
319struct gfs2_quota_data { 323struct gfs2_quota_data {
@@ -647,6 +651,7 @@ struct gfs2_sbd {
647 unsigned int sd_log_flush_head; 651 unsigned int sd_log_flush_head;
648 u64 sd_log_flush_wrapped; 652 u64 sd_log_flush_wrapped;
649 653
654 spinlock_t sd_ail_lock;
650 struct list_head sd_ail1_list; 655 struct list_head sd_ail1_list;
651 struct list_head sd_ail2_list; 656 struct list_head sd_ail2_list;
652 u64 sd_ail_sync_gen; 657 u64 sd_ail_sync_gen;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7aa7d4f8984a..97d54a28776a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -763,14 +763,15 @@ fail:
763 return error; 763 return error;
764} 764}
765 765
766static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) 766static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
767 const struct qstr *qstr)
767{ 768{
768 int err; 769 int err;
769 size_t len; 770 size_t len;
770 void *value; 771 void *value;
771 char *name; 772 char *name;
772 773
773 err = security_inode_init_security(&ip->i_inode, &dip->i_inode, 774 err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
774 &name, &value, &len); 775 &name, &value, &len);
775 776
776 if (err) { 777 if (err) {
@@ -854,7 +855,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
854 if (error) 855 if (error)
855 goto fail_gunlock2; 856 goto fail_gunlock2;
856 857
857 error = gfs2_security_init(dip, GFS2_I(inode)); 858 error = gfs2_security_init(dip, GFS2_I(inode), name);
858 if (error) 859 if (error)
859 goto fail_gunlock2; 860 goto fail_gunlock2;
860 861
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6e493aee28f8..98c80d8c2a62 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -22,7 +22,6 @@ static void gdlm_ast(void *arg)
22{ 22{
23 struct gfs2_glock *gl = arg; 23 struct gfs2_glock *gl = arg;
24 unsigned ret = gl->gl_state; 24 unsigned ret = gl->gl_state;
25 struct gfs2_sbd *sdp = gl->gl_sbd;
26 25
27 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); 26 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
28 27
@@ -31,12 +30,7 @@ static void gdlm_ast(void *arg)
31 30
32 switch (gl->gl_lksb.sb_status) { 31 switch (gl->gl_lksb.sb_status) {
33 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ 32 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
34 if (gl->gl_ops->go_flags & GLOF_ASPACE) 33 gfs2_glock_free(gl);
35 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
36 else
37 kmem_cache_free(gfs2_glock_cachep, gl);
38 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
39 wake_up(&sdp->sd_glock_wait);
40 return; 34 return;
41 case -DLM_ECANCEL: /* Cancel while getting lock */ 35 case -DLM_ECANCEL: /* Cancel while getting lock */
42 ret |= LM_OUT_CANCELED; 36 ret |= LM_OUT_CANCELED;
@@ -164,16 +158,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
164 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); 158 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
165} 159}
166 160
167static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) 161static void gdlm_put_lock(struct gfs2_glock *gl)
168{ 162{
169 struct gfs2_sbd *sdp = gl->gl_sbd; 163 struct gfs2_sbd *sdp = gl->gl_sbd;
170 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 164 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
171 int error; 165 int error;
172 166
173 if (gl->gl_lksb.sb_lkid == 0) { 167 if (gl->gl_lksb.sb_lkid == 0) {
174 kmem_cache_free(cachep, gl); 168 gfs2_glock_free(gl);
175 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
176 wake_up(&sdp->sd_glock_wait);
177 return; 169 return;
178 } 170 }
179 171
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eb01f3575e10..5b102c1887fd 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -67,7 +67,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
67 * @mapping: The associated mapping (maybe NULL) 67 * @mapping: The associated mapping (maybe NULL)
68 * @bd: The gfs2_bufdata to remove 68 * @bd: The gfs2_bufdata to remove
69 * 69 *
70 * The log lock _must_ be held when calling this function 70 * The ail lock _must_ be held when calling this function
71 * 71 *
72 */ 72 */
73 73
@@ -88,8 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
88 */ 88 */
89 89
90static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) 90static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
91__releases(&sdp->sd_log_lock) 91__releases(&sdp->sd_ail_lock)
92__acquires(&sdp->sd_log_lock) 92__acquires(&sdp->sd_ail_lock)
93{ 93{
94 struct gfs2_bufdata *bd, *s; 94 struct gfs2_bufdata *bd, *s;
95 struct buffer_head *bh; 95 struct buffer_head *bh;
@@ -117,16 +117,16 @@ __acquires(&sdp->sd_log_lock)
117 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); 117 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
118 118
119 get_bh(bh); 119 get_bh(bh);
120 gfs2_log_unlock(sdp); 120 spin_unlock(&sdp->sd_ail_lock);
121 lock_buffer(bh); 121 lock_buffer(bh);
122 if (test_clear_buffer_dirty(bh)) { 122 if (test_clear_buffer_dirty(bh)) {
123 bh->b_end_io = end_buffer_write_sync; 123 bh->b_end_io = end_buffer_write_sync;
124 submit_bh(WRITE_SYNC_PLUG, bh); 124 submit_bh(WRITE_SYNC, bh);
125 } else { 125 } else {
126 unlock_buffer(bh); 126 unlock_buffer(bh);
127 brelse(bh); 127 brelse(bh);
128 } 128 }
129 gfs2_log_lock(sdp); 129 spin_lock(&sdp->sd_ail_lock);
130 130
131 retry = 1; 131 retry = 1;
132 break; 132 break;
@@ -175,10 +175,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
175 struct gfs2_ail *ai; 175 struct gfs2_ail *ai;
176 int done = 0; 176 int done = 0;
177 177
178 gfs2_log_lock(sdp); 178 spin_lock(&sdp->sd_ail_lock);
179 head = &sdp->sd_ail1_list; 179 head = &sdp->sd_ail1_list;
180 if (list_empty(head)) { 180 if (list_empty(head)) {
181 gfs2_log_unlock(sdp); 181 spin_unlock(&sdp->sd_ail_lock);
182 return; 182 return;
183 } 183 }
184 sync_gen = sdp->sd_ail_sync_gen++; 184 sync_gen = sdp->sd_ail_sync_gen++;
@@ -189,13 +189,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
189 if (ai->ai_sync_gen >= sync_gen) 189 if (ai->ai_sync_gen >= sync_gen)
190 continue; 190 continue;
191 ai->ai_sync_gen = sync_gen; 191 ai->ai_sync_gen = sync_gen;
192 gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */ 192 gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
193 done = 0; 193 done = 0;
194 break; 194 break;
195 } 195 }
196 } 196 }
197 197
198 gfs2_log_unlock(sdp); 198 spin_unlock(&sdp->sd_ail_lock);
199} 199}
200 200
201static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) 201static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
@@ -203,7 +203,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
203 struct gfs2_ail *ai, *s; 203 struct gfs2_ail *ai, *s;
204 int ret; 204 int ret;
205 205
206 gfs2_log_lock(sdp); 206 spin_lock(&sdp->sd_ail_lock);
207 207
208 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { 208 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
209 if (gfs2_ail1_empty_one(sdp, ai, flags)) 209 if (gfs2_ail1_empty_one(sdp, ai, flags))
@@ -214,7 +214,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
214 214
215 ret = list_empty(&sdp->sd_ail1_list); 215 ret = list_empty(&sdp->sd_ail1_list);
216 216
217 gfs2_log_unlock(sdp); 217 spin_unlock(&sdp->sd_ail_lock);
218 218
219 return ret; 219 return ret;
220} 220}
@@ -247,7 +247,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
247 int wrap = (new_tail < old_tail); 247 int wrap = (new_tail < old_tail);
248 int a, b, rm; 248 int a, b, rm;
249 249
250 gfs2_log_lock(sdp); 250 spin_lock(&sdp->sd_ail_lock);
251 251
252 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { 252 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
253 a = (old_tail <= ai->ai_first); 253 a = (old_tail <= ai->ai_first);
@@ -263,7 +263,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
263 kfree(ai); 263 kfree(ai);
264 } 264 }
265 265
266 gfs2_log_unlock(sdp); 266 spin_unlock(&sdp->sd_ail_lock);
267} 267}
268 268
269/** 269/**
@@ -421,7 +421,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
421 struct gfs2_ail *ai; 421 struct gfs2_ail *ai;
422 unsigned int tail; 422 unsigned int tail;
423 423
424 gfs2_log_lock(sdp); 424 spin_lock(&sdp->sd_ail_lock);
425 425
426 if (list_empty(&sdp->sd_ail1_list)) { 426 if (list_empty(&sdp->sd_ail1_list)) {
427 tail = sdp->sd_log_head; 427 tail = sdp->sd_log_head;
@@ -430,7 +430,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
430 tail = ai->ai_first; 430 tail = ai->ai_first;
431 } 431 }
432 432
433 gfs2_log_unlock(sdp); 433 spin_unlock(&sdp->sd_ail_lock);
434 434
435 return tail; 435 return tail;
436} 436}
@@ -647,7 +647,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
647 lock_buffer(bh); 647 lock_buffer(bh);
648 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { 648 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
649 bh->b_end_io = end_buffer_write_sync; 649 bh->b_end_io = end_buffer_write_sync;
650 submit_bh(WRITE_SYNC_PLUG, bh); 650 submit_bh(WRITE_SYNC, bh);
651 } else { 651 } else {
652 unlock_buffer(bh); 652 unlock_buffer(bh);
653 brelse(bh); 653 brelse(bh);
@@ -743,10 +743,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
743 sdp->sd_log_commited_databuf = 0; 743 sdp->sd_log_commited_databuf = 0;
744 sdp->sd_log_commited_revoke = 0; 744 sdp->sd_log_commited_revoke = 0;
745 745
746 spin_lock(&sdp->sd_ail_lock);
746 if (!list_empty(&ai->ai_ail1_list)) { 747 if (!list_empty(&ai->ai_ail1_list)) {
747 list_add(&ai->ai_list, &sdp->sd_ail1_list); 748 list_add(&ai->ai_list, &sdp->sd_ail1_list);
748 ai = NULL; 749 ai = NULL;
749 } 750 }
751 spin_unlock(&sdp->sd_ail_lock);
750 gfs2_log_unlock(sdp); 752 gfs2_log_unlock(sdp);
751 trace_gfs2_log_flush(sdp, 0); 753 trace_gfs2_log_flush(sdp, 0);
752 up_write(&sdp->sd_log_flush_lock); 754 up_write(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index bf33f822058d..51d27f00ebb4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -51,8 +51,10 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
51 /* If this buffer is in the AIL and it has already been written 51 /* If this buffer is in the AIL and it has already been written
52 * to in-place disk block, remove it from the AIL. 52 * to in-place disk block, remove it from the AIL.
53 */ 53 */
54 spin_lock(&sdp->sd_ail_lock);
54 if (bd->bd_ail) 55 if (bd->bd_ail)
55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); 56 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
57 spin_unlock(&sdp->sd_ail_lock);
56 get_bh(bh); 58 get_bh(bh);
57 atomic_inc(&sdp->sd_log_pinned); 59 atomic_inc(&sdp->sd_log_pinned);
58 trace_gfs2_pin(bd, 1); 60 trace_gfs2_pin(bd, 1);
@@ -80,7 +82,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
80 mark_buffer_dirty(bh); 82 mark_buffer_dirty(bh);
81 clear_buffer_pinned(bh); 83 clear_buffer_pinned(bh);
82 84
83 gfs2_log_lock(sdp); 85 spin_lock(&sdp->sd_ail_lock);
84 if (bd->bd_ail) { 86 if (bd->bd_ail) {
85 list_del(&bd->bd_ail_st_list); 87 list_del(&bd->bd_ail_st_list);
86 brelse(bh); 88 brelse(bh);
@@ -91,9 +93,11 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
91 } 93 }
92 bd->bd_ail = ai; 94 bd->bd_ail = ai;
93 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); 95 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
94 clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); 96 spin_unlock(&sdp->sd_ail_lock);
97
98 if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
99 gfs2_glock_schedule_for_reclaim(bd->bd_gl);
95 trace_gfs2_pin(bd, 0); 100 trace_gfs2_pin(bd, 0);
96 gfs2_log_unlock(sdp);
97 unlock_buffer(bh); 101 unlock_buffer(bh);
98 atomic_dec(&sdp->sd_log_pinned); 102 atomic_dec(&sdp->sd_log_pinned);
99} 103}
@@ -200,7 +204,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
200 } 204 }
201 205
202 gfs2_log_unlock(sdp); 206 gfs2_log_unlock(sdp);
203 submit_bh(WRITE_SYNC_PLUG, bh); 207 submit_bh(WRITE_SYNC, bh);
204 gfs2_log_lock(sdp); 208 gfs2_log_lock(sdp);
205 209
206 n = 0; 210 n = 0;
@@ -210,7 +214,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
210 gfs2_log_unlock(sdp); 214 gfs2_log_unlock(sdp);
211 lock_buffer(bd2->bd_bh); 215 lock_buffer(bd2->bd_bh);
212 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); 216 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
213 submit_bh(WRITE_SYNC_PLUG, bh); 217 submit_bh(WRITE_SYNC, bh);
214 gfs2_log_lock(sdp); 218 gfs2_log_lock(sdp);
215 if (++n >= num) 219 if (++n >= num)
216 break; 220 break;
@@ -352,7 +356,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
352 sdp->sd_log_num_revoke--; 356 sdp->sd_log_num_revoke--;
353 357
354 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { 358 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
355 submit_bh(WRITE_SYNC_PLUG, bh); 359 submit_bh(WRITE_SYNC, bh);
356 360
357 bh = gfs2_log_get_buf(sdp); 361 bh = gfs2_log_get_buf(sdp);
358 mh = (struct gfs2_meta_header *)bh->b_data; 362 mh = (struct gfs2_meta_header *)bh->b_data;
@@ -369,7 +373,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
369 } 373 }
370 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 374 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
371 375
372 submit_bh(WRITE_SYNC_PLUG, bh); 376 submit_bh(WRITE_SYNC, bh);
373} 377}
374 378
375static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 379static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -571,7 +575,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
571 ptr = bh_log_ptr(bh); 575 ptr = bh_log_ptr(bh);
572 576
573 get_bh(bh); 577 get_bh(bh);
574 submit_bh(WRITE_SYNC_PLUG, bh); 578 submit_bh(WRITE_SYNC, bh);
575 gfs2_log_lock(sdp); 579 gfs2_log_lock(sdp);
576 while(!list_empty(list)) { 580 while(!list_empty(list)) {
577 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); 581 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -597,7 +601,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
597 } else { 601 } else {
598 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); 602 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
599 } 603 }
600 submit_bh(WRITE_SYNC_PLUG, bh1); 604 submit_bh(WRITE_SYNC, bh1);
601 gfs2_log_lock(sdp); 605 gfs2_log_lock(sdp);
602 ptr += 2; 606 ptr += 2;
603 } 607 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index ebef7ab6e17e..888a5f5a1a58 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,6 +14,8 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/rcupdate.h>
18#include <linux/rculist_bl.h>
17#include <asm/atomic.h> 19#include <asm/atomic.h>
18 20
19#include "gfs2.h" 21#include "gfs2.h"
@@ -45,7 +47,7 @@ static void gfs2_init_glock_once(void *foo)
45{ 47{
46 struct gfs2_glock *gl = foo; 48 struct gfs2_glock *gl = foo;
47 49
48 INIT_HLIST_NODE(&gl->gl_list); 50 INIT_HLIST_BL_NODE(&gl->gl_list);
49 spin_lock_init(&gl->gl_spin); 51 spin_lock_init(&gl->gl_spin);
50 INIT_LIST_HEAD(&gl->gl_holders); 52 INIT_LIST_HEAD(&gl->gl_holders);
51 INIT_LIST_HEAD(&gl->gl_lru); 53 INIT_LIST_HEAD(&gl->gl_lru);
@@ -59,14 +61,7 @@ static void gfs2_init_gl_aspace_once(void *foo)
59 struct address_space *mapping = (struct address_space *)(gl + 1); 61 struct address_space *mapping = (struct address_space *)(gl + 1);
60 62
61 gfs2_init_glock_once(gl); 63 gfs2_init_glock_once(gl);
62 memset(mapping, 0, sizeof(*mapping)); 64 address_space_init_once(mapping);
63 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
64 spin_lock_init(&mapping->tree_lock);
65 spin_lock_init(&mapping->i_mmap_lock);
66 INIT_LIST_HEAD(&mapping->private_list);
67 spin_lock_init(&mapping->private_lock);
68 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
69 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
70} 65}
71 66
72/** 67/**
@@ -144,7 +139,7 @@ static int __init init_gfs2_fs(void)
144 139
145 error = -ENOMEM; 140 error = -ENOMEM;
146 gfs_recovery_wq = alloc_workqueue("gfs_recovery", 141 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
147 WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0); 142 WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
148 if (!gfs_recovery_wq) 143 if (!gfs_recovery_wq)
149 goto fail_wq; 144 goto fail_wq;
150 145
@@ -198,6 +193,8 @@ static void __exit exit_gfs2_fs(void)
198 unregister_filesystem(&gfs2meta_fs_type); 193 unregister_filesystem(&gfs2meta_fs_type);
199 destroy_workqueue(gfs_recovery_wq); 194 destroy_workqueue(gfs_recovery_wq);
200 195
196 rcu_barrier();
197
201 kmem_cache_destroy(gfs2_quotad_cachep); 198 kmem_cache_destroy(gfs2_quotad_cachep);
202 kmem_cache_destroy(gfs2_rgrpd_cachep); 199 kmem_cache_destroy(gfs2_rgrpd_cachep);
203 kmem_cache_destroy(gfs2_bufdata_cachep); 200 kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939739c7b3f9..675349b5a133 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
37 struct buffer_head *bh, *head; 37 struct buffer_head *bh, *head;
38 int nr_underway = 0; 38 int nr_underway = 0;
39 int write_op = REQ_META | 39 int write_op = REQ_META |
40 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE); 40 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
41 41
42 BUG_ON(!PageLocked(page)); 42 BUG_ON(!PageLocked(page));
43 BUG_ON(!page_has_buffers(page)); 43 BUG_ON(!page_has_buffers(page));
@@ -94,7 +94,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
94const struct address_space_operations gfs2_meta_aops = { 94const struct address_space_operations gfs2_meta_aops = {
95 .writepage = gfs2_aspace_writepage, 95 .writepage = gfs2_aspace_writepage,
96 .releasepage = gfs2_releasepage, 96 .releasepage = gfs2_releasepage,
97 .sync_page = block_sync_page,
98}; 97};
99 98
100/** 99/**
@@ -326,6 +325,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
326 brelse(bh); 325 brelse(bh);
327 } 326 }
328 if (bd) { 327 if (bd) {
328 spin_lock(&sdp->sd_ail_lock);
329 if (bd->bd_ail) { 329 if (bd->bd_ail) {
330 gfs2_remove_from_ail(bd); 330 gfs2_remove_from_ail(bd);
331 bh->b_private = NULL; 331 bh->b_private = NULL;
@@ -333,6 +333,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
333 bd->bd_blkno = bh->b_blocknr; 333 bd->bd_blkno = bh->b_blocknr;
334 gfs2_trans_add_revoke(sdp, bd); 334 gfs2_trans_add_revoke(sdp, bd);
335 } 335 }
336 spin_unlock(&sdp->sd_ail_lock);
336 } 337 }
337 clear_buffer_dirty(bh); 338 clear_buffer_dirty(bh);
338 clear_buffer_uptodate(bh); 339 clear_buffer_uptodate(bh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 777927ce6f79..42ef24355afb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
99 99
100 init_waitqueue_head(&sdp->sd_log_waitq); 100 init_waitqueue_head(&sdp->sd_log_waitq);
101 init_waitqueue_head(&sdp->sd_logd_waitq); 101 init_waitqueue_head(&sdp->sd_logd_waitq);
102 spin_lock_init(&sdp->sd_ail_lock);
102 INIT_LIST_HEAD(&sdp->sd_ail1_list); 103 INIT_LIST_HEAD(&sdp->sd_ail1_list);
103 INIT_LIST_HEAD(&sdp->sd_ail2_list); 104 INIT_LIST_HEAD(&sdp->sd_ail2_list);
104 105
@@ -928,17 +929,9 @@ static const match_table_t nolock_tokens = {
928 { Opt_err, NULL }, 929 { Opt_err, NULL },
929}; 930};
930 931
931static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
932{
933 struct gfs2_sbd *sdp = gl->gl_sbd;
934 kmem_cache_free(cachep, gl);
935 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
936 wake_up(&sdp->sd_glock_wait);
937}
938
939static const struct lm_lockops nolock_ops = { 932static const struct lm_lockops nolock_ops = {
940 .lm_proto_name = "lock_nolock", 933 .lm_proto_name = "lock_nolock",
941 .lm_put_lock = nolock_put_lock, 934 .lm_put_lock = gfs2_glock_free,
942 .lm_tokens = &nolock_tokens, 935 .lm_tokens = &nolock_tokens,
943}; 936};
944 937
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d8b26ac2e20b..09e436a50723 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1026,9 +1026,9 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1026 1026
1027/** 1027/**
1028 * gfs2_permission - 1028 * gfs2_permission -
1029 * @inode: 1029 * @inode: The inode
1030 * @mask: 1030 * @mask: The mask to be tested
1031 * @nd: passed from Linux VFS, ignored by us 1031 * @flags: Indicates whether this is an RCU path walk or not
1032 * 1032 *
1033 * This may be called from the VFS directly, or from within GFS2 with the 1033 * This may be called from the VFS directly, or from within GFS2 with the
1034 * inode locked, so we look to see if the glock is already locked and only 1034 * inode locked, so we look to see if the glock is already locked and only
@@ -1044,11 +1044,11 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
1044 int error; 1044 int error;
1045 int unlock = 0; 1045 int unlock = 0;
1046 1046
1047 if (flags & IPERM_FLAG_RCU)
1048 return -ECHILD;
1049 1047
1050 ip = GFS2_I(inode); 1048 ip = GFS2_I(inode);
1051 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { 1049 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
1050 if (flags & IPERM_FLAG_RCU)
1051 return -ECHILD;
1052 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 1052 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
1053 if (error) 1053 if (error)
1054 return error; 1054 return error;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a689901963de..e23d9864c418 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -834,6 +834,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
834 goto out_end_trans; 834 goto out_end_trans;
835 835
836 do_qc(qd, -qd->qd_change_sync); 836 do_qc(qd, -qd->qd_change_sync);
837 set_bit(QDF_REFRESH, &qd->qd_flags);
837 } 838 }
838 839
839 error = 0; 840 error = 0;
@@ -929,6 +930,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
929{ 930{
930 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 931 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
931 struct gfs2_alloc *al = ip->i_alloc; 932 struct gfs2_alloc *al = ip->i_alloc;
933 struct gfs2_quota_data *qd;
932 unsigned int x; 934 unsigned int x;
933 int error = 0; 935 int error = 0;
934 936
@@ -942,7 +944,11 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
942 sort_qd, NULL); 944 sort_qd, NULL);
943 945
944 for (x = 0; x < al->al_qd_num; x++) { 946 for (x = 0; x < al->al_qd_num; x++) {
945 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]); 947 int force = NO_FORCE;
948 qd = al->al_qd[x];
949 if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
950 force = FORCE;
951 error = do_glock(qd, force, &al->al_qd_ghs[x]);
946 if (error) 952 if (error)
947 break; 953 break;
948 } 954 }
@@ -1587,6 +1593,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1587 1593
1588 offset = qd2offset(qd); 1594 offset = qd2offset(qd);
1589 alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); 1595 alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
1596 if (gfs2_is_stuffed(ip))
1597 alloc_required = 1;
1590 if (alloc_required) { 1598 if (alloc_required) {
1591 al = gfs2_alloc_get(ip); 1599 al = gfs2_alloc_get(ip);
1592 if (al == NULL) 1600 if (al == NULL)
@@ -1600,7 +1608,9 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1600 blocks += gfs2_rg_blocks(al); 1608 blocks += gfs2_rg_blocks(al);
1601 } 1609 }
1602 1610
1603 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); 1611 /* Some quotas span block boundaries and can update two blocks,
1612 adding an extra block to the transaction to handle such quotas */
1613 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0);
1604 if (error) 1614 if (error)
1605 goto out_release; 1615 goto out_release;
1606 1616
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7293ea27020c..cf930cd9664a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1602,7 +1602,7 @@ rgrp_error:
1602 * 1602 *
1603 */ 1603 */
1604 1604
1605void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) 1605void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1606{ 1606{
1607 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1607 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1608 struct gfs2_rgrpd *rgd; 1608 struct gfs2_rgrpd *rgd;
@@ -1617,7 +1617,21 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1617 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1617 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1618 1618
1619 gfs2_trans_add_rg(rgd); 1619 gfs2_trans_add_rg(rgd);
1620}
1620 1621
1622/**
1623 * gfs2_free_data - free a contiguous run of data block(s)
1624 * @ip: the inode these blocks are being freed from
1625 * @bstart: first block of a run of contiguous blocks
1626 * @blen: the length of the block run
1627 *
1628 */
1629
1630void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1631{
1632 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1633
1634 __gfs2_free_data(ip, bstart, blen);
1621 gfs2_statfs_change(sdp, 0, +blen, 0); 1635 gfs2_statfs_change(sdp, 0, +blen, 0);
1622 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); 1636 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
1623} 1637}
@@ -1630,7 +1644,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1630 * 1644 *
1631 */ 1645 */
1632 1646
1633void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) 1647void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1634{ 1648{
1635 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1649 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1636 struct gfs2_rgrpd *rgd; 1650 struct gfs2_rgrpd *rgd;
@@ -1645,10 +1659,24 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1645 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1659 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1646 1660
1647 gfs2_trans_add_rg(rgd); 1661 gfs2_trans_add_rg(rgd);
1662 gfs2_meta_wipe(ip, bstart, blen);
1663}
1648 1664
1665/**
1666 * gfs2_free_meta - free a contiguous run of data block(s)
1667 * @ip: the inode these blocks are being freed from
1668 * @bstart: first block of a run of contiguous blocks
1669 * @blen: the length of the block run
1670 *
1671 */
1672
1673void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1674{
1675 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1676
1677 __gfs2_free_meta(ip, bstart, blen);
1649 gfs2_statfs_change(sdp, 0, +blen, 0); 1678 gfs2_statfs_change(sdp, 0, +blen, 0);
1650 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); 1679 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
1651 gfs2_meta_wipe(ip, bstart, blen);
1652} 1680}
1653 1681
1654void gfs2_unlink_di(struct inode *inode) 1682void gfs2_unlink_di(struct inode *inode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 50c2bb04369c..a80e3034ac47 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -52,7 +52,9 @@ extern int gfs2_ri_update(struct gfs2_inode *ip);
52extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 52extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
53extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); 53extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
54 54
55extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
55extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); 56extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
57extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
56extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); 58extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
57extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); 59extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
58extern void gfs2_unlink_di(struct inode *inode); 60extern void gfs2_unlink_di(struct inode *inode);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index afa66aaa2237..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
238} 238}
239 239
240/* 240/*
241 * hfs_unlink() 241 * hfs_remove()
242 * 242 *
243 * This is the unlink() entry in the inode_operations structure for 243 * This serves as both unlink() and rmdir() in the inode_operations
244 * regular HFS directories. The purpose is to delete an existing 244 * structure for regular HFS directories. The purpose is to delete
245 * file, given the inode for the parent directory and the name 245 * an existing child, given the inode for the parent directory and
246 * (and its length) of the existing file. 246 * the name (and its length) of the existing directory.
247 */
248static int hfs_unlink(struct inode *dir, struct dentry *dentry)
249{
250 struct inode *inode;
251 int res;
252
253 inode = dentry->d_inode;
254 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
255 if (res)
256 return res;
257
258 drop_nlink(inode);
259 hfs_delete_inode(inode);
260 inode->i_ctime = CURRENT_TIME_SEC;
261 mark_inode_dirty(inode);
262
263 return res;
264}
265
266/*
267 * hfs_rmdir()
268 * 247 *
269 * This is the rmdir() entry in the inode_operations structure for 248 * HFS does not have hardlinks, so both rmdir and unlink set the
270 * regular HFS directories. The purpose is to delete an existing 249 * link count to 0. The only difference is the emptiness check.
271 * directory, given the inode for the parent directory and the name
272 * (and its length) of the existing directory.
273 */ 250 */
274static int hfs_rmdir(struct inode *dir, struct dentry *dentry) 251static int hfs_remove(struct inode *dir, struct dentry *dentry)
275{ 252{
276 struct inode *inode; 253 struct inode *inode = dentry->d_inode;
277 int res; 254 int res;
278 255
279 inode = dentry->d_inode; 256 if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
280 if (inode->i_size != 2)
281 return -ENOTEMPTY; 257 return -ENOTEMPTY;
282 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); 258 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
283 if (res) 259 if (res)
@@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
307 283
308 /* Unlink destination if it already exists */ 284 /* Unlink destination if it already exists */
309 if (new_dentry->d_inode) { 285 if (new_dentry->d_inode) {
310 res = hfs_unlink(new_dir, new_dentry); 286 res = hfs_remove(new_dir, new_dentry);
311 if (res) 287 if (res)
312 return res; 288 return res;
313 } 289 }
@@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = {
332const struct inode_operations hfs_dir_inode_operations = { 308const struct inode_operations hfs_dir_inode_operations = {
333 .create = hfs_create, 309 .create = hfs_create,
334 .lookup = hfs_lookup, 310 .lookup = hfs_lookup,
335 .unlink = hfs_unlink, 311 .unlink = hfs_remove,
336 .mkdir = hfs_mkdir, 312 .mkdir = hfs_mkdir,
337 .rmdir = hfs_rmdir, 313 .rmdir = hfs_remove,
338 .rename = hfs_rename, 314 .rename = hfs_rename,
339 .setattr = hfs_inode_setattr, 315 .setattr = hfs_inode_setattr,
340}; 316};
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index dffb4e996643..fff16c968e67 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,6 @@ static int hfs_writepages(struct address_space *mapping,
150const struct address_space_operations hfs_btree_aops = { 150const struct address_space_operations hfs_btree_aops = {
151 .readpage = hfs_readpage, 151 .readpage = hfs_readpage,
152 .writepage = hfs_writepage, 152 .writepage = hfs_writepage,
153 .sync_page = block_sync_page,
154 .write_begin = hfs_write_begin, 153 .write_begin = hfs_write_begin,
155 .write_end = generic_write_end, 154 .write_end = generic_write_end,
156 .bmap = hfs_bmap, 155 .bmap = hfs_bmap,
@@ -160,7 +159,6 @@ const struct address_space_operations hfs_btree_aops = {
160const struct address_space_operations hfs_aops = { 159const struct address_space_operations hfs_aops = {
161 .readpage = hfs_readpage, 160 .readpage = hfs_readpage,
162 .writepage = hfs_writepage, 161 .writepage = hfs_writepage,
163 .sync_page = block_sync_page,
164 .write_begin = hfs_write_begin, 162 .write_begin = hfs_write_begin,
165 .write_end = generic_write_end, 163 .write_end = generic_write_end,
166 .bmap = hfs_bmap, 164 .bmap = hfs_bmap,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 52a0bcaa7b6d..b1991a2a08e0 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -397,8 +397,8 @@ int hfsplus_file_extend(struct inode *inode)
397 u32 start, len, goal; 397 u32 start, len, goal;
398 int res; 398 int res;
399 399
400 if (sbi->total_blocks - sbi->free_blocks + 8 > 400 if (sbi->alloc_file->i_size * 8 <
401 sbi->alloc_file->i_size * 8) { 401 sbi->total_blocks - sbi->free_blocks + 8) {
402 /* extend alloc file */ 402 /* extend alloc file */
403 printk(KERN_ERR "hfs: extend alloc file! " 403 printk(KERN_ERR "hfs: extend alloc file! "
404 "(%llu,%u,%u)\n", 404 "(%llu,%u,%u)\n",
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a8df651747f0..b248a6cfcad9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -146,7 +146,6 @@ static int hfsplus_writepages(struct address_space *mapping,
146const struct address_space_operations hfsplus_btree_aops = { 146const struct address_space_operations hfsplus_btree_aops = {
147 .readpage = hfsplus_readpage, 147 .readpage = hfsplus_readpage,
148 .writepage = hfsplus_writepage, 148 .writepage = hfsplus_writepage,
149 .sync_page = block_sync_page,
150 .write_begin = hfsplus_write_begin, 149 .write_begin = hfsplus_write_begin,
151 .write_end = generic_write_end, 150 .write_end = generic_write_end,
152 .bmap = hfsplus_bmap, 151 .bmap = hfsplus_bmap,
@@ -156,7 +155,6 @@ const struct address_space_operations hfsplus_btree_aops = {
156const struct address_space_operations hfsplus_aops = { 155const struct address_space_operations hfsplus_aops = {
157 .readpage = hfsplus_readpage, 156 .readpage = hfsplus_readpage,
158 .writepage = hfsplus_writepage, 157 .writepage = hfsplus_writepage,
159 .sync_page = block_sync_page,
160 .write_begin = hfsplus_write_begin, 158 .write_begin = hfsplus_write_begin,
161 .write_end = generic_write_end, 159 .write_end = generic_write_end,
162 .bmap = hfsplus_bmap, 160 .bmap = hfsplus_bmap,
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 508ce662ce12..fbaa6690c8e0 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -47,7 +47,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
47 if (err) 47 if (err)
48 goto out; 48 goto out;
49 49
50 if (!is_owner_or_cap(inode)) { 50 if (!inode_owner_or_capable(inode)) {
51 err = -EACCES; 51 err = -EACCES;
52 goto out_drop_write; 52 goto out_drop_write;
53 } 53 }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index d66ad113b1cc..40ad88c12c64 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -134,7 +134,7 @@ int hfs_part_find(struct super_block *sb,
134 res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK, 134 res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
135 data, READ); 135 data, READ);
136 if (res) 136 if (res)
137 return res; 137 goto out;
138 138
139 switch (be16_to_cpu(*((__be16 *)data))) { 139 switch (be16_to_cpu(*((__be16 *)data))) {
140 case HFS_OLD_PMAP_MAGIC: 140 case HFS_OLD_PMAP_MAGIC:
@@ -147,7 +147,7 @@ int hfs_part_find(struct super_block *sb,
147 res = -ENOENT; 147 res = -ENOENT;
148 break; 148 break;
149 } 149 }
150 150out:
151 kfree(data); 151 kfree(data);
152 return res; 152 return res;
153} 153}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a3b4795f43c..b49b55584c84 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -338,20 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
338 struct inode *root, *inode; 338 struct inode *root, *inode;
339 struct qstr str; 339 struct qstr str;
340 struct nls_table *nls = NULL; 340 struct nls_table *nls = NULL;
341 int err = -EINVAL; 341 int err;
342 342
343 err = -EINVAL;
343 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 344 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
344 if (!sbi) 345 if (!sbi)
345 return -ENOMEM; 346 goto out;
346 347
347 sb->s_fs_info = sbi; 348 sb->s_fs_info = sbi;
348 mutex_init(&sbi->alloc_mutex); 349 mutex_init(&sbi->alloc_mutex);
349 mutex_init(&sbi->vh_mutex); 350 mutex_init(&sbi->vh_mutex);
350 hfsplus_fill_defaults(sbi); 351 hfsplus_fill_defaults(sbi);
352
353 err = -EINVAL;
351 if (!hfsplus_parse_options(data, sbi)) { 354 if (!hfsplus_parse_options(data, sbi)) {
352 printk(KERN_ERR "hfs: unable to parse mount options\n"); 355 printk(KERN_ERR "hfs: unable to parse mount options\n");
353 err = -EINVAL; 356 goto out_unload_nls;
354 goto cleanup;
355 } 357 }
356 358
357 /* temporarily use utf8 to correctly find the hidden dir below */ 359 /* temporarily use utf8 to correctly find the hidden dir below */
@@ -359,16 +361,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
359 sbi->nls = load_nls("utf8"); 361 sbi->nls = load_nls("utf8");
360 if (!sbi->nls) { 362 if (!sbi->nls) {
361 printk(KERN_ERR "hfs: unable to load nls for utf8\n"); 363 printk(KERN_ERR "hfs: unable to load nls for utf8\n");
362 err = -EINVAL; 364 goto out_unload_nls;
363 goto cleanup;
364 } 365 }
365 366
366 /* Grab the volume header */ 367 /* Grab the volume header */
367 if (hfsplus_read_wrapper(sb)) { 368 if (hfsplus_read_wrapper(sb)) {
368 if (!silent) 369 if (!silent)
369 printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); 370 printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
370 err = -EINVAL; 371 goto out_unload_nls;
371 goto cleanup;
372 } 372 }
373 vhdr = sbi->s_vhdr; 373 vhdr = sbi->s_vhdr;
374 374
@@ -377,7 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
377 if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || 377 if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
378 be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { 378 be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
379 printk(KERN_ERR "hfs: wrong filesystem version\n"); 379 printk(KERN_ERR "hfs: wrong filesystem version\n");
380 goto cleanup; 380 goto out_free_vhdr;
381 } 381 }
382 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); 382 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
383 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); 383 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
@@ -421,19 +421,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
421 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 421 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
422 if (!sbi->ext_tree) { 422 if (!sbi->ext_tree) {
423 printk(KERN_ERR "hfs: failed to load extents file\n"); 423 printk(KERN_ERR "hfs: failed to load extents file\n");
424 goto cleanup; 424 goto out_free_vhdr;
425 } 425 }
426 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 426 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
427 if (!sbi->cat_tree) { 427 if (!sbi->cat_tree) {
428 printk(KERN_ERR "hfs: failed to load catalog file\n"); 428 printk(KERN_ERR "hfs: failed to load catalog file\n");
429 goto cleanup; 429 goto out_close_ext_tree;
430 } 430 }
431 431
432 inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); 432 inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
433 if (IS_ERR(inode)) { 433 if (IS_ERR(inode)) {
434 printk(KERN_ERR "hfs: failed to load allocation file\n"); 434 printk(KERN_ERR "hfs: failed to load allocation file\n");
435 err = PTR_ERR(inode); 435 err = PTR_ERR(inode);
436 goto cleanup; 436 goto out_close_cat_tree;
437 } 437 }
438 sbi->alloc_file = inode; 438 sbi->alloc_file = inode;
439 439
@@ -442,14 +442,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
442 if (IS_ERR(root)) { 442 if (IS_ERR(root)) {
443 printk(KERN_ERR "hfs: failed to load root directory\n"); 443 printk(KERN_ERR "hfs: failed to load root directory\n");
444 err = PTR_ERR(root); 444 err = PTR_ERR(root);
445 goto cleanup; 445 goto out_put_alloc_file;
446 }
447 sb->s_d_op = &hfsplus_dentry_operations;
448 sb->s_root = d_alloc_root(root);
449 if (!sb->s_root) {
450 iput(root);
451 err = -ENOMEM;
452 goto cleanup;
453 } 446 }
454 447
455 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 448 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
@@ -459,46 +452,69 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
459 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { 452 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
460 hfs_find_exit(&fd); 453 hfs_find_exit(&fd);
461 if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) 454 if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
462 goto cleanup; 455 goto out_put_root;
463 inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); 456 inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
464 if (IS_ERR(inode)) { 457 if (IS_ERR(inode)) {
465 err = PTR_ERR(inode); 458 err = PTR_ERR(inode);
466 goto cleanup; 459 goto out_put_root;
467 } 460 }
468 sbi->hidden_dir = inode; 461 sbi->hidden_dir = inode;
469 } else 462 } else
470 hfs_find_exit(&fd); 463 hfs_find_exit(&fd);
471 464
472 if (sb->s_flags & MS_RDONLY) 465 if (!(sb->s_flags & MS_RDONLY)) {
473 goto out; 466 /*
467 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
468 * all three are registered with Apple for our use
469 */
470 vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
471 vhdr->modify_date = hfsp_now2mt();
472 be32_add_cpu(&vhdr->write_count, 1);
473 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
474 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
475 hfsplus_sync_fs(sb, 1);
474 476
475 /* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused 477 if (!sbi->hidden_dir) {
476 * all three are registered with Apple for our use 478 mutex_lock(&sbi->vh_mutex);
477 */ 479 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
478 vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); 480 hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
479 vhdr->modify_date = hfsp_now2mt(); 481 sbi->hidden_dir);
480 be32_add_cpu(&vhdr->write_count, 1); 482 mutex_unlock(&sbi->vh_mutex);
481 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 483
482 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 484 hfsplus_mark_inode_dirty(sbi->hidden_dir,
483 hfsplus_sync_fs(sb, 1); 485 HFSPLUS_I_CAT_DIRTY);
484 486 }
485 if (!sbi->hidden_dir) {
486 mutex_lock(&sbi->vh_mutex);
487 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
488 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
489 &str, sbi->hidden_dir);
490 mutex_unlock(&sbi->vh_mutex);
491
492 hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
493 } 487 }
494out: 488
489 sb->s_d_op = &hfsplus_dentry_operations;
490 sb->s_root = d_alloc_root(root);
491 if (!sb->s_root) {
492 err = -ENOMEM;
493 goto out_put_hidden_dir;
494 }
495
495 unload_nls(sbi->nls); 496 unload_nls(sbi->nls);
496 sbi->nls = nls; 497 sbi->nls = nls;
497 return 0; 498 return 0;
498 499
499cleanup: 500out_put_hidden_dir:
500 hfsplus_put_super(sb); 501 iput(sbi->hidden_dir);
502out_put_root:
503 iput(sbi->alloc_file);
504out_put_alloc_file:
505 iput(sbi->alloc_file);
506out_close_cat_tree:
507 hfs_btree_close(sbi->cat_tree);
508out_close_ext_tree:
509 hfs_btree_close(sbi->ext_tree);
510out_free_vhdr:
511 kfree(sbi->s_vhdr);
512 kfree(sbi->s_backup_vhdr);
513out_unload_nls:
514 unload_nls(sbi->nls);
501 unload_nls(nls); 515 unload_nls(nls);
516 kfree(sbi);
517out:
502 return err; 518 return err;
503} 519}
504 520
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 196231794f64..3031d81f5f0f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -167,7 +167,7 @@ reread:
167 break; 167 break;
168 case cpu_to_be16(HFSP_WRAP_MAGIC): 168 case cpu_to_be16(HFSP_WRAP_MAGIC):
169 if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) 169 if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
170 goto out; 170 goto out_free_backup_vhdr;
171 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; 171 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
172 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; 172 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
173 part_size = wd.embed_count * wd.ablk_size; 173 part_size = wd.embed_count * wd.ablk_size;
@@ -179,7 +179,7 @@ reread:
179 * (should do this only for cdrom/loop though) 179 * (should do this only for cdrom/loop though)
180 */ 180 */
181 if (hfs_part_find(sb, &part_start, &part_size)) 181 if (hfs_part_find(sb, &part_start, &part_size))
182 goto out; 182 goto out_free_backup_vhdr;
183 goto reread; 183 goto reread;
184 } 184 }
185 185
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 63b6f5632318..0c39dc3ef7d7 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,7 +1,7 @@
1config HPFS_FS 1config HPFS_FS
2 tristate "OS/2 HPFS file system support" 2 tristate "OS/2 HPFS file system support"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # nontrivial to fix 4 depends on BROKEN || !PREEMPT
5 help 5 help
6 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS 6 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
7 is the file system used for organizing files on OS/2 hard disk 7 is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index d32f63a569f7..b3d7c0ddb609 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -6,16 +6,15 @@
6 * directory VFS functions 6 * directory VFS functions
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include <linux/slab.h> 9#include <linux/slab.h>
11#include "hpfs_fn.h" 10#include "hpfs_fn.h"
12 11
13static int hpfs_dir_release(struct inode *inode, struct file *filp) 12static int hpfs_dir_release(struct inode *inode, struct file *filp)
14{ 13{
15 lock_kernel(); 14 hpfs_lock(inode->i_sb);
16 hpfs_del_pos(inode, &filp->f_pos); 15 hpfs_del_pos(inode, &filp->f_pos);
17 /*hpfs_write_if_changed(inode);*/ 16 /*hpfs_write_if_changed(inode);*/
18 unlock_kernel(); 17 hpfs_unlock(inode->i_sb);
19 return 0; 18 return 0;
20} 19}
21 20
@@ -30,7 +29,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
30 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 29 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
31 struct super_block *s = i->i_sb; 30 struct super_block *s = i->i_sb;
32 31
33 lock_kernel(); 32 hpfs_lock(s);
34 33
35 /*printk("dir lseek\n");*/ 34 /*printk("dir lseek\n");*/
36 if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; 35 if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
@@ -43,12 +42,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
43 } 42 }
44 mutex_unlock(&i->i_mutex); 43 mutex_unlock(&i->i_mutex);
45ok: 44ok:
46 unlock_kernel(); 45 hpfs_unlock(s);
47 return filp->f_pos = new_off; 46 return filp->f_pos = new_off;
48fail: 47fail:
49 mutex_unlock(&i->i_mutex); 48 mutex_unlock(&i->i_mutex);
50 /*printk("illegal lseek: %016llx\n", new_off);*/ 49 /*printk("illegal lseek: %016llx\n", new_off);*/
51 unlock_kernel(); 50 hpfs_unlock(s);
52 return -ESPIPE; 51 return -ESPIPE;
53} 52}
54 53
@@ -64,7 +63,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
64 int c1, c2 = 0; 63 int c1, c2 = 0;
65 int ret = 0; 64 int ret = 0;
66 65
67 lock_kernel(); 66 hpfs_lock(inode->i_sb);
68 67
69 if (hpfs_sb(inode->i_sb)->sb_chk) { 68 if (hpfs_sb(inode->i_sb)->sb_chk) {
70 if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) { 69 if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) {
@@ -167,7 +166,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
167 hpfs_brelse4(&qbh); 166 hpfs_brelse4(&qbh);
168 } 167 }
169out: 168out:
170 unlock_kernel(); 169 hpfs_unlock(inode->i_sb);
171 return ret; 170 return ret;
172} 171}
173 172
@@ -197,10 +196,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
197 struct inode *result = NULL; 196 struct inode *result = NULL;
198 struct hpfs_inode_info *hpfs_result; 197 struct hpfs_inode_info *hpfs_result;
199 198
200 lock_kernel(); 199 hpfs_lock(dir->i_sb);
201 if ((err = hpfs_chk_name(name, &len))) { 200 if ((err = hpfs_chk_name(name, &len))) {
202 if (err == -ENAMETOOLONG) { 201 if (err == -ENAMETOOLONG) {
203 unlock_kernel(); 202 hpfs_unlock(dir->i_sb);
204 return ERR_PTR(-ENAMETOOLONG); 203 return ERR_PTR(-ENAMETOOLONG);
205 } 204 }
206 goto end_add; 205 goto end_add;
@@ -298,7 +297,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
298 297
299 end: 298 end:
300 end_add: 299 end_add:
301 unlock_kernel(); 300 hpfs_unlock(dir->i_sb);
302 d_add(dentry, result); 301 d_add(dentry, result);
303 return NULL; 302 return NULL;
304 303
@@ -311,7 +310,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
311 310
312 /*bail:*/ 311 /*bail:*/
313 312
314 unlock_kernel(); 313 hpfs_unlock(dir->i_sb);
315 return ERR_PTR(-ENOENT); 314 return ERR_PTR(-ENOENT);
316} 315}
317 316
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c0340887c7ea..9b9eb6933e43 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -6,16 +6,15 @@
6 * file VFS functions 6 * file VFS functions
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include "hpfs_fn.h" 9#include "hpfs_fn.h"
11 10
12#define BLOCKS(size) (((size) + 511) >> 9) 11#define BLOCKS(size) (((size) + 511) >> 9)
13 12
14static int hpfs_file_release(struct inode *inode, struct file *file) 13static int hpfs_file_release(struct inode *inode, struct file *file)
15{ 14{
16 lock_kernel(); 15 hpfs_lock(inode->i_sb);
17 hpfs_write_if_changed(inode); 16 hpfs_write_if_changed(inode);
18 unlock_kernel(); 17 hpfs_unlock(inode->i_sb);
19 return 0; 18 return 0;
20} 19}
21 20
@@ -49,14 +48,14 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
49static void hpfs_truncate(struct inode *i) 48static void hpfs_truncate(struct inode *i)
50{ 49{
51 if (IS_IMMUTABLE(i)) return /*-EPERM*/; 50 if (IS_IMMUTABLE(i)) return /*-EPERM*/;
52 lock_kernel(); 51 hpfs_lock(i->i_sb);
53 hpfs_i(i)->i_n_secs = 0; 52 hpfs_i(i)->i_n_secs = 0;
54 i->i_blocks = 1 + ((i->i_size + 511) >> 9); 53 i->i_blocks = 1 + ((i->i_size + 511) >> 9);
55 hpfs_i(i)->mmu_private = i->i_size; 54 hpfs_i(i)->mmu_private = i->i_size;
56 hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9)); 55 hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9));
57 hpfs_write_inode(i); 56 hpfs_write_inode(i);
58 hpfs_i(i)->i_n_secs = 0; 57 hpfs_i(i)->i_n_secs = 0;
59 unlock_kernel(); 58 hpfs_unlock(i->i_sb);
60} 59}
61 60
62static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) 61static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
@@ -120,7 +119,6 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
120const struct address_space_operations hpfs_aops = { 119const struct address_space_operations hpfs_aops = {
121 .readpage = hpfs_readpage, 120 .readpage = hpfs_readpage,
122 .writepage = hpfs_writepage, 121 .writepage = hpfs_writepage,
123 .sync_page = block_sync_page,
124 .write_begin = hpfs_write_begin, 122 .write_begin = hpfs_write_begin,
125 .write_end = generic_write_end, 123 .write_end = generic_write_end,
126 .bmap = _hpfs_bmap 124 .bmap = _hpfs_bmap
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1c43dbea55e8..c15adbca07ff 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -342,3 +342,25 @@ static inline time32_t gmt_to_local(struct super_block *s, time_t t)
342 extern struct timezone sys_tz; 342 extern struct timezone sys_tz;
343 return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; 343 return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
344} 344}
345
346/*
347 * Locking:
348 *
349 * hpfs_lock() is a leftover from the big kernel lock.
350 * Right now, these functions are empty and only left
351 * for documentation purposes. The file system no longer
352 * works on SMP systems, so the lock is not needed
353 * any more.
354 *
355 * If someone is interested in making it work again, this
356 * would be the place to start by adding a per-superblock
357 * mutex and fixing all the bugs and performance issues
358 * caused by that.
359 */
360static inline void hpfs_lock(struct super_block *s)
361{
362}
363
364static inline void hpfs_unlock(struct super_block *s)
365{
366}
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1ae35baa539e..87f1f787e767 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -6,7 +6,6 @@
6 * inode VFS functions 6 * inode VFS functions
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include <linux/slab.h> 9#include <linux/slab.h>
11#include "hpfs_fn.h" 10#include "hpfs_fn.h"
12 11
@@ -267,7 +266,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
267 struct inode *inode = dentry->d_inode; 266 struct inode *inode = dentry->d_inode;
268 int error = -EINVAL; 267 int error = -EINVAL;
269 268
270 lock_kernel(); 269 hpfs_lock(inode->i_sb);
271 if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root) 270 if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
272 goto out_unlock; 271 goto out_unlock;
273 if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) 272 if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
@@ -290,7 +289,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
290 hpfs_write_inode(inode); 289 hpfs_write_inode(inode);
291 290
292 out_unlock: 291 out_unlock:
293 unlock_kernel(); 292 hpfs_unlock(inode->i_sb);
294 return error; 293 return error;
295} 294}
296 295
@@ -307,8 +306,8 @@ void hpfs_evict_inode(struct inode *inode)
307 truncate_inode_pages(&inode->i_data, 0); 306 truncate_inode_pages(&inode->i_data, 0);
308 end_writeback(inode); 307 end_writeback(inode);
309 if (!inode->i_nlink) { 308 if (!inode->i_nlink) {
310 lock_kernel(); 309 hpfs_lock(inode->i_sb);
311 hpfs_remove_fnode(inode->i_sb, inode->i_ino); 310 hpfs_remove_fnode(inode->i_sb, inode->i_ino);
312 unlock_kernel(); 311 hpfs_unlock(inode->i_sb);
313 } 312 }
314} 313}
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4ad9e31ddc4..d5f8c8a19023 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -6,7 +6,6 @@
6 * adding & removing files & directories 6 * adding & removing files & directories
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/smp_lock.h>
10#include "hpfs_fn.h" 9#include "hpfs_fn.h"
11 10
12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 11static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
@@ -25,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
25 struct hpfs_dirent dee; 24 struct hpfs_dirent dee;
26 int err; 25 int err;
27 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; 26 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
28 lock_kernel(); 27 hpfs_lock(dir->i_sb);
29 err = -ENOSPC; 28 err = -ENOSPC;
30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 29 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
31 if (!fnode) 30 if (!fnode)
@@ -103,7 +102,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
103 } 102 }
104 d_instantiate(dentry, result); 103 d_instantiate(dentry, result);
105 mutex_unlock(&hpfs_i(dir)->i_mutex); 104 mutex_unlock(&hpfs_i(dir)->i_mutex);
106 unlock_kernel(); 105 hpfs_unlock(dir->i_sb);
107 return 0; 106 return 0;
108bail3: 107bail3:
109 mutex_unlock(&hpfs_i(dir)->i_mutex); 108 mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -115,7 +114,7 @@ bail1:
115 brelse(bh); 114 brelse(bh);
116 hpfs_free_sectors(dir->i_sb, fno, 1); 115 hpfs_free_sectors(dir->i_sb, fno, 1);
117bail: 116bail:
118 unlock_kernel(); 117 hpfs_unlock(dir->i_sb);
119 return err; 118 return err;
120} 119}
121 120
@@ -132,7 +131,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
132 int err; 131 int err;
133 if ((err = hpfs_chk_name(name, &len))) 132 if ((err = hpfs_chk_name(name, &len)))
134 return err==-ENOENT ? -EINVAL : err; 133 return err==-ENOENT ? -EINVAL : err;
135 lock_kernel(); 134 hpfs_lock(dir->i_sb);
136 err = -ENOSPC; 135 err = -ENOSPC;
137 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 136 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
138 if (!fnode) 137 if (!fnode)
@@ -195,7 +194,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
195 } 194 }
196 d_instantiate(dentry, result); 195 d_instantiate(dentry, result);
197 mutex_unlock(&hpfs_i(dir)->i_mutex); 196 mutex_unlock(&hpfs_i(dir)->i_mutex);
198 unlock_kernel(); 197 hpfs_unlock(dir->i_sb);
199 return 0; 198 return 0;
200 199
201bail2: 200bail2:
@@ -205,7 +204,7 @@ bail1:
205 brelse(bh); 204 brelse(bh);
206 hpfs_free_sectors(dir->i_sb, fno, 1); 205 hpfs_free_sectors(dir->i_sb, fno, 1);
207bail: 206bail:
208 unlock_kernel(); 207 hpfs_unlock(dir->i_sb);
209 return err; 208 return err;
210} 209}
211 210
@@ -224,7 +223,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; 223 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
225 if (!new_valid_dev(rdev)) 224 if (!new_valid_dev(rdev))
226 return -EINVAL; 225 return -EINVAL;
227 lock_kernel(); 226 hpfs_lock(dir->i_sb);
228 err = -ENOSPC; 227 err = -ENOSPC;
229 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 228 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
230 if (!fnode) 229 if (!fnode)
@@ -274,7 +273,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
274 d_instantiate(dentry, result); 273 d_instantiate(dentry, result);
275 mutex_unlock(&hpfs_i(dir)->i_mutex); 274 mutex_unlock(&hpfs_i(dir)->i_mutex);
276 brelse(bh); 275 brelse(bh);
277 unlock_kernel(); 276 hpfs_unlock(dir->i_sb);
278 return 0; 277 return 0;
279bail2: 278bail2:
280 mutex_unlock(&hpfs_i(dir)->i_mutex); 279 mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -283,7 +282,7 @@ bail1:
283 brelse(bh); 282 brelse(bh);
284 hpfs_free_sectors(dir->i_sb, fno, 1); 283 hpfs_free_sectors(dir->i_sb, fno, 1);
285bail: 284bail:
286 unlock_kernel(); 285 hpfs_unlock(dir->i_sb);
287 return err; 286 return err;
288} 287}
289 288
@@ -299,9 +298,9 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
299 struct inode *result; 298 struct inode *result;
300 int err; 299 int err;
301 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; 300 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
302 lock_kernel(); 301 hpfs_lock(dir->i_sb);
303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) { 302 if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
304 unlock_kernel(); 303 hpfs_unlock(dir->i_sb);
305 return -EPERM; 304 return -EPERM;
306 } 305 }
307 err = -ENOSPC; 306 err = -ENOSPC;
@@ -354,7 +353,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
354 hpfs_write_inode_nolock(result); 353 hpfs_write_inode_nolock(result);
355 d_instantiate(dentry, result); 354 d_instantiate(dentry, result);
356 mutex_unlock(&hpfs_i(dir)->i_mutex); 355 mutex_unlock(&hpfs_i(dir)->i_mutex);
357 unlock_kernel(); 356 hpfs_unlock(dir->i_sb);
358 return 0; 357 return 0;
359bail2: 358bail2:
360 mutex_unlock(&hpfs_i(dir)->i_mutex); 359 mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -363,7 +362,7 @@ bail1:
363 brelse(bh); 362 brelse(bh);
364 hpfs_free_sectors(dir->i_sb, fno, 1); 363 hpfs_free_sectors(dir->i_sb, fno, 1);
365bail: 364bail:
366 unlock_kernel(); 365 hpfs_unlock(dir->i_sb);
367 return err; 366 return err;
368} 367}
369 368
@@ -380,7 +379,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
380 int rep = 0; 379 int rep = 0;
381 int err; 380 int err;
382 381
383 lock_kernel(); 382 hpfs_lock(dir->i_sb);
384 hpfs_adjust_length(name, &len); 383 hpfs_adjust_length(name, &len);
385again: 384again:
386 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 385 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
@@ -416,7 +415,7 @@ again:
416 dentry_unhash(dentry); 415 dentry_unhash(dentry);
417 if (!d_unhashed(dentry)) { 416 if (!d_unhashed(dentry)) {
418 dput(dentry); 417 dput(dentry);
419 unlock_kernel(); 418 hpfs_unlock(dir->i_sb);
420 return -ENOSPC; 419 return -ENOSPC;
421 } 420 }
422 if (generic_permission(inode, MAY_WRITE, 0, NULL) || 421 if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
@@ -435,7 +434,7 @@ again:
435 if (!err) 434 if (!err)
436 goto again; 435 goto again;
437 } 436 }
438 unlock_kernel(); 437 hpfs_unlock(dir->i_sb);
439 return -ENOSPC; 438 return -ENOSPC;
440 default: 439 default:
441 drop_nlink(inode); 440 drop_nlink(inode);
@@ -448,7 +447,7 @@ out1:
448out: 447out:
449 mutex_unlock(&hpfs_i(dir)->i_mutex); 448 mutex_unlock(&hpfs_i(dir)->i_mutex);
450 mutex_unlock(&hpfs_i(inode)->i_parent_mutex); 449 mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
451 unlock_kernel(); 450 hpfs_unlock(dir->i_sb);
452 return err; 451 return err;
453} 452}
454 453
@@ -466,7 +465,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
466 int r; 465 int r;
467 466
468 hpfs_adjust_length(name, &len); 467 hpfs_adjust_length(name, &len);
469 lock_kernel(); 468 hpfs_lock(dir->i_sb);
470 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 469 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
471 mutex_lock(&hpfs_i(dir)->i_mutex); 470 mutex_lock(&hpfs_i(dir)->i_mutex);
472 err = -ENOENT; 471 err = -ENOENT;
@@ -508,7 +507,7 @@ out1:
508out: 507out:
509 mutex_unlock(&hpfs_i(dir)->i_mutex); 508 mutex_unlock(&hpfs_i(dir)->i_mutex);
510 mutex_unlock(&hpfs_i(inode)->i_parent_mutex); 509 mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
511 unlock_kernel(); 510 hpfs_unlock(dir->i_sb);
512 return err; 511 return err;
513} 512}
514 513
@@ -521,21 +520,21 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
521 int err; 520 int err;
522 521
523 err = -EIO; 522 err = -EIO;
524 lock_kernel(); 523 hpfs_lock(i->i_sb);
525 if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) 524 if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh)))
526 goto fail; 525 goto fail;
527 err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE); 526 err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
528 brelse(bh); 527 brelse(bh);
529 if (err) 528 if (err)
530 goto fail; 529 goto fail;
531 unlock_kernel(); 530 hpfs_unlock(i->i_sb);
532 SetPageUptodate(page); 531 SetPageUptodate(page);
533 kunmap(page); 532 kunmap(page);
534 unlock_page(page); 533 unlock_page(page);
535 return 0; 534 return 0;
536 535
537fail: 536fail:
538 unlock_kernel(); 537 hpfs_unlock(i->i_sb);
539 SetPageError(page); 538 SetPageError(page);
540 kunmap(page); 539 kunmap(page);
541 unlock_page(page); 540 unlock_page(page);
@@ -567,7 +566,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
567 err = 0; 566 err = 0;
568 hpfs_adjust_length(old_name, &old_len); 567 hpfs_adjust_length(old_name, &old_len);
569 568
570 lock_kernel(); 569 hpfs_lock(i->i_sb);
571 /* order doesn't matter, due to VFS exclusion */ 570 /* order doesn't matter, due to VFS exclusion */
572 mutex_lock(&hpfs_i(i)->i_parent_mutex); 571 mutex_lock(&hpfs_i(i)->i_parent_mutex);
573 if (new_inode) 572 if (new_inode)
@@ -659,7 +658,7 @@ end1:
659 mutex_unlock(&hpfs_i(i)->i_parent_mutex); 658 mutex_unlock(&hpfs_i(i)->i_parent_mutex);
660 if (new_inode) 659 if (new_inode)
661 mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex); 660 mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex);
662 unlock_kernel(); 661 hpfs_unlock(i->i_sb);
663 return err; 662 return err;
664} 663}
665 664
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b30426b1fc97..c89b40808587 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,7 +13,6 @@
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/magic.h> 14#include <linux/magic.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h>
17#include <linux/bitmap.h> 16#include <linux/bitmap.h>
18#include <linux/slab.h> 17#include <linux/slab.h>
19 18
@@ -103,15 +102,11 @@ static void hpfs_put_super(struct super_block *s)
103{ 102{
104 struct hpfs_sb_info *sbi = hpfs_sb(s); 103 struct hpfs_sb_info *sbi = hpfs_sb(s);
105 104
106 lock_kernel();
107
108 kfree(sbi->sb_cp_table); 105 kfree(sbi->sb_cp_table);
109 kfree(sbi->sb_bmp_dir); 106 kfree(sbi->sb_bmp_dir);
110 unmark_dirty(s); 107 unmark_dirty(s);
111 s->s_fs_info = NULL; 108 s->s_fs_info = NULL;
112 kfree(sbi); 109 kfree(sbi);
113
114 unlock_kernel();
115} 110}
116 111
117unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) 112unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -143,7 +138,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
143 struct super_block *s = dentry->d_sb; 138 struct super_block *s = dentry->d_sb;
144 struct hpfs_sb_info *sbi = hpfs_sb(s); 139 struct hpfs_sb_info *sbi = hpfs_sb(s);
145 u64 id = huge_encode_dev(s->s_bdev->bd_dev); 140 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
146 lock_kernel(); 141 hpfs_lock(s);
147 142
148 /*if (sbi->sb_n_free == -1) {*/ 143 /*if (sbi->sb_n_free == -1) {*/
149 sbi->sb_n_free = count_bitmaps(s); 144 sbi->sb_n_free = count_bitmaps(s);
@@ -160,7 +155,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
160 buf->f_fsid.val[1] = (u32)(id >> 32); 155 buf->f_fsid.val[1] = (u32)(id >> 32);
161 buf->f_namelen = 254; 156 buf->f_namelen = 254;
162 157
163 unlock_kernel(); 158 hpfs_unlock(s);
164 159
165 return 0; 160 return 0;
166} 161}
@@ -406,7 +401,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
406 401
407 *flags |= MS_NOATIME; 402 *flags |= MS_NOATIME;
408 403
409 lock_kernel(); 404 hpfs_lock(s);
410 lock_super(s); 405 lock_super(s);
411 uid = sbi->sb_uid; gid = sbi->sb_gid; 406 uid = sbi->sb_uid; gid = sbi->sb_gid;
412 umask = 0777 & ~sbi->sb_mode; 407 umask = 0777 & ~sbi->sb_mode;
@@ -441,12 +436,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
441 replace_mount_options(s, new_opts); 436 replace_mount_options(s, new_opts);
442 437
443 unlock_super(s); 438 unlock_super(s);
444 unlock_kernel(); 439 hpfs_unlock(s);
445 return 0; 440 return 0;
446 441
447out_err: 442out_err:
448 unlock_super(s); 443 unlock_super(s);
449 unlock_kernel(); 444 hpfs_unlock(s);
450 kfree(new_opts); 445 kfree(new_opts);
451 return -EINVAL; 446 return -EINVAL;
452} 447}
@@ -484,13 +479,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
484 479
485 int o; 480 int o;
486 481
487 lock_kernel(); 482 if (num_possible_cpus() > 1) {
483 printk(KERN_ERR "HPFS is not SMP safe\n");
484 return -EINVAL;
485 }
488 486
489 save_mount_options(s, options); 487 save_mount_options(s, options);
490 488
491 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 489 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
492 if (!sbi) { 490 if (!sbi) {
493 unlock_kernel();
494 return -ENOMEM; 491 return -ENOMEM;
495 } 492 }
496 s->s_fs_info = sbi; 493 s->s_fs_info = sbi;
@@ -677,7 +674,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
677 root->i_blocks = 5; 674 root->i_blocks = 5;
678 hpfs_brelse4(&qbh); 675 hpfs_brelse4(&qbh);
679 } 676 }
680 unlock_kernel();
681 return 0; 677 return 0;
682 678
683bail4: brelse(bh2); 679bail4: brelse(bh2);
@@ -689,7 +685,6 @@ bail0:
689 kfree(sbi->sb_cp_table); 685 kfree(sbi->sb_cp_table);
690 s->s_fs_info = NULL; 686 s->s_fs_info = NULL;
691 kfree(sbi); 687 kfree(sbi);
692 unlock_kernel();
693 return -EINVAL; 688 return -EINVAL;
694} 689}
695 690
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9885082b470f..b9eeb1cd03ff 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -332,8 +332,7 @@ static void truncate_huge_page(struct page *page)
332{ 332{
333 cancel_dirty_page(page, /* No IO accounting for huge pages? */0); 333 cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
334 ClearPageUptodate(page); 334 ClearPageUptodate(page);
335 remove_from_page_cache(page); 335 delete_from_page_cache(page);
336 put_page(page);
337} 336}
338 337
339static void truncate_hugepages(struct inode *inode, loff_t lstart) 338static void truncate_hugepages(struct inode *inode, loff_t lstart)
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f3..05a1f75ae791 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,39 @@
25#include <linux/async.h> 25#include <linux/async.h>
26#include <linux/posix_acl.h> 26#include <linux/posix_acl.h>
27#include <linux/ima.h> 27#include <linux/ima.h>
28#include <linux/cred.h>
29#include "internal.h"
30
31/*
32 * inode locking rules.
33 *
34 * inode->i_lock protects:
35 * inode->i_state, inode->i_hash, __iget()
36 * inode_lru_lock protects:
37 * inode_lru, inode->i_lru
38 * inode_sb_list_lock protects:
39 * sb->s_inodes, inode->i_sb_list
40 * inode_wb_list_lock protects:
41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
42 * inode_hash_lock protects:
43 * inode_hashtable, inode->i_hash
44 *
45 * Lock ordering:
46 *
47 * inode_sb_list_lock
48 * inode->i_lock
49 * inode_lru_lock
50 *
51 * inode_wb_list_lock
52 * inode->i_lock
53 *
54 * inode_hash_lock
55 * inode_sb_list_lock
56 * inode->i_lock
57 *
58 * iunique_lock
59 * inode_hash_lock
60 */
28 61
29/* 62/*
30 * This is needed for the following functions: 63 * This is needed for the following functions:
@@ -59,6 +92,8 @@
59 92
60static unsigned int i_hash_mask __read_mostly; 93static unsigned int i_hash_mask __read_mostly;
61static unsigned int i_hash_shift __read_mostly; 94static unsigned int i_hash_shift __read_mostly;
95static struct hlist_head *inode_hashtable __read_mostly;
96static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
62 97
63/* 98/*
64 * Each inode can be on two separate lists. One is 99 * Each inode can be on two separate lists. One is
@@ -73,27 +108,19 @@ static unsigned int i_hash_shift __read_mostly;
73 */ 108 */
74 109
75static LIST_HEAD(inode_lru); 110static LIST_HEAD(inode_lru);
76static struct hlist_head *inode_hashtable __read_mostly; 111static DEFINE_SPINLOCK(inode_lru_lock);
77 112
78/* 113__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
79 * A simple spinlock to protect the list manipulations. 114__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
80 *
81 * NOTE! You also have to own the lock if you change
82 * the i_state of an inode while it is in use..
83 */
84DEFINE_SPINLOCK(inode_lock);
85 115
86/* 116/*
87 * iprune_sem provides exclusion between the kswapd or try_to_free_pages 117 * iprune_sem provides exclusion between the icache shrinking and the
88 * icache shrinking path, and the umount path. Without this exclusion, 118 * umount path.
89 * by the time prune_icache calls iput for the inode whose pages it has
90 * been invalidating, or by the time it calls clear_inode & destroy_inode
91 * from its final dispose_list, the struct super_block they refer to
92 * (for inode->i_sb->s_op) may already have been freed and reused.
93 * 119 *
94 * We make this an rwsem because the fastpath is icache shrinking. In 120 * We don't actually need it to protect anything in the umount path,
95 * some cases a filesystem may be doing a significant amount of work in 121 * but only need to cycle through it to make sure any inode that
96 * its inode reclaim code, so this should improve parallelism. 122 * prune_icache took off the LRU list has been fully torn down by the
123 * time we are past evict_inodes.
97 */ 124 */
98static DECLARE_RWSEM(iprune_sem); 125static DECLARE_RWSEM(iprune_sem);
99 126
@@ -139,15 +166,6 @@ int proc_nr_inodes(ctl_table *table, int write,
139} 166}
140#endif 167#endif
141 168
142static void wake_up_inode(struct inode *inode)
143{
144 /*
145 * Prevent speculative execution through spin_unlock(&inode_lock);
146 */
147 smp_mb();
148 wake_up_bit(&inode->i_state, __I_NEW);
149}
150
151/** 169/**
152 * inode_init_always - perform inode structure intialisation 170 * inode_init_always - perform inode structure intialisation
153 * @sb: superblock inode belongs to 171 * @sb: superblock inode belongs to
@@ -295,6 +313,20 @@ static void destroy_inode(struct inode *inode)
295 call_rcu(&inode->i_rcu, i_callback); 313 call_rcu(&inode->i_rcu, i_callback);
296} 314}
297 315
316void address_space_init_once(struct address_space *mapping)
317{
318 memset(mapping, 0, sizeof(*mapping));
319 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
320 spin_lock_init(&mapping->tree_lock);
321 spin_lock_init(&mapping->i_mmap_lock);
322 INIT_LIST_HEAD(&mapping->private_list);
323 spin_lock_init(&mapping->private_lock);
324 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
325 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
326 mutex_init(&mapping->unmap_mutex);
327}
328EXPORT_SYMBOL(address_space_init_once);
329
298/* 330/*
299 * These are initializations that only need to be done 331 * These are initializations that only need to be done
300 * once, because the fields are idempotent across use 332 * once, because the fields are idempotent across use
@@ -308,13 +340,7 @@ void inode_init_once(struct inode *inode)
308 INIT_LIST_HEAD(&inode->i_devices); 340 INIT_LIST_HEAD(&inode->i_devices);
309 INIT_LIST_HEAD(&inode->i_wb_list); 341 INIT_LIST_HEAD(&inode->i_wb_list);
310 INIT_LIST_HEAD(&inode->i_lru); 342 INIT_LIST_HEAD(&inode->i_lru);
311 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 343 address_space_init_once(&inode->i_data);
312 spin_lock_init(&inode->i_data.tree_lock);
313 spin_lock_init(&inode->i_data.i_mmap_lock);
314 INIT_LIST_HEAD(&inode->i_data.private_list);
315 spin_lock_init(&inode->i_data.private_lock);
316 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
317 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
318 i_size_ordered_init(inode); 344 i_size_ordered_init(inode);
319#ifdef CONFIG_FSNOTIFY 345#ifdef CONFIG_FSNOTIFY
320 INIT_HLIST_HEAD(&inode->i_fsnotify_marks); 346 INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
@@ -330,7 +356,7 @@ static void init_once(void *foo)
330} 356}
331 357
332/* 358/*
333 * inode_lock must be held 359 * inode->i_lock must be held
334 */ 360 */
335void __iget(struct inode *inode) 361void __iget(struct inode *inode)
336{ 362{
@@ -348,23 +374,22 @@ EXPORT_SYMBOL(ihold);
348 374
349static void inode_lru_list_add(struct inode *inode) 375static void inode_lru_list_add(struct inode *inode)
350{ 376{
377 spin_lock(&inode_lru_lock);
351 if (list_empty(&inode->i_lru)) { 378 if (list_empty(&inode->i_lru)) {
352 list_add(&inode->i_lru, &inode_lru); 379 list_add(&inode->i_lru, &inode_lru);
353 inodes_stat.nr_unused++; 380 inodes_stat.nr_unused++;
354 } 381 }
382 spin_unlock(&inode_lru_lock);
355} 383}
356 384
357static void inode_lru_list_del(struct inode *inode) 385static void inode_lru_list_del(struct inode *inode)
358{ 386{
387 spin_lock(&inode_lru_lock);
359 if (!list_empty(&inode->i_lru)) { 388 if (!list_empty(&inode->i_lru)) {
360 list_del_init(&inode->i_lru); 389 list_del_init(&inode->i_lru);
361 inodes_stat.nr_unused--; 390 inodes_stat.nr_unused--;
362 } 391 }
363} 392 spin_unlock(&inode_lru_lock);
364
365static inline void __inode_sb_list_add(struct inode *inode)
366{
367 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
368} 393}
369 394
370/** 395/**
@@ -373,15 +398,17 @@ static inline void __inode_sb_list_add(struct inode *inode)
373 */ 398 */
374void inode_sb_list_add(struct inode *inode) 399void inode_sb_list_add(struct inode *inode)
375{ 400{
376 spin_lock(&inode_lock); 401 spin_lock(&inode_sb_list_lock);
377 __inode_sb_list_add(inode); 402 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
378 spin_unlock(&inode_lock); 403 spin_unlock(&inode_sb_list_lock);
379} 404}
380EXPORT_SYMBOL_GPL(inode_sb_list_add); 405EXPORT_SYMBOL_GPL(inode_sb_list_add);
381 406
382static inline void __inode_sb_list_del(struct inode *inode) 407static inline void inode_sb_list_del(struct inode *inode)
383{ 408{
409 spin_lock(&inode_sb_list_lock);
384 list_del_init(&inode->i_sb_list); 410 list_del_init(&inode->i_sb_list);
411 spin_unlock(&inode_sb_list_lock);
385} 412}
386 413
387static unsigned long hash(struct super_block *sb, unsigned long hashval) 414static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -406,24 +433,15 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
406{ 433{
407 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 434 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
408 435
409 spin_lock(&inode_lock); 436 spin_lock(&inode_hash_lock);
437 spin_lock(&inode->i_lock);
410 hlist_add_head(&inode->i_hash, b); 438 hlist_add_head(&inode->i_hash, b);
411 spin_unlock(&inode_lock); 439 spin_unlock(&inode->i_lock);
440 spin_unlock(&inode_hash_lock);
412} 441}
413EXPORT_SYMBOL(__insert_inode_hash); 442EXPORT_SYMBOL(__insert_inode_hash);
414 443
415/** 444/**
416 * __remove_inode_hash - remove an inode from the hash
417 * @inode: inode to unhash
418 *
419 * Remove an inode from the superblock.
420 */
421static void __remove_inode_hash(struct inode *inode)
422{
423 hlist_del_init(&inode->i_hash);
424}
425
426/**
427 * remove_inode_hash - remove an inode from the hash 445 * remove_inode_hash - remove an inode from the hash
428 * @inode: inode to unhash 446 * @inode: inode to unhash
429 * 447 *
@@ -431,9 +449,11 @@ static void __remove_inode_hash(struct inode *inode)
431 */ 449 */
432void remove_inode_hash(struct inode *inode) 450void remove_inode_hash(struct inode *inode)
433{ 451{
434 spin_lock(&inode_lock); 452 spin_lock(&inode_hash_lock);
453 spin_lock(&inode->i_lock);
435 hlist_del_init(&inode->i_hash); 454 hlist_del_init(&inode->i_hash);
436 spin_unlock(&inode_lock); 455 spin_unlock(&inode->i_lock);
456 spin_unlock(&inode_hash_lock);
437} 457}
438EXPORT_SYMBOL(remove_inode_hash); 458EXPORT_SYMBOL(remove_inode_hash);
439 459
@@ -450,10 +470,29 @@ void end_writeback(struct inode *inode)
450} 470}
451EXPORT_SYMBOL(end_writeback); 471EXPORT_SYMBOL(end_writeback);
452 472
473/*
474 * Free the inode passed in, removing it from the lists it is still connected
475 * to. We remove any pages still attached to the inode and wait for any IO that
476 * is still in progress before finally destroying the inode.
477 *
478 * An inode must already be marked I_FREEING so that we avoid the inode being
479 * moved back onto lists if we race with other code that manipulates the lists
480 * (e.g. writeback_single_inode). The caller is responsible for setting this.
481 *
482 * An inode must already be removed from the LRU list before being evicted from
483 * the cache. This should occur atomically with setting the I_FREEING state
484 * flag, so no inodes here should ever be on the LRU when being evicted.
485 */
453static void evict(struct inode *inode) 486static void evict(struct inode *inode)
454{ 487{
455 const struct super_operations *op = inode->i_sb->s_op; 488 const struct super_operations *op = inode->i_sb->s_op;
456 489
490 BUG_ON(!(inode->i_state & I_FREEING));
491 BUG_ON(!list_empty(&inode->i_lru));
492
493 inode_wb_list_del(inode);
494 inode_sb_list_del(inode);
495
457 if (op->evict_inode) { 496 if (op->evict_inode) {
458 op->evict_inode(inode); 497 op->evict_inode(inode);
459 } else { 498 } else {
@@ -465,6 +504,15 @@ static void evict(struct inode *inode)
465 bd_forget(inode); 504 bd_forget(inode);
466 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 505 if (S_ISCHR(inode->i_mode) && inode->i_cdev)
467 cd_forget(inode); 506 cd_forget(inode);
507
508 remove_inode_hash(inode);
509
510 spin_lock(&inode->i_lock);
511 wake_up_bit(&inode->i_state, __I_NEW);
512 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
513 spin_unlock(&inode->i_lock);
514
515 destroy_inode(inode);
468} 516}
469 517
470/* 518/*
@@ -483,14 +531,6 @@ static void dispose_list(struct list_head *head)
483 list_del_init(&inode->i_lru); 531 list_del_init(&inode->i_lru);
484 532
485 evict(inode); 533 evict(inode);
486
487 spin_lock(&inode_lock);
488 __remove_inode_hash(inode);
489 __inode_sb_list_del(inode);
490 spin_unlock(&inode_lock);
491
492 wake_up_inode(inode);
493 destroy_inode(inode);
494 } 534 }
495} 535}
496 536
@@ -508,74 +548,77 @@ void evict_inodes(struct super_block *sb)
508 struct inode *inode, *next; 548 struct inode *inode, *next;
509 LIST_HEAD(dispose); 549 LIST_HEAD(dispose);
510 550
511 down_write(&iprune_sem); 551 spin_lock(&inode_sb_list_lock);
512
513 spin_lock(&inode_lock);
514 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 552 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
515 if (atomic_read(&inode->i_count)) 553 if (atomic_read(&inode->i_count))
516 continue; 554 continue;
517 555
556 spin_lock(&inode->i_lock);
518 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 557 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
519 WARN_ON(1); 558 spin_unlock(&inode->i_lock);
520 continue; 559 continue;
521 } 560 }
522 561
523 inode->i_state |= I_FREEING; 562 inode->i_state |= I_FREEING;
524 563 inode_lru_list_del(inode);
525 /* 564 spin_unlock(&inode->i_lock);
526 * Move the inode off the IO lists and LRU once I_FREEING is 565 list_add(&inode->i_lru, &dispose);
527 * set so that it won't get moved back on there if it is dirty.
528 */
529 list_move(&inode->i_lru, &dispose);
530 list_del_init(&inode->i_wb_list);
531 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
532 inodes_stat.nr_unused--;
533 } 566 }
534 spin_unlock(&inode_lock); 567 spin_unlock(&inode_sb_list_lock);
535 568
536 dispose_list(&dispose); 569 dispose_list(&dispose);
570
571 /*
572 * Cycle through iprune_sem to make sure any inode that prune_icache
573 * moved off the list before we took the lock has been fully torn
574 * down.
575 */
576 down_write(&iprune_sem);
537 up_write(&iprune_sem); 577 up_write(&iprune_sem);
538} 578}
539 579
540/** 580/**
541 * invalidate_inodes - attempt to free all inodes on a superblock 581 * invalidate_inodes - attempt to free all inodes on a superblock
542 * @sb: superblock to operate on 582 * @sb: superblock to operate on
583 * @kill_dirty: flag to guide handling of dirty inodes
543 * 584 *
544 * Attempts to free all inodes for a given superblock. If there were any 585 * Attempts to free all inodes for a given superblock. If there were any
545 * busy inodes return a non-zero value, else zero. 586 * busy inodes return a non-zero value, else zero.
587 * If @kill_dirty is set, discard dirty inodes too, otherwise treat
588 * them as busy.
546 */ 589 */
547int invalidate_inodes(struct super_block *sb) 590int invalidate_inodes(struct super_block *sb, bool kill_dirty)
548{ 591{
549 int busy = 0; 592 int busy = 0;
550 struct inode *inode, *next; 593 struct inode *inode, *next;
551 LIST_HEAD(dispose); 594 LIST_HEAD(dispose);
552 595
553 down_write(&iprune_sem); 596 spin_lock(&inode_sb_list_lock);
554
555 spin_lock(&inode_lock);
556 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 597 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
557 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 598 spin_lock(&inode->i_lock);
599 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
600 spin_unlock(&inode->i_lock);
558 continue; 601 continue;
602 }
603 if (inode->i_state & I_DIRTY && !kill_dirty) {
604 spin_unlock(&inode->i_lock);
605 busy = 1;
606 continue;
607 }
559 if (atomic_read(&inode->i_count)) { 608 if (atomic_read(&inode->i_count)) {
609 spin_unlock(&inode->i_lock);
560 busy = 1; 610 busy = 1;
561 continue; 611 continue;
562 } 612 }
563 613
564 inode->i_state |= I_FREEING; 614 inode->i_state |= I_FREEING;
565 615 inode_lru_list_del(inode);
566 /* 616 spin_unlock(&inode->i_lock);
567 * Move the inode off the IO lists and LRU once I_FREEING is 617 list_add(&inode->i_lru, &dispose);
568 * set so that it won't get moved back on there if it is dirty.
569 */
570 list_move(&inode->i_lru, &dispose);
571 list_del_init(&inode->i_wb_list);
572 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
573 inodes_stat.nr_unused--;
574 } 618 }
575 spin_unlock(&inode_lock); 619 spin_unlock(&inode_sb_list_lock);
576 620
577 dispose_list(&dispose); 621 dispose_list(&dispose);
578 up_write(&iprune_sem);
579 622
580 return busy; 623 return busy;
581} 624}
@@ -595,7 +638,7 @@ static int can_unuse(struct inode *inode)
595 638
596/* 639/*
597 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a 640 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
598 * temporary list and then are freed outside inode_lock by dispose_list(). 641 * temporary list and then are freed outside inode_lru_lock by dispose_list().
599 * 642 *
600 * Any inodes which are pinned purely because of attached pagecache have their 643 * Any inodes which are pinned purely because of attached pagecache have their
601 * pagecache removed. If the inode has metadata buffers attached to 644 * pagecache removed. If the inode has metadata buffers attached to
@@ -616,7 +659,7 @@ static void prune_icache(int nr_to_scan)
616 unsigned long reap = 0; 659 unsigned long reap = 0;
617 660
618 down_read(&iprune_sem); 661 down_read(&iprune_sem);
619 spin_lock(&inode_lock); 662 spin_lock(&inode_lru_lock);
620 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 663 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
621 struct inode *inode; 664 struct inode *inode;
622 665
@@ -626,53 +669,67 @@ static void prune_icache(int nr_to_scan)
626 inode = list_entry(inode_lru.prev, struct inode, i_lru); 669 inode = list_entry(inode_lru.prev, struct inode, i_lru);
627 670
628 /* 671 /*
672 * we are inverting the inode_lru_lock/inode->i_lock here,
673 * so use a trylock. If we fail to get the lock, just move the
674 * inode to the back of the list so we don't spin on it.
675 */
676 if (!spin_trylock(&inode->i_lock)) {
677 list_move(&inode->i_lru, &inode_lru);
678 continue;
679 }
680
681 /*
629 * Referenced or dirty inodes are still in use. Give them 682 * Referenced or dirty inodes are still in use. Give them
630 * another pass through the LRU as we canot reclaim them now. 683 * another pass through the LRU as we canot reclaim them now.
631 */ 684 */
632 if (atomic_read(&inode->i_count) || 685 if (atomic_read(&inode->i_count) ||
633 (inode->i_state & ~I_REFERENCED)) { 686 (inode->i_state & ~I_REFERENCED)) {
634 list_del_init(&inode->i_lru); 687 list_del_init(&inode->i_lru);
688 spin_unlock(&inode->i_lock);
635 inodes_stat.nr_unused--; 689 inodes_stat.nr_unused--;
636 continue; 690 continue;
637 } 691 }
638 692
639 /* recently referenced inodes get one more pass */ 693 /* recently referenced inodes get one more pass */
640 if (inode->i_state & I_REFERENCED) { 694 if (inode->i_state & I_REFERENCED) {
641 list_move(&inode->i_lru, &inode_lru);
642 inode->i_state &= ~I_REFERENCED; 695 inode->i_state &= ~I_REFERENCED;
696 list_move(&inode->i_lru, &inode_lru);
697 spin_unlock(&inode->i_lock);
643 continue; 698 continue;
644 } 699 }
645 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 700 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
646 __iget(inode); 701 __iget(inode);
647 spin_unlock(&inode_lock); 702 spin_unlock(&inode->i_lock);
703 spin_unlock(&inode_lru_lock);
648 if (remove_inode_buffers(inode)) 704 if (remove_inode_buffers(inode))
649 reap += invalidate_mapping_pages(&inode->i_data, 705 reap += invalidate_mapping_pages(&inode->i_data,
650 0, -1); 706 0, -1);
651 iput(inode); 707 iput(inode);
652 spin_lock(&inode_lock); 708 spin_lock(&inode_lru_lock);
653 709
654 if (inode != list_entry(inode_lru.next, 710 if (inode != list_entry(inode_lru.next,
655 struct inode, i_lru)) 711 struct inode, i_lru))
656 continue; /* wrong inode or list_empty */ 712 continue; /* wrong inode or list_empty */
657 if (!can_unuse(inode)) 713 /* avoid lock inversions with trylock */
714 if (!spin_trylock(&inode->i_lock))
715 continue;
716 if (!can_unuse(inode)) {
717 spin_unlock(&inode->i_lock);
658 continue; 718 continue;
719 }
659 } 720 }
660 WARN_ON(inode->i_state & I_NEW); 721 WARN_ON(inode->i_state & I_NEW);
661 inode->i_state |= I_FREEING; 722 inode->i_state |= I_FREEING;
723 spin_unlock(&inode->i_lock);
662 724
663 /*
664 * Move the inode off the IO lists and LRU once I_FREEING is
665 * set so that it won't get moved back on there if it is dirty.
666 */
667 list_move(&inode->i_lru, &freeable); 725 list_move(&inode->i_lru, &freeable);
668 list_del_init(&inode->i_wb_list);
669 inodes_stat.nr_unused--; 726 inodes_stat.nr_unused--;
670 } 727 }
671 if (current_is_kswapd()) 728 if (current_is_kswapd())
672 __count_vm_events(KSWAPD_INODESTEAL, reap); 729 __count_vm_events(KSWAPD_INODESTEAL, reap);
673 else 730 else
674 __count_vm_events(PGINODESTEAL, reap); 731 __count_vm_events(PGINODESTEAL, reap);
675 spin_unlock(&inode_lock); 732 spin_unlock(&inode_lru_lock);
676 733
677 dispose_list(&freeable); 734 dispose_list(&freeable);
678 up_read(&iprune_sem); 735 up_read(&iprune_sem);
@@ -721,15 +778,21 @@ static struct inode *find_inode(struct super_block *sb,
721 778
722repeat: 779repeat:
723 hlist_for_each_entry(inode, node, head, i_hash) { 780 hlist_for_each_entry(inode, node, head, i_hash) {
724 if (inode->i_sb != sb) 781 spin_lock(&inode->i_lock);
782 if (inode->i_sb != sb) {
783 spin_unlock(&inode->i_lock);
725 continue; 784 continue;
726 if (!test(inode, data)) 785 }
786 if (!test(inode, data)) {
787 spin_unlock(&inode->i_lock);
727 continue; 788 continue;
789 }
728 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 790 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
729 __wait_on_freeing_inode(inode); 791 __wait_on_freeing_inode(inode);
730 goto repeat; 792 goto repeat;
731 } 793 }
732 __iget(inode); 794 __iget(inode);
795 spin_unlock(&inode->i_lock);
733 return inode; 796 return inode;
734 } 797 }
735 return NULL; 798 return NULL;
@@ -747,15 +810,21 @@ static struct inode *find_inode_fast(struct super_block *sb,
747 810
748repeat: 811repeat:
749 hlist_for_each_entry(inode, node, head, i_hash) { 812 hlist_for_each_entry(inode, node, head, i_hash) {
750 if (inode->i_ino != ino) 813 spin_lock(&inode->i_lock);
814 if (inode->i_ino != ino) {
815 spin_unlock(&inode->i_lock);
751 continue; 816 continue;
752 if (inode->i_sb != sb) 817 }
818 if (inode->i_sb != sb) {
819 spin_unlock(&inode->i_lock);
753 continue; 820 continue;
821 }
754 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 822 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
755 __wait_on_freeing_inode(inode); 823 __wait_on_freeing_inode(inode);
756 goto repeat; 824 goto repeat;
757 } 825 }
758 __iget(inode); 826 __iget(inode);
827 spin_unlock(&inode->i_lock);
759 return inode; 828 return inode;
760 } 829 }
761 return NULL; 830 return NULL;
@@ -815,19 +884,26 @@ struct inode *new_inode(struct super_block *sb)
815{ 884{
816 struct inode *inode; 885 struct inode *inode;
817 886
818 spin_lock_prefetch(&inode_lock); 887 spin_lock_prefetch(&inode_sb_list_lock);
819 888
820 inode = alloc_inode(sb); 889 inode = alloc_inode(sb);
821 if (inode) { 890 if (inode) {
822 spin_lock(&inode_lock); 891 spin_lock(&inode->i_lock);
823 __inode_sb_list_add(inode);
824 inode->i_state = 0; 892 inode->i_state = 0;
825 spin_unlock(&inode_lock); 893 spin_unlock(&inode->i_lock);
894 inode_sb_list_add(inode);
826 } 895 }
827 return inode; 896 return inode;
828} 897}
829EXPORT_SYMBOL(new_inode); 898EXPORT_SYMBOL(new_inode);
830 899
900/**
901 * unlock_new_inode - clear the I_NEW state and wake up any waiters
902 * @inode: new inode to unlock
903 *
904 * Called when the inode is fully initialised to clear the new state of the
905 * inode and wake up anyone waiting for the inode to finish initialisation.
906 */
831void unlock_new_inode(struct inode *inode) 907void unlock_new_inode(struct inode *inode)
832{ 908{
833#ifdef CONFIG_DEBUG_LOCK_ALLOC 909#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -847,51 +923,67 @@ void unlock_new_inode(struct inode *inode)
847 } 923 }
848 } 924 }
849#endif 925#endif
850 /* 926 spin_lock(&inode->i_lock);
851 * This is special! We do not need the spinlock when clearing I_NEW,
852 * because we're guaranteed that nobody else tries to do anything about
853 * the state of the inode when it is locked, as we just created it (so
854 * there can be no old holders that haven't tested I_NEW).
855 * However we must emit the memory barrier so that other CPUs reliably
856 * see the clearing of I_NEW after the other inode initialisation has
857 * completed.
858 */
859 smp_mb();
860 WARN_ON(!(inode->i_state & I_NEW)); 927 WARN_ON(!(inode->i_state & I_NEW));
861 inode->i_state &= ~I_NEW; 928 inode->i_state &= ~I_NEW;
862 wake_up_inode(inode); 929 wake_up_bit(&inode->i_state, __I_NEW);
930 spin_unlock(&inode->i_lock);
863} 931}
864EXPORT_SYMBOL(unlock_new_inode); 932EXPORT_SYMBOL(unlock_new_inode);
865 933
866/* 934/**
867 * This is called without the inode lock held.. Be careful. 935 * iget5_locked - obtain an inode from a mounted file system
936 * @sb: super block of file system
937 * @hashval: hash value (usually inode number) to get
938 * @test: callback used for comparisons between inodes
939 * @set: callback used to initialize a new struct inode
940 * @data: opaque data pointer to pass to @test and @set
941 *
942 * Search for the inode specified by @hashval and @data in the inode cache,
943 * and if present it is return it with an increased reference count. This is
944 * a generalized version of iget_locked() for file systems where the inode
945 * number is not sufficient for unique identification of an inode.
868 * 946 *
869 * We no longer cache the sb_flags in i_flags - see fs.h 947 * If the inode is not in cache, allocate a new inode and return it locked,
870 * -- rmk@arm.uk.linux.org 948 * hashed, and with the I_NEW flag set. The file system gets to fill it in
949 * before unlocking it via unlock_new_inode().
950 *
951 * Note both @test and @set are called with the inode_hash_lock held, so can't
952 * sleep.
871 */ 953 */
872static struct inode *get_new_inode(struct super_block *sb, 954struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
873 struct hlist_head *head, 955 int (*test)(struct inode *, void *),
874 int (*test)(struct inode *, void *), 956 int (*set)(struct inode *, void *), void *data)
875 int (*set)(struct inode *, void *),
876 void *data)
877{ 957{
958 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
878 struct inode *inode; 959 struct inode *inode;
879 960
961 spin_lock(&inode_hash_lock);
962 inode = find_inode(sb, head, test, data);
963 spin_unlock(&inode_hash_lock);
964
965 if (inode) {
966 wait_on_inode(inode);
967 return inode;
968 }
969
880 inode = alloc_inode(sb); 970 inode = alloc_inode(sb);
881 if (inode) { 971 if (inode) {
882 struct inode *old; 972 struct inode *old;
883 973
884 spin_lock(&inode_lock); 974 spin_lock(&inode_hash_lock);
885 /* We released the lock, so.. */ 975 /* We released the lock, so.. */
886 old = find_inode(sb, head, test, data); 976 old = find_inode(sb, head, test, data);
887 if (!old) { 977 if (!old) {
888 if (set(inode, data)) 978 if (set(inode, data))
889 goto set_failed; 979 goto set_failed;
890 980
891 hlist_add_head(&inode->i_hash, head); 981 spin_lock(&inode->i_lock);
892 __inode_sb_list_add(inode);
893 inode->i_state = I_NEW; 982 inode->i_state = I_NEW;
894 spin_unlock(&inode_lock); 983 hlist_add_head(&inode->i_hash, head);
984 spin_unlock(&inode->i_lock);
985 inode_sb_list_add(inode);
986 spin_unlock(&inode_hash_lock);
895 987
896 /* Return the locked inode with I_NEW set, the 988 /* Return the locked inode with I_NEW set, the
897 * caller is responsible for filling in the contents 989 * caller is responsible for filling in the contents
@@ -904,7 +996,7 @@ static struct inode *get_new_inode(struct super_block *sb,
904 * us. Use the old inode instead of the one we just 996 * us. Use the old inode instead of the one we just
905 * allocated. 997 * allocated.
906 */ 998 */
907 spin_unlock(&inode_lock); 999 spin_unlock(&inode_hash_lock);
908 destroy_inode(inode); 1000 destroy_inode(inode);
909 inode = old; 1001 inode = old;
910 wait_on_inode(inode); 1002 wait_on_inode(inode);
@@ -912,33 +1004,53 @@ static struct inode *get_new_inode(struct super_block *sb,
912 return inode; 1004 return inode;
913 1005
914set_failed: 1006set_failed:
915 spin_unlock(&inode_lock); 1007 spin_unlock(&inode_hash_lock);
916 destroy_inode(inode); 1008 destroy_inode(inode);
917 return NULL; 1009 return NULL;
918} 1010}
1011EXPORT_SYMBOL(iget5_locked);
919 1012
920/* 1013/**
921 * get_new_inode_fast is the fast path version of get_new_inode, see the 1014 * iget_locked - obtain an inode from a mounted file system
922 * comment at iget_locked for details. 1015 * @sb: super block of file system
1016 * @ino: inode number to get
1017 *
1018 * Search for the inode specified by @ino in the inode cache and if present
1019 * return it with an increased reference count. This is for file systems
1020 * where the inode number is sufficient for unique identification of an inode.
1021 *
1022 * If the inode is not in cache, allocate a new inode and return it locked,
1023 * hashed, and with the I_NEW flag set. The file system gets to fill it in
1024 * before unlocking it via unlock_new_inode().
923 */ 1025 */
924static struct inode *get_new_inode_fast(struct super_block *sb, 1026struct inode *iget_locked(struct super_block *sb, unsigned long ino)
925 struct hlist_head *head, unsigned long ino)
926{ 1027{
1028 struct hlist_head *head = inode_hashtable + hash(sb, ino);
927 struct inode *inode; 1029 struct inode *inode;
928 1030
1031 spin_lock(&inode_hash_lock);
1032 inode = find_inode_fast(sb, head, ino);
1033 spin_unlock(&inode_hash_lock);
1034 if (inode) {
1035 wait_on_inode(inode);
1036 return inode;
1037 }
1038
929 inode = alloc_inode(sb); 1039 inode = alloc_inode(sb);
930 if (inode) { 1040 if (inode) {
931 struct inode *old; 1041 struct inode *old;
932 1042
933 spin_lock(&inode_lock); 1043 spin_lock(&inode_hash_lock);
934 /* We released the lock, so.. */ 1044 /* We released the lock, so.. */
935 old = find_inode_fast(sb, head, ino); 1045 old = find_inode_fast(sb, head, ino);
936 if (!old) { 1046 if (!old) {
937 inode->i_ino = ino; 1047 inode->i_ino = ino;
938 hlist_add_head(&inode->i_hash, head); 1048 spin_lock(&inode->i_lock);
939 __inode_sb_list_add(inode);
940 inode->i_state = I_NEW; 1049 inode->i_state = I_NEW;
941 spin_unlock(&inode_lock); 1050 hlist_add_head(&inode->i_hash, head);
1051 spin_unlock(&inode->i_lock);
1052 inode_sb_list_add(inode);
1053 spin_unlock(&inode_hash_lock);
942 1054
943 /* Return the locked inode with I_NEW set, the 1055 /* Return the locked inode with I_NEW set, the
944 * caller is responsible for filling in the contents 1056 * caller is responsible for filling in the contents
@@ -951,13 +1063,14 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
951 * us. Use the old inode instead of the one we just 1063 * us. Use the old inode instead of the one we just
952 * allocated. 1064 * allocated.
953 */ 1065 */
954 spin_unlock(&inode_lock); 1066 spin_unlock(&inode_hash_lock);
955 destroy_inode(inode); 1067 destroy_inode(inode);
956 inode = old; 1068 inode = old;
957 wait_on_inode(inode); 1069 wait_on_inode(inode);
958 } 1070 }
959 return inode; 1071 return inode;
960} 1072}
1073EXPORT_SYMBOL(iget_locked);
961 1074
962/* 1075/*
963 * search the inode cache for a matching inode number. 1076 * search the inode cache for a matching inode number.
@@ -972,10 +1085,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
972 struct hlist_node *node; 1085 struct hlist_node *node;
973 struct inode *inode; 1086 struct inode *inode;
974 1087
1088 spin_lock(&inode_hash_lock);
975 hlist_for_each_entry(inode, node, b, i_hash) { 1089 hlist_for_each_entry(inode, node, b, i_hash) {
976 if (inode->i_ino == ino && inode->i_sb == sb) 1090 if (inode->i_ino == ino && inode->i_sb == sb) {
1091 spin_unlock(&inode_hash_lock);
977 return 0; 1092 return 0;
1093 }
978 } 1094 }
1095 spin_unlock(&inode_hash_lock);
979 1096
980 return 1; 1097 return 1;
981} 1098}
@@ -1005,7 +1122,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
1005 static unsigned int counter; 1122 static unsigned int counter;
1006 ino_t res; 1123 ino_t res;
1007 1124
1008 spin_lock(&inode_lock);
1009 spin_lock(&iunique_lock); 1125 spin_lock(&iunique_lock);
1010 do { 1126 do {
1011 if (counter <= max_reserved) 1127 if (counter <= max_reserved)
@@ -1013,7 +1129,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
1013 res = counter++; 1129 res = counter++;
1014 } while (!test_inode_iunique(sb, res)); 1130 } while (!test_inode_iunique(sb, res));
1015 spin_unlock(&iunique_lock); 1131 spin_unlock(&iunique_lock);
1016 spin_unlock(&inode_lock);
1017 1132
1018 return res; 1133 return res;
1019} 1134}
@@ -1021,116 +1136,50 @@ EXPORT_SYMBOL(iunique);
1021 1136
1022struct inode *igrab(struct inode *inode) 1137struct inode *igrab(struct inode *inode)
1023{ 1138{
1024 spin_lock(&inode_lock); 1139 spin_lock(&inode->i_lock);
1025 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) 1140 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
1026 __iget(inode); 1141 __iget(inode);
1027 else 1142 spin_unlock(&inode->i_lock);
1143 } else {
1144 spin_unlock(&inode->i_lock);
1028 /* 1145 /*
1029 * Handle the case where s_op->clear_inode is not been 1146 * Handle the case where s_op->clear_inode is not been
1030 * called yet, and somebody is calling igrab 1147 * called yet, and somebody is calling igrab
1031 * while the inode is getting freed. 1148 * while the inode is getting freed.
1032 */ 1149 */
1033 inode = NULL; 1150 inode = NULL;
1034 spin_unlock(&inode_lock); 1151 }
1035 return inode; 1152 return inode;
1036} 1153}
1037EXPORT_SYMBOL(igrab); 1154EXPORT_SYMBOL(igrab);
1038 1155
1039/** 1156/**
1040 * ifind - internal function, you want ilookup5() or iget5().
1041 * @sb: super block of file system to search
1042 * @head: the head of the list to search
1043 * @test: callback used for comparisons between inodes
1044 * @data: opaque data pointer to pass to @test
1045 * @wait: if true wait for the inode to be unlocked, if false do not
1046 *
1047 * ifind() searches for the inode specified by @data in the inode
1048 * cache. This is a generalized version of ifind_fast() for file systems where
1049 * the inode number is not sufficient for unique identification of an inode.
1050 *
1051 * If the inode is in the cache, the inode is returned with an incremented
1052 * reference count.
1053 *
1054 * Otherwise NULL is returned.
1055 *
1056 * Note, @test is called with the inode_lock held, so can't sleep.
1057 */
1058static struct inode *ifind(struct super_block *sb,
1059 struct hlist_head *head, int (*test)(struct inode *, void *),
1060 void *data, const int wait)
1061{
1062 struct inode *inode;
1063
1064 spin_lock(&inode_lock);
1065 inode = find_inode(sb, head, test, data);
1066 if (inode) {
1067 spin_unlock(&inode_lock);
1068 if (likely(wait))
1069 wait_on_inode(inode);
1070 return inode;
1071 }
1072 spin_unlock(&inode_lock);
1073 return NULL;
1074}
1075
1076/**
1077 * ifind_fast - internal function, you want ilookup() or iget().
1078 * @sb: super block of file system to search
1079 * @head: head of the list to search
1080 * @ino: inode number to search for
1081 *
1082 * ifind_fast() searches for the inode @ino in the inode cache. This is for
1083 * file systems where the inode number is sufficient for unique identification
1084 * of an inode.
1085 *
1086 * If the inode is in the cache, the inode is returned with an incremented
1087 * reference count.
1088 *
1089 * Otherwise NULL is returned.
1090 */
1091static struct inode *ifind_fast(struct super_block *sb,
1092 struct hlist_head *head, unsigned long ino)
1093{
1094 struct inode *inode;
1095
1096 spin_lock(&inode_lock);
1097 inode = find_inode_fast(sb, head, ino);
1098 if (inode) {
1099 spin_unlock(&inode_lock);
1100 wait_on_inode(inode);
1101 return inode;
1102 }
1103 spin_unlock(&inode_lock);
1104 return NULL;
1105}
1106
1107/**
1108 * ilookup5_nowait - search for an inode in the inode cache 1157 * ilookup5_nowait - search for an inode in the inode cache
1109 * @sb: super block of file system to search 1158 * @sb: super block of file system to search
1110 * @hashval: hash value (usually inode number) to search for 1159 * @hashval: hash value (usually inode number) to search for
1111 * @test: callback used for comparisons between inodes 1160 * @test: callback used for comparisons between inodes
1112 * @data: opaque data pointer to pass to @test 1161 * @data: opaque data pointer to pass to @test
1113 * 1162 *
1114 * ilookup5() uses ifind() to search for the inode specified by @hashval and 1163 * Search for the inode specified by @hashval and @data in the inode cache.
1115 * @data in the inode cache. This is a generalized version of ilookup() for
1116 * file systems where the inode number is not sufficient for unique
1117 * identification of an inode.
1118 *
1119 * If the inode is in the cache, the inode is returned with an incremented 1164 * If the inode is in the cache, the inode is returned with an incremented
1120 * reference count. Note, the inode lock is not waited upon so you have to be 1165 * reference count.
1121 * very careful what you do with the returned inode. You probably should be
1122 * using ilookup5() instead.
1123 * 1166 *
1124 * Otherwise NULL is returned. 1167 * Note: I_NEW is not waited upon so you have to be very careful what you do
1168 * with the returned inode. You probably should be using ilookup5() instead.
1125 * 1169 *
1126 * Note, @test is called with the inode_lock held, so can't sleep. 1170 * Note: @test is called with the inode_hash_lock held, so can't sleep.
1127 */ 1171 */
1128struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1172struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
1129 int (*test)(struct inode *, void *), void *data) 1173 int (*test)(struct inode *, void *), void *data)
1130{ 1174{
1131 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1175 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1176 struct inode *inode;
1132 1177
1133 return ifind(sb, head, test, data, 0); 1178 spin_lock(&inode_hash_lock);
1179 inode = find_inode(sb, head, test, data);
1180 spin_unlock(&inode_hash_lock);
1181
1182 return inode;
1134} 1183}
1135EXPORT_SYMBOL(ilookup5_nowait); 1184EXPORT_SYMBOL(ilookup5_nowait);
1136 1185
@@ -1141,24 +1190,24 @@ EXPORT_SYMBOL(ilookup5_nowait);
1141 * @test: callback used for comparisons between inodes 1190 * @test: callback used for comparisons between inodes
1142 * @data: opaque data pointer to pass to @test 1191 * @data: opaque data pointer to pass to @test
1143 * 1192 *
1144 * ilookup5() uses ifind() to search for the inode specified by @hashval and 1193 * Search for the inode specified by @hashval and @data in the inode cache,
1145 * @data in the inode cache. This is a generalized version of ilookup() for 1194 * and if the inode is in the cache, return the inode with an incremented
1146 * file systems where the inode number is not sufficient for unique 1195 * reference count. Waits on I_NEW before returning the inode.
1147 * identification of an inode.
1148 *
1149 * If the inode is in the cache, the inode lock is waited upon and the inode is
1150 * returned with an incremented reference count. 1196 * returned with an incremented reference count.
1151 * 1197 *
1152 * Otherwise NULL is returned. 1198 * This is a generalized version of ilookup() for file systems where the
1199 * inode number is not sufficient for unique identification of an inode.
1153 * 1200 *
1154 * Note, @test is called with the inode_lock held, so can't sleep. 1201 * Note: @test is called with the inode_hash_lock held, so can't sleep.
1155 */ 1202 */
1156struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1203struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
1157 int (*test)(struct inode *, void *), void *data) 1204 int (*test)(struct inode *, void *), void *data)
1158{ 1205{
1159 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1206 struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
1160 1207
1161 return ifind(sb, head, test, data, 1); 1208 if (inode)
1209 wait_on_inode(inode);
1210 return inode;
1162} 1211}
1163EXPORT_SYMBOL(ilookup5); 1212EXPORT_SYMBOL(ilookup5);
1164 1213
@@ -1167,91 +1216,23 @@ EXPORT_SYMBOL(ilookup5);
1167 * @sb: super block of file system to search 1216 * @sb: super block of file system to search
1168 * @ino: inode number to search for 1217 * @ino: inode number to search for
1169 * 1218 *
1170 * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. 1219 * Search for the inode @ino in the inode cache, and if the inode is in the
1171 * This is for file systems where the inode number is sufficient for unique 1220 * cache, the inode is returned with an incremented reference count.
1172 * identification of an inode.
1173 *
1174 * If the inode is in the cache, the inode is returned with an incremented
1175 * reference count.
1176 *
1177 * Otherwise NULL is returned.
1178 */ 1221 */
1179struct inode *ilookup(struct super_block *sb, unsigned long ino) 1222struct inode *ilookup(struct super_block *sb, unsigned long ino)
1180{ 1223{
1181 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1224 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1182
1183 return ifind_fast(sb, head, ino);
1184}
1185EXPORT_SYMBOL(ilookup);
1186
1187/**
1188 * iget5_locked - obtain an inode from a mounted file system
1189 * @sb: super block of file system
1190 * @hashval: hash value (usually inode number) to get
1191 * @test: callback used for comparisons between inodes
1192 * @set: callback used to initialize a new struct inode
1193 * @data: opaque data pointer to pass to @test and @set
1194 *
1195 * iget5_locked() uses ifind() to search for the inode specified by @hashval
1196 * and @data in the inode cache and if present it is returned with an increased
1197 * reference count. This is a generalized version of iget_locked() for file
1198 * systems where the inode number is not sufficient for unique identification
1199 * of an inode.
1200 *
1201 * If the inode is not in cache, get_new_inode() is called to allocate a new
1202 * inode and this is returned locked, hashed, and with the I_NEW flag set. The
1203 * file system gets to fill it in before unlocking it via unlock_new_inode().
1204 *
1205 * Note both @test and @set are called with the inode_lock held, so can't sleep.
1206 */
1207struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
1208 int (*test)(struct inode *, void *),
1209 int (*set)(struct inode *, void *), void *data)
1210{
1211 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1212 struct inode *inode; 1225 struct inode *inode;
1213 1226
1214 inode = ifind(sb, head, test, data, 1); 1227 spin_lock(&inode_hash_lock);
1215 if (inode) 1228 inode = find_inode_fast(sb, head, ino);
1216 return inode; 1229 spin_unlock(&inode_hash_lock);
1217 /*
1218 * get_new_inode() will do the right thing, re-trying the search
1219 * in case it had to block at any point.
1220 */
1221 return get_new_inode(sb, head, test, set, data);
1222}
1223EXPORT_SYMBOL(iget5_locked);
1224
1225/**
1226 * iget_locked - obtain an inode from a mounted file system
1227 * @sb: super block of file system
1228 * @ino: inode number to get
1229 *
1230 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
1231 * the inode cache and if present it is returned with an increased reference
1232 * count. This is for file systems where the inode number is sufficient for
1233 * unique identification of an inode.
1234 *
1235 * If the inode is not in cache, get_new_inode_fast() is called to allocate a
1236 * new inode and this is returned locked, hashed, and with the I_NEW flag set.
1237 * The file system gets to fill it in before unlocking it via
1238 * unlock_new_inode().
1239 */
1240struct inode *iget_locked(struct super_block *sb, unsigned long ino)
1241{
1242 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1243 struct inode *inode;
1244 1230
1245 inode = ifind_fast(sb, head, ino);
1246 if (inode) 1231 if (inode)
1247 return inode; 1232 wait_on_inode(inode);
1248 /* 1233 return inode;
1249 * get_new_inode_fast() will do the right thing, re-trying the search
1250 * in case it had to block at any point.
1251 */
1252 return get_new_inode_fast(sb, head, ino);
1253} 1234}
1254EXPORT_SYMBOL(iget_locked); 1235EXPORT_SYMBOL(ilookup);
1255 1236
1256int insert_inode_locked(struct inode *inode) 1237int insert_inode_locked(struct inode *inode)
1257{ 1238{
@@ -1259,27 +1240,33 @@ int insert_inode_locked(struct inode *inode)
1259 ino_t ino = inode->i_ino; 1240 ino_t ino = inode->i_ino;
1260 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1241 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1261 1242
1262 inode->i_state |= I_NEW;
1263 while (1) { 1243 while (1) {
1264 struct hlist_node *node; 1244 struct hlist_node *node;
1265 struct inode *old = NULL; 1245 struct inode *old = NULL;
1266 spin_lock(&inode_lock); 1246 spin_lock(&inode_hash_lock);
1267 hlist_for_each_entry(old, node, head, i_hash) { 1247 hlist_for_each_entry(old, node, head, i_hash) {
1268 if (old->i_ino != ino) 1248 if (old->i_ino != ino)
1269 continue; 1249 continue;
1270 if (old->i_sb != sb) 1250 if (old->i_sb != sb)
1271 continue; 1251 continue;
1272 if (old->i_state & (I_FREEING|I_WILL_FREE)) 1252 spin_lock(&old->i_lock);
1253 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1254 spin_unlock(&old->i_lock);
1273 continue; 1255 continue;
1256 }
1274 break; 1257 break;
1275 } 1258 }
1276 if (likely(!node)) { 1259 if (likely(!node)) {
1260 spin_lock(&inode->i_lock);
1261 inode->i_state |= I_NEW;
1277 hlist_add_head(&inode->i_hash, head); 1262 hlist_add_head(&inode->i_hash, head);
1278 spin_unlock(&inode_lock); 1263 spin_unlock(&inode->i_lock);
1264 spin_unlock(&inode_hash_lock);
1279 return 0; 1265 return 0;
1280 } 1266 }
1281 __iget(old); 1267 __iget(old);
1282 spin_unlock(&inode_lock); 1268 spin_unlock(&old->i_lock);
1269 spin_unlock(&inode_hash_lock);
1283 wait_on_inode(old); 1270 wait_on_inode(old);
1284 if (unlikely(!inode_unhashed(old))) { 1271 if (unlikely(!inode_unhashed(old))) {
1285 iput(old); 1272 iput(old);
@@ -1296,29 +1283,34 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1296 struct super_block *sb = inode->i_sb; 1283 struct super_block *sb = inode->i_sb;
1297 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1284 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1298 1285
1299 inode->i_state |= I_NEW;
1300
1301 while (1) { 1286 while (1) {
1302 struct hlist_node *node; 1287 struct hlist_node *node;
1303 struct inode *old = NULL; 1288 struct inode *old = NULL;
1304 1289
1305 spin_lock(&inode_lock); 1290 spin_lock(&inode_hash_lock);
1306 hlist_for_each_entry(old, node, head, i_hash) { 1291 hlist_for_each_entry(old, node, head, i_hash) {
1307 if (old->i_sb != sb) 1292 if (old->i_sb != sb)
1308 continue; 1293 continue;
1309 if (!test(old, data)) 1294 if (!test(old, data))
1310 continue; 1295 continue;
1311 if (old->i_state & (I_FREEING|I_WILL_FREE)) 1296 spin_lock(&old->i_lock);
1297 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1298 spin_unlock(&old->i_lock);
1312 continue; 1299 continue;
1300 }
1313 break; 1301 break;
1314 } 1302 }
1315 if (likely(!node)) { 1303 if (likely(!node)) {
1304 spin_lock(&inode->i_lock);
1305 inode->i_state |= I_NEW;
1316 hlist_add_head(&inode->i_hash, head); 1306 hlist_add_head(&inode->i_hash, head);
1317 spin_unlock(&inode_lock); 1307 spin_unlock(&inode->i_lock);
1308 spin_unlock(&inode_hash_lock);
1318 return 0; 1309 return 0;
1319 } 1310 }
1320 __iget(old); 1311 __iget(old);
1321 spin_unlock(&inode_lock); 1312 spin_unlock(&old->i_lock);
1313 spin_unlock(&inode_hash_lock);
1322 wait_on_inode(old); 1314 wait_on_inode(old);
1323 if (unlikely(!inode_unhashed(old))) { 1315 if (unlikely(!inode_unhashed(old))) {
1324 iput(old); 1316 iput(old);
@@ -1363,47 +1355,35 @@ static void iput_final(struct inode *inode)
1363 const struct super_operations *op = inode->i_sb->s_op; 1355 const struct super_operations *op = inode->i_sb->s_op;
1364 int drop; 1356 int drop;
1365 1357
1358 WARN_ON(inode->i_state & I_NEW);
1359
1366 if (op && op->drop_inode) 1360 if (op && op->drop_inode)
1367 drop = op->drop_inode(inode); 1361 drop = op->drop_inode(inode);
1368 else 1362 else
1369 drop = generic_drop_inode(inode); 1363 drop = generic_drop_inode(inode);
1370 1364
1365 if (!drop && (sb->s_flags & MS_ACTIVE)) {
1366 inode->i_state |= I_REFERENCED;
1367 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1368 inode_lru_list_add(inode);
1369 spin_unlock(&inode->i_lock);
1370 return;
1371 }
1372
1371 if (!drop) { 1373 if (!drop) {
1372 if (sb->s_flags & MS_ACTIVE) {
1373 inode->i_state |= I_REFERENCED;
1374 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1375 inode_lru_list_add(inode);
1376 }
1377 spin_unlock(&inode_lock);
1378 return;
1379 }
1380 WARN_ON(inode->i_state & I_NEW);
1381 inode->i_state |= I_WILL_FREE; 1374 inode->i_state |= I_WILL_FREE;
1382 spin_unlock(&inode_lock); 1375 spin_unlock(&inode->i_lock);
1383 write_inode_now(inode, 1); 1376 write_inode_now(inode, 1);
1384 spin_lock(&inode_lock); 1377 spin_lock(&inode->i_lock);
1385 WARN_ON(inode->i_state & I_NEW); 1378 WARN_ON(inode->i_state & I_NEW);
1386 inode->i_state &= ~I_WILL_FREE; 1379 inode->i_state &= ~I_WILL_FREE;
1387 __remove_inode_hash(inode);
1388 } 1380 }
1389 1381
1390 WARN_ON(inode->i_state & I_NEW);
1391 inode->i_state |= I_FREEING; 1382 inode->i_state |= I_FREEING;
1392
1393 /*
1394 * Move the inode off the IO lists and LRU once I_FREEING is
1395 * set so that it won't get moved back on there if it is dirty.
1396 */
1397 inode_lru_list_del(inode); 1383 inode_lru_list_del(inode);
1398 list_del_init(&inode->i_wb_list); 1384 spin_unlock(&inode->i_lock);
1399 1385
1400 __inode_sb_list_del(inode);
1401 spin_unlock(&inode_lock);
1402 evict(inode); 1386 evict(inode);
1403 remove_inode_hash(inode);
1404 wake_up_inode(inode);
1405 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
1406 destroy_inode(inode);
1407} 1387}
1408 1388
1409/** 1389/**
@@ -1420,7 +1400,7 @@ void iput(struct inode *inode)
1420 if (inode) { 1400 if (inode) {
1421 BUG_ON(inode->i_state & I_CLEAR); 1401 BUG_ON(inode->i_state & I_CLEAR);
1422 1402
1423 if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1403 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
1424 iput_final(inode); 1404 iput_final(inode);
1425 } 1405 }
1426} 1406}
@@ -1599,9 +1579,8 @@ EXPORT_SYMBOL(inode_wait);
1599 * to recheck inode state. 1579 * to recheck inode state.
1600 * 1580 *
1601 * It doesn't matter if I_NEW is not set initially, a call to 1581 * It doesn't matter if I_NEW is not set initially, a call to
1602 * wake_up_inode() after removing from the hash list will DTRT. 1582 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
1603 * 1583 * will DTRT.
1604 * This is called with inode_lock held.
1605 */ 1584 */
1606static void __wait_on_freeing_inode(struct inode *inode) 1585static void __wait_on_freeing_inode(struct inode *inode)
1607{ 1586{
@@ -1609,10 +1588,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
1609 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1588 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1610 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1589 wq = bit_waitqueue(&inode->i_state, __I_NEW);
1611 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1590 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
1612 spin_unlock(&inode_lock); 1591 spin_unlock(&inode->i_lock);
1592 spin_unlock(&inode_hash_lock);
1613 schedule(); 1593 schedule();
1614 finish_wait(wq, &wait.wait); 1594 finish_wait(wq, &wait.wait);
1615 spin_lock(&inode_lock); 1595 spin_lock(&inode_hash_lock);
1616} 1596}
1617 1597
1618static __initdata unsigned long ihash_entries; 1598static __initdata unsigned long ihash_entries;
@@ -1704,7 +1684,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1704EXPORT_SYMBOL(init_special_inode); 1684EXPORT_SYMBOL(init_special_inode);
1705 1685
1706/** 1686/**
1707 * Init uid,gid,mode for new inode according to posix standards 1687 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
1708 * @inode: New inode 1688 * @inode: New inode
1709 * @dir: Directory inode 1689 * @dir: Directory inode
1710 * @mode: mode of the new inode 1690 * @mode: mode of the new inode
@@ -1722,3 +1702,22 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
1722 inode->i_mode = mode; 1702 inode->i_mode = mode;
1723} 1703}
1724EXPORT_SYMBOL(inode_init_owner); 1704EXPORT_SYMBOL(inode_init_owner);
1705
1706/**
1707 * inode_owner_or_capable - check current task permissions to inode
1708 * @inode: inode being checked
1709 *
1710 * Return true if current either has CAP_FOWNER to the inode, or
1711 * owns the file.
1712 */
1713bool inode_owner_or_capable(const struct inode *inode)
1714{
1715 struct user_namespace *ns = inode_userns(inode);
1716
1717 if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
1718 return true;
1719 if (ns_capable(ns, CAP_FOWNER))
1720 return true;
1721 return false;
1722}
1723EXPORT_SYMBOL(inode_owner_or_capable);
diff --git a/fs/internal.h b/fs/internal.h
index 0663568b1247..b29c46e4e32f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
12#include <linux/lglock.h> 12#include <linux/lglock.h>
13 13
14struct super_block; 14struct super_block;
15struct file_system_type;
15struct linux_binprm; 16struct linux_binprm;
16struct path; 17struct path;
17 18
@@ -61,10 +62,9 @@ extern int check_unsafe_exec(struct linux_binprm *);
61extern int copy_mount_options(const void __user *, unsigned long *); 62extern int copy_mount_options(const void __user *, unsigned long *);
62extern int copy_mount_string(const void __user *, char **); 63extern int copy_mount_string(const void __user *, char **);
63 64
64extern void free_vfsmnt(struct vfsmount *);
65extern struct vfsmount *alloc_vfsmnt(const char *);
66extern unsigned int mnt_get_count(struct vfsmount *mnt); 65extern unsigned int mnt_get_count(struct vfsmount *mnt);
67extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); 66extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
67extern struct vfsmount *lookup_mnt(struct path *);
68extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, 68extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
69 struct vfsmount *); 69 struct vfsmount *);
70extern void release_mounts(struct list_head *); 70extern void release_mounts(struct list_head *);
@@ -99,6 +99,8 @@ extern struct file *get_empty_filp(void);
99extern int do_remount_sb(struct super_block *, int, void *, int); 99extern int do_remount_sb(struct super_block *, int, void *, int);
100extern void __put_super(struct super_block *sb); 100extern void __put_super(struct super_block *sb);
101extern void put_super(struct super_block *sb); 101extern void put_super(struct super_block *sb);
102extern struct dentry *mount_fs(struct file_system_type *,
103 int, const char *, void *);
102 104
103/* 105/*
104 * open.c 106 * open.c
@@ -106,10 +108,30 @@ extern void put_super(struct super_block *sb);
106struct nameidata; 108struct nameidata;
107extern struct file *nameidata_to_filp(struct nameidata *); 109extern struct file *nameidata_to_filp(struct nameidata *);
108extern void release_open_intent(struct nameidata *); 110extern void release_open_intent(struct nameidata *);
111struct open_flags {
112 int open_flag;
113 int mode;
114 int acc_mode;
115 int intent;
116};
117extern struct file *do_filp_open(int dfd, const char *pathname,
118 const struct open_flags *op, int lookup_flags);
119extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
120 const char *, const struct open_flags *, int lookup_flags);
121
122extern long do_handle_open(int mountdirfd,
123 struct file_handle __user *ufh, int open_flag);
109 124
110/* 125/*
111 * inode.c 126 * inode.c
112 */ 127 */
128extern spinlock_t inode_sb_list_lock;
129
130/*
131 * fs-writeback.c
132 */
133extern void inode_wb_list_del(struct inode *inode);
134
113extern int get_nr_dirty_inodes(void); 135extern int get_nr_dirty_inodes(void);
114extern void evict_inodes(struct super_block *); 136extern void evict_inodes(struct super_block *);
115extern int invalidate_inodes(struct super_block *); 137extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index a59635e295fa..1d9b9fcb2db4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -273,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode,
273 len = isize; 273 len = isize;
274 } 274 }
275 275
276 /*
277 * Some filesystems can't deal with being asked to map less than
278 * blocksize, so make sure our len is at least block length.
279 */
280 if (logical_to_blk(inode, len) == 0)
281 len = blk_to_logical(inode, 1);
282
276 start_blk = logical_to_blk(inode, start); 283 start_blk = logical_to_blk(inode, start);
277 last_blk = logical_to_blk(inode, start + len - 1); 284 last_blk = logical_to_blk(inode, start + len - 1);
278 285
@@ -541,6 +548,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
541{ 548{
542 int error = 0; 549 int error = 0;
543 int __user *argp = (int __user *)arg; 550 int __user *argp = (int __user *)arg;
551 struct inode *inode = filp->f_path.dentry->d_inode;
544 552
545 switch (cmd) { 553 switch (cmd) {
546 case FIOCLEX: 554 case FIOCLEX:
@@ -560,13 +568,11 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
560 break; 568 break;
561 569
562 case FIOQSIZE: 570 case FIOQSIZE:
563 if (S_ISDIR(filp->f_path.dentry->d_inode->i_mode) || 571 if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
564 S_ISREG(filp->f_path.dentry->d_inode->i_mode) || 572 S_ISLNK(inode->i_mode)) {
565 S_ISLNK(filp->f_path.dentry->d_inode->i_mode)) { 573 loff_t res = inode_get_bytes(inode);
566 loff_t res = 574 error = copy_to_user(argp, &res, sizeof(res)) ?
567 inode_get_bytes(filp->f_path.dentry->d_inode); 575 -EFAULT : 0;
568 error = copy_to_user((loff_t __user *)arg, &res,
569 sizeof(res)) ? -EFAULT : 0;
570 } else 576 } else
571 error = -ENOTTY; 577 error = -ENOTTY;
572 break; 578 break;
@@ -583,14 +589,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
583 return ioctl_fiemap(filp, arg); 589 return ioctl_fiemap(filp, arg);
584 590
585 case FIGETBSZ: 591 case FIGETBSZ:
586 { 592 return put_user(inode->i_sb->s_blocksize, argp);
587 struct inode *inode = filp->f_path.dentry->d_inode;
588 int __user *p = (int __user *)arg;
589 return put_user(inode->i_sb->s_blocksize, p);
590 }
591 593
592 default: 594 default:
593 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) 595 if (S_ISREG(inode->i_mode))
594 error = file_ioctl(filp, cmd, arg); 596 error = file_ioctl(filp, cmd, arg);
595 else 597 else
596 error = vfs_ioctl(filp, cmd, arg); 598 error = vfs_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb38474..dd4687ff30d0 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
124 * offset of the inode and the upper 16 bits of fh32[1] to 124 * offset of the inode and the upper 16 bits of fh32[1] to
125 * hold the offset of the parent. 125 * hold the offset of the parent.
126 */ 126 */
127 127 if (connectable && (len < 5)) {
128 if (len < 3 || (connectable && len < 5)) 128 *max_len = 5;
129 return 255;
130 } else if (len < 3) {
131 *max_len = 3;
129 return 255; 132 return 255;
133 }
130 134
131 len = 3; 135 len = 3;
132 fh32[0] = ei->i_iget5_block; 136 fh32[0] = ei->i_iget5_block;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a0f3833c0dbf..3db5ba4568fc 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1158,7 +1158,6 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
1158 1158
1159static const struct address_space_operations isofs_aops = { 1159static const struct address_space_operations isofs_aops = {
1160 .readpage = isofs_readpage, 1160 .readpage = isofs_readpage,
1161 .sync_page = block_sync_page,
1162 .bmap = _isofs_bmap 1161 .bmap = _isofs_bmap
1163}; 1162};
1164 1163
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 34a4861c14b8..da871ee084d3 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/blkdev.h>
23 24
24/* 25/*
25 * Default IO end handler for temporary BJ_IO buffer_heads. 26 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -294,7 +295,7 @@ void journal_commit_transaction(journal_t *journal)
294 int first_tag = 0; 295 int first_tag = 0;
295 int tag_flag; 296 int tag_flag;
296 int i; 297 int i;
297 int write_op = WRITE_SYNC; 298 struct blk_plug plug;
298 299
299 /* 300 /*
300 * First job: lock down the current transaction and wait for 301 * First job: lock down the current transaction and wait for
@@ -327,13 +328,6 @@ void journal_commit_transaction(journal_t *journal)
327 spin_lock(&journal->j_state_lock); 328 spin_lock(&journal->j_state_lock);
328 commit_transaction->t_state = T_LOCKED; 329 commit_transaction->t_state = T_LOCKED;
329 330
330 /*
331 * Use plugged writes here, since we want to submit several before
332 * we unplug the device. We don't do explicit unplugging in here,
333 * instead we rely on sync_buffer() doing the unplug for us.
334 */
335 if (commit_transaction->t_synchronous_commit)
336 write_op = WRITE_SYNC_PLUG;
337 spin_lock(&commit_transaction->t_handle_lock); 331 spin_lock(&commit_transaction->t_handle_lock);
338 while (commit_transaction->t_updates) { 332 while (commit_transaction->t_updates) {
339 DEFINE_WAIT(wait); 333 DEFINE_WAIT(wait);
@@ -418,8 +412,10 @@ void journal_commit_transaction(journal_t *journal)
418 * Now start flushing things to disk, in the order they appear 412 * Now start flushing things to disk, in the order they appear
419 * on the transaction lists. Data blocks go first. 413 * on the transaction lists. Data blocks go first.
420 */ 414 */
415 blk_start_plug(&plug);
421 err = journal_submit_data_buffers(journal, commit_transaction, 416 err = journal_submit_data_buffers(journal, commit_transaction,
422 write_op); 417 WRITE_SYNC);
418 blk_finish_plug(&plug);
423 419
424 /* 420 /*
425 * Wait for all previously submitted IO to complete. 421 * Wait for all previously submitted IO to complete.
@@ -480,7 +476,9 @@ void journal_commit_transaction(journal_t *journal)
480 err = 0; 476 err = 0;
481 } 477 }
482 478
483 journal_write_revoke_records(journal, commit_transaction, write_op); 479 blk_start_plug(&plug);
480
481 journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
484 482
485 /* 483 /*
486 * If we found any dirty or locked buffers, then we should have 484 * If we found any dirty or locked buffers, then we should have
@@ -650,7 +648,7 @@ start_journal_io:
650 clear_buffer_dirty(bh); 648 clear_buffer_dirty(bh);
651 set_buffer_uptodate(bh); 649 set_buffer_uptodate(bh);
652 bh->b_end_io = journal_end_buffer_io_sync; 650 bh->b_end_io = journal_end_buffer_io_sync;
653 submit_bh(write_op, bh); 651 submit_bh(WRITE_SYNC, bh);
654 } 652 }
655 cond_resched(); 653 cond_resched();
656 654
@@ -661,6 +659,8 @@ start_journal_io:
661 } 659 }
662 } 660 }
663 661
662 blk_finish_plug(&plug);
663
664 /* Lo and behold: we have just managed to send a transaction to 664 /* Lo and behold: we have just managed to send a transaction to
665 the log. Before we can commit it, wait for the IO so far to 665 the log. Before we can commit it, wait for the IO so far to
666 complete. Control buffers being written are on the 666 complete. Control buffers being written are on the
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index da1b5e4ffce1..eb11601f2e00 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -839,7 +839,7 @@ journal_t * journal_init_inode (struct inode *inode)
839 err = journal_bmap(journal, 0, &blocknr); 839 err = journal_bmap(journal, 0, &blocknr);
840 /* If that failed, give up */ 840 /* If that failed, give up */
841 if (err) { 841 if (err) {
842 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 842 printk(KERN_ERR "%s: Cannot locate journal superblock\n",
843 __func__); 843 __func__);
844 goto out_err; 844 goto out_err;
845 } 845 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f3ad1598b201..fa36d7662b21 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -137,9 +137,9 @@ static int journal_submit_commit_record(journal_t *journal,
137 if (journal->j_flags & JBD2_BARRIER && 137 if (journal->j_flags & JBD2_BARRIER &&
138 !JBD2_HAS_INCOMPAT_FEATURE(journal, 138 !JBD2_HAS_INCOMPAT_FEATURE(journal,
139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) 139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
140 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh); 140 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
141 else 141 else
142 ret = submit_bh(WRITE_SYNC_PLUG, bh); 142 ret = submit_bh(WRITE_SYNC, bh);
143 143
144 *cbh = bh; 144 *cbh = bh;
145 return ret; 145 return ret;
@@ -329,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
329 int tag_bytes = journal_tag_bytes(journal); 329 int tag_bytes = journal_tag_bytes(journal);
330 struct buffer_head *cbh = NULL; /* For transactional checksums */ 330 struct buffer_head *cbh = NULL; /* For transactional checksums */
331 __u32 crc32_sum = ~0; 331 __u32 crc32_sum = ~0;
332 int write_op = WRITE_SYNC; 332 struct blk_plug plug;
333 333
334 /* 334 /*
335 * First job: lock down the current transaction and wait for 335 * First job: lock down the current transaction and wait for
@@ -363,13 +363,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
363 write_lock(&journal->j_state_lock); 363 write_lock(&journal->j_state_lock);
364 commit_transaction->t_state = T_LOCKED; 364 commit_transaction->t_state = T_LOCKED;
365 365
366 /*
367 * Use plugged writes here, since we want to submit several before
368 * we unplug the device. We don't do explicit unplugging in here,
369 * instead we rely on sync_buffer() doing the unplug for us.
370 */
371 if (commit_transaction->t_synchronous_commit)
372 write_op = WRITE_SYNC_PLUG;
373 trace_jbd2_commit_locking(journal, commit_transaction); 366 trace_jbd2_commit_locking(journal, commit_transaction);
374 stats.run.rs_wait = commit_transaction->t_max_wait; 367 stats.run.rs_wait = commit_transaction->t_max_wait;
375 stats.run.rs_locked = jiffies; 368 stats.run.rs_locked = jiffies;
@@ -469,8 +462,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
469 if (err) 462 if (err)
470 jbd2_journal_abort(journal, err); 463 jbd2_journal_abort(journal, err);
471 464
465 blk_start_plug(&plug);
472 jbd2_journal_write_revoke_records(journal, commit_transaction, 466 jbd2_journal_write_revoke_records(journal, commit_transaction,
473 write_op); 467 WRITE_SYNC);
468 blk_finish_plug(&plug);
474 469
475 jbd_debug(3, "JBD: commit phase 2\n"); 470 jbd_debug(3, "JBD: commit phase 2\n");
476 471
@@ -497,6 +492,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
497 err = 0; 492 err = 0;
498 descriptor = NULL; 493 descriptor = NULL;
499 bufs = 0; 494 bufs = 0;
495 blk_start_plug(&plug);
500 while (commit_transaction->t_buffers) { 496 while (commit_transaction->t_buffers) {
501 497
502 /* Find the next buffer to be journaled... */ 498 /* Find the next buffer to be journaled... */
@@ -658,7 +654,7 @@ start_journal_io:
658 clear_buffer_dirty(bh); 654 clear_buffer_dirty(bh);
659 set_buffer_uptodate(bh); 655 set_buffer_uptodate(bh);
660 bh->b_end_io = journal_end_buffer_io_sync; 656 bh->b_end_io = journal_end_buffer_io_sync;
661 submit_bh(write_op, bh); 657 submit_bh(WRITE_SYNC, bh);
662 } 658 }
663 cond_resched(); 659 cond_resched();
664 stats.run.rs_blocks_logged += bufs; 660 stats.run.rs_blocks_logged += bufs;
@@ -699,6 +695,8 @@ start_journal_io:
699 __jbd2_journal_abort_hard(journal); 695 __jbd2_journal_abort_hard(journal);
700 } 696 }
701 697
698 blk_finish_plug(&plug);
699
702 /* Lo and behold: we have just managed to send a transaction to 700 /* Lo and behold: we have just managed to send a transaction to
703 the log. Before we can commit it, wait for the IO so far to 701 the log. Before we can commit it, wait for the IO so far to
704 complete. Control buffers being written are on the 702 complete. Control buffers being written are on the
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9e4686900f18..90407b8fece7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal)
473} 473}
474 474
475/* 475/*
476 * Called under j_state_lock. Returns true if a transaction commit was started. 476 * Called with j_state_lock locked for writing.
477 * Returns true if a transaction commit was started.
477 */ 478 */
478int __jbd2_log_start_commit(journal_t *journal, tid_t target) 479int __jbd2_log_start_commit(journal_t *journal, tid_t target)
479{ 480{
@@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
520{ 521{
521 transaction_t *transaction = NULL; 522 transaction_t *transaction = NULL;
522 tid_t tid; 523 tid_t tid;
524 int need_to_start = 0;
523 525
524 read_lock(&journal->j_state_lock); 526 read_lock(&journal->j_state_lock);
525 if (journal->j_running_transaction && !current->journal_info) { 527 if (journal->j_running_transaction && !current->journal_info) {
526 transaction = journal->j_running_transaction; 528 transaction = journal->j_running_transaction;
527 __jbd2_log_start_commit(journal, transaction->t_tid); 529 if (!tid_geq(journal->j_commit_request, transaction->t_tid))
530 need_to_start = 1;
528 } else if (journal->j_committing_transaction) 531 } else if (journal->j_committing_transaction)
529 transaction = journal->j_committing_transaction; 532 transaction = journal->j_committing_transaction;
530 533
@@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
535 538
536 tid = transaction->t_tid; 539 tid = transaction->t_tid;
537 read_unlock(&journal->j_state_lock); 540 read_unlock(&journal->j_state_lock);
541 if (need_to_start)
542 jbd2_log_start_commit(journal, tid);
538 jbd2_log_wait_commit(journal, tid); 543 jbd2_log_wait_commit(journal, tid);
539 return 1; 544 return 1;
540} 545}
@@ -986,7 +991,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
986 err = jbd2_journal_bmap(journal, 0, &blocknr); 991 err = jbd2_journal_bmap(journal, 0, &blocknr);
987 /* If that failed, give up */ 992 /* If that failed, give up */
988 if (err) { 993 if (err) {
989 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 994 printk(KERN_ERR "%s: Cannot locate journal superblock\n",
990 __func__); 995 __func__);
991 goto out_err; 996 goto out_err;
992 } 997 }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index faad2bd787c7..1d1191050f99 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction)
117static int start_this_handle(journal_t *journal, handle_t *handle, 117static int start_this_handle(journal_t *journal, handle_t *handle,
118 int gfp_mask) 118 int gfp_mask)
119{ 119{
120 transaction_t *transaction; 120 transaction_t *transaction, *new_transaction = NULL;
121 int needed; 121 tid_t tid;
122 int nblocks = handle->h_buffer_credits; 122 int needed, need_to_start;
123 transaction_t *new_transaction = NULL; 123 int nblocks = handle->h_buffer_credits;
124 124
125 if (nblocks > journal->j_max_transaction_buffers) { 125 if (nblocks > journal->j_max_transaction_buffers) {
126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -222,8 +222,11 @@ repeat:
222 atomic_sub(nblocks, &transaction->t_outstanding_credits); 222 atomic_sub(nblocks, &transaction->t_outstanding_credits);
223 prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 223 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
224 TASK_UNINTERRUPTIBLE); 224 TASK_UNINTERRUPTIBLE);
225 __jbd2_log_start_commit(journal, transaction->t_tid); 225 tid = transaction->t_tid;
226 need_to_start = !tid_geq(journal->j_commit_request, tid);
226 read_unlock(&journal->j_state_lock); 227 read_unlock(&journal->j_state_lock);
228 if (need_to_start)
229 jbd2_log_start_commit(journal, tid);
227 schedule(); 230 schedule();
228 finish_wait(&journal->j_wait_transaction_locked, &wait); 231 finish_wait(&journal->j_wait_transaction_locked, &wait);
229 goto repeat; 232 goto repeat;
@@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
442{ 445{
443 transaction_t *transaction = handle->h_transaction; 446 transaction_t *transaction = handle->h_transaction;
444 journal_t *journal = transaction->t_journal; 447 journal_t *journal = transaction->t_journal;
445 int ret; 448 tid_t tid;
449 int need_to_start, ret;
446 450
447 /* If we've had an abort of any type, don't even think about 451 /* If we've had an abort of any type, don't even think about
448 * actually doing the restart! */ 452 * actually doing the restart! */
@@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
465 spin_unlock(&transaction->t_handle_lock); 469 spin_unlock(&transaction->t_handle_lock);
466 470
467 jbd_debug(2, "restarting handle %p\n", handle); 471 jbd_debug(2, "restarting handle %p\n", handle);
468 __jbd2_log_start_commit(journal, transaction->t_tid); 472 tid = transaction->t_tid;
473 need_to_start = !tid_geq(journal->j_commit_request, tid);
469 read_unlock(&journal->j_state_lock); 474 read_unlock(&journal->j_state_lock);
475 if (need_to_start)
476 jbd2_log_start_commit(journal, tid);
470 477
471 lock_map_release(&handle->h_lockdep_map); 478 lock_map_release(&handle->h_lockdep_map);
472 handle->h_buffer_credits = nblocks; 479 handle->h_buffer_credits = nblocks;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 95b79672150a..828a0e1ea438 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -402,7 +402,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
402 402
403 if (name[0] != '\0') 403 if (name[0] != '\0')
404 return -EINVAL; 404 return -EINVAL;
405 if (!is_owner_or_cap(dentry->d_inode)) 405 if (!inode_owner_or_capable(dentry->d_inode))
406 return -EPERM; 406 return -EPERM;
407 407
408 if (value) { 408 if (value) {
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index fd05a0b9431d..5a001020c542 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -40,12 +40,13 @@ static z_stream inf_strm, def_strm;
40 40
41static int __init alloc_workspaces(void) 41static int __init alloc_workspaces(void)
42{ 42{
43 def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 43 def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS,
44 MAX_MEM_LEVEL));
44 if (!def_strm.workspace) { 45 if (!def_strm.workspace) {
45 printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize()); 46 printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL));
46 return -ENOMEM; 47 return -ENOMEM;
47 } 48 }
48 D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize())); 49 D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)));
49 inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 50 inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
50 if (!inf_strm.workspace) { 51 if (!inf_strm.workspace) {
51 printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize()); 52 printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize());
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 92978658ed18..82faddd1f321 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -215,8 +215,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
215 no chance of AB-BA deadlock involving its f->sem). */ 215 no chance of AB-BA deadlock involving its f->sem). */
216 mutex_unlock(&f->sem); 216 mutex_unlock(&f->sem);
217 217
218 ret = jffs2_do_create(c, dir_f, f, ri, 218 ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name);
219 dentry->d_name.name, dentry->d_name.len);
220 if (ret) 219 if (ret)
221 goto fail; 220 goto fail;
222 221
@@ -386,7 +385,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
386 385
387 jffs2_complete_reservation(c); 386 jffs2_complete_reservation(c);
388 387
389 ret = jffs2_init_security(inode, dir_i); 388 ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
390 if (ret) 389 if (ret)
391 goto fail; 390 goto fail;
392 391
@@ -530,7 +529,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
530 529
531 jffs2_complete_reservation(c); 530 jffs2_complete_reservation(c);
532 531
533 ret = jffs2_init_security(inode, dir_i); 532 ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
534 if (ret) 533 if (ret)
535 goto fail; 534 goto fail;
536 535
@@ -703,7 +702,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
703 702
704 jffs2_complete_reservation(c); 703 jffs2_complete_reservation(c);
705 704
706 ret = jffs2_init_security(inode, dir_i); 705 ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
707 if (ret) 706 if (ret)
708 goto fail; 707 goto fail;
709 708
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 5a53d9bdb2b5..e4619b00f7c5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -401,7 +401,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
401 struct jffs2_raw_inode *ri, unsigned char *buf, 401 struct jffs2_raw_inode *ri, unsigned char *buf,
402 uint32_t offset, uint32_t writelen, uint32_t *retlen); 402 uint32_t offset, uint32_t writelen, uint32_t *retlen);
403int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, 403int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
404 struct jffs2_raw_inode *ri, const char *name, int namelen); 404 struct jffs2_raw_inode *ri, const struct qstr *qstr);
405int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name, 405int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
406 int namelen, struct jffs2_inode_info *dead_f, uint32_t time); 406 int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
407int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, 407int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 239f51216a68..cfeb7164b085 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -23,14 +23,15 @@
23#include "nodelist.h" 23#include "nodelist.h"
24 24
25/* ---- Initial Security Label Attachment -------------- */ 25/* ---- Initial Security Label Attachment -------------- */
26int jffs2_init_security(struct inode *inode, struct inode *dir) 26int jffs2_init_security(struct inode *inode, struct inode *dir,
27 const struct qstr *qstr)
27{ 28{
28 int rc; 29 int rc;
29 size_t len; 30 size_t len;
30 void *value; 31 void *value;
31 char *name; 32 char *name;
32 33
33 rc = security_inode_init_security(inode, dir, &name, &value, &len); 34 rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
34 if (rc) { 35 if (rc) {
35 if (rc == -EOPNOTSUPP) 36 if (rc == -EOPNOTSUPP)
36 return 0; 37 return 0;
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index c819eb0e982d..30d175b6d290 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -424,7 +424,9 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
424 return ret; 424 return ret;
425} 425}
426 426
427int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen) 427int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
428 struct jffs2_inode_info *f, struct jffs2_raw_inode *ri,
429 const struct qstr *qstr)
428{ 430{
429 struct jffs2_raw_dirent *rd; 431 struct jffs2_raw_dirent *rd;
430 struct jffs2_full_dnode *fn; 432 struct jffs2_full_dnode *fn;
@@ -466,15 +468,15 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
466 mutex_unlock(&f->sem); 468 mutex_unlock(&f->sem);
467 jffs2_complete_reservation(c); 469 jffs2_complete_reservation(c);
468 470
469 ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode); 471 ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr);
470 if (ret) 472 if (ret)
471 return ret; 473 return ret;
472 ret = jffs2_init_acl_post(&f->vfs_inode); 474 ret = jffs2_init_acl_post(&f->vfs_inode);
473 if (ret) 475 if (ret)
474 return ret; 476 return ret;
475 477
476 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 478 ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen,
477 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 479 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len));
478 480
479 if (ret) { 481 if (ret) {
480 /* Eep. */ 482 /* Eep. */
@@ -493,19 +495,19 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
493 495
494 rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); 496 rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
495 rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); 497 rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
496 rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); 498 rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len);
497 rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); 499 rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
498 500
499 rd->pino = cpu_to_je32(dir_f->inocache->ino); 501 rd->pino = cpu_to_je32(dir_f->inocache->ino);
500 rd->version = cpu_to_je32(++dir_f->highest_version); 502 rd->version = cpu_to_je32(++dir_f->highest_version);
501 rd->ino = ri->ino; 503 rd->ino = ri->ino;
502 rd->mctime = ri->ctime; 504 rd->mctime = ri->ctime;
503 rd->nsize = namelen; 505 rd->nsize = qstr->len;
504 rd->type = DT_REG; 506 rd->type = DT_REG;
505 rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); 507 rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
506 rd->name_crc = cpu_to_je32(crc32(0, name, namelen)); 508 rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len));
507 509
508 fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL); 510 fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL);
509 511
510 jffs2_free_raw_dirent(rd); 512 jffs2_free_raw_dirent(rd);
511 513
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index cf4f5759b42b..7be4beb306f3 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -121,10 +121,11 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
121#endif /* CONFIG_JFFS2_FS_XATTR */ 121#endif /* CONFIG_JFFS2_FS_XATTR */
122 122
123#ifdef CONFIG_JFFS2_FS_SECURITY 123#ifdef CONFIG_JFFS2_FS_SECURITY
124extern int jffs2_init_security(struct inode *inode, struct inode *dir); 124extern int jffs2_init_security(struct inode *inode, struct inode *dir,
125 const struct qstr *qstr);
125extern const struct xattr_handler jffs2_security_xattr_handler; 126extern const struct xattr_handler jffs2_security_xattr_handler;
126#else 127#else
127#define jffs2_init_security(inode,dir) (0) 128#define jffs2_init_security(inode,dir,qstr) (0)
128#endif /* CONFIG_JFFS2_FS_SECURITY */ 129#endif /* CONFIG_JFFS2_FS_SECURITY */
129 130
130#endif /* _JFFS2_FS_XATTR_H_ */ 131#endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index 3adb6395e42d..a58fa72d7e59 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -13,4 +13,4 @@ jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
13 13
14jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o 14jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
15 15
16EXTRA_CFLAGS += -D_JFS_4K 16ccflags-y := -D_JFS_4K
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9978803ceedc..eddbb373209e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -352,7 +352,6 @@ const struct address_space_operations jfs_aops = {
352 .readpages = jfs_readpages, 352 .readpages = jfs_readpages,
353 .writepage = jfs_writepage, 353 .writepage = jfs_writepage,
354 .writepages = jfs_writepages, 354 .writepages = jfs_writepages,
355 .sync_page = block_sync_page,
356 .write_begin = jfs_write_begin, 355 .write_begin = jfs_write_begin,
357 .write_end = nobh_write_end, 356 .write_end = nobh_write_end,
358 .bmap = jfs_bmap, 357 .bmap = jfs_bmap,
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index afe222bf300f..6f98a1866776 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -72,7 +72,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
72 if (err) 72 if (err)
73 return err; 73 return err;
74 74
75 if (!is_owner_or_cap(inode)) { 75 if (!inode_owner_or_capable(inode)) {
76 err = -EACCES; 76 err = -EACCES;
77 goto setflags_out; 77 goto setflags_out;
78 } 78 }
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 48b44bd8267b..6740d34cd82b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -583,7 +583,6 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
583const struct address_space_operations jfs_metapage_aops = { 583const struct address_space_operations jfs_metapage_aops = {
584 .readpage = metapage_readpage, 584 .readpage = metapage_readpage,
585 .writepage = metapage_writepage, 585 .writepage = metapage_writepage,
586 .sync_page = block_sync_page,
587 .releasepage = metapage_releasepage, 586 .releasepage = metapage_releasepage,
588 .invalidatepage = metapage_invalidatepage, 587 .invalidatepage = metapage_invalidatepage,
589 .set_page_dirty = __set_page_dirty_nobuffers, 588 .set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 88b6cc535bf2..e9e100fd7c09 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -62,10 +62,11 @@ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
62extern int jfs_removexattr(struct dentry *, const char *); 62extern int jfs_removexattr(struct dentry *, const char *);
63 63
64#ifdef CONFIG_JFS_SECURITY 64#ifdef CONFIG_JFS_SECURITY
65extern int jfs_init_security(tid_t, struct inode *, struct inode *); 65extern int jfs_init_security(tid_t, struct inode *, struct inode *,
66 const struct qstr *);
66#else 67#else
67static inline int jfs_init_security(tid_t tid, struct inode *inode, 68static inline int jfs_init_security(tid_t tid, struct inode *inode,
68 struct inode *dir) 69 struct inode *dir, const struct qstr *qstr)
69{ 70{
70 return 0; 71 return 0;
71} 72}
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 81ead850ddb6..eaaf2b511e89 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -115,7 +115,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
115 if (rc) 115 if (rc)
116 goto out3; 116 goto out3;
117 117
118 rc = jfs_init_security(tid, ip, dip); 118 rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
119 if (rc) { 119 if (rc) {
120 txAbort(tid, 0); 120 txAbort(tid, 0);
121 goto out3; 121 goto out3;
@@ -253,7 +253,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
253 if (rc) 253 if (rc)
254 goto out3; 254 goto out3;
255 255
256 rc = jfs_init_security(tid, ip, dip); 256 rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
257 if (rc) { 257 if (rc) {
258 txAbort(tid, 0); 258 txAbort(tid, 0);
259 goto out3; 259 goto out3;
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
809 if (ip->i_nlink == JFS_LINK_MAX) 809 if (ip->i_nlink == JFS_LINK_MAX)
810 return -EMLINK; 810 return -EMLINK;
811 811
812 if (ip->i_nlink == 0)
813 return -ENOENT;
814
815 dquot_initialize(dir); 812 dquot_initialize(dir);
816 813
817 tid = txBegin(ip->i_sb, 0); 814 tid = txBegin(ip->i_sb, 0);
@@ -932,7 +929,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
932 mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); 929 mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
933 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); 930 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
934 931
935 rc = jfs_init_security(tid, ip, dip); 932 rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
936 if (rc) 933 if (rc)
937 goto out3; 934 goto out3;
938 935
@@ -1395,7 +1392,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1395 if (rc) 1392 if (rc)
1396 goto out3; 1393 goto out3;
1397 1394
1398 rc = jfs_init_security(tid, ip, dir); 1395 rc = jfs_init_security(tid, ip, dir, &dentry->d_name);
1399 if (rc) { 1396 if (rc) {
1400 txAbort(tid, 0); 1397 txAbort(tid, 0);
1401 goto out3; 1398 goto out3;
@@ -1600,7 +1597,7 @@ out:
1600 1597
1601static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) 1598static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
1602{ 1599{
1603 if (nd->flags & LOOKUP_RCU) 1600 if (nd && nd->flags & LOOKUP_RCU)
1604 return -ECHILD; 1601 return -ECHILD;
1605 /* 1602 /*
1606 * This is not negative dentry. Always valid. 1603 * This is not negative dentry. Always valid.
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 2d7f165d0f1d..24838f1eeee5 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -678,7 +678,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
678 struct posix_acl *acl; 678 struct posix_acl *acl;
679 int rc; 679 int rc;
680 680
681 if (!is_owner_or_cap(inode)) 681 if (!inode_owner_or_capable(inode))
682 return -EPERM; 682 return -EPERM;
683 683
684 /* 684 /*
@@ -1091,7 +1091,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
1091} 1091}
1092 1092
1093#ifdef CONFIG_JFS_SECURITY 1093#ifdef CONFIG_JFS_SECURITY
1094int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir) 1094int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
1095 const struct qstr *qstr)
1095{ 1096{
1096 int rc; 1097 int rc;
1097 size_t len; 1098 size_t len;
@@ -1099,7 +1100,8 @@ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
1099 char *suffix; 1100 char *suffix;
1100 char *name; 1101 char *name;
1101 1102
1102 rc = security_inode_init_security(inode, dir, &suffix, &value, &len); 1103 rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
1104 &len);
1103 if (rc) { 1105 if (rc) {
1104 if (rc == -EOPNOTSUPP) 1106 if (rc == -EOPNOTSUPP)
1105 return 0; 1107 return 0;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 5f1bcb2f06f3..b7c99bfb3da6 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -520,7 +520,7 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
520 struct nsm_handle *nsm, 520 struct nsm_handle *nsm,
521 const struct nlm_reboot *info) 521 const struct nlm_reboot *info)
522{ 522{
523 struct nlm_host *host = NULL; 523 struct nlm_host *host;
524 struct hlist_head *chain; 524 struct hlist_head *chain;
525 struct hlist_node *pos; 525 struct hlist_node *pos;
526 526
@@ -532,12 +532,13 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
532 host->h_state++; 532 host->h_state++;
533 533
534 nlm_get_host(host); 534 nlm_get_host(host);
535 goto out; 535 mutex_unlock(&nlm_host_mutex);
536 return host;
536 } 537 }
537 } 538 }
538out: 539
539 mutex_unlock(&nlm_host_mutex); 540 mutex_unlock(&nlm_host_mutex);
540 return host; 541 return NULL;
541} 542}
542 543
543/** 544/**
diff --git a/fs/locks.c b/fs/locks.c
index 0f3998291f78..0a4f50dfadfb 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -145,7 +145,6 @@ static DEFINE_SPINLOCK(file_lock_lock);
145 145
146/* 146/*
147 * Protects the two list heads above, plus the inode->i_flock list 147 * Protects the two list heads above, plus the inode->i_flock list
148 * FIXME: should use a spinlock, once lockd and ceph are ready.
149 */ 148 */
150void lock_flocks(void) 149void lock_flocks(void)
151{ 150{
@@ -415,17 +414,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
415 fl->fl_ops = NULL; 414 fl->fl_ops = NULL;
416 fl->fl_lmops = NULL; 415 fl->fl_lmops = NULL;
417 416
418 switch (l->l_type) { 417 return assign_type(fl, l->l_type);
419 case F_RDLCK:
420 case F_WRLCK:
421 case F_UNLCK:
422 fl->fl_type = l->l_type;
423 break;
424 default:
425 return -EINVAL;
426 }
427
428 return (0);
429} 418}
430#endif 419#endif
431 420
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
index 44bbfd249abc..961f02b86d97 100644
--- a/fs/logfs/compr.c
+++ b/fs/logfs/compr.c
@@ -81,7 +81,7 @@ error:
81 81
82int __init logfs_compr_init(void) 82int __init logfs_compr_init(void)
83{ 83{
84 size_t size = max(zlib_deflate_workspacesize(), 84 size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
85 zlib_inflate_workspacesize()); 85 zlib_inflate_workspacesize());
86 stream.workspace = vmalloc(size); 86 stream.workspace = vmalloc(size);
87 if (!stream.workspace) 87 if (!stream.workspace)
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 723bc5bca09a..1adc8d455f0e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -39,7 +39,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
39 bio.bi_end_io = request_complete; 39 bio.bi_end_io = request_complete;
40 40
41 submit_bio(rw, &bio); 41 submit_bio(rw, &bio);
42 generic_unplug_device(bdev_get_queue(bdev));
43 wait_for_completion(&complete); 42 wait_for_completion(&complete);
44 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; 43 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
45} 44}
@@ -168,7 +167,6 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
168 } 167 }
169 len = PAGE_ALIGN(len); 168 len = PAGE_ALIGN(len);
170 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); 169 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
171 generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
172} 170}
173 171
174 172
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index e86376b87af1..c2ad7028def4 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -196,7 +196,7 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
196 if (IS_RDONLY(inode)) 196 if (IS_RDONLY(inode))
197 return -EROFS; 197 return -EROFS;
198 198
199 if (!is_owner_or_cap(inode)) 199 if (!inode_owner_or_capable(inode))
200 return -EACCES; 200 return -EACCES;
201 201
202 err = get_user(flags, (int __user *)arg); 202 err = get_user(flags, (int __user *)arg);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 03b8c240aeda..edfea7a3a747 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
293 return ret; 293 return ret;
294} 294}
295 295
296/* called with inode_lock held */ 296/* called with inode->i_lock held */
297static int logfs_drop_inode(struct inode *inode) 297static int logfs_drop_inode(struct inode *inode)
298{ 298{
299 struct logfs_super *super = logfs_super(inode->i_sb); 299 struct logfs_super *super = logfs_super(inode->i_sb);
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index 0fd7ca994264..6624684dd5de 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -15,3 +15,11 @@ config MINIX_FS
15 module will be called minix. Note that the file system of your root 15 module will be called minix. Note that the file system of your root
16 partition (the one containing the directory /) cannot be compiled as 16 partition (the one containing the directory /) cannot be compiled as
17 a module. 17 a module.
18
19config MINIX_FS_NATIVE_ENDIAN
20 def_bool MINIX_FS
21 depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
22
23config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
24 def_bool MINIX_FS
25 depends on M68K && MMU
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index ae0b83f476a6..adcdc0a4e182 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -399,7 +399,6 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
399static const struct address_space_operations minix_aops = { 399static const struct address_space_operations minix_aops = {
400 .readpage = minix_readpage, 400 .readpage = minix_readpage,
401 .writepage = minix_writepage, 401 .writepage = minix_writepage,
402 .sync_page = block_sync_page,
403 .write_begin = minix_write_begin, 402 .write_begin = minix_write_begin,
404 .write_end = generic_write_end, 403 .write_end = generic_write_end,
405 .bmap = minix_bmap 404 .bmap = minix_bmap
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 407b1c84911e..341e2122879a 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -88,4 +88,78 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
88 return list_entry(inode, struct minix_inode_info, vfs_inode); 88 return list_entry(inode, struct minix_inode_info, vfs_inode);
89} 89}
90 90
91#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
92 defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
93
94#error Minix file system byte order broken
95
96#elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN)
97
98/*
99 * big-endian 32 or 64 bit indexed bitmaps on big-endian system or
100 * little-endian bitmaps on little-endian system
101 */
102
103#define minix_test_and_set_bit(nr, addr) \
104 __test_and_set_bit((nr), (unsigned long *)(addr))
105#define minix_set_bit(nr, addr) \
106 __set_bit((nr), (unsigned long *)(addr))
107#define minix_test_and_clear_bit(nr, addr) \
108 __test_and_clear_bit((nr), (unsigned long *)(addr))
109#define minix_test_bit(nr, addr) \
110 test_bit((nr), (unsigned long *)(addr))
111#define minix_find_first_zero_bit(addr, size) \
112 find_first_zero_bit((unsigned long *)(addr), (size))
113
114#elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
115
116/*
117 * big-endian 16bit indexed bitmaps
118 */
119
120static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
121{
122 const unsigned short *p = vaddr, *addr = vaddr;
123 unsigned short num;
124
125 if (!size)
126 return 0;
127
128 size = (size >> 4) + ((size & 15) > 0);
129 while (*p++ == 0xffff) {
130 if (--size == 0)
131 return (p - addr) << 4;
132 }
133
134 num = *--p;
135 return ((p - addr) << 4) + ffz(num);
136}
137
138#define minix_test_and_set_bit(nr, addr) \
139 __test_and_set_bit((nr) ^ 16, (unsigned long *)(addr))
140#define minix_set_bit(nr, addr) \
141 __set_bit((nr) ^ 16, (unsigned long *)(addr))
142#define minix_test_and_clear_bit(nr, addr) \
143 __test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr))
144
145static inline int minix_test_bit(int nr, const void *vaddr)
146{
147 const unsigned short *p = vaddr;
148 return (p[nr >> 4] & (1U << (nr & 15))) != 0;
149}
150
151#else
152
153/*
154 * little-endian bitmaps
155 */
156
157#define minix_test_and_set_bit __test_and_set_bit_le
158#define minix_set_bit __set_bit_le
159#define minix_test_and_clear_bit __test_and_clear_bit_le
160#define minix_test_bit test_bit_le
161#define minix_find_first_zero_bit find_first_zero_bit_le
162
163#endif
164
91#endif /* FS_MINIX_H */ 165#endif /* FS_MINIX_H */
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ce7337ddfdbf..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
213 new_de = minix_find_entry(new_dentry, &new_page); 213 new_de = minix_find_entry(new_dentry, &new_page);
214 if (!new_de) 214 if (!new_de)
215 goto out_dir; 215 goto out_dir;
216 inode_inc_link_count(old_inode);
217 minix_set_link(new_de, new_page, old_inode); 216 minix_set_link(new_de, new_page, old_inode);
218 new_inode->i_ctime = CURRENT_TIME_SEC; 217 new_inode->i_ctime = CURRENT_TIME_SEC;
219 if (dir_de) 218 if (dir_de)
@@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
225 if (new_dir->i_nlink >= info->s_link_max) 224 if (new_dir->i_nlink >= info->s_link_max)
226 goto out_dir; 225 goto out_dir;
227 } 226 }
228 inode_inc_link_count(old_inode);
229 err = minix_add_link(new_dentry, old_inode); 227 err = minix_add_link(new_dentry, old_inode);
230 if (err) { 228 if (err)
231 inode_dec_link_count(old_inode);
232 goto out_dir; 229 goto out_dir;
233 }
234 if (dir_de) 230 if (dir_de)
235 inode_inc_link_count(new_dir); 231 inode_inc_link_count(new_dir);
236 } 232 }
237 233
238 minix_delete_entry(old_de, old_page); 234 minix_delete_entry(old_de, old_page);
239 inode_dec_link_count(old_inode); 235 mark_inode_dirty(old_inode);
240 236
241 if (dir_de) { 237 if (dir_de) {
242 minix_set_link(dir_de, dir_page, new_dir); 238 minix_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/mpage.c b/fs/mpage.c
index d78455a81ec9..0afc809e46e0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -364,6 +364,9 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
364 sector_t last_block_in_bio = 0; 364 sector_t last_block_in_bio = 0;
365 struct buffer_head map_bh; 365 struct buffer_head map_bh;
366 unsigned long first_logical_block = 0; 366 unsigned long first_logical_block = 0;
367 struct blk_plug plug;
368
369 blk_start_plug(&plug);
367 370
368 map_bh.b_state = 0; 371 map_bh.b_state = 0;
369 map_bh.b_size = 0; 372 map_bh.b_size = 0;
@@ -385,6 +388,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
385 BUG_ON(!list_empty(pages)); 388 BUG_ON(!list_empty(pages));
386 if (bio) 389 if (bio)
387 mpage_bio_submit(READ, bio); 390 mpage_bio_submit(READ, bio);
391 blk_finish_plug(&plug);
388 return 0; 392 return 0;
389} 393}
390EXPORT_SYMBOL(mpage_readpages); 394EXPORT_SYMBOL(mpage_readpages);
@@ -666,8 +670,11 @@ int
666mpage_writepages(struct address_space *mapping, 670mpage_writepages(struct address_space *mapping,
667 struct writeback_control *wbc, get_block_t get_block) 671 struct writeback_control *wbc, get_block_t get_block)
668{ 672{
673 struct blk_plug plug;
669 int ret; 674 int ret;
670 675
676 blk_start_plug(&plug);
677
671 if (!get_block) 678 if (!get_block)
672 ret = generic_writepages(mapping, wbc); 679 ret = generic_writepages(mapping, wbc);
673 else { 680 else {
@@ -682,6 +689,7 @@ mpage_writepages(struct address_space *mapping,
682 if (mpd.bio) 689 if (mpd.bio)
683 mpage_bio_submit(WRITE, mpd.bio); 690 mpage_bio_submit(WRITE, mpd.bio);
684 } 691 }
692 blk_finish_plug(&plug);
685 return ret; 693 return ret;
686} 694}
687EXPORT_SYMBOL(mpage_writepages); 695EXPORT_SYMBOL(mpage_writepages);
diff --git a/fs/namei.c b/fs/namei.c
index 7d77f24d32a9..3cb616d38d9c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
136 return retval; 136 return retval;
137} 137}
138 138
139char * getname(const char __user * filename) 139static char *getname_flags(const char __user * filename, int flags)
140{ 140{
141 char *tmp, *result; 141 char *tmp, *result;
142 142
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
147 147
148 result = tmp; 148 result = tmp;
149 if (retval < 0) { 149 if (retval < 0) {
150 __putname(tmp); 150 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
151 result = ERR_PTR(retval); 151 __putname(tmp);
152 result = ERR_PTR(retval);
153 }
152 } 154 }
153 } 155 }
154 audit_getname(result); 156 audit_getname(result);
155 return result; 157 return result;
156} 158}
157 159
160char *getname(const char __user * filename)
161{
162 return getname_flags(filename, 0);
163}
164
158#ifdef CONFIG_AUDITSYSCALL 165#ifdef CONFIG_AUDITSYSCALL
159void putname(const char *name) 166void putname(const char *name)
160{ 167{
@@ -176,6 +183,9 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
176 183
177 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 184 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
178 185
186 if (current_user_ns() != inode_userns(inode))
187 goto other_perms;
188
179 if (current_fsuid() == inode->i_uid) 189 if (current_fsuid() == inode->i_uid)
180 mode >>= 6; 190 mode >>= 6;
181 else { 191 else {
@@ -189,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
189 mode >>= 3; 199 mode >>= 3;
190 } 200 }
191 201
202other_perms:
192 /* 203 /*
193 * If the DACs are ok we don't need any capability check. 204 * If the DACs are ok we don't need any capability check.
194 */ 205 */
@@ -230,7 +241,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
230 * Executable DACs are overridable if at least one exec bit is set. 241 * Executable DACs are overridable if at least one exec bit is set.
231 */ 242 */
232 if (!(mask & MAY_EXEC) || execute_ok(inode)) 243 if (!(mask & MAY_EXEC) || execute_ok(inode))
233 if (capable(CAP_DAC_OVERRIDE)) 244 if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
234 return 0; 245 return 0;
235 246
236 /* 247 /*
@@ -238,7 +249,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
238 */ 249 */
239 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 250 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
240 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) 251 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
241 if (capable(CAP_DAC_READ_SEARCH)) 252 if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
242 return 0; 253 return 0;
243 254
244 return -EACCES; 255 return -EACCES;
@@ -401,9 +412,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
401{ 412{
402 struct fs_struct *fs = current->fs; 413 struct fs_struct *fs = current->fs;
403 struct dentry *dentry = nd->path.dentry; 414 struct dentry *dentry = nd->path.dentry;
415 int want_root = 0;
404 416
405 BUG_ON(!(nd->flags & LOOKUP_RCU)); 417 BUG_ON(!(nd->flags & LOOKUP_RCU));
406 if (nd->root.mnt) { 418 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
419 want_root = 1;
407 spin_lock(&fs->lock); 420 spin_lock(&fs->lock);
408 if (nd->root.mnt != fs->root.mnt || 421 if (nd->root.mnt != fs->root.mnt ||
409 nd->root.dentry != fs->root.dentry) 422 nd->root.dentry != fs->root.dentry)
@@ -414,7 +427,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
414 goto err; 427 goto err;
415 BUG_ON(nd->inode != dentry->d_inode); 428 BUG_ON(nd->inode != dentry->d_inode);
416 spin_unlock(&dentry->d_lock); 429 spin_unlock(&dentry->d_lock);
417 if (nd->root.mnt) { 430 if (want_root) {
418 path_get(&nd->root); 431 path_get(&nd->root);
419 spin_unlock(&fs->lock); 432 spin_unlock(&fs->lock);
420 } 433 }
@@ -427,7 +440,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
427err: 440err:
428 spin_unlock(&dentry->d_lock); 441 spin_unlock(&dentry->d_lock);
429err_root: 442err_root:
430 if (nd->root.mnt) 443 if (want_root)
431 spin_unlock(&fs->lock); 444 spin_unlock(&fs->lock);
432 return -ECHILD; 445 return -ECHILD;
433} 446}
@@ -454,17 +467,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
454{ 467{
455 struct fs_struct *fs = current->fs; 468 struct fs_struct *fs = current->fs;
456 struct dentry *parent = nd->path.dentry; 469 struct dentry *parent = nd->path.dentry;
457 470 int want_root = 0;
458 /*
459 * It can be possible to revalidate the dentry that we started
460 * the path walk with. force_reval_path may also revalidate the
461 * dentry already committed to the nameidata.
462 */
463 if (unlikely(parent == dentry))
464 return nameidata_drop_rcu(nd);
465 471
466 BUG_ON(!(nd->flags & LOOKUP_RCU)); 472 BUG_ON(!(nd->flags & LOOKUP_RCU));
467 if (nd->root.mnt) { 473 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
474 want_root = 1;
468 spin_lock(&fs->lock); 475 spin_lock(&fs->lock);
469 if (nd->root.mnt != fs->root.mnt || 476 if (nd->root.mnt != fs->root.mnt ||
470 nd->root.dentry != fs->root.dentry) 477 nd->root.dentry != fs->root.dentry)
@@ -484,7 +491,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
484 parent->d_count++; 491 parent->d_count++;
485 spin_unlock(&dentry->d_lock); 492 spin_unlock(&dentry->d_lock);
486 spin_unlock(&parent->d_lock); 493 spin_unlock(&parent->d_lock);
487 if (nd->root.mnt) { 494 if (want_root) {
488 path_get(&nd->root); 495 path_get(&nd->root);
489 spin_unlock(&fs->lock); 496 spin_unlock(&fs->lock);
490 } 497 }
@@ -498,7 +505,7 @@ err:
498 spin_unlock(&dentry->d_lock); 505 spin_unlock(&dentry->d_lock);
499 spin_unlock(&parent->d_lock); 506 spin_unlock(&parent->d_lock);
500err_root: 507err_root:
501 if (nd->root.mnt) 508 if (want_root)
502 spin_unlock(&fs->lock); 509 spin_unlock(&fs->lock);
503 return -ECHILD; 510 return -ECHILD;
504} 511}
@@ -506,8 +513,16 @@ err_root:
506/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ 513/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
507static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) 514static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
508{ 515{
509 if (nd->flags & LOOKUP_RCU) 516 if (nd->flags & LOOKUP_RCU) {
510 return nameidata_dentry_drop_rcu(nd, dentry); 517 if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
518 nd->flags &= ~LOOKUP_RCU;
519 if (!(nd->flags & LOOKUP_ROOT))
520 nd->root.mnt = NULL;
521 rcu_read_unlock();
522 br_read_unlock(vfsmount_lock);
523 return -ECHILD;
524 }
525 }
511 return 0; 526 return 0;
512} 527}
513 528
@@ -526,7 +541,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
526 541
527 BUG_ON(!(nd->flags & LOOKUP_RCU)); 542 BUG_ON(!(nd->flags & LOOKUP_RCU));
528 nd->flags &= ~LOOKUP_RCU; 543 nd->flags &= ~LOOKUP_RCU;
529 nd->root.mnt = NULL; 544 if (!(nd->flags & LOOKUP_ROOT))
545 nd->root.mnt = NULL;
530 spin_lock(&dentry->d_lock); 546 spin_lock(&dentry->d_lock);
531 if (!__d_rcu_to_refcount(dentry, nd->seq)) 547 if (!__d_rcu_to_refcount(dentry, nd->seq))
532 goto err_unlock; 548 goto err_unlock;
@@ -547,53 +563,31 @@ err_unlock:
547 return -ECHILD; 563 return -ECHILD;
548} 564}
549 565
550/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
551static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
552{
553 if (likely(nd->flags & LOOKUP_RCU))
554 return nameidata_drop_rcu_last(nd);
555 return 0;
556}
557
558/** 566/**
559 * release_open_intent - free up open intent resources 567 * release_open_intent - free up open intent resources
560 * @nd: pointer to nameidata 568 * @nd: pointer to nameidata
561 */ 569 */
562void release_open_intent(struct nameidata *nd) 570void release_open_intent(struct nameidata *nd)
563{ 571{
564 if (nd->intent.open.file->f_path.dentry == NULL) 572 struct file *file = nd->intent.open.file;
565 put_filp(nd->intent.open.file);
566 else
567 fput(nd->intent.open.file);
568}
569
570/*
571 * Call d_revalidate and handle filesystems that request rcu-walk
572 * to be dropped. This may be called and return in rcu-walk mode,
573 * regardless of success or error. If -ECHILD is returned, the caller
574 * must return -ECHILD back up the path walk stack so path walk may
575 * be restarted in ref-walk mode.
576 */
577static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
578{
579 int status;
580 573
581 status = dentry->d_op->d_revalidate(dentry, nd); 574 if (file && !IS_ERR(file)) {
582 if (status == -ECHILD) { 575 if (file->f_path.dentry == NULL)
583 if (nameidata_dentry_drop_rcu(nd, dentry)) 576 put_filp(file);
584 return status; 577 else
585 status = dentry->d_op->d_revalidate(dentry, nd); 578 fput(file);
586 } 579 }
580}
587 581
588 return status; 582static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
583{
584 return dentry->d_op->d_revalidate(dentry, nd);
589} 585}
590 586
591static inline struct dentry * 587static struct dentry *
592do_revalidate(struct dentry *dentry, struct nameidata *nd) 588do_revalidate(struct dentry *dentry, struct nameidata *nd)
593{ 589{
594 int status; 590 int status = d_revalidate(dentry, nd);
595
596 status = d_revalidate(dentry, nd);
597 if (unlikely(status <= 0)) { 591 if (unlikely(status <= 0)) {
598 /* 592 /*
599 * The dentry failed validation. 593 * The dentry failed validation.
@@ -602,37 +596,18 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
602 * to return a fail status. 596 * to return a fail status.
603 */ 597 */
604 if (status < 0) { 598 if (status < 0) {
605 /* If we're in rcu-walk, we don't have a ref */ 599 dput(dentry);
606 if (!(nd->flags & LOOKUP_RCU))
607 dput(dentry);
608 dentry = ERR_PTR(status); 600 dentry = ERR_PTR(status);
609 601 } else if (!d_invalidate(dentry)) {
610 } else { 602 dput(dentry);
611 /* Don't d_invalidate in rcu-walk mode */ 603 dentry = NULL;
612 if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
613 return ERR_PTR(-ECHILD);
614 if (!d_invalidate(dentry)) {
615 dput(dentry);
616 dentry = NULL;
617 }
618 } 604 }
619 } 605 }
620 return dentry; 606 return dentry;
621} 607}
622 608
623static inline int need_reval_dot(struct dentry *dentry)
624{
625 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
626 return 0;
627
628 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
629 return 0;
630
631 return 1;
632}
633
634/* 609/*
635 * force_reval_path - force revalidation of a dentry 610 * handle_reval_path - force revalidation of a dentry
636 * 611 *
637 * In some situations the path walking code will trust dentries without 612 * In some situations the path walking code will trust dentries without
638 * revalidating them. This causes problems for filesystems that depend on 613 * revalidating them. This causes problems for filesystems that depend on
@@ -646,30 +621,28 @@ static inline int need_reval_dot(struct dentry *dentry)
646 * invalidate the dentry. It's up to the caller to handle putting references 621 * invalidate the dentry. It's up to the caller to handle putting references
647 * to the path if necessary. 622 * to the path if necessary.
648 */ 623 */
649static int 624static inline int handle_reval_path(struct nameidata *nd)
650force_reval_path(struct path *path, struct nameidata *nd)
651{ 625{
626 struct dentry *dentry = nd->path.dentry;
652 int status; 627 int status;
653 struct dentry *dentry = path->dentry;
654 628
655 /* 629 if (likely(!(nd->flags & LOOKUP_JUMPED)))
656 * only check on filesystems where it's possible for the dentry to 630 return 0;
657 * become stale. 631
658 */ 632 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
659 if (!need_reval_dot(dentry)) 633 return 0;
634
635 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
660 return 0; 636 return 0;
661 637
638 /* Note: we do not d_invalidate() */
662 status = d_revalidate(dentry, nd); 639 status = d_revalidate(dentry, nd);
663 if (status > 0) 640 if (status > 0)
664 return 0; 641 return 0;
665 642
666 if (!status) { 643 if (!status)
667 /* Don't d_invalidate in rcu-walk mode */
668 if (nameidata_drop_rcu(nd))
669 return -ECHILD;
670 d_invalidate(dentry);
671 status = -ESTALE; 644 status = -ESTALE;
672 } 645
673 return status; 646 return status;
674} 647}
675 648
@@ -685,6 +658,7 @@ force_reval_path(struct path *path, struct nameidata *nd)
685static inline int exec_permission(struct inode *inode, unsigned int flags) 658static inline int exec_permission(struct inode *inode, unsigned int flags)
686{ 659{
687 int ret; 660 int ret;
661 struct user_namespace *ns = inode_userns(inode);
688 662
689 if (inode->i_op->permission) { 663 if (inode->i_op->permission) {
690 ret = inode->i_op->permission(inode, MAY_EXEC, flags); 664 ret = inode->i_op->permission(inode, MAY_EXEC, flags);
@@ -697,7 +671,8 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
697 if (ret == -ECHILD) 671 if (ret == -ECHILD)
698 return ret; 672 return ret;
699 673
700 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) 674 if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
675 ns_capable(ns, CAP_DAC_READ_SEARCH))
701 goto ok; 676 goto ok;
702 677
703 return ret; 678 return ret;
@@ -738,6 +713,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
738 path_put(&nd->path); 713 path_put(&nd->path);
739 nd->path = nd->root; 714 nd->path = nd->root;
740 path_get(&nd->root); 715 path_get(&nd->root);
716 nd->flags |= LOOKUP_JUMPED;
741 } 717 }
742 nd->inode = nd->path.dentry->d_inode; 718 nd->inode = nd->path.dentry->d_inode;
743 719
@@ -767,18 +743,43 @@ static inline void path_to_nameidata(const struct path *path,
767 nd->path.dentry = path->dentry; 743 nd->path.dentry = path->dentry;
768} 744}
769 745
746static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
747{
748 struct inode *inode = link->dentry->d_inode;
749 if (!IS_ERR(cookie) && inode->i_op->put_link)
750 inode->i_op->put_link(link->dentry, nd, cookie);
751 path_put(link);
752}
753
770static __always_inline int 754static __always_inline int
771__do_follow_link(const struct path *link, struct nameidata *nd, void **p) 755follow_link(struct path *link, struct nameidata *nd, void **p)
772{ 756{
773 int error; 757 int error;
774 struct dentry *dentry = link->dentry; 758 struct dentry *dentry = link->dentry;
775 759
776 touch_atime(link->mnt, dentry); 760 BUG_ON(nd->flags & LOOKUP_RCU);
777 nd_set_link(nd, NULL);
778 761
779 if (link->mnt == nd->path.mnt) 762 if (link->mnt == nd->path.mnt)
780 mntget(link->mnt); 763 mntget(link->mnt);
781 764
765 if (unlikely(current->total_link_count >= 40)) {
766 *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
767 path_put(&nd->path);
768 return -ELOOP;
769 }
770 cond_resched();
771 current->total_link_count++;
772
773 touch_atime(link->mnt, dentry);
774 nd_set_link(nd, NULL);
775
776 error = security_inode_follow_link(link->dentry, nd);
777 if (error) {
778 *p = ERR_PTR(error); /* no ->put_link(), please */
779 path_put(&nd->path);
780 return error;
781 }
782
782 nd->last_type = LAST_BIND; 783 nd->last_type = LAST_BIND;
783 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 784 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
784 error = PTR_ERR(*p); 785 error = PTR_ERR(*p);
@@ -788,50 +789,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
788 if (s) 789 if (s)
789 error = __vfs_follow_link(nd, s); 790 error = __vfs_follow_link(nd, s);
790 else if (nd->last_type == LAST_BIND) { 791 else if (nd->last_type == LAST_BIND) {
791 error = force_reval_path(&nd->path, nd); 792 nd->flags |= LOOKUP_JUMPED;
792 if (error) 793 nd->inode = nd->path.dentry->d_inode;
794 if (nd->inode->i_op->follow_link) {
795 /* stepped on a _really_ weird one */
793 path_put(&nd->path); 796 path_put(&nd->path);
797 error = -ELOOP;
798 }
794 } 799 }
795 } 800 }
796 return error; 801 return error;
797} 802}
798 803
799/*
800 * This limits recursive symlink follows to 8, while
801 * limiting consecutive symlinks to 40.
802 *
803 * Without that kind of total limit, nasty chains of consecutive
804 * symlinks can cause almost arbitrarily long lookups.
805 */
806static inline int do_follow_link(struct path *path, struct nameidata *nd)
807{
808 void *cookie;
809 int err = -ELOOP;
810 if (current->link_count >= MAX_NESTED_LINKS)
811 goto loop;
812 if (current->total_link_count >= 40)
813 goto loop;
814 BUG_ON(nd->depth >= MAX_NESTED_LINKS);
815 cond_resched();
816 err = security_inode_follow_link(path->dentry, nd);
817 if (err)
818 goto loop;
819 current->link_count++;
820 current->total_link_count++;
821 nd->depth++;
822 err = __do_follow_link(path, nd, &cookie);
823 if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
824 path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
825 path_put(path);
826 current->link_count--;
827 nd->depth--;
828 return err;
829loop:
830 path_put_conditional(path, nd);
831 path_put(&nd->path);
832 return err;
833}
834
835static int follow_up_rcu(struct path *path) 804static int follow_up_rcu(struct path *path)
836{ 805{
837 struct vfsmount *parent; 806 struct vfsmount *parent;
@@ -970,8 +939,7 @@ static int follow_managed(struct path *path, unsigned flags)
970 if (managed & DCACHE_MANAGE_TRANSIT) { 939 if (managed & DCACHE_MANAGE_TRANSIT) {
971 BUG_ON(!path->dentry->d_op); 940 BUG_ON(!path->dentry->d_op);
972 BUG_ON(!path->dentry->d_op->d_manage); 941 BUG_ON(!path->dentry->d_op->d_manage);
973 ret = path->dentry->d_op->d_manage(path->dentry, 942 ret = path->dentry->d_op->d_manage(path->dentry, false);
974 false, false);
975 if (ret < 0) 943 if (ret < 0)
976 return ret == -EISDIR ? 0 : ret; 944 return ret == -EISDIR ? 0 : ret;
977 } 945 }
@@ -1024,6 +992,12 @@ int follow_down_one(struct path *path)
1024 return 0; 992 return 0;
1025} 993}
1026 994
995static inline bool managed_dentry_might_block(struct dentry *dentry)
996{
997 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
998 dentry->d_op->d_manage(dentry, true) < 0);
999}
1000
1027/* 1001/*
1028 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we 1002 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
1029 * meet a managed dentry and we're not walking to "..". True is returned to 1003 * meet a managed dentry and we're not walking to "..". True is returned to
@@ -1032,19 +1006,26 @@ int follow_down_one(struct path *path)
1032static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 1006static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1033 struct inode **inode, bool reverse_transit) 1007 struct inode **inode, bool reverse_transit)
1034{ 1008{
1035 while (d_mountpoint(path->dentry)) { 1009 for (;;) {
1036 struct vfsmount *mounted; 1010 struct vfsmount *mounted;
1037 if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && 1011 /*
1038 !reverse_transit && 1012 * Don't forget we might have a non-mountpoint managed dentry
1039 path->dentry->d_op->d_manage(path->dentry, false, true) < 0) 1013 * that wants to block transit.
1014 */
1015 *inode = path->dentry->d_inode;
1016 if (!reverse_transit &&
1017 unlikely(managed_dentry_might_block(path->dentry)))
1040 return false; 1018 return false;
1019
1020 if (!d_mountpoint(path->dentry))
1021 break;
1022
1041 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 1023 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
1042 if (!mounted) 1024 if (!mounted)
1043 break; 1025 break;
1044 path->mnt = mounted; 1026 path->mnt = mounted;
1045 path->dentry = mounted->mnt_root; 1027 path->dentry = mounted->mnt_root;
1046 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 1028 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1047 *inode = path->dentry->d_inode;
1048 } 1029 }
1049 1030
1050 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) 1031 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
@@ -1070,7 +1051,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1070 1051
1071 seq = read_seqcount_begin(&parent->d_seq); 1052 seq = read_seqcount_begin(&parent->d_seq);
1072 if (read_seqcount_retry(&old->d_seq, nd->seq)) 1053 if (read_seqcount_retry(&old->d_seq, nd->seq))
1073 return -ECHILD; 1054 goto failed;
1074 inode = parent->d_inode; 1055 inode = parent->d_inode;
1075 nd->path.dentry = parent; 1056 nd->path.dentry = parent;
1076 nd->seq = seq; 1057 nd->seq = seq;
@@ -1083,8 +1064,15 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1083 } 1064 }
1084 __follow_mount_rcu(nd, &nd->path, &inode, true); 1065 __follow_mount_rcu(nd, &nd->path, &inode, true);
1085 nd->inode = inode; 1066 nd->inode = inode;
1086
1087 return 0; 1067 return 0;
1068
1069failed:
1070 nd->flags &= ~LOOKUP_RCU;
1071 if (!(nd->flags & LOOKUP_ROOT))
1072 nd->root.mnt = NULL;
1073 rcu_read_unlock();
1074 br_read_unlock(vfsmount_lock);
1075 return -ECHILD;
1088} 1076}
1089 1077
1090/* 1078/*
@@ -1095,7 +1083,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1095 * Care must be taken as namespace_sem may be held (indicated by mounting_here 1083 * Care must be taken as namespace_sem may be held (indicated by mounting_here
1096 * being true). 1084 * being true).
1097 */ 1085 */
1098int follow_down(struct path *path, bool mounting_here) 1086int follow_down(struct path *path)
1099{ 1087{
1100 unsigned managed; 1088 unsigned managed;
1101 int ret; 1089 int ret;
@@ -1116,7 +1104,7 @@ int follow_down(struct path *path, bool mounting_here)
1116 BUG_ON(!path->dentry->d_op); 1104 BUG_ON(!path->dentry->d_op);
1117 BUG_ON(!path->dentry->d_op->d_manage); 1105 BUG_ON(!path->dentry->d_op->d_manage);
1118 ret = path->dentry->d_op->d_manage( 1106 ret = path->dentry->d_op->d_manage(
1119 path->dentry, mounting_here, false); 1107 path->dentry, false);
1120 if (ret < 0) 1108 if (ret < 0)
1121 return ret == -EISDIR ? 0 : ret; 1109 return ret == -EISDIR ? 0 : ret;
1122 } 1110 }
@@ -1218,57 +1206,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1218{ 1206{
1219 struct vfsmount *mnt = nd->path.mnt; 1207 struct vfsmount *mnt = nd->path.mnt;
1220 struct dentry *dentry, *parent = nd->path.dentry; 1208 struct dentry *dentry, *parent = nd->path.dentry;
1221 struct inode *dir; 1209 int need_reval = 1;
1210 int status = 1;
1222 int err; 1211 int err;
1223 1212
1224 /* 1213 /*
1225 * See if the low-level filesystem might want
1226 * to use its own hash..
1227 */
1228 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1229 err = parent->d_op->d_hash(parent, nd->inode, name);
1230 if (err < 0)
1231 return err;
1232 }
1233
1234 /*
1235 * Rename seqlock is not required here because in the off chance 1214 * Rename seqlock is not required here because in the off chance
1236 * of a false negative due to a concurrent rename, we're going to 1215 * of a false negative due to a concurrent rename, we're going to
1237 * do the non-racy lookup, below. 1216 * do the non-racy lookup, below.
1238 */ 1217 */
1239 if (nd->flags & LOOKUP_RCU) { 1218 if (nd->flags & LOOKUP_RCU) {
1240 unsigned seq; 1219 unsigned seq;
1241
1242 *inode = nd->inode; 1220 *inode = nd->inode;
1243 dentry = __d_lookup_rcu(parent, name, &seq, inode); 1221 dentry = __d_lookup_rcu(parent, name, &seq, inode);
1244 if (!dentry) { 1222 if (!dentry)
1245 if (nameidata_drop_rcu(nd)) 1223 goto unlazy;
1246 return -ECHILD; 1224
1247 goto need_lookup;
1248 }
1249 /* Memory barrier in read_seqcount_begin of child is enough */ 1225 /* Memory barrier in read_seqcount_begin of child is enough */
1250 if (__read_seqcount_retry(&parent->d_seq, nd->seq)) 1226 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1251 return -ECHILD; 1227 return -ECHILD;
1252
1253 nd->seq = seq; 1228 nd->seq = seq;
1254 if (dentry->d_flags & DCACHE_OP_REVALIDATE) 1229
1255 goto need_revalidate; 1230 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1256done2: 1231 status = d_revalidate(dentry, nd);
1232 if (unlikely(status <= 0)) {
1233 if (status != -ECHILD)
1234 need_reval = 0;
1235 goto unlazy;
1236 }
1237 }
1257 path->mnt = mnt; 1238 path->mnt = mnt;
1258 path->dentry = dentry; 1239 path->dentry = dentry;
1259 if (likely(__follow_mount_rcu(nd, path, inode, false))) 1240 if (likely(__follow_mount_rcu(nd, path, inode, false)))
1260 return 0; 1241 return 0;
1261 if (nameidata_drop_rcu(nd)) 1242unlazy:
1262 return -ECHILD; 1243 if (dentry) {
1263 /* fallthru */ 1244 if (nameidata_dentry_drop_rcu(nd, dentry))
1245 return -ECHILD;
1246 } else {
1247 if (nameidata_drop_rcu(nd))
1248 return -ECHILD;
1249 }
1250 } else {
1251 dentry = __d_lookup(parent, name);
1264 } 1252 }
1265 dentry = __d_lookup(parent, name); 1253
1266 if (!dentry) 1254retry:
1267 goto need_lookup; 1255 if (unlikely(!dentry)) {
1268found: 1256 struct inode *dir = parent->d_inode;
1269 if (dentry->d_flags & DCACHE_OP_REVALIDATE) 1257 BUG_ON(nd->inode != dir);
1270 goto need_revalidate; 1258
1271done: 1259 mutex_lock(&dir->i_mutex);
1260 dentry = d_lookup(parent, name);
1261 if (likely(!dentry)) {
1262 dentry = d_alloc_and_lookup(parent, name, nd);
1263 if (IS_ERR(dentry)) {
1264 mutex_unlock(&dir->i_mutex);
1265 return PTR_ERR(dentry);
1266 }
1267 /* known good */
1268 need_reval = 0;
1269 status = 1;
1270 }
1271 mutex_unlock(&dir->i_mutex);
1272 }
1273 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1274 status = d_revalidate(dentry, nd);
1275 if (unlikely(status <= 0)) {
1276 if (status < 0) {
1277 dput(dentry);
1278 return status;
1279 }
1280 if (!d_invalidate(dentry)) {
1281 dput(dentry);
1282 dentry = NULL;
1283 need_reval = 1;
1284 goto retry;
1285 }
1286 }
1287
1272 path->mnt = mnt; 1288 path->mnt = mnt;
1273 path->dentry = dentry; 1289 path->dentry = dentry;
1274 err = follow_managed(path, nd->flags); 1290 err = follow_managed(path, nd->flags);
@@ -1278,49 +1294,113 @@ done:
1278 } 1294 }
1279 *inode = path->dentry->d_inode; 1295 *inode = path->dentry->d_inode;
1280 return 0; 1296 return 0;
1297}
1281 1298
1282need_lookup: 1299static inline int may_lookup(struct nameidata *nd)
1283 dir = parent->d_inode; 1300{
1284 BUG_ON(nd->inode != dir); 1301 if (nd->flags & LOOKUP_RCU) {
1302 int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
1303 if (err != -ECHILD)
1304 return err;
1305 if (nameidata_drop_rcu(nd))
1306 return -ECHILD;
1307 }
1308 return exec_permission(nd->inode, 0);
1309}
1285 1310
1286 mutex_lock(&dir->i_mutex); 1311static inline int handle_dots(struct nameidata *nd, int type)
1287 /* 1312{
1288 * First re-do the cached lookup just in case it was created 1313 if (type == LAST_DOTDOT) {
1289 * while we waited for the directory semaphore, or the first 1314 if (nd->flags & LOOKUP_RCU) {
1290 * lookup failed due to an unrelated rename. 1315 if (follow_dotdot_rcu(nd))
1291 * 1316 return -ECHILD;
1292 * This could use version numbering or similar to avoid unnecessary 1317 } else
1293 * cache lookups, but then we'd have to do the first lookup in the 1318 follow_dotdot(nd);
1294 * non-racy way. However in the common case here, everything should 1319 }
1295 * be hot in cache, so would it be a big win? 1320 return 0;
1296 */ 1321}
1297 dentry = d_lookup(parent, name); 1322
1298 if (likely(!dentry)) { 1323static void terminate_walk(struct nameidata *nd)
1299 dentry = d_alloc_and_lookup(parent, name, nd); 1324{
1300 mutex_unlock(&dir->i_mutex); 1325 if (!(nd->flags & LOOKUP_RCU)) {
1301 if (IS_ERR(dentry)) 1326 path_put(&nd->path);
1302 goto fail; 1327 } else {
1303 goto done; 1328 nd->flags &= ~LOOKUP_RCU;
1329 if (!(nd->flags & LOOKUP_ROOT))
1330 nd->root.mnt = NULL;
1331 rcu_read_unlock();
1332 br_read_unlock(vfsmount_lock);
1304 } 1333 }
1334}
1335
1336static inline int walk_component(struct nameidata *nd, struct path *path,
1337 struct qstr *name, int type, int follow)
1338{
1339 struct inode *inode;
1340 int err;
1305 /* 1341 /*
1306 * Uhhuh! Nasty case: the cache was re-populated while 1342 * "." and ".." are special - ".." especially so because it has
1307 * we waited on the semaphore. Need to revalidate. 1343 * to be able to know about the current root directory and
1344 * parent relationships.
1308 */ 1345 */
1309 mutex_unlock(&dir->i_mutex); 1346 if (unlikely(type != LAST_NORM))
1310 goto found; 1347 return handle_dots(nd, type);
1348 err = do_lookup(nd, name, path, &inode);
1349 if (unlikely(err)) {
1350 terminate_walk(nd);
1351 return err;
1352 }
1353 if (!inode) {
1354 path_to_nameidata(path, nd);
1355 terminate_walk(nd);
1356 return -ENOENT;
1357 }
1358 if (unlikely(inode->i_op->follow_link) && follow) {
1359 if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
1360 return -ECHILD;
1361 BUG_ON(inode != path->dentry->d_inode);
1362 return 1;
1363 }
1364 path_to_nameidata(path, nd);
1365 nd->inode = inode;
1366 return 0;
1367}
1311 1368
1312need_revalidate: 1369/*
1313 dentry = do_revalidate(dentry, nd); 1370 * This limits recursive symlink follows to 8, while
1314 if (!dentry) 1371 * limiting consecutive symlinks to 40.
1315 goto need_lookup; 1372 *
1316 if (IS_ERR(dentry)) 1373 * Without that kind of total limit, nasty chains of consecutive
1317 goto fail; 1374 * symlinks can cause almost arbitrarily long lookups.
1318 if (nd->flags & LOOKUP_RCU) 1375 */
1319 goto done2; 1376static inline int nested_symlink(struct path *path, struct nameidata *nd)
1320 goto done; 1377{
1378 int res;
1321 1379
1322fail: 1380 BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1323 return PTR_ERR(dentry); 1381 if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
1382 path_put_conditional(path, nd);
1383 path_put(&nd->path);
1384 return -ELOOP;
1385 }
1386
1387 nd->depth++;
1388 current->link_count++;
1389
1390 do {
1391 struct path link = *path;
1392 void *cookie;
1393
1394 res = follow_link(&link, nd, &cookie);
1395 if (!res)
1396 res = walk_component(nd, path, &nd->last,
1397 nd->last_type, LOOKUP_FOLLOW);
1398 put_link(nd, &link, cookie);
1399 } while (res > 0);
1400
1401 current->link_count--;
1402 nd->depth--;
1403 return res;
1324} 1404}
1325 1405
1326/* 1406/*
@@ -1340,30 +1420,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1340 while (*name=='/') 1420 while (*name=='/')
1341 name++; 1421 name++;
1342 if (!*name) 1422 if (!*name)
1343 goto return_reval; 1423 return 0;
1344
1345 if (nd->depth)
1346 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
1347 1424
1348 /* At this point we know we have a real path component. */ 1425 /* At this point we know we have a real path component. */
1349 for(;;) { 1426 for(;;) {
1350 struct inode *inode;
1351 unsigned long hash; 1427 unsigned long hash;
1352 struct qstr this; 1428 struct qstr this;
1353 unsigned int c; 1429 unsigned int c;
1430 int type;
1354 1431
1355 nd->flags |= LOOKUP_CONTINUE; 1432 nd->flags |= LOOKUP_CONTINUE;
1356 if (nd->flags & LOOKUP_RCU) { 1433
1357 err = exec_permission(nd->inode, IPERM_FLAG_RCU); 1434 err = may_lookup(nd);
1358 if (err == -ECHILD) {
1359 if (nameidata_drop_rcu(nd))
1360 return -ECHILD;
1361 goto exec_again;
1362 }
1363 } else {
1364exec_again:
1365 err = exec_permission(nd->inode, 0);
1366 }
1367 if (err) 1435 if (err)
1368 break; 1436 break;
1369 1437
@@ -1379,56 +1447,43 @@ exec_again:
1379 this.len = name - (const char *) this.name; 1447 this.len = name - (const char *) this.name;
1380 this.hash = end_name_hash(hash); 1448 this.hash = end_name_hash(hash);
1381 1449
1450 type = LAST_NORM;
1451 if (this.name[0] == '.') switch (this.len) {
1452 case 2:
1453 if (this.name[1] == '.') {
1454 type = LAST_DOTDOT;
1455 nd->flags |= LOOKUP_JUMPED;
1456 }
1457 break;
1458 case 1:
1459 type = LAST_DOT;
1460 }
1461 if (likely(type == LAST_NORM)) {
1462 struct dentry *parent = nd->path.dentry;
1463 nd->flags &= ~LOOKUP_JUMPED;
1464 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1465 err = parent->d_op->d_hash(parent, nd->inode,
1466 &this);
1467 if (err < 0)
1468 break;
1469 }
1470 }
1471
1382 /* remove trailing slashes? */ 1472 /* remove trailing slashes? */
1383 if (!c) 1473 if (!c)
1384 goto last_component; 1474 goto last_component;
1385 while (*++name == '/'); 1475 while (*++name == '/');
1386 if (!*name) 1476 if (!*name)
1387 goto last_with_slashes; 1477 goto last_component;
1388 1478
1389 /* 1479 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
1390 * "." and ".." are special - ".." especially so because it has 1480 if (err < 0)
1391 * to be able to know about the current root directory and 1481 return err;
1392 * parent relationships.
1393 */
1394 if (this.name[0] == '.') switch (this.len) {
1395 default:
1396 break;
1397 case 2:
1398 if (this.name[1] != '.')
1399 break;
1400 if (nd->flags & LOOKUP_RCU) {
1401 if (follow_dotdot_rcu(nd))
1402 return -ECHILD;
1403 } else
1404 follow_dotdot(nd);
1405 /* fallthrough */
1406 case 1:
1407 continue;
1408 }
1409 /* This does the actual lookups.. */
1410 err = do_lookup(nd, &this, &next, &inode);
1411 if (err)
1412 break;
1413 err = -ENOENT;
1414 if (!inode)
1415 goto out_dput;
1416 1482
1417 if (inode->i_op->follow_link) { 1483 if (err) {
1418 /* We commonly drop rcu-walk here */ 1484 err = nested_symlink(&next, nd);
1419 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1420 return -ECHILD;
1421 BUG_ON(inode != next.dentry->d_inode);
1422 err = do_follow_link(&next, nd);
1423 if (err) 1485 if (err)
1424 goto return_err; 1486 return err;
1425 nd->inode = nd->path.dentry->d_inode;
1426 err = -ENOENT;
1427 if (!nd->inode)
1428 break;
1429 } else {
1430 path_to_nameidata(&next, nd);
1431 nd->inode = inode;
1432 } 1487 }
1433 err = -ENOTDIR; 1488 err = -ENOTDIR;
1434 if (!nd->inode->i_op->lookup) 1489 if (!nd->inode->i_op->lookup)
@@ -1436,209 +1491,109 @@ exec_again:
1436 continue; 1491 continue;
1437 /* here ends the main loop */ 1492 /* here ends the main loop */
1438 1493
1439last_with_slashes:
1440 lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1441last_component: 1494last_component:
1442 /* Clear LOOKUP_CONTINUE iff it was previously unset */ 1495 /* Clear LOOKUP_CONTINUE iff it was previously unset */
1443 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; 1496 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
1444 if (lookup_flags & LOOKUP_PARENT)
1445 goto lookup_parent;
1446 if (this.name[0] == '.') switch (this.len) {
1447 default:
1448 break;
1449 case 2:
1450 if (this.name[1] != '.')
1451 break;
1452 if (nd->flags & LOOKUP_RCU) {
1453 if (follow_dotdot_rcu(nd))
1454 return -ECHILD;
1455 } else
1456 follow_dotdot(nd);
1457 /* fallthrough */
1458 case 1:
1459 goto return_reval;
1460 }
1461 err = do_lookup(nd, &this, &next, &inode);
1462 if (err)
1463 break;
1464 if (inode && unlikely(inode->i_op->follow_link) &&
1465 (lookup_flags & LOOKUP_FOLLOW)) {
1466 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1467 return -ECHILD;
1468 BUG_ON(inode != next.dentry->d_inode);
1469 err = do_follow_link(&next, nd);
1470 if (err)
1471 goto return_err;
1472 nd->inode = nd->path.dentry->d_inode;
1473 } else {
1474 path_to_nameidata(&next, nd);
1475 nd->inode = inode;
1476 }
1477 err = -ENOENT;
1478 if (!nd->inode)
1479 break;
1480 if (lookup_flags & LOOKUP_DIRECTORY) {
1481 err = -ENOTDIR;
1482 if (!nd->inode->i_op->lookup)
1483 break;
1484 }
1485 goto return_base;
1486lookup_parent:
1487 nd->last = this; 1497 nd->last = this;
1488 nd->last_type = LAST_NORM; 1498 nd->last_type = type;
1489 if (this.name[0] != '.')
1490 goto return_base;
1491 if (this.len == 1)
1492 nd->last_type = LAST_DOT;
1493 else if (this.len == 2 && this.name[1] == '.')
1494 nd->last_type = LAST_DOTDOT;
1495 else
1496 goto return_base;
1497return_reval:
1498 /*
1499 * We bypassed the ordinary revalidation routines.
1500 * We may need to check the cached dentry for staleness.
1501 */
1502 if (need_reval_dot(nd->path.dentry)) {
1503 /* Note: we do not d_invalidate() */
1504 err = d_revalidate(nd->path.dentry, nd);
1505 if (!err)
1506 err = -ESTALE;
1507 if (err < 0)
1508 break;
1509 }
1510return_base:
1511 if (nameidata_drop_rcu_last_maybe(nd))
1512 return -ECHILD;
1513 return 0; 1499 return 0;
1514out_dput:
1515 if (!(nd->flags & LOOKUP_RCU))
1516 path_put_conditional(&next, nd);
1517 break;
1518 } 1500 }
1519 if (!(nd->flags & LOOKUP_RCU)) 1501 terminate_walk(nd);
1520 path_put(&nd->path);
1521return_err:
1522 return err; 1502 return err;
1523} 1503}
1524 1504
1525static inline int path_walk_rcu(const char *name, struct nameidata *nd) 1505static int path_init(int dfd, const char *name, unsigned int flags,
1526{ 1506 struct nameidata *nd, struct file **fp)
1527 current->total_link_count = 0;
1528
1529 return link_path_walk(name, nd);
1530}
1531
1532static inline int path_walk_simple(const char *name, struct nameidata *nd)
1533{
1534 current->total_link_count = 0;
1535
1536 return link_path_walk(name, nd);
1537}
1538
1539static int path_walk(const char *name, struct nameidata *nd)
1540{
1541 struct path save = nd->path;
1542 int result;
1543
1544 current->total_link_count = 0;
1545
1546 /* make sure the stuff we saved doesn't go away */
1547 path_get(&save);
1548
1549 result = link_path_walk(name, nd);
1550 if (result == -ESTALE) {
1551 /* nd->path had been dropped */
1552 current->total_link_count = 0;
1553 nd->path = save;
1554 path_get(&nd->path);
1555 nd->flags |= LOOKUP_REVAL;
1556 result = link_path_walk(name, nd);
1557 }
1558
1559 path_put(&save);
1560
1561 return result;
1562}
1563
1564static void path_finish_rcu(struct nameidata *nd)
1565{
1566 if (nd->flags & LOOKUP_RCU) {
1567 /* RCU dangling. Cancel it. */
1568 nd->flags &= ~LOOKUP_RCU;
1569 nd->root.mnt = NULL;
1570 rcu_read_unlock();
1571 br_read_unlock(vfsmount_lock);
1572 }
1573 if (nd->file)
1574 fput(nd->file);
1575}
1576
1577static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1578{ 1507{
1579 int retval = 0; 1508 int retval = 0;
1580 int fput_needed; 1509 int fput_needed;
1581 struct file *file; 1510 struct file *file;
1582 1511
1583 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1512 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1584 nd->flags = flags | LOOKUP_RCU; 1513 nd->flags = flags | LOOKUP_JUMPED;
1585 nd->depth = 0; 1514 nd->depth = 0;
1515 if (flags & LOOKUP_ROOT) {
1516 struct inode *inode = nd->root.dentry->d_inode;
1517 if (*name) {
1518 if (!inode->i_op->lookup)
1519 return -ENOTDIR;
1520 retval = inode_permission(inode, MAY_EXEC);
1521 if (retval)
1522 return retval;
1523 }
1524 nd->path = nd->root;
1525 nd->inode = inode;
1526 if (flags & LOOKUP_RCU) {
1527 br_read_lock(vfsmount_lock);
1528 rcu_read_lock();
1529 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1530 } else {
1531 path_get(&nd->path);
1532 }
1533 return 0;
1534 }
1535
1586 nd->root.mnt = NULL; 1536 nd->root.mnt = NULL;
1587 nd->file = NULL;
1588 1537
1589 if (*name=='/') { 1538 if (*name=='/') {
1590 struct fs_struct *fs = current->fs; 1539 if (flags & LOOKUP_RCU) {
1591 unsigned seq; 1540 br_read_lock(vfsmount_lock);
1592 1541 rcu_read_lock();
1593 br_read_lock(vfsmount_lock); 1542 set_root_rcu(nd);
1594 rcu_read_lock(); 1543 } else {
1595 1544 set_root(nd);
1596 do { 1545 path_get(&nd->root);
1597 seq = read_seqcount_begin(&fs->seq); 1546 }
1598 nd->root = fs->root; 1547 nd->path = nd->root;
1599 nd->path = nd->root;
1600 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1601 } while (read_seqcount_retry(&fs->seq, seq));
1602
1603 } else if (dfd == AT_FDCWD) { 1548 } else if (dfd == AT_FDCWD) {
1604 struct fs_struct *fs = current->fs; 1549 if (flags & LOOKUP_RCU) {
1605 unsigned seq; 1550 struct fs_struct *fs = current->fs;
1551 unsigned seq;
1606 1552
1607 br_read_lock(vfsmount_lock); 1553 br_read_lock(vfsmount_lock);
1608 rcu_read_lock(); 1554 rcu_read_lock();
1609
1610 do {
1611 seq = read_seqcount_begin(&fs->seq);
1612 nd->path = fs->pwd;
1613 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1614 } while (read_seqcount_retry(&fs->seq, seq));
1615 1555
1556 do {
1557 seq = read_seqcount_begin(&fs->seq);
1558 nd->path = fs->pwd;
1559 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1560 } while (read_seqcount_retry(&fs->seq, seq));
1561 } else {
1562 get_fs_pwd(current->fs, &nd->path);
1563 }
1616 } else { 1564 } else {
1617 struct dentry *dentry; 1565 struct dentry *dentry;
1618 1566
1619 file = fget_light(dfd, &fput_needed); 1567 file = fget_raw_light(dfd, &fput_needed);
1620 retval = -EBADF; 1568 retval = -EBADF;
1621 if (!file) 1569 if (!file)
1622 goto out_fail; 1570 goto out_fail;
1623 1571
1624 dentry = file->f_path.dentry; 1572 dentry = file->f_path.dentry;
1625 1573
1626 retval = -ENOTDIR; 1574 if (*name) {
1627 if (!S_ISDIR(dentry->d_inode->i_mode)) 1575 retval = -ENOTDIR;
1628 goto fput_fail; 1576 if (!S_ISDIR(dentry->d_inode->i_mode))
1577 goto fput_fail;
1629 1578
1630 retval = file_permission(file, MAY_EXEC); 1579 retval = file_permission(file, MAY_EXEC);
1631 if (retval) 1580 if (retval)
1632 goto fput_fail; 1581 goto fput_fail;
1582 }
1633 1583
1634 nd->path = file->f_path; 1584 nd->path = file->f_path;
1635 if (fput_needed) 1585 if (flags & LOOKUP_RCU) {
1636 nd->file = file; 1586 if (fput_needed)
1637 1587 *fp = file;
1638 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1588 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1639 br_read_lock(vfsmount_lock); 1589 br_read_lock(vfsmount_lock);
1640 rcu_read_lock(); 1590 rcu_read_lock();
1591 } else {
1592 path_get(&file->f_path);
1593 fput_light(file, fput_needed);
1594 }
1641 } 1595 }
1596
1642 nd->inode = nd->path.dentry->d_inode; 1597 nd->inode = nd->path.dentry->d_inode;
1643 return 0; 1598 return 0;
1644 1599
@@ -1648,60 +1603,23 @@ out_fail:
1648 return retval; 1603 return retval;
1649} 1604}
1650 1605
1651static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) 1606static inline int lookup_last(struct nameidata *nd, struct path *path)
1652{ 1607{
1653 int retval = 0; 1608 if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
1654 int fput_needed; 1609 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1655 struct file *file;
1656
1657 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1658 nd->flags = flags;
1659 nd->depth = 0;
1660 nd->root.mnt = NULL;
1661
1662 if (*name=='/') {
1663 set_root(nd);
1664 nd->path = nd->root;
1665 path_get(&nd->root);
1666 } else if (dfd == AT_FDCWD) {
1667 get_fs_pwd(current->fs, &nd->path);
1668 } else {
1669 struct dentry *dentry;
1670
1671 file = fget_light(dfd, &fput_needed);
1672 retval = -EBADF;
1673 if (!file)
1674 goto out_fail;
1675
1676 dentry = file->f_path.dentry;
1677 1610
1678 retval = -ENOTDIR; 1611 nd->flags &= ~LOOKUP_PARENT;
1679 if (!S_ISDIR(dentry->d_inode->i_mode)) 1612 return walk_component(nd, path, &nd->last, nd->last_type,
1680 goto fput_fail; 1613 nd->flags & LOOKUP_FOLLOW);
1681
1682 retval = file_permission(file, MAY_EXEC);
1683 if (retval)
1684 goto fput_fail;
1685
1686 nd->path = file->f_path;
1687 path_get(&file->f_path);
1688
1689 fput_light(file, fput_needed);
1690 }
1691 nd->inode = nd->path.dentry->d_inode;
1692 return 0;
1693
1694fput_fail:
1695 fput_light(file, fput_needed);
1696out_fail:
1697 return retval;
1698} 1614}
1699 1615
1700/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1616/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1701static int do_path_lookup(int dfd, const char *name, 1617static int path_lookupat(int dfd, const char *name,
1702 unsigned int flags, struct nameidata *nd) 1618 unsigned int flags, struct nameidata *nd)
1703{ 1619{
1704 int retval; 1620 struct file *base = NULL;
1621 struct path path;
1622 int err;
1705 1623
1706 /* 1624 /*
1707 * Path walking is largely split up into 2 different synchronisation 1625 * Path walking is largely split up into 2 different synchronisation
@@ -1717,44 +1635,78 @@ static int do_path_lookup(int dfd, const char *name,
1717 * be handled by restarting a traditional ref-walk (which will always 1635 * be handled by restarting a traditional ref-walk (which will always
1718 * be able to complete). 1636 * be able to complete).
1719 */ 1637 */
1720 retval = path_init_rcu(dfd, name, flags, nd); 1638 err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
1721 if (unlikely(retval)) 1639
1722 return retval; 1640 if (unlikely(err))
1723 retval = path_walk_rcu(name, nd); 1641 return err;
1724 path_finish_rcu(nd); 1642
1725 if (nd->root.mnt) { 1643 current->total_link_count = 0;
1726 path_put(&nd->root); 1644 err = link_path_walk(name, nd);
1727 nd->root.mnt = NULL; 1645
1646 if (!err && !(flags & LOOKUP_PARENT)) {
1647 err = lookup_last(nd, &path);
1648 while (err > 0) {
1649 void *cookie;
1650 struct path link = path;
1651 nd->flags |= LOOKUP_PARENT;
1652 err = follow_link(&link, nd, &cookie);
1653 if (!err)
1654 err = lookup_last(nd, &path);
1655 put_link(nd, &link, cookie);
1656 }
1728 } 1657 }
1729 1658
1730 if (unlikely(retval == -ECHILD || retval == -ESTALE)) { 1659 if (nd->flags & LOOKUP_RCU) {
1731 /* slower, locked walk */ 1660 /* went all way through without dropping RCU */
1732 if (retval == -ESTALE) 1661 BUG_ON(err);
1733 flags |= LOOKUP_REVAL; 1662 if (nameidata_drop_rcu_last(nd))
1734 retval = path_init(dfd, name, flags, nd); 1663 err = -ECHILD;
1735 if (unlikely(retval)) 1664 }
1736 return retval; 1665
1737 retval = path_walk(name, nd); 1666 if (!err) {
1738 if (nd->root.mnt) { 1667 err = handle_reval_path(nd);
1739 path_put(&nd->root); 1668 if (err)
1740 nd->root.mnt = NULL; 1669 path_put(&nd->path);
1670 }
1671
1672 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1673 if (!nd->inode->i_op->lookup) {
1674 path_put(&nd->path);
1675 err = -ENOTDIR;
1741 } 1676 }
1742 } 1677 }
1743 1678
1679 if (base)
1680 fput(base);
1681
1682 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
1683 path_put(&nd->root);
1684 nd->root.mnt = NULL;
1685 }
1686 return err;
1687}
1688
1689static int do_path_lookup(int dfd, const char *name,
1690 unsigned int flags, struct nameidata *nd)
1691{
1692 int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
1693 if (unlikely(retval == -ECHILD))
1694 retval = path_lookupat(dfd, name, flags, nd);
1695 if (unlikely(retval == -ESTALE))
1696 retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
1697
1744 if (likely(!retval)) { 1698 if (likely(!retval)) {
1745 if (unlikely(!audit_dummy_context())) { 1699 if (unlikely(!audit_dummy_context())) {
1746 if (nd->path.dentry && nd->inode) 1700 if (nd->path.dentry && nd->inode)
1747 audit_inode(name, nd->path.dentry); 1701 audit_inode(name, nd->path.dentry);
1748 } 1702 }
1749 } 1703 }
1750
1751 return retval; 1704 return retval;
1752} 1705}
1753 1706
1754int path_lookup(const char *name, unsigned int flags, 1707int kern_path_parent(const char *name, struct nameidata *nd)
1755 struct nameidata *nd)
1756{ 1708{
1757 return do_path_lookup(AT_FDCWD, name, flags, nd); 1709 return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
1758} 1710}
1759 1711
1760int kern_path(const char *name, unsigned int flags, struct path *path) 1712int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -1778,29 +1730,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1778 const char *name, unsigned int flags, 1730 const char *name, unsigned int flags,
1779 struct nameidata *nd) 1731 struct nameidata *nd)
1780{ 1732{
1781 int retval; 1733 nd->root.dentry = dentry;
1782 1734 nd->root.mnt = mnt;
1783 /* same as do_path_lookup */ 1735 /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
1784 nd->last_type = LAST_ROOT; 1736 return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
1785 nd->flags = flags;
1786 nd->depth = 0;
1787
1788 nd->path.dentry = dentry;
1789 nd->path.mnt = mnt;
1790 path_get(&nd->path);
1791 nd->root = nd->path;
1792 path_get(&nd->root);
1793 nd->inode = nd->path.dentry->d_inode;
1794
1795 retval = path_walk(name, nd);
1796 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1797 nd->inode))
1798 audit_inode(name, nd->path.dentry);
1799
1800 path_put(&nd->root);
1801 nd->root.mnt = NULL;
1802
1803 return retval;
1804} 1737}
1805 1738
1806static struct dentry *__lookup_hash(struct qstr *name, 1739static struct dentry *__lookup_hash(struct qstr *name,
@@ -1815,17 +1748,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
1815 return ERR_PTR(err); 1748 return ERR_PTR(err);
1816 1749
1817 /* 1750 /*
1818 * See if the low-level filesystem might want
1819 * to use its own hash..
1820 */
1821 if (base->d_flags & DCACHE_OP_HASH) {
1822 err = base->d_op->d_hash(base, inode, name);
1823 dentry = ERR_PTR(err);
1824 if (err < 0)
1825 goto out;
1826 }
1827
1828 /*
1829 * Don't bother with __d_lookup: callers are for creat as 1751 * Don't bother with __d_lookup: callers are for creat as
1830 * well as unlink, so a lot of the time it would cost 1752 * well as unlink, so a lot of the time it would cost
1831 * a double lookup. 1753 * a double lookup.
@@ -1837,7 +1759,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
1837 1759
1838 if (!dentry) 1760 if (!dentry)
1839 dentry = d_alloc_and_lookup(base, name, nd); 1761 dentry = d_alloc_and_lookup(base, name, nd);
1840out: 1762
1841 return dentry; 1763 return dentry;
1842} 1764}
1843 1765
@@ -1851,28 +1773,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
1851 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1773 return __lookup_hash(&nd->last, nd->path.dentry, nd);
1852} 1774}
1853 1775
1854static int __lookup_one_len(const char *name, struct qstr *this,
1855 struct dentry *base, int len)
1856{
1857 unsigned long hash;
1858 unsigned int c;
1859
1860 this->name = name;
1861 this->len = len;
1862 if (!len)
1863 return -EACCES;
1864
1865 hash = init_name_hash();
1866 while (len--) {
1867 c = *(const unsigned char *)name++;
1868 if (c == '/' || c == '\0')
1869 return -EACCES;
1870 hash = partial_name_hash(c, hash);
1871 }
1872 this->hash = end_name_hash(hash);
1873 return 0;
1874}
1875
1876/** 1776/**
1877 * lookup_one_len - filesystem helper to lookup single pathname component 1777 * lookup_one_len - filesystem helper to lookup single pathname component
1878 * @name: pathname component to lookup 1778 * @name: pathname component to lookup
@@ -1886,14 +1786,34 @@ static int __lookup_one_len(const char *name, struct qstr *this,
1886 */ 1786 */
1887struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) 1787struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1888{ 1788{
1889 int err;
1890 struct qstr this; 1789 struct qstr this;
1790 unsigned long hash;
1791 unsigned int c;
1891 1792
1892 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); 1793 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1893 1794
1894 err = __lookup_one_len(name, &this, base, len); 1795 this.name = name;
1895 if (err) 1796 this.len = len;
1896 return ERR_PTR(err); 1797 if (!len)
1798 return ERR_PTR(-EACCES);
1799
1800 hash = init_name_hash();
1801 while (len--) {
1802 c = *(const unsigned char *)name++;
1803 if (c == '/' || c == '\0')
1804 return ERR_PTR(-EACCES);
1805 hash = partial_name_hash(c, hash);
1806 }
1807 this.hash = end_name_hash(hash);
1808 /*
1809 * See if the low-level filesystem might want
1810 * to use its own hash..
1811 */
1812 if (base->d_flags & DCACHE_OP_HASH) {
1813 int err = base->d_op->d_hash(base, base->d_inode, &this);
1814 if (err < 0)
1815 return ERR_PTR(err);
1816 }
1897 1817
1898 return __lookup_hash(&this, base, NULL); 1818 return __lookup_hash(&this, base, NULL);
1899} 1819}
@@ -1902,7 +1822,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
1902 struct path *path) 1822 struct path *path)
1903{ 1823{
1904 struct nameidata nd; 1824 struct nameidata nd;
1905 char *tmp = getname(name); 1825 char *tmp = getname_flags(name, flags);
1906 int err = PTR_ERR(tmp); 1826 int err = PTR_ERR(tmp);
1907 if (!IS_ERR(tmp)) { 1827 if (!IS_ERR(tmp)) {
1908 1828
@@ -1944,11 +1864,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
1944 1864
1945 if (!(dir->i_mode & S_ISVTX)) 1865 if (!(dir->i_mode & S_ISVTX))
1946 return 0; 1866 return 0;
1867 if (current_user_ns() != inode_userns(inode))
1868 goto other_userns;
1947 if (inode->i_uid == fsuid) 1869 if (inode->i_uid == fsuid)
1948 return 0; 1870 return 0;
1949 if (dir->i_uid == fsuid) 1871 if (dir->i_uid == fsuid)
1950 return 0; 1872 return 0;
1951 return !capable(CAP_FOWNER); 1873
1874other_userns:
1875 return !ns_capable(inode_userns(inode), CAP_FOWNER);
1952} 1876}
1953 1877
1954/* 1878/*
@@ -2082,12 +2006,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
2082 return error; 2006 return error;
2083} 2007}
2084 2008
2085int may_open(struct path *path, int acc_mode, int flag) 2009static int may_open(struct path *path, int acc_mode, int flag)
2086{ 2010{
2087 struct dentry *dentry = path->dentry; 2011 struct dentry *dentry = path->dentry;
2088 struct inode *inode = dentry->d_inode; 2012 struct inode *inode = dentry->d_inode;
2089 int error; 2013 int error;
2090 2014
2015 /* O_PATH? */
2016 if (!acc_mode)
2017 return 0;
2018
2091 if (!inode) 2019 if (!inode)
2092 return -ENOENT; 2020 return -ENOENT;
2093 2021
@@ -2124,7 +2052,7 @@ int may_open(struct path *path, int acc_mode, int flag)
2124 } 2052 }
2125 2053
2126 /* O_NOATIME can only be set by the owner or superuser */ 2054 /* O_NOATIME can only be set by the owner or superuser */
2127 if (flag & O_NOATIME && !is_owner_or_cap(inode)) 2055 if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2128 return -EPERM; 2056 return -EPERM;
2129 2057
2130 /* 2058 /*
@@ -2156,34 +2084,6 @@ static int handle_truncate(struct file *filp)
2156} 2084}
2157 2085
2158/* 2086/*
2159 * Be careful about ever adding any more callers of this
2160 * function. Its flags must be in the namei format, not
2161 * what get passed to sys_open().
2162 */
2163static int __open_namei_create(struct nameidata *nd, struct path *path,
2164 int open_flag, int mode)
2165{
2166 int error;
2167 struct dentry *dir = nd->path.dentry;
2168
2169 if (!IS_POSIXACL(dir->d_inode))
2170 mode &= ~current_umask();
2171 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
2172 if (error)
2173 goto out_unlock;
2174 error = vfs_create(dir->d_inode, path->dentry, mode, nd);
2175out_unlock:
2176 mutex_unlock(&dir->d_inode->i_mutex);
2177 dput(nd->path.dentry);
2178 nd->path.dentry = path->dentry;
2179
2180 if (error)
2181 return error;
2182 /* Don't check for write permission, don't truncate */
2183 return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
2184}
2185
2186/*
2187 * Note that while the flag value (low two bits) for sys_open means: 2087 * Note that while the flag value (low two bits) for sys_open means:
2188 * 00 - read-only 2088 * 00 - read-only
2189 * 01 - write-only 2089 * 01 - write-only
@@ -2207,128 +2107,115 @@ static inline int open_to_namei_flags(int flag)
2207 return flag; 2107 return flag;
2208} 2108}
2209 2109
2210static int open_will_truncate(int flag, struct inode *inode)
2211{
2212 /*
2213 * We'll never write to the fs underlying
2214 * a device file.
2215 */
2216 if (special_file(inode->i_mode))
2217 return 0;
2218 return (flag & O_TRUNC);
2219}
2220
2221static struct file *finish_open(struct nameidata *nd,
2222 int open_flag, int acc_mode)
2223{
2224 struct file *filp;
2225 int will_truncate;
2226 int error;
2227
2228 will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
2229 if (will_truncate) {
2230 error = mnt_want_write(nd->path.mnt);
2231 if (error)
2232 goto exit;
2233 }
2234 error = may_open(&nd->path, acc_mode, open_flag);
2235 if (error) {
2236 if (will_truncate)
2237 mnt_drop_write(nd->path.mnt);
2238 goto exit;
2239 }
2240 filp = nameidata_to_filp(nd);
2241 if (!IS_ERR(filp)) {
2242 error = ima_file_check(filp, acc_mode);
2243 if (error) {
2244 fput(filp);
2245 filp = ERR_PTR(error);
2246 }
2247 }
2248 if (!IS_ERR(filp)) {
2249 if (will_truncate) {
2250 error = handle_truncate(filp);
2251 if (error) {
2252 fput(filp);
2253 filp = ERR_PTR(error);
2254 }
2255 }
2256 }
2257 /*
2258 * It is now safe to drop the mnt write
2259 * because the filp has had a write taken
2260 * on its behalf.
2261 */
2262 if (will_truncate)
2263 mnt_drop_write(nd->path.mnt);
2264 path_put(&nd->path);
2265 return filp;
2266
2267exit:
2268 if (!IS_ERR(nd->intent.open.file))
2269 release_open_intent(nd);
2270 path_put(&nd->path);
2271 return ERR_PTR(error);
2272}
2273
2274/* 2110/*
2275 * Handle O_CREAT case for do_filp_open 2111 * Handle the last step of open()
2276 */ 2112 */
2277static struct file *do_last(struct nameidata *nd, struct path *path, 2113static struct file *do_last(struct nameidata *nd, struct path *path,
2278 int open_flag, int acc_mode, 2114 const struct open_flags *op, const char *pathname)
2279 int mode, const char *pathname)
2280{ 2115{
2281 struct dentry *dir = nd->path.dentry; 2116 struct dentry *dir = nd->path.dentry;
2117 struct dentry *dentry;
2118 int open_flag = op->open_flag;
2119 int will_truncate = open_flag & O_TRUNC;
2120 int want_write = 0;
2121 int acc_mode = op->acc_mode;
2282 struct file *filp; 2122 struct file *filp;
2283 int error = -EISDIR; 2123 int error;
2124
2125 nd->flags &= ~LOOKUP_PARENT;
2126 nd->flags |= op->intent;
2284 2127
2285 switch (nd->last_type) { 2128 switch (nd->last_type) {
2286 case LAST_DOTDOT: 2129 case LAST_DOTDOT:
2287 follow_dotdot(nd);
2288 dir = nd->path.dentry;
2289 case LAST_DOT: 2130 case LAST_DOT:
2290 if (need_reval_dot(dir)) { 2131 error = handle_dots(nd, nd->last_type);
2291 int status = d_revalidate(nd->path.dentry, nd); 2132 if (error)
2292 if (!status) 2133 return ERR_PTR(error);
2293 status = -ESTALE;
2294 if (status < 0) {
2295 error = status;
2296 goto exit;
2297 }
2298 }
2299 /* fallthrough */ 2134 /* fallthrough */
2300 case LAST_ROOT: 2135 case LAST_ROOT:
2301 goto exit; 2136 if (nd->flags & LOOKUP_RCU) {
2137 if (nameidata_drop_rcu_last(nd))
2138 return ERR_PTR(-ECHILD);
2139 }
2140 error = handle_reval_path(nd);
2141 if (error)
2142 goto exit;
2143 audit_inode(pathname, nd->path.dentry);
2144 if (open_flag & O_CREAT) {
2145 error = -EISDIR;
2146 goto exit;
2147 }
2148 goto ok;
2302 case LAST_BIND: 2149 case LAST_BIND:
2150 /* can't be RCU mode here */
2151 error = handle_reval_path(nd);
2152 if (error)
2153 goto exit;
2303 audit_inode(pathname, dir); 2154 audit_inode(pathname, dir);
2304 goto ok; 2155 goto ok;
2305 } 2156 }
2306 2157
2158 if (!(open_flag & O_CREAT)) {
2159 int symlink_ok = 0;
2160 if (nd->last.name[nd->last.len])
2161 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2162 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
2163 symlink_ok = 1;
2164 /* we _can_ be in RCU mode here */
2165 error = walk_component(nd, path, &nd->last, LAST_NORM,
2166 !symlink_ok);
2167 if (error < 0)
2168 return ERR_PTR(error);
2169 if (error) /* symlink */
2170 return NULL;
2171 /* sayonara */
2172 if (nd->flags & LOOKUP_RCU) {
2173 if (nameidata_drop_rcu_last(nd))
2174 return ERR_PTR(-ECHILD);
2175 }
2176
2177 error = -ENOTDIR;
2178 if (nd->flags & LOOKUP_DIRECTORY) {
2179 if (!nd->inode->i_op->lookup)
2180 goto exit;
2181 }
2182 audit_inode(pathname, nd->path.dentry);
2183 goto ok;
2184 }
2185
2186 /* create side of things */
2187
2188 if (nd->flags & LOOKUP_RCU) {
2189 if (nameidata_drop_rcu_last(nd))
2190 return ERR_PTR(-ECHILD);
2191 }
2192
2193 audit_inode(pathname, dir);
2194 error = -EISDIR;
2307 /* trailing slashes? */ 2195 /* trailing slashes? */
2308 if (nd->last.name[nd->last.len]) 2196 if (nd->last.name[nd->last.len])
2309 goto exit; 2197 goto exit;
2310 2198
2311 mutex_lock(&dir->d_inode->i_mutex); 2199 mutex_lock(&dir->d_inode->i_mutex);
2312 2200
2313 path->dentry = lookup_hash(nd); 2201 dentry = lookup_hash(nd);
2314 path->mnt = nd->path.mnt; 2202 error = PTR_ERR(dentry);
2315 2203 if (IS_ERR(dentry)) {
2316 error = PTR_ERR(path->dentry);
2317 if (IS_ERR(path->dentry)) {
2318 mutex_unlock(&dir->d_inode->i_mutex); 2204 mutex_unlock(&dir->d_inode->i_mutex);
2319 goto exit; 2205 goto exit;
2320 } 2206 }
2321 2207
2322 if (IS_ERR(nd->intent.open.file)) { 2208 path->dentry = dentry;
2323 error = PTR_ERR(nd->intent.open.file); 2209 path->mnt = nd->path.mnt;
2324 goto exit_mutex_unlock;
2325 }
2326 2210
2327 /* Negative dentry, just create the file */ 2211 /* Negative dentry, just create the file */
2328 if (!path->dentry->d_inode) { 2212 if (!dentry->d_inode) {
2213 int mode = op->mode;
2214 if (!IS_POSIXACL(dir->d_inode))
2215 mode &= ~current_umask();
2329 /* 2216 /*
2330 * This write is needed to ensure that a 2217 * This write is needed to ensure that a
2331 * ro->rw transition does not occur between 2218 * rw->ro transition does not occur between
2332 * the time when the file is created and when 2219 * the time when the file is created and when
2333 * a permanent write count is taken through 2220 * a permanent write count is taken through
2334 * the 'struct file' in nameidata_to_filp(). 2221 * the 'struct file' in nameidata_to_filp().
@@ -2336,22 +2223,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2336 error = mnt_want_write(nd->path.mnt); 2223 error = mnt_want_write(nd->path.mnt);
2337 if (error) 2224 if (error)
2338 goto exit_mutex_unlock; 2225 goto exit_mutex_unlock;
2339 error = __open_namei_create(nd, path, open_flag, mode); 2226 want_write = 1;
2340 if (error) { 2227 /* Don't check for write permission, don't truncate */
2341 mnt_drop_write(nd->path.mnt); 2228 open_flag &= ~O_TRUNC;
2342 goto exit; 2229 will_truncate = 0;
2343 } 2230 acc_mode = MAY_OPEN;
2344 filp = nameidata_to_filp(nd); 2231 error = security_path_mknod(&nd->path, dentry, mode, 0);
2345 mnt_drop_write(nd->path.mnt); 2232 if (error)
2346 path_put(&nd->path); 2233 goto exit_mutex_unlock;
2347 if (!IS_ERR(filp)) { 2234 error = vfs_create(dir->d_inode, dentry, mode, nd);
2348 error = ima_file_check(filp, acc_mode); 2235 if (error)
2349 if (error) { 2236 goto exit_mutex_unlock;
2350 fput(filp); 2237 mutex_unlock(&dir->d_inode->i_mutex);
2351 filp = ERR_PTR(error); 2238 dput(nd->path.dentry);
2352 } 2239 nd->path.dentry = dentry;
2353 } 2240 goto common;
2354 return filp;
2355 } 2241 }
2356 2242
2357 /* 2243 /*
@@ -2381,7 +2267,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2381 if (S_ISDIR(nd->inode->i_mode)) 2267 if (S_ISDIR(nd->inode->i_mode))
2382 goto exit; 2268 goto exit;
2383ok: 2269ok:
2384 filp = finish_open(nd, open_flag, acc_mode); 2270 if (!S_ISREG(nd->inode->i_mode))
2271 will_truncate = 0;
2272
2273 if (will_truncate) {
2274 error = mnt_want_write(nd->path.mnt);
2275 if (error)
2276 goto exit;
2277 want_write = 1;
2278 }
2279common:
2280 error = may_open(&nd->path, acc_mode, open_flag);
2281 if (error)
2282 goto exit;
2283 filp = nameidata_to_filp(nd);
2284 if (!IS_ERR(filp)) {
2285 error = ima_file_check(filp, op->acc_mode);
2286 if (error) {
2287 fput(filp);
2288 filp = ERR_PTR(error);
2289 }
2290 }
2291 if (!IS_ERR(filp)) {
2292 if (will_truncate) {
2293 error = handle_truncate(filp);
2294 if (error) {
2295 fput(filp);
2296 filp = ERR_PTR(error);
2297 }
2298 }
2299 }
2300out:
2301 if (want_write)
2302 mnt_drop_write(nd->path.mnt);
2303 path_put(&nd->path);
2385 return filp; 2304 return filp;
2386 2305
2387exit_mutex_unlock: 2306exit_mutex_unlock:
@@ -2389,199 +2308,103 @@ exit_mutex_unlock:
2389exit_dput: 2308exit_dput:
2390 path_put_conditional(path, nd); 2309 path_put_conditional(path, nd);
2391exit: 2310exit:
2392 if (!IS_ERR(nd->intent.open.file)) 2311 filp = ERR_PTR(error);
2393 release_open_intent(nd); 2312 goto out;
2394 path_put(&nd->path);
2395 return ERR_PTR(error);
2396} 2313}
2397 2314
2398/* 2315static struct file *path_openat(int dfd, const char *pathname,
2399 * Note that the low bits of the passed in "open_flag" 2316 struct nameidata *nd, const struct open_flags *op, int flags)
2400 * are not the same as in the local variable "flag". See
2401 * open_to_namei_flags() for more details.
2402 */
2403struct file *do_filp_open(int dfd, const char *pathname,
2404 int open_flag, int mode, int acc_mode)
2405{ 2317{
2318 struct file *base = NULL;
2406 struct file *filp; 2319 struct file *filp;
2407 struct nameidata nd;
2408 int error;
2409 struct path path; 2320 struct path path;
2410 int count = 0; 2321 int error;
2411 int flag = open_to_namei_flags(open_flag);
2412 int flags;
2413
2414 if (!(open_flag & O_CREAT))
2415 mode = 0;
2416
2417 /* Must never be set by userspace */
2418 open_flag &= ~FMODE_NONOTIFY;
2419
2420 /*
2421 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
2422 * check for O_DSYNC if the need any syncing at all we enforce it's
2423 * always set instead of having to deal with possibly weird behaviour
2424 * for malicious applications setting only __O_SYNC.
2425 */
2426 if (open_flag & __O_SYNC)
2427 open_flag |= O_DSYNC;
2428
2429 if (!acc_mode)
2430 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
2431
2432 /* O_TRUNC implies we need access checks for write permissions */
2433 if (open_flag & O_TRUNC)
2434 acc_mode |= MAY_WRITE;
2435
2436 /* Allow the LSM permission hook to distinguish append
2437 access from general write access. */
2438 if (open_flag & O_APPEND)
2439 acc_mode |= MAY_APPEND;
2440
2441 flags = LOOKUP_OPEN;
2442 if (open_flag & O_CREAT) {
2443 flags |= LOOKUP_CREATE;
2444 if (open_flag & O_EXCL)
2445 flags |= LOOKUP_EXCL;
2446 }
2447 if (open_flag & O_DIRECTORY)
2448 flags |= LOOKUP_DIRECTORY;
2449 if (!(open_flag & O_NOFOLLOW))
2450 flags |= LOOKUP_FOLLOW;
2451 2322
2452 filp = get_empty_filp(); 2323 filp = get_empty_filp();
2453 if (!filp) 2324 if (!filp)
2454 return ERR_PTR(-ENFILE); 2325 return ERR_PTR(-ENFILE);
2455 2326
2456 filp->f_flags = open_flag; 2327 filp->f_flags = op->open_flag;
2457 nd.intent.open.file = filp; 2328 nd->intent.open.file = filp;
2458 nd.intent.open.flags = flag; 2329 nd->intent.open.flags = open_to_namei_flags(op->open_flag);
2459 nd.intent.open.create_mode = mode; 2330 nd->intent.open.create_mode = op->mode;
2460
2461 if (open_flag & O_CREAT)
2462 goto creat;
2463 2331
2464 /* !O_CREAT, simple open */ 2332 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
2465 error = do_path_lookup(dfd, pathname, flags, &nd);
2466 if (unlikely(error)) 2333 if (unlikely(error))
2467 goto out_filp; 2334 goto out_filp;
2468 error = -ELOOP;
2469 if (!(nd.flags & LOOKUP_FOLLOW)) {
2470 if (nd.inode->i_op->follow_link)
2471 goto out_path;
2472 }
2473 error = -ENOTDIR;
2474 if (nd.flags & LOOKUP_DIRECTORY) {
2475 if (!nd.inode->i_op->lookup)
2476 goto out_path;
2477 }
2478 audit_inode(pathname, nd.path.dentry);
2479 filp = finish_open(&nd, open_flag, acc_mode);
2480 return filp;
2481
2482creat:
2483 /* OK, have to create the file. Find the parent. */
2484 error = path_init_rcu(dfd, pathname,
2485 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2486 if (error)
2487 goto out_filp;
2488 error = path_walk_rcu(pathname, &nd);
2489 path_finish_rcu(&nd);
2490 if (unlikely(error == -ECHILD || error == -ESTALE)) {
2491 /* slower, locked walk */
2492 if (error == -ESTALE) {
2493reval:
2494 flags |= LOOKUP_REVAL;
2495 }
2496 error = path_init(dfd, pathname,
2497 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2498 if (error)
2499 goto out_filp;
2500 2335
2501 error = path_walk_simple(pathname, &nd); 2336 current->total_link_count = 0;
2502 } 2337 error = link_path_walk(pathname, nd);
2503 if (unlikely(error)) 2338 if (unlikely(error))
2504 goto out_filp; 2339 goto out_filp;
2505 if (unlikely(!audit_dummy_context()))
2506 audit_inode(pathname, nd.path.dentry);
2507 2340
2508 /* 2341 filp = do_last(nd, &path, op, pathname);
2509 * We have the parent and last component.
2510 */
2511 nd.flags = flags;
2512 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
2513 while (unlikely(!filp)) { /* trailing symlink */ 2342 while (unlikely(!filp)) { /* trailing symlink */
2514 struct path link = path; 2343 struct path link = path;
2515 struct inode *linki = link.dentry->d_inode;
2516 void *cookie; 2344 void *cookie;
2517 error = -ELOOP; 2345 if (!(nd->flags & LOOKUP_FOLLOW)) {
2518 if (!(nd.flags & LOOKUP_FOLLOW)) 2346 path_put_conditional(&path, nd);
2519 goto exit_dput; 2347 path_put(&nd->path);
2520 if (count++ == 32) 2348 filp = ERR_PTR(-ELOOP);
2521 goto exit_dput; 2349 break;
2522 /*
2523 * This is subtle. Instead of calling do_follow_link() we do
2524 * the thing by hands. The reason is that this way we have zero
2525 * link_count and path_walk() (called from ->follow_link)
2526 * honoring LOOKUP_PARENT. After that we have the parent and
2527 * last component, i.e. we are in the same situation as after
2528 * the first path_walk(). Well, almost - if the last component
2529 * is normal we get its copy stored in nd->last.name and we will
2530 * have to putname() it when we are done. Procfs-like symlinks
2531 * just set LAST_BIND.
2532 */
2533 nd.flags |= LOOKUP_PARENT;
2534 error = security_inode_follow_link(link.dentry, &nd);
2535 if (error)
2536 goto exit_dput;
2537 error = __do_follow_link(&link, &nd, &cookie);
2538 if (unlikely(error)) {
2539 if (!IS_ERR(cookie) && linki->i_op->put_link)
2540 linki->i_op->put_link(link.dentry, &nd, cookie);
2541 /* nd.path had been dropped */
2542 nd.path = link;
2543 goto out_path;
2544 } 2350 }
2545 nd.flags &= ~LOOKUP_PARENT; 2351 nd->flags |= LOOKUP_PARENT;
2546 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2352 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
2547 if (linki->i_op->put_link) 2353 error = follow_link(&link, nd, &cookie);
2548 linki->i_op->put_link(link.dentry, &nd, cookie); 2354 if (unlikely(error))
2549 path_put(&link); 2355 filp = ERR_PTR(error);
2356 else
2357 filp = do_last(nd, &path, op, pathname);
2358 put_link(nd, &link, cookie);
2550 } 2359 }
2551out: 2360out:
2552 if (nd.root.mnt) 2361 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
2553 path_put(&nd.root); 2362 path_put(&nd->root);
2554 if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) 2363 if (base)
2555 goto reval; 2364 fput(base);
2365 release_open_intent(nd);
2556 return filp; 2366 return filp;
2557 2367
2558exit_dput:
2559 path_put_conditional(&path, &nd);
2560out_path:
2561 path_put(&nd.path);
2562out_filp: 2368out_filp:
2563 if (!IS_ERR(nd.intent.open.file))
2564 release_open_intent(&nd);
2565 filp = ERR_PTR(error); 2369 filp = ERR_PTR(error);
2566 goto out; 2370 goto out;
2567} 2371}
2568 2372
2569/** 2373struct file *do_filp_open(int dfd, const char *pathname,
2570 * filp_open - open file and return file pointer 2374 const struct open_flags *op, int flags)
2571 *
2572 * @filename: path to open
2573 * @flags: open flags as per the open(2) second argument
2574 * @mode: mode for the new file if O_CREAT is set, else ignored
2575 *
2576 * This is the helper to open a file from kernelspace if you really
2577 * have to. But in generally you should not do this, so please move
2578 * along, nothing to see here..
2579 */
2580struct file *filp_open(const char *filename, int flags, int mode)
2581{ 2375{
2582 return do_filp_open(AT_FDCWD, filename, flags, mode, 0); 2376 struct nameidata nd;
2377 struct file *filp;
2378
2379 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
2380 if (unlikely(filp == ERR_PTR(-ECHILD)))
2381 filp = path_openat(dfd, pathname, &nd, op, flags);
2382 if (unlikely(filp == ERR_PTR(-ESTALE)))
2383 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
2384 return filp;
2385}
2386
2387struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
2388 const char *name, const struct open_flags *op, int flags)
2389{
2390 struct nameidata nd;
2391 struct file *file;
2392
2393 nd.root.mnt = mnt;
2394 nd.root.dentry = dentry;
2395
2396 flags |= LOOKUP_ROOT;
2397
2398 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
2399 return ERR_PTR(-ELOOP);
2400
2401 file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
2402 if (unlikely(file == ERR_PTR(-ECHILD)))
2403 file = path_openat(-1, name, &nd, op, flags);
2404 if (unlikely(file == ERR_PTR(-ESTALE)))
2405 file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
2406 return file;
2583} 2407}
2584EXPORT_SYMBOL(filp_open);
2585 2408
2586/** 2409/**
2587 * lookup_create - lookup a dentry, creating it if it doesn't exist 2410 * lookup_create - lookup a dentry, creating it if it doesn't exist
@@ -2643,7 +2466,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
2643 if (error) 2466 if (error)
2644 return error; 2467 return error;
2645 2468
2646 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) 2469 if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
2470 !ns_capable(inode_userns(dir), CAP_MKNOD))
2647 return -EPERM; 2471 return -EPERM;
2648 2472
2649 if (!dir->i_op->mknod) 2473 if (!dir->i_op->mknod)
@@ -3120,7 +2944,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
3120 return error; 2944 return error;
3121 2945
3122 mutex_lock(&inode->i_mutex); 2946 mutex_lock(&inode->i_mutex);
3123 error = dir->i_op->link(old_dentry, dir, new_dentry); 2947 /* Make sure we don't allow creating hardlink to an unlinked file */
2948 if (inode->i_nlink == 0)
2949 error = -ENOENT;
2950 else
2951 error = dir->i_op->link(old_dentry, dir, new_dentry);
3124 mutex_unlock(&inode->i_mutex); 2952 mutex_unlock(&inode->i_mutex);
3125 if (!error) 2953 if (!error)
3126 fsnotify_link(dir, inode, new_dentry); 2954 fsnotify_link(dir, inode, new_dentry);
@@ -3142,15 +2970,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3142 struct dentry *new_dentry; 2970 struct dentry *new_dentry;
3143 struct nameidata nd; 2971 struct nameidata nd;
3144 struct path old_path; 2972 struct path old_path;
2973 int how = 0;
3145 int error; 2974 int error;
3146 char *to; 2975 char *to;
3147 2976
3148 if ((flags & ~AT_SYMLINK_FOLLOW) != 0) 2977 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
3149 return -EINVAL; 2978 return -EINVAL;
2979 /*
2980 * To use null names we require CAP_DAC_READ_SEARCH
2981 * This ensures that not everyone will be able to create
2982 * handlink using the passed filedescriptor.
2983 */
2984 if (flags & AT_EMPTY_PATH) {
2985 if (!capable(CAP_DAC_READ_SEARCH))
2986 return -ENOENT;
2987 how = LOOKUP_EMPTY;
2988 }
2989
2990 if (flags & AT_SYMLINK_FOLLOW)
2991 how |= LOOKUP_FOLLOW;
3150 2992
3151 error = user_path_at(olddfd, oldname, 2993 error = user_path_at(olddfd, oldname, how, &old_path);
3152 flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
3153 &old_path);
3154 if (error) 2994 if (error)
3155 return error; 2995 return error;
3156 2996
@@ -3587,7 +3427,7 @@ EXPORT_SYMBOL(page_readlink);
3587EXPORT_SYMBOL(__page_symlink); 3427EXPORT_SYMBOL(__page_symlink);
3588EXPORT_SYMBOL(page_symlink); 3428EXPORT_SYMBOL(page_symlink);
3589EXPORT_SYMBOL(page_symlink_inode_operations); 3429EXPORT_SYMBOL(page_symlink_inode_operations);
3590EXPORT_SYMBOL(path_lookup); 3430EXPORT_SYMBOL(kern_path_parent);
3591EXPORT_SYMBOL(kern_path); 3431EXPORT_SYMBOL(kern_path);
3592EXPORT_SYMBOL(vfs_path_lookup); 3432EXPORT_SYMBOL(vfs_path_lookup);
3593EXPORT_SYMBOL(inode_permission); 3433EXPORT_SYMBOL(inode_permission);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7b0b95371696..7dba2ed03429 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -196,7 +196,7 @@ unsigned int mnt_get_count(struct vfsmount *mnt)
196#endif 196#endif
197} 197}
198 198
199struct vfsmount *alloc_vfsmnt(const char *name) 199static struct vfsmount *alloc_vfsmnt(const char *name)
200{ 200{
201 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 201 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
202 if (mnt) { 202 if (mnt) {
@@ -466,15 +466,7 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
466 br_write_unlock(vfsmount_lock); 466 br_write_unlock(vfsmount_lock);
467} 467}
468 468
469void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 469static void free_vfsmnt(struct vfsmount *mnt)
470{
471 mnt->mnt_sb = sb;
472 mnt->mnt_root = dget(sb->s_root);
473}
474
475EXPORT_SYMBOL(simple_set_mnt);
476
477void free_vfsmnt(struct vfsmount *mnt)
478{ 470{
479 kfree(mnt->mnt_devname); 471 kfree(mnt->mnt_devname);
480 mnt_free_id(mnt); 472 mnt_free_id(mnt);
@@ -678,6 +670,36 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
678 return p; 670 return p;
679} 671}
680 672
673struct vfsmount *
674vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
675{
676 struct vfsmount *mnt;
677 struct dentry *root;
678
679 if (!type)
680 return ERR_PTR(-ENODEV);
681
682 mnt = alloc_vfsmnt(name);
683 if (!mnt)
684 return ERR_PTR(-ENOMEM);
685
686 if (flags & MS_KERNMOUNT)
687 mnt->mnt_flags = MNT_INTERNAL;
688
689 root = mount_fs(type, flags, name, data);
690 if (IS_ERR(root)) {
691 free_vfsmnt(mnt);
692 return ERR_CAST(root);
693 }
694
695 mnt->mnt_root = root;
696 mnt->mnt_sb = root->d_sb;
697 mnt->mnt_mountpoint = mnt->mnt_root;
698 mnt->mnt_parent = mnt;
699 return mnt;
700}
701EXPORT_SYMBOL_GPL(vfs_kern_mount);
702
681static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, 703static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
682 int flag) 704 int flag)
683{ 705{
@@ -978,7 +1000,13 @@ static int show_vfsmnt(struct seq_file *m, void *v)
978 int err = 0; 1000 int err = 0;
979 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 1001 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
980 1002
981 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 1003 if (mnt->mnt_sb->s_op->show_devname) {
1004 err = mnt->mnt_sb->s_op->show_devname(m, mnt);
1005 if (err)
1006 goto out;
1007 } else {
1008 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
1009 }
982 seq_putc(m, ' '); 1010 seq_putc(m, ' ');
983 seq_path(m, &mnt_path, " \t\n\\"); 1011 seq_path(m, &mnt_path, " \t\n\\");
984 seq_putc(m, ' '); 1012 seq_putc(m, ' ');
@@ -1002,6 +1030,18 @@ const struct seq_operations mounts_op = {
1002 .show = show_vfsmnt 1030 .show = show_vfsmnt
1003}; 1031};
1004 1032
1033static int uuid_is_nil(u8 *uuid)
1034{
1035 int i;
1036 u8 *cp = (u8 *)uuid;
1037
1038 for (i = 0; i < 16; i++) {
1039 if (*cp++)
1040 return 0;
1041 }
1042 return 1;
1043}
1044
1005static int show_mountinfo(struct seq_file *m, void *v) 1045static int show_mountinfo(struct seq_file *m, void *v)
1006{ 1046{
1007 struct proc_mounts *p = m->private; 1047 struct proc_mounts *p = m->private;
@@ -1013,7 +1053,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
1013 1053
1014 seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, 1054 seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
1015 MAJOR(sb->s_dev), MINOR(sb->s_dev)); 1055 MAJOR(sb->s_dev), MINOR(sb->s_dev));
1016 seq_dentry(m, mnt->mnt_root, " \t\n\\"); 1056 if (sb->s_op->show_path)
1057 err = sb->s_op->show_path(m, mnt);
1058 else
1059 seq_dentry(m, mnt->mnt_root, " \t\n\\");
1060 if (err)
1061 goto out;
1017 seq_putc(m, ' '); 1062 seq_putc(m, ' ');
1018 seq_path_root(m, &mnt_path, &root, " \t\n\\"); 1063 seq_path_root(m, &mnt_path, &root, " \t\n\\");
1019 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { 1064 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
@@ -1040,11 +1085,20 @@ static int show_mountinfo(struct seq_file *m, void *v)
1040 if (IS_MNT_UNBINDABLE(mnt)) 1085 if (IS_MNT_UNBINDABLE(mnt))
1041 seq_puts(m, " unbindable"); 1086 seq_puts(m, " unbindable");
1042 1087
1088 if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
1089 /* print the uuid */
1090 seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
1091
1043 /* Filesystem specific data */ 1092 /* Filesystem specific data */
1044 seq_puts(m, " - "); 1093 seq_puts(m, " - ");
1045 show_type(m, sb); 1094 show_type(m, sb);
1046 seq_putc(m, ' '); 1095 seq_putc(m, ' ');
1047 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 1096 if (sb->s_op->show_devname)
1097 err = sb->s_op->show_devname(m, mnt);
1098 else
1099 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
1100 if (err)
1101 goto out;
1048 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); 1102 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
1049 err = show_sb_opts(m, sb); 1103 err = show_sb_opts(m, sb);
1050 if (err) 1104 if (err)
@@ -1070,11 +1124,15 @@ static int show_vfsstat(struct seq_file *m, void *v)
1070 int err = 0; 1124 int err = 0;
1071 1125
1072 /* device */ 1126 /* device */
1073 if (mnt->mnt_devname) { 1127 if (mnt->mnt_sb->s_op->show_devname) {
1074 seq_puts(m, "device "); 1128 err = mnt->mnt_sb->s_op->show_devname(m, mnt);
1075 mangle(m, mnt->mnt_devname); 1129 } else {
1076 } else 1130 if (mnt->mnt_devname) {
1077 seq_puts(m, "no device"); 1131 seq_puts(m, "device ");
1132 mangle(m, mnt->mnt_devname);
1133 } else
1134 seq_puts(m, "no device");
1135 }
1078 1136
1079 /* mount point */ 1137 /* mount point */
1080 seq_puts(m, " mounted on "); 1138 seq_puts(m, " mounted on ");
@@ -1088,7 +1146,8 @@ static int show_vfsstat(struct seq_file *m, void *v)
1088 /* optional statistics */ 1146 /* optional statistics */
1089 if (mnt->mnt_sb->s_op->show_stats) { 1147 if (mnt->mnt_sb->s_op->show_stats) {
1090 seq_putc(m, ' '); 1148 seq_putc(m, ' ');
1091 err = mnt->mnt_sb->s_op->show_stats(m, mnt); 1149 if (!err)
1150 err = mnt->mnt_sb->s_op->show_stats(m, mnt);
1092 } 1151 }
1093 1152
1094 seq_putc(m, '\n'); 1153 seq_putc(m, '\n');
@@ -1244,7 +1303,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1244 */ 1303 */
1245 br_write_lock(vfsmount_lock); 1304 br_write_lock(vfsmount_lock);
1246 if (mnt_get_count(mnt) != 2) { 1305 if (mnt_get_count(mnt) != 2) {
1247 br_write_lock(vfsmount_lock); 1306 br_write_unlock(vfsmount_lock);
1248 return -EBUSY; 1307 return -EBUSY;
1249 } 1308 }
1250 br_write_unlock(vfsmount_lock); 1309 br_write_unlock(vfsmount_lock);
@@ -1604,9 +1663,35 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1604 return err; 1663 return err;
1605} 1664}
1606 1665
1666static int lock_mount(struct path *path)
1667{
1668 struct vfsmount *mnt;
1669retry:
1670 mutex_lock(&path->dentry->d_inode->i_mutex);
1671 if (unlikely(cant_mount(path->dentry))) {
1672 mutex_unlock(&path->dentry->d_inode->i_mutex);
1673 return -ENOENT;
1674 }
1675 down_write(&namespace_sem);
1676 mnt = lookup_mnt(path);
1677 if (likely(!mnt))
1678 return 0;
1679 up_write(&namespace_sem);
1680 mutex_unlock(&path->dentry->d_inode->i_mutex);
1681 path_put(path);
1682 path->mnt = mnt;
1683 path->dentry = dget(mnt->mnt_root);
1684 goto retry;
1685}
1686
1687static void unlock_mount(struct path *path)
1688{
1689 up_write(&namespace_sem);
1690 mutex_unlock(&path->dentry->d_inode->i_mutex);
1691}
1692
1607static int graft_tree(struct vfsmount *mnt, struct path *path) 1693static int graft_tree(struct vfsmount *mnt, struct path *path)
1608{ 1694{
1609 int err;
1610 if (mnt->mnt_sb->s_flags & MS_NOUSER) 1695 if (mnt->mnt_sb->s_flags & MS_NOUSER)
1611 return -EINVAL; 1696 return -EINVAL;
1612 1697
@@ -1614,16 +1699,10 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1614 S_ISDIR(mnt->mnt_root->d_inode->i_mode)) 1699 S_ISDIR(mnt->mnt_root->d_inode->i_mode))
1615 return -ENOTDIR; 1700 return -ENOTDIR;
1616 1701
1617 err = -ENOENT; 1702 if (d_unlinked(path->dentry))
1618 mutex_lock(&path->dentry->d_inode->i_mutex); 1703 return -ENOENT;
1619 if (cant_mount(path->dentry))
1620 goto out_unlock;
1621 1704
1622 if (!d_unlinked(path->dentry)) 1705 return attach_recursive_mnt(mnt, path, NULL);
1623 err = attach_recursive_mnt(mnt, path, NULL);
1624out_unlock:
1625 mutex_unlock(&path->dentry->d_inode->i_mutex);
1626 return err;
1627} 1706}
1628 1707
1629/* 1708/*
@@ -1686,6 +1765,7 @@ static int do_change_type(struct path *path, int flag)
1686static int do_loopback(struct path *path, char *old_name, 1765static int do_loopback(struct path *path, char *old_name,
1687 int recurse) 1766 int recurse)
1688{ 1767{
1768 LIST_HEAD(umount_list);
1689 struct path old_path; 1769 struct path old_path;
1690 struct vfsmount *mnt = NULL; 1770 struct vfsmount *mnt = NULL;
1691 int err = mount_is_safe(path); 1771 int err = mount_is_safe(path);
@@ -1697,13 +1777,16 @@ static int do_loopback(struct path *path, char *old_name,
1697 if (err) 1777 if (err)
1698 return err; 1778 return err;
1699 1779
1700 down_write(&namespace_sem); 1780 err = lock_mount(path);
1781 if (err)
1782 goto out;
1783
1701 err = -EINVAL; 1784 err = -EINVAL;
1702 if (IS_MNT_UNBINDABLE(old_path.mnt)) 1785 if (IS_MNT_UNBINDABLE(old_path.mnt))
1703 goto out; 1786 goto out2;
1704 1787
1705 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1788 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
1706 goto out; 1789 goto out2;
1707 1790
1708 err = -ENOMEM; 1791 err = -ENOMEM;
1709 if (recurse) 1792 if (recurse)
@@ -1712,20 +1795,18 @@ static int do_loopback(struct path *path, char *old_name,
1712 mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); 1795 mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
1713 1796
1714 if (!mnt) 1797 if (!mnt)
1715 goto out; 1798 goto out2;
1716 1799
1717 err = graft_tree(mnt, path); 1800 err = graft_tree(mnt, path);
1718 if (err) { 1801 if (err) {
1719 LIST_HEAD(umount_list);
1720
1721 br_write_lock(vfsmount_lock); 1802 br_write_lock(vfsmount_lock);
1722 umount_tree(mnt, 0, &umount_list); 1803 umount_tree(mnt, 0, &umount_list);
1723 br_write_unlock(vfsmount_lock); 1804 br_write_unlock(vfsmount_lock);
1724 release_mounts(&umount_list);
1725 } 1805 }
1726 1806out2:
1807 unlock_mount(path);
1808 release_mounts(&umount_list);
1727out: 1809out:
1728 up_write(&namespace_sem);
1729 path_put(&old_path); 1810 path_put(&old_path);
1730 return err; 1811 return err;
1731} 1812}
@@ -1767,6 +1848,10 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1767 if (path->dentry != path->mnt->mnt_root) 1848 if (path->dentry != path->mnt->mnt_root)
1768 return -EINVAL; 1849 return -EINVAL;
1769 1850
1851 err = security_sb_remount(sb, data);
1852 if (err)
1853 return err;
1854
1770 down_write(&sb->s_umount); 1855 down_write(&sb->s_umount);
1771 if (flags & MS_BIND) 1856 if (flags & MS_BIND)
1772 err = change_mount_flags(path->mnt, flags); 1857 err = change_mount_flags(path->mnt, flags);
@@ -1810,18 +1895,12 @@ static int do_move_mount(struct path *path, char *old_name)
1810 if (err) 1895 if (err)
1811 return err; 1896 return err;
1812 1897
1813 down_write(&namespace_sem); 1898 err = lock_mount(path);
1814 err = follow_down(path, true);
1815 if (err < 0) 1899 if (err < 0)
1816 goto out; 1900 goto out;
1817 1901
1818 err = -EINVAL; 1902 err = -EINVAL;
1819 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1903 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
1820 goto out;
1821
1822 err = -ENOENT;
1823 mutex_lock(&path->dentry->d_inode->i_mutex);
1824 if (cant_mount(path->dentry))
1825 goto out1; 1904 goto out1;
1826 1905
1827 if (d_unlinked(path->dentry)) 1906 if (d_unlinked(path->dentry))
@@ -1863,16 +1942,87 @@ static int do_move_mount(struct path *path, char *old_name)
1863 * automatically */ 1942 * automatically */
1864 list_del_init(&old_path.mnt->mnt_expire); 1943 list_del_init(&old_path.mnt->mnt_expire);
1865out1: 1944out1:
1866 mutex_unlock(&path->dentry->d_inode->i_mutex); 1945 unlock_mount(path);
1867out: 1946out:
1868 up_write(&namespace_sem);
1869 if (!err) 1947 if (!err)
1870 path_put(&parent_path); 1948 path_put(&parent_path);
1871 path_put(&old_path); 1949 path_put(&old_path);
1872 return err; 1950 return err;
1873} 1951}
1874 1952
1875static int do_add_mount(struct vfsmount *, struct path *, int); 1953static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1954{
1955 int err;
1956 const char *subtype = strchr(fstype, '.');
1957 if (subtype) {
1958 subtype++;
1959 err = -EINVAL;
1960 if (!subtype[0])
1961 goto err;
1962 } else
1963 subtype = "";
1964
1965 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
1966 err = -ENOMEM;
1967 if (!mnt->mnt_sb->s_subtype)
1968 goto err;
1969 return mnt;
1970
1971 err:
1972 mntput(mnt);
1973 return ERR_PTR(err);
1974}
1975
1976struct vfsmount *
1977do_kern_mount(const char *fstype, int flags, const char *name, void *data)
1978{
1979 struct file_system_type *type = get_fs_type(fstype);
1980 struct vfsmount *mnt;
1981 if (!type)
1982 return ERR_PTR(-ENODEV);
1983 mnt = vfs_kern_mount(type, flags, name, data);
1984 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1985 !mnt->mnt_sb->s_subtype)
1986 mnt = fs_set_subtype(mnt, fstype);
1987 put_filesystem(type);
1988 return mnt;
1989}
1990EXPORT_SYMBOL_GPL(do_kern_mount);
1991
1992/*
1993 * add a mount into a namespace's mount tree
1994 */
1995static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
1996{
1997 int err;
1998
1999 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
2000
2001 err = lock_mount(path);
2002 if (err)
2003 return err;
2004
2005 err = -EINVAL;
2006 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
2007 goto unlock;
2008
2009 /* Refuse the same filesystem on the same mount point */
2010 err = -EBUSY;
2011 if (path->mnt->mnt_sb == newmnt->mnt_sb &&
2012 path->mnt->mnt_root == path->dentry)
2013 goto unlock;
2014
2015 err = -EINVAL;
2016 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
2017 goto unlock;
2018
2019 newmnt->mnt_flags = mnt_flags;
2020 err = graft_tree(newmnt, path);
2021
2022unlock:
2023 unlock_mount(path);
2024 return err;
2025}
1876 2026
1877/* 2027/*
1878 * create a new mount for userspace and request it to be added into the 2028 * create a new mount for userspace and request it to be added into the
@@ -1932,43 +2082,6 @@ fail:
1932 return err; 2082 return err;
1933} 2083}
1934 2084
1935/*
1936 * add a mount into a namespace's mount tree
1937 */
1938static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
1939{
1940 int err;
1941
1942 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
1943
1944 down_write(&namespace_sem);
1945 /* Something was mounted here while we slept */
1946 err = follow_down(path, true);
1947 if (err < 0)
1948 goto unlock;
1949
1950 err = -EINVAL;
1951 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1952 goto unlock;
1953
1954 /* Refuse the same filesystem on the same mount point */
1955 err = -EBUSY;
1956 if (path->mnt->mnt_sb == newmnt->mnt_sb &&
1957 path->mnt->mnt_root == path->dentry)
1958 goto unlock;
1959
1960 err = -EINVAL;
1961 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
1962 goto unlock;
1963
1964 newmnt->mnt_flags = mnt_flags;
1965 err = graft_tree(newmnt, path);
1966
1967unlock:
1968 up_write(&namespace_sem);
1969 return err;
1970}
1971
1972/** 2085/**
1973 * mnt_set_expiry - Put a mount on an expiration list 2086 * mnt_set_expiry - Put a mount on an expiration list
1974 * @mnt: The mount to list. 2087 * @mnt: The mount to list.
@@ -2469,65 +2582,60 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2469 error = user_path_dir(new_root, &new); 2582 error = user_path_dir(new_root, &new);
2470 if (error) 2583 if (error)
2471 goto out0; 2584 goto out0;
2472 error = -EINVAL;
2473 if (!check_mnt(new.mnt))
2474 goto out1;
2475 2585
2476 error = user_path_dir(put_old, &old); 2586 error = user_path_dir(put_old, &old);
2477 if (error) 2587 if (error)
2478 goto out1; 2588 goto out1;
2479 2589
2480 error = security_sb_pivotroot(&old, &new); 2590 error = security_sb_pivotroot(&old, &new);
2481 if (error) { 2591 if (error)
2482 path_put(&old); 2592 goto out2;
2483 goto out1;
2484 }
2485 2593
2486 get_fs_root(current->fs, &root); 2594 get_fs_root(current->fs, &root);
2487 down_write(&namespace_sem); 2595 error = lock_mount(&old);
2488 mutex_lock(&old.dentry->d_inode->i_mutex); 2596 if (error)
2597 goto out3;
2598
2489 error = -EINVAL; 2599 error = -EINVAL;
2490 if (IS_MNT_SHARED(old.mnt) || 2600 if (IS_MNT_SHARED(old.mnt) ||
2491 IS_MNT_SHARED(new.mnt->mnt_parent) || 2601 IS_MNT_SHARED(new.mnt->mnt_parent) ||
2492 IS_MNT_SHARED(root.mnt->mnt_parent)) 2602 IS_MNT_SHARED(root.mnt->mnt_parent))
2493 goto out2; 2603 goto out4;
2494 if (!check_mnt(root.mnt)) 2604 if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
2495 goto out2; 2605 goto out4;
2496 error = -ENOENT; 2606 error = -ENOENT;
2497 if (cant_mount(old.dentry))
2498 goto out2;
2499 if (d_unlinked(new.dentry)) 2607 if (d_unlinked(new.dentry))
2500 goto out2; 2608 goto out4;
2501 if (d_unlinked(old.dentry)) 2609 if (d_unlinked(old.dentry))
2502 goto out2; 2610 goto out4;
2503 error = -EBUSY; 2611 error = -EBUSY;
2504 if (new.mnt == root.mnt || 2612 if (new.mnt == root.mnt ||
2505 old.mnt == root.mnt) 2613 old.mnt == root.mnt)
2506 goto out2; /* loop, on the same file system */ 2614 goto out4; /* loop, on the same file system */
2507 error = -EINVAL; 2615 error = -EINVAL;
2508 if (root.mnt->mnt_root != root.dentry) 2616 if (root.mnt->mnt_root != root.dentry)
2509 goto out2; /* not a mountpoint */ 2617 goto out4; /* not a mountpoint */
2510 if (root.mnt->mnt_parent == root.mnt) 2618 if (root.mnt->mnt_parent == root.mnt)
2511 goto out2; /* not attached */ 2619 goto out4; /* not attached */
2512 if (new.mnt->mnt_root != new.dentry) 2620 if (new.mnt->mnt_root != new.dentry)
2513 goto out2; /* not a mountpoint */ 2621 goto out4; /* not a mountpoint */
2514 if (new.mnt->mnt_parent == new.mnt) 2622 if (new.mnt->mnt_parent == new.mnt)
2515 goto out2; /* not attached */ 2623 goto out4; /* not attached */
2516 /* make sure we can reach put_old from new_root */ 2624 /* make sure we can reach put_old from new_root */
2517 tmp = old.mnt; 2625 tmp = old.mnt;
2518 br_write_lock(vfsmount_lock);
2519 if (tmp != new.mnt) { 2626 if (tmp != new.mnt) {
2520 for (;;) { 2627 for (;;) {
2521 if (tmp->mnt_parent == tmp) 2628 if (tmp->mnt_parent == tmp)
2522 goto out3; /* already mounted on put_old */ 2629 goto out4; /* already mounted on put_old */
2523 if (tmp->mnt_parent == new.mnt) 2630 if (tmp->mnt_parent == new.mnt)
2524 break; 2631 break;
2525 tmp = tmp->mnt_parent; 2632 tmp = tmp->mnt_parent;
2526 } 2633 }
2527 if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) 2634 if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
2528 goto out3; 2635 goto out4;
2529 } else if (!is_subdir(old.dentry, new.dentry)) 2636 } else if (!is_subdir(old.dentry, new.dentry))
2530 goto out3; 2637 goto out4;
2638 br_write_lock(vfsmount_lock);
2531 detach_mnt(new.mnt, &parent_path); 2639 detach_mnt(new.mnt, &parent_path);
2532 detach_mnt(root.mnt, &root_parent); 2640 detach_mnt(root.mnt, &root_parent);
2533 /* mount old root on put_old */ 2641 /* mount old root on put_old */
@@ -2537,22 +2645,21 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2537 touch_mnt_namespace(current->nsproxy->mnt_ns); 2645 touch_mnt_namespace(current->nsproxy->mnt_ns);
2538 br_write_unlock(vfsmount_lock); 2646 br_write_unlock(vfsmount_lock);
2539 chroot_fs_refs(&root, &new); 2647 chroot_fs_refs(&root, &new);
2540
2541 error = 0; 2648 error = 0;
2542 path_put(&root_parent); 2649out4:
2543 path_put(&parent_path); 2650 unlock_mount(&old);
2544out2: 2651 if (!error) {
2545 mutex_unlock(&old.dentry->d_inode->i_mutex); 2652 path_put(&root_parent);
2546 up_write(&namespace_sem); 2653 path_put(&parent_path);
2654 }
2655out3:
2547 path_put(&root); 2656 path_put(&root);
2657out2:
2548 path_put(&old); 2658 path_put(&old);
2549out1: 2659out1:
2550 path_put(&new); 2660 path_put(&new);
2551out0: 2661out0:
2552 return error; 2662 return error;
2553out3:
2554 br_write_unlock(vfsmount_lock);
2555 goto out2;
2556} 2663}
2557 2664
2558static void __init init_mount_tree(void) 2665static void __init init_mount_tree(void)
@@ -2594,7 +2701,7 @@ void __init mnt_init(void)
2594 if (!mount_hashtable) 2701 if (!mount_hashtable)
2595 panic("Failed to allocate mount hash table\n"); 2702 panic("Failed to allocate mount hash table\n");
2596 2703
2597 printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); 2704 printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
2598 2705
2599 for (u = 0; u < HASH_SIZE; u++) 2706 for (u = 0; u < HASH_SIZE; u++)
2600 INIT_LIST_HEAD(&mount_hashtable[u]); 2707 INIT_LIST_HEAD(&mount_hashtable[u]);
@@ -2627,3 +2734,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
2627 kfree(ns); 2734 kfree(ns);
2628} 2735}
2629EXPORT_SYMBOL(put_mnt_ns); 2736EXPORT_SYMBOL(put_mnt_ns);
2737
2738struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
2739{
2740 return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
2741}
2742EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile
index 68ea095100a8..c66af563f2ce 100644
--- a/fs/ncpfs/Makefile
+++ b/fs/ncpfs/Makefile
@@ -11,6 +11,6 @@ ncpfs-$(CONFIG_NCPFS_EXTRAS) += symlink.o
11ncpfs-$(CONFIG_NCPFS_NFS_NS) += symlink.o 11ncpfs-$(CONFIG_NCPFS_NFS_NS) += symlink.o
12 12
13# If you want debugging output, please uncomment the following line 13# If you want debugging output, please uncomment the following line
14# EXTRA_CFLAGS += -DDEBUG_NCP=1 14# ccflags-y := -DDEBUG_NCP=1
15 15
16CFLAGS_ncplib_kernel.o := -finline-functions 16CFLAGS_ncplib_kernel.o := -finline-functions
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 199016528fcb..e3d294269058 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -135,33 +135,6 @@ out_err:
135 135
136#if defined(CONFIG_NFS_V4_1) 136#if defined(CONFIG_NFS_V4_1)
137/* 137/*
138 * * CB_SEQUENCE operations will fail until the callback sessionid is set.
139 * */
140int nfs4_set_callback_sessionid(struct nfs_client *clp)
141{
142 struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
143 struct nfs4_sessionid *bc_sid;
144
145 if (!serv->sv_bc_xprt)
146 return -EINVAL;
147
148 /* on success freed in xprt_free */
149 bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
150 if (!bc_sid)
151 return -ENOMEM;
152 memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
153 NFS4_MAX_SESSIONID_LEN);
154 spin_lock_bh(&serv->sv_cb_lock);
155 serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
156 spin_unlock_bh(&serv->sv_cb_lock);
157 dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
158 ((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
159 ((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
160 serv->sv_bc_xprt);
161 return 0;
162}
163
164/*
165 * The callback service for NFSv4.1 callbacks 138 * The callback service for NFSv4.1 callbacks
166 */ 139 */
167static int 140static int
@@ -266,10 +239,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
266 struct nfs_callback_data *cb_info) 239 struct nfs_callback_data *cb_info)
267{ 240{
268} 241}
269int nfs4_set_callback_sessionid(struct nfs_client *clp)
270{
271 return 0;
272}
273#endif /* CONFIG_NFS_V4_1 */ 242#endif /* CONFIG_NFS_V4_1 */
274 243
275/* 244/*
@@ -359,78 +328,58 @@ void nfs_callback_down(int minorversion)
359 mutex_unlock(&nfs_callback_mutex); 328 mutex_unlock(&nfs_callback_mutex);
360} 329}
361 330
362static int check_gss_callback_principal(struct nfs_client *clp, 331/* Boolean check of RPC_AUTH_GSS principal */
363 struct svc_rqst *rqstp) 332int
333check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
364{ 334{
365 struct rpc_clnt *r = clp->cl_rpcclient; 335 struct rpc_clnt *r = clp->cl_rpcclient;
366 char *p = svc_gss_principal(rqstp); 336 char *p = svc_gss_principal(rqstp);
367 337
338 if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
339 return 1;
340
368 /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */ 341 /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
369 if (clp->cl_minorversion != 0) 342 if (clp->cl_minorversion != 0)
370 return SVC_DROP; 343 return 0;
371 /* 344 /*
372 * It might just be a normal user principal, in which case 345 * It might just be a normal user principal, in which case
373 * userspace won't bother to tell us the name at all. 346 * userspace won't bother to tell us the name at all.
374 */ 347 */
375 if (p == NULL) 348 if (p == NULL)
376 return SVC_DENIED; 349 return 0;
377 350
378 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ 351 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
379 352
380 if (memcmp(p, "nfs@", 4) != 0) 353 if (memcmp(p, "nfs@", 4) != 0)
381 return SVC_DENIED; 354 return 0;
382 p += 4; 355 p += 4;
383 if (strcmp(p, r->cl_server) != 0) 356 if (strcmp(p, r->cl_server) != 0)
384 return SVC_DENIED; 357 return 0;
385 return SVC_OK; 358 return 1;
386} 359}
387 360
388/* pg_authenticate method helper */ 361/*
389static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp) 362 * pg_authenticate method for nfsv4 callback threads.
390{ 363 *
391 struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp); 364 * The authflavor has been negotiated, so an incorrect flavor is a server
392 int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0; 365 * bug. Drop packets with incorrect authflavor.
393 366 *
394 dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc); 367 * All other checking done after NFS decoding where the nfs_client can be
395 if (svc_is_backchannel(rqstp)) 368 * found in nfs4_callback_compound
396 /* Sessionid (usually) set after CB_NULL ping */ 369 */
397 return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
398 is_cb_compound);
399 else
400 /* No callback identifier in pg_authenticate */
401 return nfs4_find_client_no_ident(svc_addr(rqstp));
402}
403
404/* pg_authenticate method for nfsv4 callback threads. */
405static int nfs_callback_authenticate(struct svc_rqst *rqstp) 370static int nfs_callback_authenticate(struct svc_rqst *rqstp)
406{ 371{
407 struct nfs_client *clp;
408 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
409 int ret = SVC_OK;
410
411 /* Don't talk to strangers */
412 clp = nfs_cb_find_client(rqstp);
413 if (clp == NULL)
414 return SVC_DROP;
415
416 dprintk("%s: %s NFSv4 callback!\n", __func__,
417 svc_print_addr(rqstp, buf, sizeof(buf)));
418
419 switch (rqstp->rq_authop->flavour) { 372 switch (rqstp->rq_authop->flavour) {
420 case RPC_AUTH_NULL: 373 case RPC_AUTH_NULL:
421 if (rqstp->rq_proc != CB_NULL) 374 if (rqstp->rq_proc != CB_NULL)
422 ret = SVC_DENIED; 375 return SVC_DROP;
423 break; 376 break;
424 case RPC_AUTH_UNIX: 377 case RPC_AUTH_GSS:
425 break; 378 /* No RPC_AUTH_GSS support yet in NFSv4.1 */
426 case RPC_AUTH_GSS: 379 if (svc_is_backchannel(rqstp))
427 ret = check_gss_callback_principal(clp, rqstp); 380 return SVC_DROP;
428 break;
429 default:
430 ret = SVC_DENIED;
431 } 381 }
432 nfs_put_client(clp); 382 return SVC_OK;
433 return ret;
434} 383}
435 384
436/* 385/*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index d3b44f9bd747..46d93ce7311b 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -7,6 +7,7 @@
7 */ 7 */
8#ifndef __LINUX_FS_NFS_CALLBACK_H 8#ifndef __LINUX_FS_NFS_CALLBACK_H
9#define __LINUX_FS_NFS_CALLBACK_H 9#define __LINUX_FS_NFS_CALLBACK_H
10#include <linux/sunrpc/svc.h>
10 11
11#define NFS4_CALLBACK 0x40000000 12#define NFS4_CALLBACK 0x40000000
12#define NFS4_CALLBACK_XDRSIZE 2048 13#define NFS4_CALLBACK_XDRSIZE 2048
@@ -37,7 +38,6 @@ enum nfs4_callback_opnum {
37struct cb_process_state { 38struct cb_process_state {
38 __be32 drc_status; 39 __be32 drc_status;
39 struct nfs_client *clp; 40 struct nfs_client *clp;
40 struct nfs4_sessionid *svc_sid; /* v4.1 callback service sessionid */
41}; 41};
42 42
43struct cb_compound_hdr_arg { 43struct cb_compound_hdr_arg {
@@ -168,7 +168,7 @@ extern unsigned nfs4_callback_layoutrecall(
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); 168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp); 169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170#endif /* CONFIG_NFS_V4_1 */ 170#endif /* CONFIG_NFS_V4_1 */
171 171extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, 172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
173 struct cb_getattrres *res, 173 struct cb_getattrres *res,
174 struct cb_process_state *cps); 174 struct cb_process_state *cps);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 4bb91cb2620d..2f41dccea18e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
188 rv = NFS4ERR_DELAY; 188 rv = NFS4ERR_DELAY;
189 list_del_init(&lo->plh_bulk_recall); 189 list_del_init(&lo->plh_bulk_recall);
190 spin_unlock(&ino->i_lock); 190 spin_unlock(&ino->i_lock);
191 pnfs_free_lseg_list(&free_me_list);
191 put_layout_hdr(lo); 192 put_layout_hdr(lo);
192 iput(ino); 193 iput(ino);
193 } 194 }
194 pnfs_free_lseg_list(&free_me_list);
195 return rv; 195 return rv;
196} 196}
197 197
@@ -373,17 +373,11 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
373{ 373{
374 struct nfs_client *clp; 374 struct nfs_client *clp;
375 int i; 375 int i;
376 __be32 status; 376 __be32 status = htonl(NFS4ERR_BADSESSION);
377 377
378 cps->clp = NULL; 378 cps->clp = NULL;
379 379
380 status = htonl(NFS4ERR_BADSESSION); 380 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
381 /* Incoming session must match the callback session */
382 if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
383 goto out;
384
385 clp = nfs4_find_client_sessionid(args->csa_addr,
386 &args->csa_sessionid, 1);
387 if (clp == NULL) 381 if (clp == NULL)
388 goto out; 382 goto out;
389 383
@@ -414,9 +408,9 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
414 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 408 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
415 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 409 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
416 nfs4_cb_take_slot(clp); 410 nfs4_cb_take_slot(clp);
417 cps->clp = clp; /* put in nfs4_callback_compound */
418 411
419out: 412out:
413 cps->clp = clp; /* put in nfs4_callback_compound */
420 for (i = 0; i < args->csa_nrclists; i++) 414 for (i = 0; i < args->csa_nrclists; i++)
421 kfree(args->csa_rclists[i].rcl_refcalls); 415 kfree(args->csa_rclists[i].rcl_refcalls);
422 kfree(args->csa_rclists); 416 kfree(args->csa_rclists);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 23112c263f81..14e0f9371d14 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -794,10 +794,9 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
794 794
795 if (hdr_arg.minorversion == 0) { 795 if (hdr_arg.minorversion == 0) {
796 cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident); 796 cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
797 if (!cps.clp) 797 if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
798 return rpc_drop_reply; 798 return rpc_drop_reply;
799 } else 799 }
800 cps.svc_sid = bc_xprt_sid(rqstp);
801 800
802 hdr_res.taglen = hdr_arg.taglen; 801 hdr_res.taglen = hdr_arg.taglen;
803 hdr_res.tag = hdr_arg.tag; 802 hdr_res.tag = hdr_arg.tag;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 192f2f860265..139be9647d80 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -82,6 +82,11 @@ retry:
82#endif /* CONFIG_NFS_V4 */ 82#endif /* CONFIG_NFS_V4 */
83 83
84/* 84/*
85 * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
86 */
87static int nfs4_disable_idmapping = 0;
88
89/*
85 * RPC cruft for NFS 90 * RPC cruft for NFS
86 */ 91 */
87static struct rpc_version *nfs_version[5] = { 92static struct rpc_version *nfs_version[5] = {
@@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
481 * Look up a client by IP address and protocol version 486 * Look up a client by IP address and protocol version
482 * - creates a new record if one doesn't yet exist 487 * - creates a new record if one doesn't yet exist
483 */ 488 */
484static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) 489static struct nfs_client *
490nfs_get_client(const struct nfs_client_initdata *cl_init,
491 const struct rpc_timeout *timeparms,
492 const char *ip_addr,
493 rpc_authflavor_t authflavour,
494 int noresvport)
485{ 495{
486 struct nfs_client *clp, *new = NULL; 496 struct nfs_client *clp, *new = NULL;
487 int error; 497 int error;
@@ -512,6 +522,13 @@ install_client:
512 clp = new; 522 clp = new;
513 list_add(&clp->cl_share_link, &nfs_client_list); 523 list_add(&clp->cl_share_link, &nfs_client_list);
514 spin_unlock(&nfs_client_lock); 524 spin_unlock(&nfs_client_lock);
525
526 error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
527 authflavour, noresvport);
528 if (error < 0) {
529 nfs_put_client(clp);
530 return ERR_PTR(error);
531 }
515 dprintk("--> nfs_get_client() = %p [new]\n", clp); 532 dprintk("--> nfs_get_client() = %p [new]\n", clp);
516 return clp; 533 return clp;
517 534
@@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
767/* 784/*
768 * Initialise an NFS2 or NFS3 client 785 * Initialise an NFS2 or NFS3 client
769 */ 786 */
770static int nfs_init_client(struct nfs_client *clp, 787int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
771 const struct rpc_timeout *timeparms, 788 const char *ip_addr, rpc_authflavor_t authflavour,
772 const struct nfs_parsed_mount_data *data) 789 int noresvport)
773{ 790{
774 int error; 791 int error;
775 792
@@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp,
784 * - RFC 2623, sec 2.3.2 801 * - RFC 2623, sec 2.3.2
785 */ 802 */
786 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 803 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
787 0, data->flags & NFS_MOUNT_NORESVPORT); 804 0, noresvport);
788 if (error < 0) 805 if (error < 0)
789 goto error; 806 goto error;
790 nfs_mark_client_ready(clp, NFS_CS_READY); 807 nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server,
820 cl_init.rpc_ops = &nfs_v3_clientops; 837 cl_init.rpc_ops = &nfs_v3_clientops;
821#endif 838#endif
822 839
840 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
841 data->timeo, data->retrans);
842
823 /* Allocate or find a client reference we can use */ 843 /* Allocate or find a client reference we can use */
824 clp = nfs_get_client(&cl_init); 844 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
845 data->flags & NFS_MOUNT_NORESVPORT);
825 if (IS_ERR(clp)) { 846 if (IS_ERR(clp)) {
826 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); 847 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
827 return PTR_ERR(clp); 848 return PTR_ERR(clp);
828 } 849 }
829 850
830 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
831 data->timeo, data->retrans);
832 error = nfs_init_client(clp, &timeparms, data);
833 if (error < 0)
834 goto error;
835
836 server->nfs_client = clp; 851 server->nfs_client = clp;
837 852
838 /* Initialise the client representation from the mount data */ 853 /* Initialise the client representation from the mount data */
@@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
1009 spin_lock(&nfs_client_lock); 1024 spin_lock(&nfs_client_lock);
1010 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); 1025 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
1011 list_add_tail(&server->master_link, &nfs_volume_list); 1026 list_add_tail(&server->master_link, &nfs_volume_list);
1027 clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
1012 spin_unlock(&nfs_client_lock); 1028 spin_unlock(&nfs_client_lock);
1013 1029
1014} 1030}
1015 1031
1016static void nfs_server_remove_lists(struct nfs_server *server) 1032static void nfs_server_remove_lists(struct nfs_server *server)
1017{ 1033{
1034 struct nfs_client *clp = server->nfs_client;
1035
1018 spin_lock(&nfs_client_lock); 1036 spin_lock(&nfs_client_lock);
1019 list_del_rcu(&server->client_link); 1037 list_del_rcu(&server->client_link);
1038 if (clp && list_empty(&clp->cl_superblocks))
1039 set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
1020 list_del(&server->master_link); 1040 list_del(&server->master_link);
1021 spin_unlock(&nfs_client_lock); 1041 spin_unlock(&nfs_client_lock);
1022 1042
@@ -1206,16 +1226,11 @@ nfs4_find_client_ident(int cb_ident)
1206 * For CB_COMPOUND calls, find a client by IP address, protocol version, 1226 * For CB_COMPOUND calls, find a client by IP address, protocol version,
1207 * minorversion, and sessionID 1227 * minorversion, and sessionID
1208 * 1228 *
1209 * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
1210 * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
1211 * can arrive before the callback sessionid is set. For CB_NULL calls,
1212 * find a client by IP address protocol version, and minorversion.
1213 *
1214 * Returns NULL if no such client 1229 * Returns NULL if no such client
1215 */ 1230 */
1216struct nfs_client * 1231struct nfs_client *
1217nfs4_find_client_sessionid(const struct sockaddr *addr, 1232nfs4_find_client_sessionid(const struct sockaddr *addr,
1218 struct nfs4_sessionid *sid, int is_cb_compound) 1233 struct nfs4_sessionid *sid)
1219{ 1234{
1220 struct nfs_client *clp; 1235 struct nfs_client *clp;
1221 1236
@@ -1227,9 +1242,9 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
1227 if (!nfs4_has_session(clp)) 1242 if (!nfs4_has_session(clp))
1228 continue; 1243 continue;
1229 1244
1230 /* Match sessionid unless cb_null call*/ 1245 /* Match sessionid*/
1231 if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data, 1246 if (memcmp(clp->cl_session->sess_id.data,
1232 sid->data, NFS4_MAX_SESSIONID_LEN) != 0)) 1247 sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
1233 continue; 1248 continue;
1234 1249
1235 atomic_inc(&clp->cl_count); 1250 atomic_inc(&clp->cl_count);
@@ -1244,7 +1259,7 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
1244 1259
1245struct nfs_client * 1260struct nfs_client *
1246nfs4_find_client_sessionid(const struct sockaddr *addr, 1261nfs4_find_client_sessionid(const struct sockaddr *addr,
1247 struct nfs4_sessionid *sid, int is_cb_compound) 1262 struct nfs4_sessionid *sid)
1248{ 1263{
1249 return NULL; 1264 return NULL;
1250} 1265}
@@ -1312,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
1312/* 1327/*
1313 * Initialise an NFS4 client record 1328 * Initialise an NFS4 client record
1314 */ 1329 */
1315static int nfs4_init_client(struct nfs_client *clp, 1330int nfs4_init_client(struct nfs_client *clp,
1316 const struct rpc_timeout *timeparms, 1331 const struct rpc_timeout *timeparms,
1317 const char *ip_addr, 1332 const char *ip_addr,
1318 rpc_authflavor_t authflavour, 1333 rpc_authflavor_t authflavour,
1319 int flags) 1334 int noresvport)
1320{ 1335{
1321 int error; 1336 int error;
1322 1337
@@ -1330,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp,
1330 clp->rpc_ops = &nfs_v4_clientops; 1345 clp->rpc_ops = &nfs_v4_clientops;
1331 1346
1332 error = nfs_create_rpc_client(clp, timeparms, authflavour, 1347 error = nfs_create_rpc_client(clp, timeparms, authflavour,
1333 1, flags & NFS_MOUNT_NORESVPORT); 1348 1, noresvport);
1334 if (error < 0) 1349 if (error < 0)
1335 goto error; 1350 goto error;
1336 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); 1351 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1383,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server,
1383 dprintk("--> nfs4_set_client()\n"); 1398 dprintk("--> nfs4_set_client()\n");
1384 1399
1385 /* Allocate or find a client reference we can use */ 1400 /* Allocate or find a client reference we can use */
1386 clp = nfs_get_client(&cl_init); 1401 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
1402 server->flags & NFS_MOUNT_NORESVPORT);
1387 if (IS_ERR(clp)) { 1403 if (IS_ERR(clp)) {
1388 error = PTR_ERR(clp); 1404 error = PTR_ERR(clp);
1389 goto error; 1405 goto error;
1390 } 1406 }
1391 error = nfs4_init_client(clp, timeparms, ip_addr, authflavour, 1407
1392 server->flags); 1408 /*
1393 if (error < 0) 1409 * Query for the lease time on clientid setup or renewal
1394 goto error_put; 1410 *
1411 * Note that this will be set on nfs_clients that were created
1412 * only for the DS role and did not set this bit, but now will
1413 * serve a dual role.
1414 */
1415 set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
1395 1416
1396 server->nfs_client = clp; 1417 server->nfs_client = clp;
1397 dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); 1418 dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
1398 return 0; 1419 return 0;
1399
1400error_put:
1401 nfs_put_client(clp);
1402error: 1420error:
1403 dprintk("<-- nfs4_set_client() = xerror %d\n", error); 1421 dprintk("<-- nfs4_set_client() = xerror %d\n", error);
1404 return error; 1422 return error;
1405} 1423}
1406 1424
1425/*
1426 * Set up a pNFS Data Server client.
1427 *
1428 * Return any existing nfs_client that matches server address,port,version
1429 * and minorversion.
1430 *
1431 * For a new nfs_client, use a soft mount (default), a low retrans and a
1432 * low timeout interval so that if a connection is lost, we retry through
1433 * the MDS.
1434 */
1435struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
1436 const struct sockaddr *ds_addr,
1437 int ds_addrlen, int ds_proto)
1438{
1439 struct nfs_client_initdata cl_init = {
1440 .addr = ds_addr,
1441 .addrlen = ds_addrlen,
1442 .rpc_ops = &nfs_v4_clientops,
1443 .proto = ds_proto,
1444 .minorversion = mds_clp->cl_minorversion,
1445 };
1446 struct rpc_timeout ds_timeout = {
1447 .to_initval = 15 * HZ,
1448 .to_maxval = 15 * HZ,
1449 .to_retries = 1,
1450 .to_exponential = 1,
1451 };
1452 struct nfs_client *clp;
1453
1454 /*
1455 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
1456 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
1457 * (section 13.1 RFC 5661).
1458 */
1459 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
1460 mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
1461
1462 dprintk("<-- %s %p\n", __func__, clp);
1463 return clp;
1464}
1465EXPORT_SYMBOL(nfs4_set_ds_client);
1407 1466
1408/* 1467/*
1409 * Session has been established, and the client marked ready. 1468 * Session has been established, and the client marked ready.
@@ -1440,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
1440 BUG_ON(!server->nfs_client->rpc_ops); 1499 BUG_ON(!server->nfs_client->rpc_ops);
1441 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1500 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1442 1501
1502 /* data servers support only a subset of NFSv4.1 */
1503 if (is_ds_only_client(server->nfs_client))
1504 return -EPROTONOSUPPORT;
1505
1443 fattr = nfs_alloc_fattr(); 1506 fattr = nfs_alloc_fattr();
1444 if (fattr == NULL) 1507 if (fattr == NULL)
1445 return -ENOMEM; 1508 return -ENOMEM;
@@ -1509,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server,
1509 if (error < 0) 1572 if (error < 0)
1510 goto error; 1573 goto error;
1511 1574
1575 /*
1576 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
1577 * authentication.
1578 */
1579 if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
1580 server->caps |= NFS_CAP_UIDGID_NOMAP;
1581
1512 if (data->rsize) 1582 if (data->rsize)
1513 server->rsize = nfs_block_size(data->rsize, NULL); 1583 server->rsize = nfs_block_size(data->rsize, NULL);
1514 if (data->wsize) 1584 if (data->wsize)
@@ -1926,3 +1996,7 @@ void nfs_fs_proc_exit(void)
1926} 1996}
1927 1997
1928#endif /* CONFIG_PROC_FS */ 1998#endif /* CONFIG_PROC_FS */
1999
2000module_param(nfs4_disable_idmapping, bool, 0644);
2001MODULE_PARM_DESC(nfs4_disable_idmapping,
2002 "Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 364e4328f392..bbbc6bf5cb2e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -23,8 +23,6 @@
23 23
24static void nfs_do_free_delegation(struct nfs_delegation *delegation) 24static void nfs_do_free_delegation(struct nfs_delegation *delegation)
25{ 25{
26 if (delegation->cred)
27 put_rpccred(delegation->cred);
28 kfree(delegation); 26 kfree(delegation);
29} 27}
30 28
@@ -37,6 +35,10 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
37 35
38static void nfs_free_delegation(struct nfs_delegation *delegation) 36static void nfs_free_delegation(struct nfs_delegation *delegation)
39{ 37{
38 if (delegation->cred) {
39 put_rpccred(delegation->cred);
40 delegation->cred = NULL;
41 }
40 call_rcu(&delegation->rcu, nfs_free_delegation_callback); 42 call_rcu(&delegation->rcu, nfs_free_delegation_callback);
41} 43}
42 44
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2c3eb33b904d..abdf38d5971d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1169,11 +1169,23 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
1169 iput(inode); 1169 iput(inode);
1170} 1170}
1171 1171
1172static void nfs_d_release(struct dentry *dentry)
1173{
1174 /* free cached devname value, if it survived that far */
1175 if (unlikely(dentry->d_fsdata)) {
1176 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
1177 WARN_ON(1);
1178 else
1179 kfree(dentry->d_fsdata);
1180 }
1181}
1182
1172const struct dentry_operations nfs_dentry_operations = { 1183const struct dentry_operations nfs_dentry_operations = {
1173 .d_revalidate = nfs_lookup_revalidate, 1184 .d_revalidate = nfs_lookup_revalidate,
1174 .d_delete = nfs_dentry_delete, 1185 .d_delete = nfs_dentry_delete,
1175 .d_iput = nfs_dentry_iput, 1186 .d_iput = nfs_dentry_iput,
1176 .d_automount = nfs_d_automount, 1187 .d_automount = nfs_d_automount,
1188 .d_release = nfs_d_release,
1177}; 1189};
1178 1190
1179static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 1191static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1248,6 +1260,7 @@ const struct dentry_operations nfs4_dentry_operations = {
1248 .d_delete = nfs_dentry_delete, 1260 .d_delete = nfs_dentry_delete,
1249 .d_iput = nfs_dentry_iput, 1261 .d_iput = nfs_dentry_iput,
1250 .d_automount = nfs_d_automount, 1262 .d_automount = nfs_d_automount,
1263 .d_release = nfs_d_release,
1251}; 1264};
1252 1265
1253/* 1266/*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e6ace0d93c71..8eea25366717 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -45,6 +45,7 @@
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/task_io_accounting_ops.h>
48 49
49#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
50#include <linux/nfs_page.h> 51#include <linux/nfs_page.h>
@@ -407,15 +408,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
407 pos += vec->iov_len; 408 pos += vec->iov_len;
408 } 409 }
409 410
411 /*
412 * If no bytes were started, return the error, and let the
413 * generic layer handle the completion.
414 */
415 if (requested_bytes == 0) {
416 nfs_direct_req_release(dreq);
417 return result < 0 ? result : -EIO;
418 }
419
410 if (put_dreq(dreq)) 420 if (put_dreq(dreq))
411 nfs_direct_complete(dreq); 421 nfs_direct_complete(dreq);
412 422 return 0;
413 if (requested_bytes != 0)
414 return 0;
415
416 if (result < 0)
417 return result;
418 return -EIO;
419} 423}
420 424
421static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 425static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -646,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
646{ 650{
647 struct nfs_write_data *data = calldata; 651 struct nfs_write_data *data = calldata;
648 652
649 if (nfs_writeback_done(task, data) != 0) 653 nfs_writeback_done(task, data);
650 return;
651} 654}
652 655
653/* 656/*
@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
841 pos += vec->iov_len; 844 pos += vec->iov_len;
842 } 845 }
843 846
847 /*
848 * If no bytes were started, return the error, and let the
849 * generic layer handle the completion.
850 */
851 if (requested_bytes == 0) {
852 nfs_direct_req_release(dreq);
853 return result < 0 ? result : -EIO;
854 }
855
844 if (put_dreq(dreq)) 856 if (put_dreq(dreq))
845 nfs_direct_write_complete(dreq, dreq->inode); 857 nfs_direct_write_complete(dreq, dreq->inode);
846 858 return 0;
847 if (requested_bytes != 0)
848 return 0;
849
850 if (result < 0)
851 return result;
852 return -EIO;
853} 859}
854 860
855static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, 861static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
@@ -932,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
932 if (retval) 938 if (retval)
933 goto out; 939 goto out;
934 940
941 task_io_account_read(count);
942
935 retval = nfs_direct_read(iocb, iov, nr_segs, pos); 943 retval = nfs_direct_read(iocb, iov, nr_segs, pos);
936 if (retval > 0) 944 if (retval > 0)
937 iocb->ki_pos = pos + retval; 945 iocb->ki_pos = pos + retval;
@@ -993,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
993 if (retval) 1001 if (retval)
994 goto out; 1002 goto out;
995 1003
1004 task_io_account_write(count);
1005
996 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); 1006 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
997 1007
998 if (retval > 0) 1008 if (retval > 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7bf029ef4084..d85a534b15cd 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
387 file->f_path.dentry->d_name.name, 387 file->f_path.dentry->d_name.name,
388 mapping->host->i_ino, len, (long long) pos); 388 mapping->host->i_ino, len, (long long) pos);
389 389
390 pnfs_update_layout(mapping->host,
391 nfs_file_open_context(file),
392 IOMODE_RW);
393
394start: 390start:
395 /* 391 /*
396 * Prevent starvation issues if someone is doing a consistency 392 * Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b5ffe8fa291f..1084792bc0fe 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -75,18 +75,25 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
75/* 75/*
76 * get an NFS2/NFS3 root dentry from the root filehandle 76 * get an NFS2/NFS3 root dentry from the root filehandle
77 */ 77 */
78struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) 78struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
79 const char *devname)
79{ 80{
80 struct nfs_server *server = NFS_SB(sb); 81 struct nfs_server *server = NFS_SB(sb);
81 struct nfs_fsinfo fsinfo; 82 struct nfs_fsinfo fsinfo;
82 struct dentry *ret; 83 struct dentry *ret;
83 struct inode *inode; 84 struct inode *inode;
85 void *name = kstrdup(devname, GFP_KERNEL);
84 int error; 86 int error;
85 87
88 if (!name)
89 return ERR_PTR(-ENOMEM);
90
86 /* get the actual root for this mount */ 91 /* get the actual root for this mount */
87 fsinfo.fattr = nfs_alloc_fattr(); 92 fsinfo.fattr = nfs_alloc_fattr();
88 if (fsinfo.fattr == NULL) 93 if (fsinfo.fattr == NULL) {
94 kfree(name);
89 return ERR_PTR(-ENOMEM); 95 return ERR_PTR(-ENOMEM);
96 }
90 97
91 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); 98 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
92 if (error < 0) { 99 if (error < 0) {
@@ -119,7 +126,15 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
119 } 126 }
120 127
121 security_d_instantiate(ret, inode); 128 security_d_instantiate(ret, inode);
129 spin_lock(&ret->d_lock);
130 if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
131 ret->d_fsdata = name;
132 name = NULL;
133 }
134 spin_unlock(&ret->d_lock);
122out: 135out:
136 if (name)
137 kfree(name);
123 nfs_free_fattr(fsinfo.fattr); 138 nfs_free_fattr(fsinfo.fattr);
124 return ret; 139 return ret;
125} 140}
@@ -169,27 +184,35 @@ out:
169/* 184/*
170 * get an NFS4 root dentry from the root filehandle 185 * get an NFS4 root dentry from the root filehandle
171 */ 186 */
172struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) 187struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
188 const char *devname)
173{ 189{
174 struct nfs_server *server = NFS_SB(sb); 190 struct nfs_server *server = NFS_SB(sb);
175 struct nfs_fattr *fattr = NULL; 191 struct nfs_fattr *fattr = NULL;
176 struct dentry *ret; 192 struct dentry *ret;
177 struct inode *inode; 193 struct inode *inode;
194 void *name = kstrdup(devname, GFP_KERNEL);
178 int error; 195 int error;
179 196
180 dprintk("--> nfs4_get_root()\n"); 197 dprintk("--> nfs4_get_root()\n");
181 198
199 if (!name)
200 return ERR_PTR(-ENOMEM);
201
182 /* get the info about the server and filesystem */ 202 /* get the info about the server and filesystem */
183 error = nfs4_server_capabilities(server, mntfh); 203 error = nfs4_server_capabilities(server, mntfh);
184 if (error < 0) { 204 if (error < 0) {
185 dprintk("nfs_get_root: getcaps error = %d\n", 205 dprintk("nfs_get_root: getcaps error = %d\n",
186 -error); 206 -error);
207 kfree(name);
187 return ERR_PTR(error); 208 return ERR_PTR(error);
188 } 209 }
189 210
190 fattr = nfs_alloc_fattr(); 211 fattr = nfs_alloc_fattr();
191 if (fattr == NULL) 212 if (fattr == NULL) {
192 return ERR_PTR(-ENOMEM);; 213 kfree(name);
214 return ERR_PTR(-ENOMEM);
215 }
193 216
194 /* get the actual root for this mount */ 217 /* get the actual root for this mount */
195 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); 218 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
@@ -223,8 +246,15 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
223 } 246 }
224 247
225 security_d_instantiate(ret, inode); 248 security_d_instantiate(ret, inode);
226 249 spin_lock(&ret->d_lock);
250 if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
251 ret->d_fsdata = name;
252 name = NULL;
253 }
254 spin_unlock(&ret->d_lock);
227out: 255out:
256 if (name)
257 kfree(name);
228 nfs_free_fattr(fattr); 258 nfs_free_fattr(fattr);
229 dprintk("<-- nfs4_get_root()\n"); 259 dprintk("<-- nfs4_get_root()\n");
230 return ret; 260 return ret;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 18696882f1c6..79664a1025af 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -33,16 +33,41 @@
33 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36#include <linux/types.h>
37#include <linux/string.h>
38#include <linux/kernel.h>
39
40static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
41{
42 unsigned long val;
43 char buf[16];
44
45 if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
46 return 0;
47 memcpy(buf, name, namelen);
48 buf[namelen] = '\0';
49 if (strict_strtoul(buf, 0, &val) != 0)
50 return 0;
51 *res = val;
52 return 1;
53}
54
55static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
56{
57 return snprintf(buf, buflen, "%u", id);
58}
36 59
37#ifdef CONFIG_NFS_USE_NEW_IDMAPPER 60#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
38 61
39#include <linux/slab.h> 62#include <linux/slab.h>
40#include <linux/cred.h> 63#include <linux/cred.h>
64#include <linux/sunrpc/sched.h>
65#include <linux/nfs4.h>
66#include <linux/nfs_fs_sb.h>
41#include <linux/nfs_idmap.h> 67#include <linux/nfs_idmap.h>
42#include <linux/keyctl.h> 68#include <linux/keyctl.h>
43#include <linux/key-type.h> 69#include <linux/key-type.h>
44#include <linux/rcupdate.h> 70#include <linux/rcupdate.h>
45#include <linux/kernel.h>
46#include <linux/err.h> 71#include <linux/err.h>
47 72
48#include <keys/user-type.h> 73#include <keys/user-type.h>
@@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
219 return ret; 244 return ret;
220} 245}
221 246
222int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 247int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
223{ 248{
249 if (nfs_map_string_to_numeric(name, namelen, uid))
250 return 0;
224 return nfs_idmap_lookup_id(name, namelen, "uid", uid); 251 return nfs_idmap_lookup_id(name, namelen, "uid", uid);
225} 252}
226 253
227int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) 254int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
228{ 255{
256 if (nfs_map_string_to_numeric(name, namelen, gid))
257 return 0;
229 return nfs_idmap_lookup_id(name, namelen, "gid", gid); 258 return nfs_idmap_lookup_id(name, namelen, "gid", gid);
230} 259}
231 260
232int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) 261int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
233{ 262{
234 return nfs_idmap_lookup_name(uid, "user", buf, buflen); 263 int ret = -EINVAL;
264
265 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
266 ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
267 if (ret < 0)
268 ret = nfs_map_numeric_to_string(uid, buf, buflen);
269 return ret;
235} 270}
236int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) 271int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
237{ 272{
238 return nfs_idmap_lookup_name(gid, "group", buf, buflen); 273 int ret = -EINVAL;
274
275 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
276 ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
277 if (ret < 0)
278 ret = nfs_map_numeric_to_string(gid, buf, buflen);
279 return ret;
239} 280}
240 281
241#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ 282#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
@@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
243#include <linux/module.h> 284#include <linux/module.h>
244#include <linux/mutex.h> 285#include <linux/mutex.h>
245#include <linux/init.h> 286#include <linux/init.h>
246#include <linux/types.h>
247#include <linux/slab.h> 287#include <linux/slab.h>
248#include <linux/socket.h> 288#include <linux/socket.h>
249#include <linux/in.h> 289#include <linux/in.h>
@@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
695 return hash; 735 return hash;
696} 736}
697 737
698int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 738int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
699{ 739{
700 struct idmap *idmap = clp->cl_idmap; 740 struct idmap *idmap = server->nfs_client->cl_idmap;
701 741
742 if (nfs_map_string_to_numeric(name, namelen, uid))
743 return 0;
702 return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); 744 return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
703} 745}
704 746
705int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 747int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
706{ 748{
707 struct idmap *idmap = clp->cl_idmap; 749 struct idmap *idmap = server->nfs_client->cl_idmap;
708 750
751 if (nfs_map_string_to_numeric(name, namelen, uid))
752 return 0;
709 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); 753 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
710} 754}
711 755
712int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) 756int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
713{ 757{
714 struct idmap *idmap = clp->cl_idmap; 758 struct idmap *idmap = server->nfs_client->cl_idmap;
759 int ret = -EINVAL;
715 760
716 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); 761 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
762 ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
763 if (ret < 0)
764 ret = nfs_map_numeric_to_string(uid, buf, buflen);
765 return ret;
717} 766}
718int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) 767int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
719{ 768{
720 struct idmap *idmap = clp->cl_idmap; 769 struct idmap *idmap = server->nfs_client->cl_idmap;
770 int ret = -EINVAL;
721 771
722 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); 772 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
773 ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
774 if (ret < 0)
775 ret = nfs_map_numeric_to_string(uid, buf, buflen);
776 return ret;
723} 777}
724 778
725#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ 779#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d8512423ba72..01768e5e2c9b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,6 +37,7 @@
37#include <linux/inet.h> 37#include <linux/inet.h>
38#include <linux/nfs_xdr.h> 38#include <linux/nfs_xdr.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/compat.h>
40 41
41#include <asm/system.h> 42#include <asm/system.h>
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word)
89 */ 90 */
90u64 nfs_compat_user_ino64(u64 fileid) 91u64 nfs_compat_user_ino64(u64 fileid)
91{ 92{
92 int ino; 93#ifdef CONFIG_COMPAT
94 compat_ulong_t ino;
95#else
96 unsigned long ino;
97#endif
93 98
94 if (enable_ino64) 99 if (enable_ino64)
95 return fileid; 100 return fileid;
@@ -881,9 +886,10 @@ out:
881 return ret; 886 return ret;
882} 887}
883 888
884static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) 889static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
885{ 890{
886 struct nfs_inode *nfsi = NFS_I(inode); 891 struct nfs_inode *nfsi = NFS_I(inode);
892 unsigned long ret = 0;
887 893
888 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) 894 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
889 && (fattr->valid & NFS_ATTR_FATTR_CHANGE) 895 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -891,25 +897,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
891 nfsi->change_attr = fattr->change_attr; 897 nfsi->change_attr = fattr->change_attr;
892 if (S_ISDIR(inode->i_mode)) 898 if (S_ISDIR(inode->i_mode))
893 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 899 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
900 ret |= NFS_INO_INVALID_ATTR;
894 } 901 }
895 /* If we have atomic WCC data, we may update some attributes */ 902 /* If we have atomic WCC data, we may update some attributes */
896 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) 903 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
897 && (fattr->valid & NFS_ATTR_FATTR_CTIME) 904 && (fattr->valid & NFS_ATTR_FATTR_CTIME)
898 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) 905 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
899 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 906 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
907 ret |= NFS_INO_INVALID_ATTR;
908 }
900 909
901 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) 910 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
902 && (fattr->valid & NFS_ATTR_FATTR_MTIME) 911 && (fattr->valid & NFS_ATTR_FATTR_MTIME)
903 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 912 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
904 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 913 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
905 if (S_ISDIR(inode->i_mode)) 914 if (S_ISDIR(inode->i_mode))
906 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 915 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
916 ret |= NFS_INO_INVALID_ATTR;
907 } 917 }
908 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) 918 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
909 && (fattr->valid & NFS_ATTR_FATTR_SIZE) 919 && (fattr->valid & NFS_ATTR_FATTR_SIZE)
910 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) 920 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
911 && nfsi->npages == 0) 921 && nfsi->npages == 0) {
912 i_size_write(inode, nfs_size_to_loff_t(fattr->size)); 922 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
923 ret |= NFS_INO_INVALID_ATTR;
924 }
925 return ret;
913} 926}
914 927
915/** 928/**
@@ -1223,7 +1236,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1223 | NFS_INO_REVAL_PAGECACHE); 1236 | NFS_INO_REVAL_PAGECACHE);
1224 1237
1225 /* Do atomic weak cache consistency updates */ 1238 /* Do atomic weak cache consistency updates */
1226 nfs_wcc_update_inode(inode, fattr); 1239 invalid |= nfs_wcc_update_inode(inode, fattr);
1227 1240
1228 /* More cache consistency checks */ 1241 /* More cache consistency checks */
1229 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { 1242 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
@@ -1505,7 +1518,7 @@ static int nfsiod_start(void)
1505{ 1518{
1506 struct workqueue_struct *wq; 1519 struct workqueue_struct *wq;
1507 dprintk("RPC: creating workqueue nfsiod\n"); 1520 dprintk("RPC: creating workqueue nfsiod\n");
1508 wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0); 1521 wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
1509 if (wq == NULL) 1522 if (wq == NULL)
1510 return -ENOMEM; 1523 return -ENOMEM;
1511 nfsiod_workqueue = wq; 1524 nfsiod_workqueue = wq;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4644f04b4b46..72e0bddf7a2f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -133,8 +133,7 @@ extern void nfs_put_client(struct nfs_client *);
133extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *); 133extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
134extern struct nfs_client *nfs4_find_client_ident(int); 134extern struct nfs_client *nfs4_find_client_ident(int);
135extern struct nfs_client * 135extern struct nfs_client *
136nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *, 136nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
137 int);
138extern struct nfs_server *nfs_create_server( 137extern struct nfs_server *nfs_create_server(
139 const struct nfs_parsed_mount_data *, 138 const struct nfs_parsed_mount_data *,
140 struct nfs_fh *); 139 struct nfs_fh *);
@@ -149,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
149 struct nfs_fattr *); 148 struct nfs_fattr *);
150extern void nfs_mark_client_ready(struct nfs_client *clp, int state); 149extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
151extern int nfs4_check_client_ready(struct nfs_client *clp); 150extern int nfs4_check_client_ready(struct nfs_client *clp);
151extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
152 const struct sockaddr *ds_addr,
153 int ds_addrlen, int ds_proto);
152#ifdef CONFIG_PROC_FS 154#ifdef CONFIG_PROC_FS
153extern int __init nfs_fs_proc_init(void); 155extern int __init nfs_fs_proc_init(void);
154extern void nfs_fs_proc_exit(void); 156extern void nfs_fs_proc_exit(void);
@@ -164,10 +166,10 @@ static inline void nfs_fs_proc_exit(void)
164 166
165/* nfs4namespace.c */ 167/* nfs4namespace.c */
166#ifdef CONFIG_NFS_V4 168#ifdef CONFIG_NFS_V4
167extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry); 169extern struct vfsmount *nfs_do_refmount(struct dentry *dentry);
168#else 170#else
169static inline 171static inline
170struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) 172struct vfsmount *nfs_do_refmount(struct dentry *dentry)
171{ 173{
172 return ERR_PTR(-ENOENT); 174 return ERR_PTR(-ENOENT);
173} 175}
@@ -214,8 +216,14 @@ extern const u32 nfs41_maxwrite_overhead;
214extern struct rpc_procinfo nfs4_procedures[]; 216extern struct rpc_procinfo nfs4_procedures[];
215#endif 217#endif
216 218
219extern int nfs4_init_ds_session(struct nfs_client *clp);
220
217/* proc.c */ 221/* proc.c */
218void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 222void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
223extern int nfs_init_client(struct nfs_client *clp,
224 const struct rpc_timeout *timeparms,
225 const char *ip_addr, rpc_authflavor_t authflavour,
226 int noresvport);
219 227
220/* dir.c */ 228/* dir.c */
221extern int nfs_access_cache_shrinker(struct shrinker *shrink, 229extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -248,24 +256,30 @@ extern void nfs_sb_active(struct super_block *sb);
248extern void nfs_sb_deactive(struct super_block *sb); 256extern void nfs_sb_deactive(struct super_block *sb);
249 257
250/* namespace.c */ 258/* namespace.c */
251extern char *nfs_path(const char *base, 259extern char *nfs_path(char **p, struct dentry *dentry,
252 const struct dentry *droot,
253 const struct dentry *dentry,
254 char *buffer, ssize_t buflen); 260 char *buffer, ssize_t buflen);
255extern struct vfsmount *nfs_d_automount(struct path *path); 261extern struct vfsmount *nfs_d_automount(struct path *path);
256 262
257/* getroot.c */ 263/* getroot.c */
258extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); 264extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
265 const char *);
259#ifdef CONFIG_NFS_V4 266#ifdef CONFIG_NFS_V4
260extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); 267extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
268 const char *);
261 269
262extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); 270extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
263#endif 271#endif
264 272
265/* read.c */ 273/* read.c */
274extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
275 const struct rpc_call_ops *call_ops);
266extern void nfs_read_prepare(struct rpc_task *task, void *calldata); 276extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
267 277
268/* write.c */ 278/* write.c */
279extern int nfs_initiate_write(struct nfs_write_data *data,
280 struct rpc_clnt *clnt,
281 const struct rpc_call_ops *call_ops,
282 int how);
269extern void nfs_write_prepare(struct rpc_task *task, void *calldata); 283extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
270#ifdef CONFIG_MIGRATION 284#ifdef CONFIG_MIGRATION
271extern int nfs_migrate_page(struct address_space *, 285extern int nfs_migrate_page(struct address_space *,
@@ -275,6 +289,13 @@ extern int nfs_migrate_page(struct address_space *,
275#endif 289#endif
276 290
277/* nfs4proc.c */ 291/* nfs4proc.c */
292extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
293extern int nfs4_init_client(struct nfs_client *clp,
294 const struct rpc_timeout *timeparms,
295 const char *ip_addr,
296 rpc_authflavor_t authflavour,
297 int noresvport);
298extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
278extern int _nfs4_call_sync(struct nfs_server *server, 299extern int _nfs4_call_sync(struct nfs_server *server,
279 struct rpc_message *msg, 300 struct rpc_message *msg,
280 struct nfs4_sequence_args *args, 301 struct nfs4_sequence_args *args,
@@ -289,12 +310,11 @@ extern int _nfs4_call_sync_session(struct nfs_server *server,
289/* 310/*
290 * Determine the device name as a string 311 * Determine the device name as a string
291 */ 312 */
292static inline char *nfs_devname(const struct vfsmount *mnt_parent, 313static inline char *nfs_devname(struct dentry *dentry,
293 const struct dentry *dentry,
294 char *buffer, ssize_t buflen) 314 char *buffer, ssize_t buflen)
295{ 315{
296 return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root, 316 char *dummy;
297 dentry, buffer, buflen); 317 return nfs_path(&dummy, dentry, buffer, buflen);
298} 318}
299 319
300/* 320/*
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f32b8603dca8..bf1c68009ffd 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -25,33 +25,30 @@ static LIST_HEAD(nfs_automount_list);
25static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); 25static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
26int nfs_mountpoint_expiry_timeout = 500 * HZ; 26int nfs_mountpoint_expiry_timeout = 500 * HZ;
27 27
28static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, 28static struct vfsmount *nfs_do_submount(struct dentry *dentry,
29 const struct dentry *dentry,
30 struct nfs_fh *fh, 29 struct nfs_fh *fh,
31 struct nfs_fattr *fattr); 30 struct nfs_fattr *fattr);
32 31
33/* 32/*
34 * nfs_path - reconstruct the path given an arbitrary dentry 33 * nfs_path - reconstruct the path given an arbitrary dentry
35 * @base - arbitrary string to prepend to the path 34 * @base - used to return pointer to the end of devname part of path
36 * @droot - pointer to root dentry for mountpoint
37 * @dentry - pointer to dentry 35 * @dentry - pointer to dentry
38 * @buffer - result buffer 36 * @buffer - result buffer
39 * @buflen - length of buffer 37 * @buflen - length of buffer
40 * 38 *
41 * Helper function for constructing the path from the 39 * Helper function for constructing the server pathname
42 * root dentry to an arbitrary hashed dentry. 40 * by arbitrary hashed dentry.
43 * 41 *
44 * This is mainly for use in figuring out the path on the 42 * This is mainly for use in figuring out the path on the
45 * server side when automounting on top of an existing partition. 43 * server side when automounting on top of an existing partition
44 * and in generating /proc/mounts and friends.
46 */ 45 */
47char *nfs_path(const char *base, 46char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
48 const struct dentry *droot,
49 const struct dentry *dentry,
50 char *buffer, ssize_t buflen)
51{ 47{
52 char *end; 48 char *end;
53 int namelen; 49 int namelen;
54 unsigned seq; 50 unsigned seq;
51 const char *base;
55 52
56rename_retry: 53rename_retry:
57 end = buffer+buflen; 54 end = buffer+buflen;
@@ -60,7 +57,10 @@ rename_retry:
60 57
61 seq = read_seqbegin(&rename_lock); 58 seq = read_seqbegin(&rename_lock);
62 rcu_read_lock(); 59 rcu_read_lock();
63 while (!IS_ROOT(dentry) && dentry != droot) { 60 while (1) {
61 spin_lock(&dentry->d_lock);
62 if (IS_ROOT(dentry))
63 break;
64 namelen = dentry->d_name.len; 64 namelen = dentry->d_name.len;
65 buflen -= namelen + 1; 65 buflen -= namelen + 1;
66 if (buflen < 0) 66 if (buflen < 0)
@@ -68,27 +68,47 @@ rename_retry:
68 end -= namelen; 68 end -= namelen;
69 memcpy(end, dentry->d_name.name, namelen); 69 memcpy(end, dentry->d_name.name, namelen);
70 *--end = '/'; 70 *--end = '/';
71 spin_unlock(&dentry->d_lock);
71 dentry = dentry->d_parent; 72 dentry = dentry->d_parent;
72 } 73 }
73 rcu_read_unlock(); 74 if (read_seqretry(&rename_lock, seq)) {
74 if (read_seqretry(&rename_lock, seq)) 75 spin_unlock(&dentry->d_lock);
76 rcu_read_unlock();
75 goto rename_retry; 77 goto rename_retry;
78 }
76 if (*end != '/') { 79 if (*end != '/') {
77 if (--buflen < 0) 80 if (--buflen < 0) {
81 spin_unlock(&dentry->d_lock);
82 rcu_read_unlock();
78 goto Elong; 83 goto Elong;
84 }
79 *--end = '/'; 85 *--end = '/';
80 } 86 }
87 *p = end;
88 base = dentry->d_fsdata;
89 if (!base) {
90 spin_unlock(&dentry->d_lock);
91 rcu_read_unlock();
92 WARN_ON(1);
93 return end;
94 }
81 namelen = strlen(base); 95 namelen = strlen(base);
82 /* Strip off excess slashes in base string */ 96 /* Strip off excess slashes in base string */
83 while (namelen > 0 && base[namelen - 1] == '/') 97 while (namelen > 0 && base[namelen - 1] == '/')
84 namelen--; 98 namelen--;
85 buflen -= namelen; 99 buflen -= namelen;
86 if (buflen < 0) 100 if (buflen < 0) {
101 spin_unlock(&dentry->d_lock);
102 rcu_read_unlock();
87 goto Elong; 103 goto Elong;
104 }
88 end -= namelen; 105 end -= namelen;
89 memcpy(end, base, namelen); 106 memcpy(end, base, namelen);
107 spin_unlock(&dentry->d_lock);
108 rcu_read_unlock();
90 return end; 109 return end;
91Elong_unlock: 110Elong_unlock:
111 spin_unlock(&dentry->d_lock);
92 rcu_read_unlock(); 112 rcu_read_unlock();
93 if (read_seqretry(&rename_lock, seq)) 113 if (read_seqretry(&rename_lock, seq))
94 goto rename_retry; 114 goto rename_retry;
@@ -143,9 +163,9 @@ struct vfsmount *nfs_d_automount(struct path *path)
143 } 163 }
144 164
145 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 165 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
146 mnt = nfs_do_refmount(path->mnt, path->dentry); 166 mnt = nfs_do_refmount(path->dentry);
147 else 167 else
148 mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr); 168 mnt = nfs_do_submount(path->dentry, fh, fattr);
149 if (IS_ERR(mnt)) 169 if (IS_ERR(mnt))
150 goto out; 170 goto out;
151 171
@@ -209,19 +229,17 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
209 229
210/** 230/**
211 * nfs_do_submount - set up mountpoint when crossing a filesystem boundary 231 * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
212 * @mnt_parent - mountpoint of parent directory
213 * @dentry - parent directory 232 * @dentry - parent directory
214 * @fh - filehandle for new root dentry 233 * @fh - filehandle for new root dentry
215 * @fattr - attributes for new root inode 234 * @fattr - attributes for new root inode
216 * 235 *
217 */ 236 */
218static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, 237static struct vfsmount *nfs_do_submount(struct dentry *dentry,
219 const struct dentry *dentry,
220 struct nfs_fh *fh, 238 struct nfs_fh *fh,
221 struct nfs_fattr *fattr) 239 struct nfs_fattr *fattr)
222{ 240{
223 struct nfs_clone_mount mountdata = { 241 struct nfs_clone_mount mountdata = {
224 .sb = mnt_parent->mnt_sb, 242 .sb = dentry->d_sb,
225 .dentry = dentry, 243 .dentry = dentry,
226 .fh = fh, 244 .fh = fh,
227 .fattr = fattr, 245 .fattr = fattr,
@@ -237,11 +255,11 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
237 dentry->d_name.name); 255 dentry->d_name.name);
238 if (page == NULL) 256 if (page == NULL)
239 goto out; 257 goto out;
240 devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); 258 devname = nfs_devname(dentry, page, PAGE_SIZE);
241 mnt = (struct vfsmount *)devname; 259 mnt = (struct vfsmount *)devname;
242 if (IS_ERR(devname)) 260 if (IS_ERR(devname))
243 goto free_page; 261 goto free_page;
244 mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata); 262 mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
245free_page: 263free_page:
246 free_page((unsigned long)page); 264 free_page((unsigned long)page);
247out: 265out:
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f4c7e2..274342771655 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
311 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 311 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
312 goto out; 312 goto out;
313 313
314 /* We are doing this here, because XDR marshalling can only 314 /* We are doing this here because XDR marshalling does not
315 return -ENOMEM. */ 315 * return any results, it BUGs. */
316 status = -ENOSPC; 316 status = -ENOSPC;
317 if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) 317 if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
318 goto out; 318 goto out;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce939c062a52..d0c80d8b3f96 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
885 .lock = nfs3_proc_lock, 885 .lock = nfs3_proc_lock,
886 .clear_acl_cache = nfs3_forget_cached_acls, 886 .clear_acl_cache = nfs3_forget_cached_acls,
887 .close_context = nfs_close_context, 887 .close_context = nfs_close_context,
888 .init_client = nfs_init_client,
888}; 889};
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 01c5e8b1941d..183c6b123d0f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1328,10 +1328,13 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
1328 1328
1329 encode_nfs_fh3(xdr, NFS_FH(args->inode)); 1329 encode_nfs_fh3(xdr, NFS_FH(args->inode));
1330 encode_uint32(xdr, args->mask); 1330 encode_uint32(xdr, args->mask);
1331
1332 base = req->rq_slen;
1331 if (args->npages != 0) 1333 if (args->npages != 0)
1332 xdr_write_pages(xdr, args->pages, 0, args->len); 1334 xdr_write_pages(xdr, args->pages, 0, args->len);
1335 else
1336 xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
1333 1337
1334 base = req->rq_slen;
1335 error = nfsacl_encode(xdr->buf, base, args->inode, 1338 error = nfsacl_encode(xdr->buf, base, args->inode,
1336 (args->mask & NFS_ACL) ? 1339 (args->mask & NFS_ACL) ?
1337 args->acl_access : NULL, 1, 0); 1340 args->acl_access : NULL, 1, 0);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a7474073148..c64be1cff080 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
252extern int nfs4_setup_sequence(const struct nfs_server *server, 252extern int nfs4_setup_sequence(const struct nfs_server *server,
253 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 253 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
254 int cache_reply, struct rpc_task *task); 254 int cache_reply, struct rpc_task *task);
255extern int nfs41_setup_sequence(struct nfs4_session *session,
256 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
257 int cache_reply, struct rpc_task *task);
255extern void nfs4_destroy_session(struct nfs4_session *session); 258extern void nfs4_destroy_session(struct nfs4_session *session);
256extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); 259extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
257extern int nfs4_proc_create_session(struct nfs_client *); 260extern int nfs4_proc_create_session(struct nfs_client *);
@@ -259,6 +262,19 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
259extern int nfs4_init_session(struct nfs_server *server); 262extern int nfs4_init_session(struct nfs_server *server);
260extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 263extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
261 struct nfs_fsinfo *fsinfo); 264 struct nfs_fsinfo *fsinfo);
265
266static inline bool
267is_ds_only_client(struct nfs_client *clp)
268{
269 return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
270 EXCHGID4_FLAG_USE_PNFS_DS;
271}
272
273static inline bool
274is_ds_client(struct nfs_client *clp)
275{
276 return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
277}
262#else /* CONFIG_NFS_v4_1 */ 278#else /* CONFIG_NFS_v4_1 */
263static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) 279static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
264{ 280{
@@ -276,6 +292,18 @@ static inline int nfs4_init_session(struct nfs_server *server)
276{ 292{
277 return 0; 293 return 0;
278} 294}
295
296static inline bool
297is_ds_only_client(struct nfs_client *clp)
298{
299 return false;
300}
301
302static inline bool
303is_ds_client(struct nfs_client *clp)
304{
305 return false;
306}
279#endif /* CONFIG_NFS_V4_1 */ 307#endif /* CONFIG_NFS_V4_1 */
280 308
281extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; 309extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
@@ -298,6 +326,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
298#if defined(CONFIG_NFS_V4_1) 326#if defined(CONFIG_NFS_V4_1)
299struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); 327struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
300struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); 328struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
329extern void nfs4_schedule_session_recovery(struct nfs4_session *);
330#else
331static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
332{
333}
301#endif /* CONFIG_NFS_V4_1 */ 334#endif /* CONFIG_NFS_V4_1 */
302 335
303extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); 336extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
@@ -307,10 +340,9 @@ extern void nfs4_put_open_state(struct nfs4_state *);
307extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); 340extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
308extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); 341extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
309extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); 342extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
310extern void nfs4_schedule_state_recovery(struct nfs_client *); 343extern void nfs4_schedule_lease_recovery(struct nfs_client *);
311extern void nfs4_schedule_state_manager(struct nfs_client *); 344extern void nfs4_schedule_state_manager(struct nfs_client *);
312extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); 345extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
313extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
314extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 346extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
315extern void nfs41_handle_recall_slot(struct nfs_client *clp); 347extern void nfs41_handle_recall_slot(struct nfs_client *clp);
316extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 348extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 23f930caf1e2..428558464817 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,32 +40,309 @@ MODULE_LICENSE("GPL");
40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); 40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
41MODULE_DESCRIPTION("The NFSv4 file layout driver"); 41MODULE_DESCRIPTION("The NFSv4 file layout driver");
42 42
43static int 43#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
44filelayout_set_layoutdriver(struct nfs_server *nfss) 44
45static loff_t
46filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
47 loff_t offset)
45{ 48{
46 int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, 49 u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
47 nfs4_fl_free_deviceid_callback); 50 u64 tmp;
48 if (status) { 51
49 printk(KERN_WARNING "%s: deviceid cache could not be " 52 offset -= flseg->pattern_offset;
50 "initialized\n", __func__); 53 tmp = offset;
51 return status; 54 do_div(tmp, stripe_width);
55
56 return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
57}
58
59/* This function is used by the layout driver to calculate the
60 * offset of the file on the dserver based on whether the
61 * layout type is STRIPE_DENSE or STRIPE_SPARSE
62 */
63static loff_t
64filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
65{
66 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
67
68 switch (flseg->stripe_type) {
69 case STRIPE_SPARSE:
70 return offset;
71
72 case STRIPE_DENSE:
73 return filelayout_get_dense_offset(flseg, offset);
52 } 74 }
53 dprintk("%s: deviceid cache has been initialized successfully\n", 75
54 __func__); 76 BUG();
77}
78
79/* For data server errors we don't recover from */
80static void
81filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
82{
83 if (lseg->pls_range.iomode == IOMODE_RW) {
84 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
85 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
86 } else {
87 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
88 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
89 }
90}
91
92static int filelayout_async_handle_error(struct rpc_task *task,
93 struct nfs4_state *state,
94 struct nfs_client *clp,
95 int *reset)
96{
97 if (task->tk_status >= 0)
98 return 0;
99
100 *reset = 0;
101
102 switch (task->tk_status) {
103 case -NFS4ERR_BADSESSION:
104 case -NFS4ERR_BADSLOT:
105 case -NFS4ERR_BAD_HIGH_SLOT:
106 case -NFS4ERR_DEADSESSION:
107 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
108 case -NFS4ERR_SEQ_FALSE_RETRY:
109 case -NFS4ERR_SEQ_MISORDERED:
110 dprintk("%s ERROR %d, Reset session. Exchangeid "
111 "flags 0x%x\n", __func__, task->tk_status,
112 clp->cl_exchange_flags);
113 nfs4_schedule_session_recovery(clp->cl_session);
114 break;
115 case -NFS4ERR_DELAY:
116 case -NFS4ERR_GRACE:
117 case -EKEYEXPIRED:
118 rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
119 break;
120 default:
121 dprintk("%s DS error. Retry through MDS %d\n", __func__,
122 task->tk_status);
123 *reset = 1;
124 break;
125 }
126 task->tk_status = 0;
127 return -EAGAIN;
128}
129
130/* NFS_PROTO call done callback routines */
131
132static int filelayout_read_done_cb(struct rpc_task *task,
133 struct nfs_read_data *data)
134{
135 struct nfs_client *clp = data->ds_clp;
136 int reset = 0;
137
138 dprintk("%s DS read\n", __func__);
139
140 if (filelayout_async_handle_error(task, data->args.context->state,
141 data->ds_clp, &reset) == -EAGAIN) {
142 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
143 __func__, data->ds_clp, data->ds_clp->cl_session);
144 if (reset) {
145 filelayout_set_lo_fail(data->lseg);
146 nfs4_reset_read(task, data);
147 clp = NFS_SERVER(data->inode)->nfs_client;
148 }
149 nfs_restart_rpc(task, clp);
150 return -EAGAIN;
151 }
152
55 return 0; 153 return 0;
56} 154}
57 155
58/* Clear out the layout by destroying its device list */ 156/*
59static int 157 * Call ops for the async read/write cases
60filelayout_clear_layoutdriver(struct nfs_server *nfss) 158 * In the case of dense layouts, the offset needs to be reset to its
159 * original value.
160 */
161static void filelayout_read_prepare(struct rpc_task *task, void *data)
61{ 162{
62 dprintk("--> %s\n", __func__); 163 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
164
165 rdata->read_done_cb = filelayout_read_done_cb;
166
167 if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
168 &rdata->args.seq_args, &rdata->res.seq_res,
169 0, task))
170 return;
171
172 rpc_call_start(task);
173}
174
175static void filelayout_read_call_done(struct rpc_task *task, void *data)
176{
177 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
178
179 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
180
181 /* Note this may cause RPC to be resent */
182 rdata->mds_ops->rpc_call_done(task, data);
183}
184
185static void filelayout_read_release(void *data)
186{
187 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
188
189 rdata->mds_ops->rpc_release(data);
190}
191
192static int filelayout_write_done_cb(struct rpc_task *task,
193 struct nfs_write_data *data)
194{
195 int reset = 0;
196
197 if (filelayout_async_handle_error(task, data->args.context->state,
198 data->ds_clp, &reset) == -EAGAIN) {
199 struct nfs_client *clp;
200
201 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
202 __func__, data->ds_clp, data->ds_clp->cl_session);
203 if (reset) {
204 filelayout_set_lo_fail(data->lseg);
205 nfs4_reset_write(task, data);
206 clp = NFS_SERVER(data->inode)->nfs_client;
207 } else
208 clp = data->ds_clp;
209 nfs_restart_rpc(task, clp);
210 return -EAGAIN;
211 }
63 212
64 if (nfss->nfs_client->cl_devid_cache)
65 pnfs_put_deviceid_cache(nfss->nfs_client);
66 return 0; 213 return 0;
67} 214}
68 215
216static void filelayout_write_prepare(struct rpc_task *task, void *data)
217{
218 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
219
220 if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
221 &wdata->args.seq_args, &wdata->res.seq_res,
222 0, task))
223 return;
224
225 rpc_call_start(task);
226}
227
228static void filelayout_write_call_done(struct rpc_task *task, void *data)
229{
230 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
231
232 /* Note this may cause RPC to be resent */
233 wdata->mds_ops->rpc_call_done(task, data);
234}
235
236static void filelayout_write_release(void *data)
237{
238 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
239
240 wdata->mds_ops->rpc_release(data);
241}
242
243struct rpc_call_ops filelayout_read_call_ops = {
244 .rpc_call_prepare = filelayout_read_prepare,
245 .rpc_call_done = filelayout_read_call_done,
246 .rpc_release = filelayout_read_release,
247};
248
249struct rpc_call_ops filelayout_write_call_ops = {
250 .rpc_call_prepare = filelayout_write_prepare,
251 .rpc_call_done = filelayout_write_call_done,
252 .rpc_release = filelayout_write_release,
253};
254
255static enum pnfs_try_status
256filelayout_read_pagelist(struct nfs_read_data *data)
257{
258 struct pnfs_layout_segment *lseg = data->lseg;
259 struct nfs4_pnfs_ds *ds;
260 loff_t offset = data->args.offset;
261 u32 j, idx;
262 struct nfs_fh *fh;
263 int status;
264
265 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
266 __func__, data->inode->i_ino,
267 data->args.pgbase, (size_t)data->args.count, offset);
268
269 /* Retrieve the correct rpc_client for the byte range */
270 j = nfs4_fl_calc_j_index(lseg, offset);
271 idx = nfs4_fl_calc_ds_index(lseg, j);
272 ds = nfs4_fl_prepare_ds(lseg, idx);
273 if (!ds) {
274 /* Either layout fh index faulty, or ds connect failed */
275 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
276 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
277 return PNFS_NOT_ATTEMPTED;
278 }
279 dprintk("%s USE DS:ip %x %hu\n", __func__,
280 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
281
282 /* No multipath support. Use first DS */
283 data->ds_clp = ds->ds_clp;
284 fh = nfs4_fl_select_ds_fh(lseg, j);
285 if (fh)
286 data->args.fh = fh;
287
288 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
289 data->mds_offset = offset;
290
291 /* Perform an asynchronous read to ds */
292 status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
293 &filelayout_read_call_ops);
294 BUG_ON(status != 0);
295 return PNFS_ATTEMPTED;
296}
297
298/* Perform async writes. */
299static enum pnfs_try_status
300filelayout_write_pagelist(struct nfs_write_data *data, int sync)
301{
302 struct pnfs_layout_segment *lseg = data->lseg;
303 struct nfs4_pnfs_ds *ds;
304 loff_t offset = data->args.offset;
305 u32 j, idx;
306 struct nfs_fh *fh;
307 int status;
308
309 /* Retrieve the correct rpc_client for the byte range */
310 j = nfs4_fl_calc_j_index(lseg, offset);
311 idx = nfs4_fl_calc_ds_index(lseg, j);
312 ds = nfs4_fl_prepare_ds(lseg, idx);
313 if (!ds) {
314 printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
315 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
316 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
317 return PNFS_NOT_ATTEMPTED;
318 }
319 dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
320 data->inode->i_ino, sync, (size_t) data->args.count, offset,
321 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
322
323 /* We can't handle commit to ds yet */
324 if (!FILELAYOUT_LSEG(lseg)->commit_through_mds)
325 data->args.stable = NFS_FILE_SYNC;
326
327 data->write_done_cb = filelayout_write_done_cb;
328 data->ds_clp = ds->ds_clp;
329 fh = nfs4_fl_select_ds_fh(lseg, j);
330 if (fh)
331 data->args.fh = fh;
332 /*
333 * Get the file offset on the dserver. Set the write offset to
334 * this offset and save the original offset.
335 */
336 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
337 data->mds_offset = offset;
338
339 /* Perform an asynchronous write */
340 status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
341 &filelayout_write_call_ops, sync);
342 BUG_ON(status != 0);
343 return PNFS_ATTEMPTED;
344}
345
69/* 346/*
70 * filelayout_check_layout() 347 * filelayout_check_layout()
71 * 348 *
@@ -92,14 +369,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
92 goto out; 369 goto out;
93 } 370 }
94 371
95 if (fl->stripe_unit % PAGE_SIZE) { 372 if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
96 dprintk("%s Stripe unit (%u) not page aligned\n", 373 dprintk("%s Invalid stripe unit (%u)\n",
97 __func__, fl->stripe_unit); 374 __func__, fl->stripe_unit);
98 goto out; 375 goto out;
99 } 376 }
100 377
101 /* find and reference the deviceid */ 378 /* find and reference the deviceid */
102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); 379 dsaddr = nfs4_fl_find_get_deviceid(id);
103 if (dsaddr == NULL) { 380 if (dsaddr == NULL) {
104 dsaddr = get_device_info(lo->plh_inode, id); 381 dsaddr = get_device_info(lo->plh_inode, id);
105 if (dsaddr == NULL) 382 if (dsaddr == NULL)
@@ -134,7 +411,7 @@ out:
134 dprintk("--> %s returns %d\n", __func__, status); 411 dprintk("--> %s returns %d\n", __func__, status);
135 return status; 412 return status;
136out_put: 413out_put:
137 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); 414 nfs4_fl_put_deviceid(dsaddr);
138 goto out; 415 goto out;
139} 416}
140 417
@@ -243,23 +520,47 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
243static void 520static void
244filelayout_free_lseg(struct pnfs_layout_segment *lseg) 521filelayout_free_lseg(struct pnfs_layout_segment *lseg)
245{ 522{
246 struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 523 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
248 524
249 dprintk("--> %s\n", __func__); 525 dprintk("--> %s\n", __func__);
250 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, 526 nfs4_fl_put_deviceid(fl->dsaddr);
251 &fl->dsaddr->deviceid);
252 _filelayout_free_lseg(fl); 527 _filelayout_free_lseg(fl);
253} 528}
254 529
530/*
531 * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
532 *
533 * return 1 : coalesce page
534 * return 0 : don't coalesce page
535 */
536int
537filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
538 struct nfs_page *req)
539{
540 u64 p_stripe, r_stripe;
541 u32 stripe_unit;
542
543 if (!pgio->pg_lseg)
544 return 1;
545 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
546 r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
547 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
548
549 do_div(p_stripe, stripe_unit);
550 do_div(r_stripe, stripe_unit);
551
552 return (p_stripe == r_stripe);
553}
554
255static struct pnfs_layoutdriver_type filelayout_type = { 555static struct pnfs_layoutdriver_type filelayout_type = {
256 .id = LAYOUT_NFSV4_1_FILES, 556 .id = LAYOUT_NFSV4_1_FILES,
257 .name = "LAYOUT_NFSV4_1_FILES", 557 .name = "LAYOUT_NFSV4_1_FILES",
258 .owner = THIS_MODULE, 558 .owner = THIS_MODULE,
259 .set_layoutdriver = filelayout_set_layoutdriver, 559 .alloc_lseg = filelayout_alloc_lseg,
260 .clear_layoutdriver = filelayout_clear_layoutdriver, 560 .free_lseg = filelayout_free_lseg,
261 .alloc_lseg = filelayout_alloc_lseg, 561 .pg_test = filelayout_pg_test,
262 .free_lseg = filelayout_free_lseg, 562 .read_pagelist = filelayout_read_pagelist,
563 .write_pagelist = filelayout_write_pagelist,
263}; 564};
264 565
265static int __init nfs4filelayout_init(void) 566static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index bbf60dd2ab9d..ee0c907742b5 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -55,8 +55,14 @@ struct nfs4_pnfs_ds {
55 atomic_t ds_count; 55 atomic_t ds_count;
56}; 56};
57 57
58/* nfs4_file_layout_dsaddr flags */
59#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
60
58struct nfs4_file_layout_dsaddr { 61struct nfs4_file_layout_dsaddr {
59 struct pnfs_deviceid_node deviceid; 62 struct hlist_node node;
63 struct nfs4_deviceid deviceid;
64 atomic_t ref;
65 unsigned long flags;
60 u32 stripe_count; 66 u32 stripe_count;
61 u8 *stripe_indices; 67 u8 *stripe_indices;
62 u32 ds_num; 68 u32 ds_num;
@@ -83,11 +89,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
83 generic_hdr); 89 generic_hdr);
84} 90}
85 91
86extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); 92extern struct nfs_fh *
93nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
94
87extern void print_ds(struct nfs4_pnfs_ds *ds); 95extern void print_ds(struct nfs4_pnfs_ds *ds);
88extern void print_deviceid(struct nfs4_deviceid *dev_id); 96extern void print_deviceid(struct nfs4_deviceid *dev_id);
97u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
98u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
99struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
100 u32 ds_idx);
89extern struct nfs4_file_layout_dsaddr * 101extern struct nfs4_file_layout_dsaddr *
90nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); 102nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
103extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
91struct nfs4_file_layout_dsaddr * 104struct nfs4_file_layout_dsaddr *
92get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); 105get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
93 106
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 51fe64ace55a..68143c162e3b 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,6 +37,30 @@
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD 37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38 38
39/* 39/*
40 * Device ID RCU cache. A device ID is unique per client ID and layout type.
41 */
42#define NFS4_FL_DEVICE_ID_HASH_BITS 5
43#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
44#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
45
46static inline u32
47nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
48{
49 unsigned char *cptr = (unsigned char *)id->data;
50 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
51 u32 x = 0;
52
53 while (nbytes--) {
54 x *= 37;
55 x += *cptr++;
56 }
57 return x & NFS4_FL_DEVICE_ID_HASH_MASK;
58}
59
60static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
61static DEFINE_SPINLOCK(filelayout_deviceid_lock);
62
63/*
40 * Data server cache 64 * Data server cache
41 * 65 *
42 * Data servers can be mapped to different device ids. 66 * Data servers can be mapped to different device ids.
@@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
104 return NULL; 128 return NULL;
105} 129}
106 130
131/*
132 * Create an rpc connection to the nfs4_pnfs_ds data server
133 * Currently only support IPv4
134 */
135static int
136nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
137{
138 struct nfs_client *clp;
139 struct sockaddr_in sin;
140 int status = 0;
141
142 dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
143 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
144 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
145
146 sin.sin_family = AF_INET;
147 sin.sin_addr.s_addr = ds->ds_ip_addr;
148 sin.sin_port = ds->ds_port;
149
150 clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
151 sizeof(sin), IPPROTO_TCP);
152 if (IS_ERR(clp)) {
153 status = PTR_ERR(clp);
154 goto out;
155 }
156
157 if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
158 if (!is_ds_client(clp)) {
159 status = -ENODEV;
160 goto out_put;
161 }
162 ds->ds_clp = clp;
163 dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
164 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
165 goto out;
166 }
167
168 /*
169 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
170 * be equal to the MDS lease. Renewal is scheduled in create_session.
171 */
172 spin_lock(&mds_srv->nfs_client->cl_lock);
173 clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
174 spin_unlock(&mds_srv->nfs_client->cl_lock);
175 clp->cl_last_renewal = jiffies;
176
177 /* New nfs_client */
178 status = nfs4_init_ds_session(clp);
179 if (status)
180 goto out_put;
181
182 ds->ds_clp = clp;
183 dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
184 ntohs(ds->ds_port));
185out:
186 return status;
187out_put:
188 nfs_put_client(clp);
189 goto out;
190}
191
107static void 192static void
108destroy_ds(struct nfs4_pnfs_ds *ds) 193destroy_ds(struct nfs4_pnfs_ds *ds)
109{ 194{
@@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
122 struct nfs4_pnfs_ds *ds; 207 struct nfs4_pnfs_ds *ds;
123 int i; 208 int i;
124 209
125 print_deviceid(&dsaddr->deviceid.de_id); 210 print_deviceid(&dsaddr->deviceid);
126 211
127 for (i = 0; i < dsaddr->ds_num; i++) { 212 for (i = 0; i < dsaddr->ds_num; i++) {
128 ds = dsaddr->ds_list[i]; 213 ds = dsaddr->ds_list[i];
@@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
139 kfree(dsaddr); 224 kfree(dsaddr);
140} 225}
141 226
142void
143nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
144{
145 struct nfs4_file_layout_dsaddr *dsaddr =
146 container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
147
148 nfs4_fl_free_deviceid(dsaddr);
149}
150
151static struct nfs4_pnfs_ds * 227static struct nfs4_pnfs_ds *
152nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) 228nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
153{ 229{
@@ -214,17 +290,26 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
214 290
215 /* ipv6 length plus port is legal */ 291 /* ipv6 length plus port is legal */
216 if (rlen > INET6_ADDRSTRLEN + 8) { 292 if (rlen > INET6_ADDRSTRLEN + 8) {
217 dprintk("%s Invalid address, length %d\n", __func__, 293 dprintk("%s: Invalid address, length %d\n", __func__,
218 rlen); 294 rlen);
219 goto out_err; 295 goto out_err;
220 } 296 }
221 buf = kmalloc(rlen + 1, GFP_KERNEL); 297 buf = kmalloc(rlen + 1, GFP_KERNEL);
298 if (!buf) {
299 dprintk("%s: Not enough memory\n", __func__);
300 goto out_err;
301 }
222 buf[rlen] = '\0'; 302 buf[rlen] = '\0';
223 memcpy(buf, r_addr, rlen); 303 memcpy(buf, r_addr, rlen);
224 304
225 /* replace the port dots with dashes for the in4_pton() delimiter*/ 305 /* replace the port dots with dashes for the in4_pton() delimiter*/
226 for (i = 0; i < 2; i++) { 306 for (i = 0; i < 2; i++) {
227 char *res = strrchr(buf, '.'); 307 char *res = strrchr(buf, '.');
308 if (!res) {
309 dprintk("%s: Failed finding expected dots in port\n",
310 __func__);
311 goto out_free;
312 }
228 *res = '-'; 313 *res = '-';
229 } 314 }
230 315
@@ -240,7 +325,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
240 port = htons((tmp[0] << 8) | (tmp[1])); 325 port = htons((tmp[0] << 8) | (tmp[1]));
241 326
242 ds = nfs4_pnfs_ds_add(inode, ip_addr, port); 327 ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
243 dprintk("%s Decoded address and port %s\n", __func__, buf); 328 dprintk("%s: Decoded address and port %s\n", __func__, buf);
244out_free: 329out_free:
245 kfree(buf); 330 kfree(buf);
246out_err: 331out_err:
@@ -291,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
291 dsaddr->stripe_count = cnt; 376 dsaddr->stripe_count = cnt;
292 dsaddr->ds_num = num; 377 dsaddr->ds_num = num;
293 378
294 memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); 379 memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
295 380
296 /* Go back an read stripe indices */ 381 /* Go back an read stripe indices */
297 p = indicesp; 382 p = indicesp;
@@ -341,28 +426,37 @@ out_err:
341} 426}
342 427
343/* 428/*
344 * Decode the opaque device specified in 'dev' 429 * Decode the opaque device specified in 'dev' and add it to the cache of
345 * and add it to the list of available devices. 430 * available devices.
346 * If the deviceid is already cached, nfs4_add_deviceid will return
347 * a pointer to the cached struct and throw away the new.
348 */ 431 */
349static struct nfs4_file_layout_dsaddr* 432static struct nfs4_file_layout_dsaddr *
350decode_and_add_device(struct inode *inode, struct pnfs_device *dev) 433decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
351{ 434{
352 struct nfs4_file_layout_dsaddr *dsaddr; 435 struct nfs4_file_layout_dsaddr *d, *new;
353 struct pnfs_deviceid_node *d; 436 long hash;
354 437
355 dsaddr = decode_device(inode, dev); 438 new = decode_device(inode, dev);
356 if (!dsaddr) { 439 if (!new) {
357 printk(KERN_WARNING "%s: Could not decode or add device\n", 440 printk(KERN_WARNING "%s: Could not decode or add device\n",
358 __func__); 441 __func__);
359 return NULL; 442 return NULL;
360 } 443 }
361 444
362 d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, 445 spin_lock(&filelayout_deviceid_lock);
363 &dsaddr->deviceid); 446 d = nfs4_fl_find_get_deviceid(&new->deviceid);
447 if (d) {
448 spin_unlock(&filelayout_deviceid_lock);
449 nfs4_fl_free_deviceid(new);
450 return d;
451 }
364 452
365 return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); 453 INIT_HLIST_NODE(&new->node);
454 atomic_set(&new->ref, 1);
455 hash = nfs4_fl_deviceid_hash(&new->deviceid);
456 hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
457 spin_unlock(&filelayout_deviceid_lock);
458
459 return new;
366} 460}
367 461
368/* 462/*
@@ -437,12 +531,123 @@ out_free:
437 return dsaddr; 531 return dsaddr;
438} 532}
439 533
534void
535nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
536{
537 if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
538 hlist_del_rcu(&dsaddr->node);
539 spin_unlock(&filelayout_deviceid_lock);
540
541 synchronize_rcu();
542 nfs4_fl_free_deviceid(dsaddr);
543 }
544}
545
440struct nfs4_file_layout_dsaddr * 546struct nfs4_file_layout_dsaddr *
441nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) 547nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
548{
549 struct nfs4_file_layout_dsaddr *d;
550 struct hlist_node *n;
551 long hash = nfs4_fl_deviceid_hash(id);
552
553
554 rcu_read_lock();
555 hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
556 if (!memcmp(&d->deviceid, id, sizeof(*id))) {
557 if (!atomic_inc_not_zero(&d->ref))
558 goto fail;
559 rcu_read_unlock();
560 return d;
561 }
562 }
563fail:
564 rcu_read_unlock();
565 return NULL;
566}
567
568/*
569 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
570 * Then: ((res + fsi) % dsaddr->stripe_count)
571 */
572u32
573nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
574{
575 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
576 u64 tmp;
577
578 tmp = offset - flseg->pattern_offset;
579 do_div(tmp, flseg->stripe_unit);
580 tmp += flseg->first_stripe_index;
581 return do_div(tmp, flseg->dsaddr->stripe_count);
582}
583
584u32
585nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
586{
587 return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
588}
589
590struct nfs_fh *
591nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
592{
593 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
594 u32 i;
595
596 if (flseg->stripe_type == STRIPE_SPARSE) {
597 if (flseg->num_fh == 1)
598 i = 0;
599 else if (flseg->num_fh == 0)
600 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
601 return NULL;
602 else
603 i = nfs4_fl_calc_ds_index(lseg, j);
604 } else
605 i = j;
606 return flseg->fh_array[i];
607}
608
609static void
610filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
611 int err, u32 ds_addr)
612{
613 u32 *p = (u32 *)&dsaddr->deviceid;
614
615 printk(KERN_ERR "NFS: data server %x connection error %d."
616 " Deviceid [%x%x%x%x] marked out of use.\n",
617 ds_addr, err, p[0], p[1], p[2], p[3]);
618
619 spin_lock(&filelayout_deviceid_lock);
620 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
621 spin_unlock(&filelayout_deviceid_lock);
622}
623
624struct nfs4_pnfs_ds *
625nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
442{ 626{
443 struct pnfs_deviceid_node *d; 627 struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
628 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
629
630 if (ds == NULL) {
631 printk(KERN_ERR "%s: No data server for offset index %d\n",
632 __func__, ds_idx);
633 return NULL;
634 }
444 635
445 d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); 636 if (!ds->ds_clp) {
446 return (d == NULL) ? NULL : 637 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
447 container_of(d, struct nfs4_file_layout_dsaddr, deviceid); 638 int err;
639
640 if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
641 /* Already tried to connect, don't try again */
642 dprintk("%s Deviceid marked out of use\n", __func__);
643 return NULL;
644 }
645 err = nfs4_ds_connect(s, ds);
646 if (err) {
647 filelayout_mark_devid_negative(dsaddr, err,
648 ntohl(ds->ds_ip_addr));
649 return NULL;
650 }
651 }
652 return ds;
448} 653}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3c2a1724fbd2..bb80c49b6533 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -54,33 +54,29 @@ Elong:
54/* 54/*
55 * Determine the mount path as a string 55 * Determine the mount path as a string
56 */ 56 */
57static char *nfs4_path(const struct vfsmount *mnt_parent, 57static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
58 const struct dentry *dentry,
59 char *buffer, ssize_t buflen)
60{ 58{
61 const char *srvpath; 59 char *limit;
62 60 char *path = nfs_path(&limit, dentry, buffer, buflen);
63 srvpath = strchr(mnt_parent->mnt_devname, ':'); 61 if (!IS_ERR(path)) {
64 if (srvpath) 62 char *colon = strchr(path, ':');
65 srvpath++; 63 if (colon && colon < limit)
66 else 64 path = colon + 1;
67 srvpath = mnt_parent->mnt_devname; 65 }
68 66 return path;
69 return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen);
70} 67}
71 68
72/* 69/*
73 * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we 70 * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
74 * believe to be the server path to this dentry 71 * believe to be the server path to this dentry
75 */ 72 */
76static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, 73static int nfs4_validate_fspath(struct dentry *dentry,
77 const struct dentry *dentry,
78 const struct nfs4_fs_locations *locations, 74 const struct nfs4_fs_locations *locations,
79 char *page, char *page2) 75 char *page, char *page2)
80{ 76{
81 const char *path, *fs_path; 77 const char *path, *fs_path;
82 78
83 path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE); 79 path = nfs4_path(dentry, page, PAGE_SIZE);
84 if (IS_ERR(path)) 80 if (IS_ERR(path))
85 return PTR_ERR(path); 81 return PTR_ERR(path);
86 82
@@ -165,20 +161,18 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
165 161
166/** 162/**
167 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error 163 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
168 * @mnt_parent - mountpoint of parent directory
169 * @dentry - parent directory 164 * @dentry - parent directory
170 * @locations - array of NFSv4 server location information 165 * @locations - array of NFSv4 server location information
171 * 166 *
172 */ 167 */
173static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, 168static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
174 const struct dentry *dentry,
175 const struct nfs4_fs_locations *locations) 169 const struct nfs4_fs_locations *locations)
176{ 170{
177 struct vfsmount *mnt = ERR_PTR(-ENOENT); 171 struct vfsmount *mnt = ERR_PTR(-ENOENT);
178 struct nfs_clone_mount mountdata = { 172 struct nfs_clone_mount mountdata = {
179 .sb = mnt_parent->mnt_sb, 173 .sb = dentry->d_sb,
180 .dentry = dentry, 174 .dentry = dentry,
181 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, 175 .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor,
182 }; 176 };
183 char *page = NULL, *page2 = NULL; 177 char *page = NULL, *page2 = NULL;
184 int loc, error; 178 int loc, error;
@@ -198,7 +192,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
198 goto out; 192 goto out;
199 193
200 /* Ensure fs path is a prefix of current dentry path */ 194 /* Ensure fs path is a prefix of current dentry path */
201 error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2); 195 error = nfs4_validate_fspath(dentry, locations, page, page2);
202 if (error < 0) { 196 if (error < 0) {
203 mnt = ERR_PTR(error); 197 mnt = ERR_PTR(error);
204 goto out; 198 goto out;
@@ -225,11 +219,10 @@ out:
225 219
226/* 220/*
227 * nfs_do_refmount - handle crossing a referral on server 221 * nfs_do_refmount - handle crossing a referral on server
228 * @mnt_parent - mountpoint of referral
229 * @dentry - dentry of referral 222 * @dentry - dentry of referral
230 * 223 *
231 */ 224 */
232struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) 225struct vfsmount *nfs_do_refmount(struct dentry *dentry)
233{ 226{
234 struct vfsmount *mnt = ERR_PTR(-ENOMEM); 227 struct vfsmount *mnt = ERR_PTR(-ENOMEM);
235 struct dentry *parent; 228 struct dentry *parent;
@@ -262,7 +255,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
262 fs_locations->fs_path.ncomponents <= 0) 255 fs_locations->fs_path.ncomponents <= 0)
263 goto out_free; 256 goto out_free;
264 257
265 mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations); 258 mnt = nfs_follow_referral(dentry, fs_locations);
266out_free: 259out_free:
267 __free_page(page); 260 __free_page(page);
268 kfree(fs_locations); 261 kfree(fs_locations);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9d992b0346e3..1d84e7088af9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -50,6 +50,7 @@
50#include <linux/module.h> 50#include <linux/module.h>
51#include <linux/sunrpc/bc_xprt.h> 51#include <linux/sunrpc/bc_xprt.h>
52#include <linux/xattr.h> 52#include <linux/xattr.h>
53#include <linux/utsname.h>
53 54
54#include "nfs4_fs.h" 55#include "nfs4_fs.h"
55#include "delegation.h" 56#include "delegation.h"
@@ -84,6 +85,9 @@ static int nfs4_map_errors(int err)
84 switch (err) { 85 switch (err) {
85 case -NFS4ERR_RESOURCE: 86 case -NFS4ERR_RESOURCE:
86 return -EREMOTEIO; 87 return -EREMOTEIO;
88 case -NFS4ERR_BADOWNER:
89 case -NFS4ERR_BADNAME:
90 return -EINVAL;
87 default: 91 default:
88 dprintk("%s could not handle NFSv4 error %d\n", 92 dprintk("%s could not handle NFSv4 error %d\n",
89 __func__, -err); 93 __func__, -err);
@@ -240,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
240/* This is the error handling routine for processes that are allowed 244/* This is the error handling routine for processes that are allowed
241 * to sleep. 245 * to sleep.
242 */ 246 */
243static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) 247static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
244{ 248{
245 struct nfs_client *clp = server->nfs_client; 249 struct nfs_client *clp = server->nfs_client;
246 struct nfs4_state *state = exception->state; 250 struct nfs4_state *state = exception->state;
@@ -255,12 +259,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
255 case -NFS4ERR_OPENMODE: 259 case -NFS4ERR_OPENMODE:
256 if (state == NULL) 260 if (state == NULL)
257 break; 261 break;
258 nfs4_state_mark_reclaim_nograce(clp, state); 262 nfs4_schedule_stateid_recovery(server, state);
259 goto do_state_recovery; 263 goto wait_on_recovery;
260 case -NFS4ERR_STALE_STATEID: 264 case -NFS4ERR_STALE_STATEID:
261 case -NFS4ERR_STALE_CLIENTID: 265 case -NFS4ERR_STALE_CLIENTID:
262 case -NFS4ERR_EXPIRED: 266 case -NFS4ERR_EXPIRED:
263 goto do_state_recovery; 267 nfs4_schedule_lease_recovery(clp);
268 goto wait_on_recovery;
264#if defined(CONFIG_NFS_V4_1) 269#if defined(CONFIG_NFS_V4_1)
265 case -NFS4ERR_BADSESSION: 270 case -NFS4ERR_BADSESSION:
266 case -NFS4ERR_BADSLOT: 271 case -NFS4ERR_BADSLOT:
@@ -271,7 +276,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
271 case -NFS4ERR_SEQ_MISORDERED: 276 case -NFS4ERR_SEQ_MISORDERED:
272 dprintk("%s ERROR: %d Reset session\n", __func__, 277 dprintk("%s ERROR: %d Reset session\n", __func__,
273 errorcode); 278 errorcode);
274 nfs4_schedule_state_recovery(clp); 279 nfs4_schedule_session_recovery(clp->cl_session);
275 exception->retry = 1; 280 exception->retry = 1;
276 break; 281 break;
277#endif /* defined(CONFIG_NFS_V4_1) */ 282#endif /* defined(CONFIG_NFS_V4_1) */
@@ -291,11 +296,23 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
291 break; 296 break;
292 case -NFS4ERR_OLD_STATEID: 297 case -NFS4ERR_OLD_STATEID:
293 exception->retry = 1; 298 exception->retry = 1;
299 break;
300 case -NFS4ERR_BADOWNER:
301 /* The following works around a Linux server bug! */
302 case -NFS4ERR_BADNAME:
303 if (server->caps & NFS_CAP_UIDGID_NOMAP) {
304 server->caps &= ~NFS_CAP_UIDGID_NOMAP;
305 exception->retry = 1;
306 printk(KERN_WARNING "NFS: v4 server %s "
307 "does not accept raw "
308 "uid/gids. "
309 "Reenabling the idmapper.\n",
310 server->nfs_client->cl_hostname);
311 }
294 } 312 }
295 /* We failed to handle the error */ 313 /* We failed to handle the error */
296 return nfs4_map_errors(ret); 314 return nfs4_map_errors(ret);
297do_state_recovery: 315wait_on_recovery:
298 nfs4_schedule_state_recovery(clp);
299 ret = nfs4_wait_clnt_recover(clp); 316 ret = nfs4_wait_clnt_recover(clp);
300 if (ret == 0) 317 if (ret == 0)
301 exception->retry = 1; 318 exception->retry = 1;
@@ -434,8 +451,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
434 clp = res->sr_session->clp; 451 clp = res->sr_session->clp;
435 do_renew_lease(clp, timestamp); 452 do_renew_lease(clp, timestamp);
436 /* Check sequence flags */ 453 /* Check sequence flags */
437 if (atomic_read(&clp->cl_count) > 1) 454 if (res->sr_status_flags != 0)
438 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); 455 nfs4_schedule_lease_recovery(clp);
439 break; 456 break;
440 case -NFS4ERR_DELAY: 457 case -NFS4ERR_DELAY:
441 /* The server detected a resend of the RPC call and 458 /* The server detected a resend of the RPC call and
@@ -504,7 +521,7 @@ out:
504 return ret_id; 521 return ret_id;
505} 522}
506 523
507static int nfs41_setup_sequence(struct nfs4_session *session, 524int nfs41_setup_sequence(struct nfs4_session *session,
508 struct nfs4_sequence_args *args, 525 struct nfs4_sequence_args *args,
509 struct nfs4_sequence_res *res, 526 struct nfs4_sequence_res *res,
510 int cache_reply, 527 int cache_reply,
@@ -570,6 +587,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
570 res->sr_status = 1; 587 res->sr_status = 1;
571 return 0; 588 return 0;
572} 589}
590EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
573 591
574int nfs4_setup_sequence(const struct nfs_server *server, 592int nfs4_setup_sequence(const struct nfs_server *server,
575 struct nfs4_sequence_args *args, 593 struct nfs4_sequence_args *args,
@@ -1254,14 +1272,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1254 case -NFS4ERR_BAD_HIGH_SLOT: 1272 case -NFS4ERR_BAD_HIGH_SLOT:
1255 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 1273 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1256 case -NFS4ERR_DEADSESSION: 1274 case -NFS4ERR_DEADSESSION:
1257 nfs4_schedule_state_recovery( 1275 nfs4_schedule_session_recovery(server->nfs_client->cl_session);
1258 server->nfs_client);
1259 goto out; 1276 goto out;
1260 case -NFS4ERR_STALE_CLIENTID: 1277 case -NFS4ERR_STALE_CLIENTID:
1261 case -NFS4ERR_STALE_STATEID: 1278 case -NFS4ERR_STALE_STATEID:
1262 case -NFS4ERR_EXPIRED: 1279 case -NFS4ERR_EXPIRED:
1263 /* Don't recall a delegation if it was lost */ 1280 /* Don't recall a delegation if it was lost */
1264 nfs4_schedule_state_recovery(server->nfs_client); 1281 nfs4_schedule_lease_recovery(server->nfs_client);
1265 goto out; 1282 goto out;
1266 case -ERESTARTSYS: 1283 case -ERESTARTSYS:
1267 /* 1284 /*
@@ -1270,7 +1287,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1270 */ 1287 */
1271 case -NFS4ERR_ADMIN_REVOKED: 1288 case -NFS4ERR_ADMIN_REVOKED:
1272 case -NFS4ERR_BAD_STATEID: 1289 case -NFS4ERR_BAD_STATEID:
1273 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 1290 nfs4_schedule_stateid_recovery(server, state);
1274 case -EKEYEXPIRED: 1291 case -EKEYEXPIRED:
1275 /* 1292 /*
1276 * User RPCSEC_GSS context has expired. 1293 * User RPCSEC_GSS context has expired.
@@ -1573,9 +1590,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1573 return 0; 1590 return 0;
1574} 1591}
1575 1592
1576static int nfs4_recover_expired_lease(struct nfs_server *server) 1593static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
1577{ 1594{
1578 struct nfs_client *clp = server->nfs_client;
1579 unsigned int loop; 1595 unsigned int loop;
1580 int ret; 1596 int ret;
1581 1597
@@ -1586,12 +1602,17 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
1586 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && 1602 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1587 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) 1603 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
1588 break; 1604 break;
1589 nfs4_schedule_state_recovery(clp); 1605 nfs4_schedule_state_manager(clp);
1590 ret = -EIO; 1606 ret = -EIO;
1591 } 1607 }
1592 return ret; 1608 return ret;
1593} 1609}
1594 1610
1611static int nfs4_recover_expired_lease(struct nfs_server *server)
1612{
1613 return nfs4_client_recover_expired_lease(server->nfs_client);
1614}
1615
1595/* 1616/*
1596 * OPEN_EXPIRED: 1617 * OPEN_EXPIRED:
1597 * reclaim state on the server after a network partition. 1618 * reclaim state on the server after a network partition.
@@ -3069,15 +3090,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
3069 return err; 3090 return err;
3070} 3091}
3071 3092
3072static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) 3093static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3073{ 3094{
3074 struct nfs_server *server = NFS_SERVER(data->inode); 3095 struct nfs_server *server = NFS_SERVER(data->inode);
3075 3096
3076 dprintk("--> %s\n", __func__);
3077
3078 if (!nfs4_sequence_done(task, &data->res.seq_res))
3079 return -EAGAIN;
3080
3081 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 3097 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
3082 nfs_restart_rpc(task, server->nfs_client); 3098 nfs_restart_rpc(task, server->nfs_client);
3083 return -EAGAIN; 3099 return -EAGAIN;
@@ -3089,19 +3105,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3089 return 0; 3105 return 0;
3090} 3106}
3091 3107
3108static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3109{
3110
3111 dprintk("--> %s\n", __func__);
3112
3113 if (!nfs4_sequence_done(task, &data->res.seq_res))
3114 return -EAGAIN;
3115
3116 return data->read_done_cb(task, data);
3117}
3118
3092static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) 3119static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
3093{ 3120{
3094 data->timestamp = jiffies; 3121 data->timestamp = jiffies;
3122 data->read_done_cb = nfs4_read_done_cb;
3095 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; 3123 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
3096} 3124}
3097 3125
3098static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) 3126/* Reset the the nfs_read_data to send the read to the MDS. */
3127void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
3128{
3129 dprintk("%s Reset task for i/o through\n", __func__);
3130 put_lseg(data->lseg);
3131 data->lseg = NULL;
3132 /* offsets will differ in the dense stripe case */
3133 data->args.offset = data->mds_offset;
3134 data->ds_clp = NULL;
3135 data->args.fh = NFS_FH(data->inode);
3136 data->read_done_cb = nfs4_read_done_cb;
3137 task->tk_ops = data->mds_ops;
3138 rpc_task_reset_client(task, NFS_CLIENT(data->inode));
3139}
3140EXPORT_SYMBOL_GPL(nfs4_reset_read);
3141
3142static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
3099{ 3143{
3100 struct inode *inode = data->inode; 3144 struct inode *inode = data->inode;
3101 3145
3102 if (!nfs4_sequence_done(task, &data->res.seq_res))
3103 return -EAGAIN;
3104
3105 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 3146 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3106 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3147 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
3107 return -EAGAIN; 3148 return -EAGAIN;
@@ -3113,11 +3154,41 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3113 return 0; 3154 return 0;
3114} 3155}
3115 3156
3157static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3158{
3159 if (!nfs4_sequence_done(task, &data->res.seq_res))
3160 return -EAGAIN;
3161 return data->write_done_cb(task, data);
3162}
3163
3164/* Reset the the nfs_write_data to send the write to the MDS. */
3165void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
3166{
3167 dprintk("%s Reset task for i/o through\n", __func__);
3168 put_lseg(data->lseg);
3169 data->lseg = NULL;
3170 data->ds_clp = NULL;
3171 data->write_done_cb = nfs4_write_done_cb;
3172 data->args.fh = NFS_FH(data->inode);
3173 data->args.bitmask = data->res.server->cache_consistency_bitmask;
3174 data->args.offset = data->mds_offset;
3175 data->res.fattr = &data->fattr;
3176 task->tk_ops = data->mds_ops;
3177 rpc_task_reset_client(task, NFS_CLIENT(data->inode));
3178}
3179EXPORT_SYMBOL_GPL(nfs4_reset_write);
3180
3116static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) 3181static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
3117{ 3182{
3118 struct nfs_server *server = NFS_SERVER(data->inode); 3183 struct nfs_server *server = NFS_SERVER(data->inode);
3119 3184
3120 data->args.bitmask = server->cache_consistency_bitmask; 3185 if (data->lseg) {
3186 data->args.bitmask = NULL;
3187 data->res.fattr = NULL;
3188 } else
3189 data->args.bitmask = server->cache_consistency_bitmask;
3190 if (!data->write_done_cb)
3191 data->write_done_cb = nfs4_write_done_cb;
3121 data->res.server = server; 3192 data->res.server = server;
3122 data->timestamp = jiffies; 3193 data->timestamp = jiffies;
3123 3194
@@ -3177,7 +3248,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3177 if (task->tk_status < 0) { 3248 if (task->tk_status < 0) {
3178 /* Unless we're shutting down, schedule state recovery! */ 3249 /* Unless we're shutting down, schedule state recovery! */
3179 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) 3250 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
3180 nfs4_schedule_state_recovery(clp); 3251 nfs4_schedule_lease_recovery(clp);
3181 return; 3252 return;
3182 } 3253 }
3183 do_renew_lease(clp, timestamp); 3254 do_renew_lease(clp, timestamp);
@@ -3251,6 +3322,35 @@ static void buf_to_pages(const void *buf, size_t buflen,
3251 } 3322 }
3252} 3323}
3253 3324
3325static int buf_to_pages_noslab(const void *buf, size_t buflen,
3326 struct page **pages, unsigned int *pgbase)
3327{
3328 struct page *newpage, **spages;
3329 int rc = 0;
3330 size_t len;
3331 spages = pages;
3332
3333 do {
3334 len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
3335 newpage = alloc_page(GFP_KERNEL);
3336
3337 if (newpage == NULL)
3338 goto unwind;
3339 memcpy(page_address(newpage), buf, len);
3340 buf += len;
3341 buflen -= len;
3342 *pages++ = newpage;
3343 rc++;
3344 } while (buflen != 0);
3345
3346 return rc;
3347
3348unwind:
3349 for(; rc > 0; rc--)
3350 __free_page(spages[rc-1]);
3351 return -ENOMEM;
3352}
3353
3254struct nfs4_cached_acl { 3354struct nfs4_cached_acl {
3255 int cached; 3355 int cached;
3256 size_t len; 3356 size_t len;
@@ -3419,13 +3519,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
3419 .rpc_argp = &arg, 3519 .rpc_argp = &arg,
3420 .rpc_resp = &res, 3520 .rpc_resp = &res,
3421 }; 3521 };
3422 int ret; 3522 int ret, i;
3423 3523
3424 if (!nfs4_server_supports_acls(server)) 3524 if (!nfs4_server_supports_acls(server))
3425 return -EOPNOTSUPP; 3525 return -EOPNOTSUPP;
3526 i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
3527 if (i < 0)
3528 return i;
3426 nfs_inode_return_delegation(inode); 3529 nfs_inode_return_delegation(inode);
3427 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
3428 ret = nfs4_call_sync(server, &msg, &arg, &res, 1); 3530 ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
3531
3532 /*
3533 * Free each page after tx, so the only ref left is
3534 * held by the network stack
3535 */
3536 for (; i > 0; i--)
3537 put_page(pages[i-1]);
3538
3429 /* 3539 /*
3430 * Acl update can result in inode attribute update. 3540 * Acl update can result in inode attribute update.
3431 * so mark the attribute cache invalid. 3541 * so mark the attribute cache invalid.
@@ -3463,12 +3573,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3463 case -NFS4ERR_OPENMODE: 3573 case -NFS4ERR_OPENMODE:
3464 if (state == NULL) 3574 if (state == NULL)
3465 break; 3575 break;
3466 nfs4_state_mark_reclaim_nograce(clp, state); 3576 nfs4_schedule_stateid_recovery(server, state);
3467 goto do_state_recovery; 3577 goto wait_on_recovery;
3468 case -NFS4ERR_STALE_STATEID: 3578 case -NFS4ERR_STALE_STATEID:
3469 case -NFS4ERR_STALE_CLIENTID: 3579 case -NFS4ERR_STALE_CLIENTID:
3470 case -NFS4ERR_EXPIRED: 3580 case -NFS4ERR_EXPIRED:
3471 goto do_state_recovery; 3581 nfs4_schedule_lease_recovery(clp);
3582 goto wait_on_recovery;
3472#if defined(CONFIG_NFS_V4_1) 3583#if defined(CONFIG_NFS_V4_1)
3473 case -NFS4ERR_BADSESSION: 3584 case -NFS4ERR_BADSESSION:
3474 case -NFS4ERR_BADSLOT: 3585 case -NFS4ERR_BADSLOT:
@@ -3479,7 +3590,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3479 case -NFS4ERR_SEQ_MISORDERED: 3590 case -NFS4ERR_SEQ_MISORDERED:
3480 dprintk("%s ERROR %d, Reset session\n", __func__, 3591 dprintk("%s ERROR %d, Reset session\n", __func__,
3481 task->tk_status); 3592 task->tk_status);
3482 nfs4_schedule_state_recovery(clp); 3593 nfs4_schedule_session_recovery(clp->cl_session);
3483 task->tk_status = 0; 3594 task->tk_status = 0;
3484 return -EAGAIN; 3595 return -EAGAIN;
3485#endif /* CONFIG_NFS_V4_1 */ 3596#endif /* CONFIG_NFS_V4_1 */
@@ -3496,9 +3607,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3496 } 3607 }
3497 task->tk_status = nfs4_map_errors(task->tk_status); 3608 task->tk_status = nfs4_map_errors(task->tk_status);
3498 return 0; 3609 return 0;
3499do_state_recovery: 3610wait_on_recovery:
3500 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); 3611 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
3501 nfs4_schedule_state_recovery(clp);
3502 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) 3612 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
3503 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); 3613 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
3504 task->tk_status = 0; 3614 task->tk_status = 0;
@@ -4109,7 +4219,7 @@ static void nfs4_lock_release(void *calldata)
4109 task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, 4219 task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
4110 data->arg.lock_seqid); 4220 data->arg.lock_seqid);
4111 if (!IS_ERR(task)) 4221 if (!IS_ERR(task))
4112 rpc_put_task(task); 4222 rpc_put_task_async(task);
4113 dprintk("%s: cancelling lock!\n", __func__); 4223 dprintk("%s: cancelling lock!\n", __func__);
4114 } else 4224 } else
4115 nfs_free_seqid(data->arg.lock_seqid); 4225 nfs_free_seqid(data->arg.lock_seqid);
@@ -4133,23 +4243,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
4133 4243
4134static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) 4244static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
4135{ 4245{
4136 struct nfs_client *clp = server->nfs_client;
4137 struct nfs4_state *state = lsp->ls_state;
4138
4139 switch (error) { 4246 switch (error) {
4140 case -NFS4ERR_ADMIN_REVOKED: 4247 case -NFS4ERR_ADMIN_REVOKED:
4141 case -NFS4ERR_BAD_STATEID: 4248 case -NFS4ERR_BAD_STATEID:
4142 case -NFS4ERR_EXPIRED: 4249 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4143 if (new_lock_owner != 0 || 4250 if (new_lock_owner != 0 ||
4144 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) 4251 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
4145 nfs4_state_mark_reclaim_nograce(clp, state); 4252 nfs4_schedule_stateid_recovery(server, lsp->ls_state);
4146 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4147 break; 4253 break;
4148 case -NFS4ERR_STALE_STATEID: 4254 case -NFS4ERR_STALE_STATEID:
4149 if (new_lock_owner != 0 ||
4150 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
4151 nfs4_state_mark_reclaim_reboot(clp, state);
4152 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; 4255 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4256 case -NFS4ERR_EXPIRED:
4257 nfs4_schedule_lease_recovery(server->nfs_client);
4153 }; 4258 };
4154} 4259}
4155 4260
@@ -4365,12 +4470,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4365 case -NFS4ERR_EXPIRED: 4470 case -NFS4ERR_EXPIRED:
4366 case -NFS4ERR_STALE_CLIENTID: 4471 case -NFS4ERR_STALE_CLIENTID:
4367 case -NFS4ERR_STALE_STATEID: 4472 case -NFS4ERR_STALE_STATEID:
4473 nfs4_schedule_lease_recovery(server->nfs_client);
4474 goto out;
4368 case -NFS4ERR_BADSESSION: 4475 case -NFS4ERR_BADSESSION:
4369 case -NFS4ERR_BADSLOT: 4476 case -NFS4ERR_BADSLOT:
4370 case -NFS4ERR_BAD_HIGH_SLOT: 4477 case -NFS4ERR_BAD_HIGH_SLOT:
4371 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 4478 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
4372 case -NFS4ERR_DEADSESSION: 4479 case -NFS4ERR_DEADSESSION:
4373 nfs4_schedule_state_recovery(server->nfs_client); 4480 nfs4_schedule_session_recovery(server->nfs_client->cl_session);
4374 goto out; 4481 goto out;
4375 case -ERESTARTSYS: 4482 case -ERESTARTSYS:
4376 /* 4483 /*
@@ -4380,7 +4487,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4380 case -NFS4ERR_ADMIN_REVOKED: 4487 case -NFS4ERR_ADMIN_REVOKED:
4381 case -NFS4ERR_BAD_STATEID: 4488 case -NFS4ERR_BAD_STATEID:
4382 case -NFS4ERR_OPENMODE: 4489 case -NFS4ERR_OPENMODE:
4383 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 4490 nfs4_schedule_stateid_recovery(server, state);
4384 err = 0; 4491 err = 0;
4385 goto out; 4492 goto out;
4386 case -EKEYEXPIRED: 4493 case -EKEYEXPIRED:
@@ -4572,27 +4679,16 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4572 *p = htonl((u32)clp->cl_boot_time.tv_nsec); 4679 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
4573 args.verifier = &verifier; 4680 args.verifier = &verifier;
4574 4681
4575 while (1) { 4682 args.id_len = scnprintf(args.id, sizeof(args.id),
4576 args.id_len = scnprintf(args.id, sizeof(args.id), 4683 "%s/%s.%s/%u",
4577 "%s/%s %u", 4684 clp->cl_ipaddr,
4578 clp->cl_ipaddr, 4685 init_utsname()->nodename,
4579 rpc_peeraddr2str(clp->cl_rpcclient, 4686 init_utsname()->domainname,
4580 RPC_DISPLAY_ADDR), 4687 clp->cl_rpcclient->cl_auth->au_flavor);
4581 clp->cl_id_uniquifier);
4582
4583 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4584
4585 if (status != -NFS4ERR_CLID_INUSE)
4586 break;
4587
4588 if (signalled())
4589 break;
4590
4591 if (++clp->cl_id_uniquifier == 0)
4592 break;
4593 }
4594 4688
4595 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); 4689 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4690 if (!status)
4691 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
4596 dprintk("<-- %s status= %d\n", __func__, status); 4692 dprintk("<-- %s status= %d\n", __func__, status);
4597 return status; 4693 return status;
4598} 4694}
@@ -4998,10 +5094,20 @@ int nfs4_proc_create_session(struct nfs_client *clp)
4998 int status; 5094 int status;
4999 unsigned *ptr; 5095 unsigned *ptr;
5000 struct nfs4_session *session = clp->cl_session; 5096 struct nfs4_session *session = clp->cl_session;
5097 long timeout = 0;
5098 int err;
5001 5099
5002 dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); 5100 dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
5003 5101
5004 status = _nfs4_proc_create_session(clp); 5102 do {
5103 status = _nfs4_proc_create_session(clp);
5104 if (status == -NFS4ERR_DELAY) {
5105 err = nfs4_delay(clp->cl_rpcclient, &timeout);
5106 if (err)
5107 status = err;
5108 }
5109 } while (status == -NFS4ERR_DELAY);
5110
5005 if (status) 5111 if (status)
5006 goto out; 5112 goto out;
5007 5113
@@ -5083,6 +5189,27 @@ int nfs4_init_session(struct nfs_server *server)
5083 return ret; 5189 return ret;
5084} 5190}
5085 5191
5192int nfs4_init_ds_session(struct nfs_client *clp)
5193{
5194 struct nfs4_session *session = clp->cl_session;
5195 int ret;
5196
5197 if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
5198 return 0;
5199
5200 ret = nfs4_client_recover_expired_lease(clp);
5201 if (!ret)
5202 /* Test for the DS role */
5203 if (!is_ds_client(clp))
5204 ret = -ENODEV;
5205 if (!ret)
5206 ret = nfs4_check_client_ready(clp);
5207 return ret;
5208
5209}
5210EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
5211
5212
5086/* 5213/*
5087 * Renew the cl_session lease. 5214 * Renew the cl_session lease.
5088 */ 5215 */
@@ -5110,7 +5237,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
5110 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5237 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5111 return -EAGAIN; 5238 return -EAGAIN;
5112 default: 5239 default:
5113 nfs4_schedule_state_recovery(clp); 5240 nfs4_schedule_lease_recovery(clp);
5114 } 5241 }
5115 return 0; 5242 return 0;
5116} 5243}
@@ -5197,7 +5324,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
5197 if (IS_ERR(task)) 5324 if (IS_ERR(task))
5198 ret = PTR_ERR(task); 5325 ret = PTR_ERR(task);
5199 else 5326 else
5200 rpc_put_task(task); 5327 rpc_put_task_async(task);
5201 dprintk("<-- %s status=%d\n", __func__, ret); 5328 dprintk("<-- %s status=%d\n", __func__, ret);
5202 return ret; 5329 return ret;
5203} 5330}
@@ -5213,8 +5340,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5213 goto out; 5340 goto out;
5214 } 5341 }
5215 ret = rpc_wait_for_completion_task(task); 5342 ret = rpc_wait_for_completion_task(task);
5216 if (!ret) 5343 if (!ret) {
5344 struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
5345
5346 if (task->tk_status == 0)
5347 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
5217 ret = task->tk_status; 5348 ret = task->tk_status;
5349 }
5218 rpc_put_task(task); 5350 rpc_put_task(task);
5219out: 5351out:
5220 dprintk("<-- %s status=%d\n", __func__, ret); 5352 dprintk("<-- %s status=%d\n", __func__, ret);
@@ -5251,7 +5383,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
5251 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5383 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5252 return -EAGAIN; 5384 return -EAGAIN;
5253 default: 5385 default:
5254 nfs4_schedule_state_recovery(clp); 5386 nfs4_schedule_lease_recovery(clp);
5255 } 5387 }
5256 return 0; 5388 return 0;
5257} 5389}
@@ -5319,6 +5451,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5319 status = PTR_ERR(task); 5451 status = PTR_ERR(task);
5320 goto out; 5452 goto out;
5321 } 5453 }
5454 status = nfs4_wait_for_completion_rpc_task(task);
5455 if (status == 0)
5456 status = task->tk_status;
5322 rpc_put_task(task); 5457 rpc_put_task(task);
5323 return 0; 5458 return 0;
5324out: 5459out:
@@ -5605,6 +5740,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5605 .clear_acl_cache = nfs4_zap_acl_attr, 5740 .clear_acl_cache = nfs4_zap_acl_attr,
5606 .close_context = nfs4_close_context, 5741 .close_context = nfs4_close_context,
5607 .open_context = nfs4_atomic_open, 5742 .open_context = nfs4_atomic_open,
5743 .init_client = nfs4_init_client,
5608}; 5744};
5609 5745
5610static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { 5746static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 402143d75fc5..df8e7f3ca56d 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
64 ops = clp->cl_mvops->state_renewal_ops; 64 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 65 dprintk("%s: start\n", __func__);
66 66
67 rcu_read_lock(); 67 if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
68 if (list_empty(&clp->cl_superblocks)) {
69 rcu_read_unlock();
70 goto out; 68 goto out;
71 }
72 rcu_read_unlock();
73 69
74 spin_lock(&clp->cl_lock); 70 spin_lock(&clp->cl_lock);
75 lease = clp->cl_lease_time; 71 lease = clp->cl_lease_time;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2336d532cf66..ab1bf5bb021f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
153 int status; 153 int status;
154 struct nfs_fsinfo fsinfo; 154 struct nfs_fsinfo fsinfo;
155 155
156 if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
157 nfs4_schedule_state_renewal(clp);
158 return 0;
159 }
160
156 status = nfs4_proc_get_lease_time(clp, &fsinfo); 161 status = nfs4_proc_get_lease_time(clp, &fsinfo);
157 if (status == 0) { 162 if (status == 0) {
158 /* Update lease time and schedule renewal */ 163 /* Update lease time and schedule renewal */
@@ -232,12 +237,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
232 status = nfs4_proc_create_session(clp); 237 status = nfs4_proc_create_session(clp);
233 if (status != 0) 238 if (status != 0)
234 goto out; 239 goto out;
235 status = nfs4_set_callback_sessionid(clp);
236 if (status != 0) {
237 printk(KERN_WARNING "Sessionid not set. No callback service\n");
238 nfs_callback_down(1);
239 status = 0;
240 }
241 nfs41_setup_state_renewal(clp); 240 nfs41_setup_state_renewal(clp);
242 nfs_mark_client_ready(clp, NFS_CS_READY); 241 nfs_mark_client_ready(clp, NFS_CS_READY);
243out: 242out:
@@ -1013,9 +1012,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
1013} 1012}
1014 1013
1015/* 1014/*
1016 * Schedule a state recovery attempt 1015 * Schedule a lease recovery attempt
1017 */ 1016 */
1018void nfs4_schedule_state_recovery(struct nfs_client *clp) 1017void nfs4_schedule_lease_recovery(struct nfs_client *clp)
1019{ 1018{
1020 if (!clp) 1019 if (!clp)
1021 return; 1020 return;
@@ -1024,7 +1023,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
1024 nfs4_schedule_state_manager(clp); 1023 nfs4_schedule_state_manager(clp);
1025} 1024}
1026 1025
1027int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) 1026static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
1028{ 1027{
1029 1028
1030 set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); 1029 set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1038,7 +1037,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st
1038 return 1; 1037 return 1;
1039} 1038}
1040 1039
1041int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) 1040static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
1042{ 1041{
1043 set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); 1042 set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
1044 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); 1043 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1047,6 +1046,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s
1047 return 1; 1046 return 1;
1048} 1047}
1049 1048
1049void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
1050{
1051 struct nfs_client *clp = server->nfs_client;
1052
1053 nfs4_state_mark_reclaim_nograce(clp, state);
1054 nfs4_schedule_state_manager(clp);
1055}
1056
1050static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) 1057static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
1051{ 1058{
1052 struct inode *inode = state->inode; 1059 struct inode *inode = state->inode;
@@ -1442,10 +1449,16 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
1442} 1449}
1443 1450
1444#ifdef CONFIG_NFS_V4_1 1451#ifdef CONFIG_NFS_V4_1
1452void nfs4_schedule_session_recovery(struct nfs4_session *session)
1453{
1454 nfs4_schedule_lease_recovery(session->clp);
1455}
1456EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
1457
1445void nfs41_handle_recall_slot(struct nfs_client *clp) 1458void nfs41_handle_recall_slot(struct nfs_client *clp)
1446{ 1459{
1447 set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); 1460 set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1448 nfs4_schedule_state_recovery(clp); 1461 nfs4_schedule_state_manager(clp);
1449} 1462}
1450 1463
1451static void nfs4_reset_all_state(struct nfs_client *clp) 1464static void nfs4_reset_all_state(struct nfs_client *clp)
@@ -1453,7 +1466,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
1453 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { 1466 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1454 clp->cl_boot_time = CURRENT_TIME; 1467 clp->cl_boot_time = CURRENT_TIME;
1455 nfs4_state_start_reclaim_nograce(clp); 1468 nfs4_state_start_reclaim_nograce(clp);
1456 nfs4_schedule_state_recovery(clp); 1469 nfs4_schedule_state_manager(clp);
1457 } 1470 }
1458} 1471}
1459 1472
@@ -1461,7 +1474,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
1461{ 1474{
1462 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { 1475 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1463 nfs4_state_start_reclaim_reboot(clp); 1476 nfs4_state_start_reclaim_reboot(clp);
1464 nfs4_schedule_state_recovery(clp); 1477 nfs4_schedule_state_manager(clp);
1465 } 1478 }
1466} 1479}
1467 1480
@@ -1481,7 +1494,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
1481{ 1494{
1482 nfs_expire_all_delegations(clp); 1495 nfs_expire_all_delegations(clp);
1483 if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) 1496 if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
1484 nfs4_schedule_state_recovery(clp); 1497 nfs4_schedule_state_manager(clp);
1485} 1498}
1486 1499
1487void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) 1500void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2ab8e5cb8f59..0cf560f77884 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
844 if (iap->ia_valid & ATTR_MODE) 844 if (iap->ia_valid & ATTR_MODE)
845 len += 4; 845 len += 4;
846 if (iap->ia_valid & ATTR_UID) { 846 if (iap->ia_valid & ATTR_UID) {
847 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ); 847 owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
848 if (owner_namelen < 0) { 848 if (owner_namelen < 0) {
849 dprintk("nfs: couldn't resolve uid %d to string\n", 849 dprintk("nfs: couldn't resolve uid %d to string\n",
850 iap->ia_uid); 850 iap->ia_uid);
@@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
856 len += 4 + (XDR_QUADLEN(owner_namelen) << 2); 856 len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
857 } 857 }
858 if (iap->ia_valid & ATTR_GID) { 858 if (iap->ia_valid & ATTR_GID) {
859 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ); 859 owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
860 if (owner_grouplen < 0) { 860 if (owner_grouplen < 0) {
861 dprintk("nfs: couldn't resolve gid %d to string\n", 861 dprintk("nfs: couldn't resolve gid %d to string\n",
862 iap->ia_gid); 862 iap->ia_gid);
@@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1384 hdr->replen += decode_putrootfh_maxsz; 1384 hdr->replen += decode_putrootfh_maxsz;
1385} 1385}
1386 1386
1387static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) 1387static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
1388{ 1388{
1389 nfs4_stateid stateid; 1389 nfs4_stateid stateid;
1390 __be32 *p; 1390 __be32 *p;
@@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
1392 p = reserve_space(xdr, NFS4_STATEID_SIZE); 1392 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1393 if (ctx->state != NULL) { 1393 if (ctx->state != NULL) {
1394 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); 1394 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
1395 if (zero_seqid)
1396 stateid.stateid.seqid = 0;
1395 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); 1397 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
1396 } else 1398 } else
1397 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); 1399 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1404 p = reserve_space(xdr, 4); 1406 p = reserve_space(xdr, 4);
1405 *p = cpu_to_be32(OP_READ); 1407 *p = cpu_to_be32(OP_READ);
1406 1408
1407 encode_stateid(xdr, args->context, args->lock_context); 1409 encode_stateid(xdr, args->context, args->lock_context,
1410 hdr->minorversion);
1408 1411
1409 p = reserve_space(xdr, 12); 1412 p = reserve_space(xdr, 12);
1410 p = xdr_encode_hyper(p, args->offset); 1413 p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1592 p = reserve_space(xdr, 4); 1595 p = reserve_space(xdr, 4);
1593 *p = cpu_to_be32(OP_WRITE); 1596 *p = cpu_to_be32(OP_WRITE);
1594 1597
1595 encode_stateid(xdr, args->context, args->lock_context); 1598 encode_stateid(xdr, args->context, args->lock_context,
1599 hdr->minorversion);
1596 1600
1597 p = reserve_space(xdr, 16); 1601 p = reserve_space(xdr, 16);
1598 p = xdr_encode_hyper(p, args->offset); 1602 p = xdr_encode_hyper(p, args->offset);
@@ -1660,7 +1664,7 @@ static void encode_create_session(struct xdr_stream *xdr,
1660 1664
1661 p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); 1665 p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
1662 *p++ = cpu_to_be32(OP_CREATE_SESSION); 1666 *p++ = cpu_to_be32(OP_CREATE_SESSION);
1663 p = xdr_encode_hyper(p, clp->cl_ex_clid); 1667 p = xdr_encode_hyper(p, clp->cl_clientid);
1664 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ 1668 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
1665 *p++ = cpu_to_be32(args->flags); /*flags */ 1669 *p++ = cpu_to_be32(args->flags); /*flags */
1666 1670
@@ -2271,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
2271 encode_putfh(xdr, args->fh, &hdr); 2275 encode_putfh(xdr, args->fh, &hdr);
2272 encode_write(xdr, args, &hdr); 2276 encode_write(xdr, args, &hdr);
2273 req->rq_snd_buf.flags |= XDRBUF_WRITE; 2277 req->rq_snd_buf.flags |= XDRBUF_WRITE;
2274 encode_getfattr(xdr, args->bitmask, &hdr); 2278 if (args->bitmask)
2279 encode_getfattr(xdr, args->bitmask, &hdr);
2275 encode_nops(&hdr); 2280 encode_nops(&hdr);
2276} 2281}
2277 2282
@@ -3382,7 +3387,7 @@ out_overflow:
3382} 3387}
3383 3388
3384static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, 3389static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3385 struct nfs_client *clp, uint32_t *uid, int may_sleep) 3390 const struct nfs_server *server, uint32_t *uid, int may_sleep)
3386{ 3391{
3387 uint32_t len; 3392 uint32_t len;
3388 __be32 *p; 3393 __be32 *p;
@@ -3402,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3402 if (!may_sleep) { 3407 if (!may_sleep) {
3403 /* do nothing */ 3408 /* do nothing */
3404 } else if (len < XDR_MAX_NETOBJ) { 3409 } else if (len < XDR_MAX_NETOBJ) {
3405 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) 3410 if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
3406 ret = NFS_ATTR_FATTR_OWNER; 3411 ret = NFS_ATTR_FATTR_OWNER;
3407 else 3412 else
3408 dprintk("%s: nfs_map_name_to_uid failed!\n", 3413 dprintk("%s: nfs_map_name_to_uid failed!\n",
@@ -3420,7 +3425,7 @@ out_overflow:
3420} 3425}
3421 3426
3422static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, 3427static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3423 struct nfs_client *clp, uint32_t *gid, int may_sleep) 3428 const struct nfs_server *server, uint32_t *gid, int may_sleep)
3424{ 3429{
3425 uint32_t len; 3430 uint32_t len;
3426 __be32 *p; 3431 __be32 *p;
@@ -3440,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3440 if (!may_sleep) { 3445 if (!may_sleep) {
3441 /* do nothing */ 3446 /* do nothing */
3442 } else if (len < XDR_MAX_NETOBJ) { 3447 } else if (len < XDR_MAX_NETOBJ) {
3443 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) 3448 if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
3444 ret = NFS_ATTR_FATTR_GROUP; 3449 ret = NFS_ATTR_FATTR_GROUP;
3445 else 3450 else
3446 dprintk("%s: nfs_map_group_to_gid failed!\n", 3451 dprintk("%s: nfs_map_group_to_gid failed!\n",
@@ -3939,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
3939 goto xdr_error; 3944 goto xdr_error;
3940 fattr->valid |= status; 3945 fattr->valid |= status;
3941 3946
3942 status = decode_attr_owner(xdr, bitmap, server->nfs_client, 3947 status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
3943 &fattr->uid, may_sleep);
3944 if (status < 0) 3948 if (status < 0)
3945 goto xdr_error; 3949 goto xdr_error;
3946 fattr->valid |= status; 3950 fattr->valid |= status;
3947 3951
3948 status = decode_attr_group(xdr, bitmap, server->nfs_client, 3952 status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
3949 &fattr->gid, may_sleep);
3950 if (status < 0) 3953 if (status < 0)
3951 goto xdr_error; 3954 goto xdr_error;
3952 fattr->valid |= status; 3955 fattr->valid |= status;
@@ -4694,7 +4697,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4694 p = xdr_inline_decode(xdr, 8); 4697 p = xdr_inline_decode(xdr, 8);
4695 if (unlikely(!p)) 4698 if (unlikely(!p))
4696 goto out_overflow; 4699 goto out_overflow;
4697 xdr_decode_hyper(p, &clp->cl_ex_clid); 4700 xdr_decode_hyper(p, &clp->cl_clientid);
4698 p = xdr_inline_decode(xdr, 12); 4701 p = xdr_inline_decode(xdr, 12);
4699 if (unlikely(!p)) 4702 if (unlikely(!p))
4700 goto out_overflow; 4703 goto out_overflow;
@@ -5690,8 +5693,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5690 status = decode_write(xdr, res); 5693 status = decode_write(xdr, res);
5691 if (status) 5694 if (status)
5692 goto out; 5695 goto out;
5693 decode_getfattr(xdr, res->fattr, res->server, 5696 if (res->fattr)
5694 !RPC_IS_ASYNC(rqstp->rq_task)); 5697 decode_getfattr(xdr, res->fattr, res->server,
5698 !RPC_IS_ASYNC(rqstp->rq_task));
5695 if (!status) 5699 if (!status)
5696 status = res->count; 5700 status = res->count;
5697out: 5701out:
@@ -6086,11 +6090,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6086 __be32 *p = xdr_inline_decode(xdr, 4); 6090 __be32 *p = xdr_inline_decode(xdr, 4);
6087 if (unlikely(!p)) 6091 if (unlikely(!p))
6088 goto out_overflow; 6092 goto out_overflow;
6089 if (!ntohl(*p++)) { 6093 if (*p == xdr_zero) {
6090 p = xdr_inline_decode(xdr, 4); 6094 p = xdr_inline_decode(xdr, 4);
6091 if (unlikely(!p)) 6095 if (unlikely(!p))
6092 goto out_overflow; 6096 goto out_overflow;
6093 if (!ntohl(*p++)) 6097 if (*p == xdr_zero)
6094 return -EAGAIN; 6098 return -EAGAIN;
6095 entry->eof = 1; 6099 entry->eof = 1;
6096 return -EBADCOOKIE; 6100 return -EBADCOOKIE;
@@ -6101,7 +6105,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6101 goto out_overflow; 6105 goto out_overflow;
6102 entry->prev_cookie = entry->cookie; 6106 entry->prev_cookie = entry->cookie;
6103 p = xdr_decode_hyper(p, &entry->cookie); 6107 p = xdr_decode_hyper(p, &entry->cookie);
6104 entry->len = ntohl(*p++); 6108 entry->len = be32_to_cpup(p);
6105 6109
6106 p = xdr_inline_decode(xdr, entry->len); 6110 p = xdr_inline_decode(xdr, entry->len);
6107 if (unlikely(!p)) 6111 if (unlikely(!p))
@@ -6132,9 +6136,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6132 if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) 6136 if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
6133 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); 6137 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
6134 6138
6135 if (verify_attr_len(xdr, p, len) < 0)
6136 goto out_overflow;
6137
6138 return 0; 6139 return 0;
6139 6140
6140out_overflow: 6141out_overflow:
@@ -6170,8 +6171,6 @@ static struct {
6170 { NFS4ERR_DQUOT, -EDQUOT }, 6171 { NFS4ERR_DQUOT, -EDQUOT },
6171 { NFS4ERR_STALE, -ESTALE }, 6172 { NFS4ERR_STALE, -ESTALE },
6172 { NFS4ERR_BADHANDLE, -EBADHANDLE }, 6173 { NFS4ERR_BADHANDLE, -EBADHANDLE },
6173 { NFS4ERR_BADOWNER, -EINVAL },
6174 { NFS4ERR_BADNAME, -EINVAL },
6175 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, 6174 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
6176 { NFS4ERR_NOTSUPP, -ENOTSUPP }, 6175 { NFS4ERR_NOTSUPP, -ENOTSUPP },
6177 { NFS4ERR_TOOSMALL, -ETOOSMALL }, 6176 { NFS4ERR_TOOSMALL, -ETOOSMALL },
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 903908a20023..c541093a5bf2 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,11 +86,14 @@
86/* Default path we try to mount. "%s" gets replaced by our IP address */ 86/* Default path we try to mount. "%s" gets replaced by our IP address */
87#define NFS_ROOT "/tftpboot/%s" 87#define NFS_ROOT "/tftpboot/%s"
88 88
89/* Default NFSROOT mount options. */
90#define NFS_DEF_OPTIONS "udp"
91
89/* Parameters passed from the kernel command line */ 92/* Parameters passed from the kernel command line */
90static char nfs_root_parms[256] __initdata = ""; 93static char nfs_root_parms[256] __initdata = "";
91 94
92/* Text-based mount options passed to super.c */ 95/* Text-based mount options passed to super.c */
93static char nfs_root_options[256] __initdata = ""; 96static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
94 97
95/* Address of NFS server */ 98/* Address of NFS server */
96static __be32 servaddr __initdata = htonl(INADDR_NONE); 99static __be32 servaddr __initdata = htonl(INADDR_NONE);
@@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src,
160} 163}
161 164
162static int __init root_nfs_cat(char *dest, const char *src, 165static int __init root_nfs_cat(char *dest, const char *src,
163 const size_t destlen) 166 const size_t destlen)
164{ 167{
168 size_t len = strlen(dest);
169
170 if (len && dest[len - 1] != ',')
171 if (strlcat(dest, ",", destlen) > destlen)
172 return -1;
173
165 if (strlcat(dest, src, destlen) > destlen) 174 if (strlcat(dest, src, destlen) > destlen)
166 return -1; 175 return -1;
167 return 0; 176 return 0;
@@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
194 if (root_nfs_cat(nfs_root_options, incoming, 203 if (root_nfs_cat(nfs_root_options, incoming,
195 sizeof(nfs_root_options))) 204 sizeof(nfs_root_options)))
196 return -1; 205 return -1;
197
198 /*
199 * Possibly prepare for more options to be appended
200 */
201 if (nfs_root_options[0] != '\0' &&
202 nfs_root_options[strlen(nfs_root_options)] != ',')
203 if (root_nfs_cat(nfs_root_options, ",",
204 sizeof(nfs_root_options)))
205 return -1;
206
207 return 0; 206 return 0;
208} 207}
209 208
@@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
217 */ 216 */
218static int __init root_nfs_data(char *cmdline) 217static int __init root_nfs_data(char *cmdline)
219{ 218{
220 char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; 219 char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
221 int len, retval = -1; 220 int len, retval = -1;
222 char *tmp = NULL; 221 char *tmp = NULL;
223 const size_t tmplen = sizeof(nfs_export_path); 222 const size_t tmplen = sizeof(nfs_export_path);
@@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline)
244 * Append mandatory options for nfsroot so they override 243 * Append mandatory options for nfsroot so they override
245 * what has come before 244 * what has come before
246 */ 245 */
247 snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4", 246 snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
248 &servaddr); 247 &servaddr);
249 if (root_nfs_cat(nfs_root_options, addr_option, 248 if (root_nfs_cat(nfs_root_options, mand_options,
250 sizeof(nfs_root_options))) 249 sizeof(nfs_root_options)))
251 goto out_optionstoolong; 250 goto out_optionstoolong;
252 251
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e1164e3f9e69..23e794410669 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,6 +20,7 @@
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21 21
22#include "internal.h" 22#include "internal.h"
23#include "pnfs.h"
23 24
24static struct kmem_cache *nfs_page_cachep; 25static struct kmem_cache *nfs_page_cachep;
25 26
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
213 */ 214 */
214void nfs_pageio_init(struct nfs_pageio_descriptor *desc, 215void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
215 struct inode *inode, 216 struct inode *inode,
216 int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), 217 int (*doio)(struct nfs_pageio_descriptor *),
217 size_t bsize, 218 size_t bsize,
218 int io_flags) 219 int io_flags)
219{ 220{
@@ -226,6 +227,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
226 desc->pg_doio = doio; 227 desc->pg_doio = doio;
227 desc->pg_ioflags = io_flags; 228 desc->pg_ioflags = io_flags;
228 desc->pg_error = 0; 229 desc->pg_error = 0;
230 desc->pg_lseg = NULL;
229} 231}
230 232
231/** 233/**
@@ -240,7 +242,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
240 * Return 'true' if this is the case, else return 'false'. 242 * Return 'true' if this is the case, else return 'false'.
241 */ 243 */
242static int nfs_can_coalesce_requests(struct nfs_page *prev, 244static int nfs_can_coalesce_requests(struct nfs_page *prev,
243 struct nfs_page *req) 245 struct nfs_page *req,
246 struct nfs_pageio_descriptor *pgio)
244{ 247{
245 if (req->wb_context->cred != prev->wb_context->cred) 248 if (req->wb_context->cred != prev->wb_context->cred)
246 return 0; 249 return 0;
@@ -254,6 +257,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
254 return 0; 257 return 0;
255 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) 258 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
256 return 0; 259 return 0;
260 /*
261 * Non-whole file layouts need to check that req is inside of
262 * pgio->pg_lseg.
263 */
264 if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
265 return 0;
257 return 1; 266 return 1;
258} 267}
259 268
@@ -286,7 +295,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
286 if (newlen > desc->pg_bsize) 295 if (newlen > desc->pg_bsize)
287 return 0; 296 return 0;
288 prev = nfs_list_entry(desc->pg_list.prev); 297 prev = nfs_list_entry(desc->pg_list.prev);
289 if (!nfs_can_coalesce_requests(prev, req)) 298 if (!nfs_can_coalesce_requests(prev, req, desc))
290 return 0; 299 return 0;
291 } else 300 } else
292 desc->pg_base = req->wb_pgbase; 301 desc->pg_base = req->wb_pgbase;
@@ -302,12 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
302static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) 311static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
303{ 312{
304 if (!list_empty(&desc->pg_list)) { 313 if (!list_empty(&desc->pg_list)) {
305 int error = desc->pg_doio(desc->pg_inode, 314 int error = desc->pg_doio(desc);
306 &desc->pg_list,
307 nfs_page_array_len(desc->pg_base,
308 desc->pg_count),
309 desc->pg_count,
310 desc->pg_ioflags);
311 if (error < 0) 315 if (error < 0)
312 desc->pg_error = error; 316 desc->pg_error = error;
313 else 317 else
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bc4089769735..f38813a0a295 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
30#include <linux/nfs_fs.h> 30#include <linux/nfs_fs.h>
31#include "internal.h" 31#include "internal.h"
32#include "pnfs.h" 32#include "pnfs.h"
33#include "iostat.h"
33 34
34#define NFSDBG_FACILITY NFSDBG_PNFS 35#define NFSDBG_FACILITY NFSDBG_PNFS
35 36
@@ -74,10 +75,8 @@ find_pnfs_driver(u32 id)
74void 75void
75unset_pnfs_layoutdriver(struct nfs_server *nfss) 76unset_pnfs_layoutdriver(struct nfs_server *nfss)
76{ 77{
77 if (nfss->pnfs_curr_ld) { 78 if (nfss->pnfs_curr_ld)
78 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
79 module_put(nfss->pnfs_curr_ld->owner); 79 module_put(nfss->pnfs_curr_ld->owner);
80 }
81 nfss->pnfs_curr_ld = NULL; 80 nfss->pnfs_curr_ld = NULL;
82} 81}
83 82
@@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
115 goto out_no_driver; 114 goto out_no_driver;
116 } 115 }
117 server->pnfs_curr_ld = ld_type; 116 server->pnfs_curr_ld = ld_type;
118 if (ld_type->set_layoutdriver(server)) { 117
119 printk(KERN_ERR
120 "%s: Error initializing mount point for layout driver %u.\n",
121 __func__, id);
122 module_put(ld_type->owner);
123 goto out_no_driver;
124 }
125 dprintk("%s: pNFS module for %u set\n", __func__, id); 118 dprintk("%s: pNFS module for %u set\n", __func__, id);
126 return; 119 return;
127 120
@@ -230,37 +223,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
230 put_layout_hdr(NFS_I(ino)->layout); 223 put_layout_hdr(NFS_I(ino)->layout);
231} 224}
232 225
233/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg 226static void
234 * could sleep, so must be called outside of the lock. 227put_lseg_common(struct pnfs_layout_segment *lseg)
235 * Returns 1 if object was removed, otherwise return 0. 228{
236 */ 229 struct inode *inode = lseg->pls_layout->plh_inode;
237static int 230
238put_lseg_locked(struct pnfs_layout_segment *lseg, 231 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
239 struct list_head *tmp_list) 232 list_del_init(&lseg->pls_list);
233 if (list_empty(&lseg->pls_layout->plh_segs)) {
234 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
235 /* Matched by initial refcount set in alloc_init_layout_hdr */
236 put_layout_hdr_locked(lseg->pls_layout);
237 }
238 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
239}
240
241void
242put_lseg(struct pnfs_layout_segment *lseg)
240{ 243{
244 struct inode *inode;
245
246 if (!lseg)
247 return;
248
241 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 249 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
242 atomic_read(&lseg->pls_refcount), 250 atomic_read(&lseg->pls_refcount),
243 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 251 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
244 if (atomic_dec_and_test(&lseg->pls_refcount)) { 252 inode = lseg->pls_layout->plh_inode;
245 struct inode *ino = lseg->pls_layout->plh_inode; 253 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
254 LIST_HEAD(free_me);
246 255
247 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 256 put_lseg_common(lseg);
248 list_del(&lseg->pls_list); 257 list_add(&lseg->pls_list, &free_me);
249 if (list_empty(&lseg->pls_layout->plh_segs)) { 258 spin_unlock(&inode->i_lock);
250 struct nfs_client *clp; 259 pnfs_free_lseg_list(&free_me);
251
252 clp = NFS_SERVER(ino)->nfs_client;
253 spin_lock(&clp->cl_lock);
254 /* List does not take a reference, so no need for put here */
255 list_del_init(&lseg->pls_layout->plh_layouts);
256 spin_unlock(&clp->cl_lock);
257 clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
258 }
259 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
260 list_add(&lseg->pls_list, tmp_list);
261 return 1;
262 } 260 }
263 return 0;
264} 261}
265 262
266static bool 263static bool
@@ -281,7 +278,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
281 * list. It will now be removed when all 278 * list. It will now be removed when all
282 * outstanding io is finished. 279 * outstanding io is finished.
283 */ 280 */
284 rv = put_lseg_locked(lseg, tmp_list); 281 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
282 atomic_read(&lseg->pls_refcount));
283 if (atomic_dec_and_test(&lseg->pls_refcount)) {
284 put_lseg_common(lseg);
285 list_add(&lseg->pls_list, tmp_list);
286 rv = 1;
287 }
285 } 288 }
286 return rv; 289 return rv;
287} 290}
@@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
299 302
300 dprintk("%s:Begin lo %p\n", __func__, lo); 303 dprintk("%s:Begin lo %p\n", __func__, lo);
301 304
305 if (list_empty(&lo->plh_segs)) {
306 if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
307 put_layout_hdr_locked(lo);
308 return 0;
309 }
302 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 310 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
303 if (should_free_lseg(lseg->pls_range.iomode, iomode)) { 311 if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
304 dprintk("%s: freeing lseg %p iomode %d " 312 dprintk("%s: freeing lseg %p iomode %d "
@@ -312,11 +320,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
312 return invalid - removed; 320 return invalid - removed;
313} 321}
314 322
323/* note free_me must contain lsegs from a single layout_hdr */
315void 324void
316pnfs_free_lseg_list(struct list_head *free_me) 325pnfs_free_lseg_list(struct list_head *free_me)
317{ 326{
318 struct pnfs_layout_segment *lseg, *tmp; 327 struct pnfs_layout_segment *lseg, *tmp;
328 struct pnfs_layout_hdr *lo;
329
330 if (list_empty(free_me))
331 return;
319 332
333 lo = list_first_entry(free_me, struct pnfs_layout_segment,
334 pls_list)->pls_layout;
335
336 if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
337 struct nfs_client *clp;
338
339 clp = NFS_SERVER(lo->plh_inode)->nfs_client;
340 spin_lock(&clp->cl_lock);
341 list_del_init(&lo->plh_layouts);
342 spin_unlock(&clp->cl_lock);
343 }
320 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 344 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
321 list_del(&lseg->pls_list); 345 list_del(&lseg->pls_list);
322 free_lseg(lseg); 346 free_lseg(lseg);
@@ -332,10 +356,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
332 spin_lock(&nfsi->vfs_inode.i_lock); 356 spin_lock(&nfsi->vfs_inode.i_lock);
333 lo = nfsi->layout; 357 lo = nfsi->layout;
334 if (lo) { 358 if (lo) {
335 set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags); 359 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
336 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); 360 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
337 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
338 put_layout_hdr_locked(lo);
339 } 361 }
340 spin_unlock(&nfsi->vfs_inode.i_lock); 362 spin_unlock(&nfsi->vfs_inode.i_lock);
341 pnfs_free_lseg_list(&tmp_list); 363 pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +425,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
403 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) 425 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
404 return true; 426 return true;
405 return lo->plh_block_lgets || 427 return lo->plh_block_lgets ||
428 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
406 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 429 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
407 (list_empty(&lo->plh_segs) && 430 (list_empty(&lo->plh_segs) &&
408 (atomic_read(&lo->plh_outstanding) > lget)); 431 (atomic_read(&lo->plh_outstanding) > lget));
@@ -674,7 +697,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
674 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 697 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
675 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 698 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
676 is_matching_lseg(lseg, iomode)) { 699 is_matching_lseg(lseg, iomode)) {
677 ret = lseg; 700 ret = get_lseg(lseg);
678 break; 701 break;
679 } 702 }
680 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) 703 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -699,6 +722,7 @@ pnfs_update_layout(struct inode *ino,
699 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 722 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
700 struct pnfs_layout_hdr *lo; 723 struct pnfs_layout_hdr *lo;
701 struct pnfs_layout_segment *lseg = NULL; 724 struct pnfs_layout_segment *lseg = NULL;
725 bool first = false;
702 726
703 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 727 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
704 return NULL; 728 return NULL;
@@ -715,21 +739,25 @@ pnfs_update_layout(struct inode *ino,
715 dprintk("%s matches recall, use MDS\n", __func__); 739 dprintk("%s matches recall, use MDS\n", __func__);
716 goto out_unlock; 740 goto out_unlock;
717 } 741 }
718 /* Check to see if the layout for the given range already exists */
719 lseg = pnfs_find_lseg(lo, iomode);
720 if (lseg)
721 goto out_unlock;
722 742
723 /* if LAYOUTGET already failed once we don't try again */ 743 /* if LAYOUTGET already failed once we don't try again */
724 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) 744 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
725 goto out_unlock; 745 goto out_unlock;
726 746
747 /* Check to see if the layout for the given range already exists */
748 lseg = pnfs_find_lseg(lo, iomode);
749 if (lseg)
750 goto out_unlock;
751
727 if (pnfs_layoutgets_blocked(lo, NULL, 0)) 752 if (pnfs_layoutgets_blocked(lo, NULL, 0))
728 goto out_unlock; 753 goto out_unlock;
729 atomic_inc(&lo->plh_outstanding); 754 atomic_inc(&lo->plh_outstanding);
730 755
731 get_layout_hdr(lo); 756 get_layout_hdr(lo);
732 if (list_empty(&lo->plh_segs)) { 757 if (list_empty(&lo->plh_segs))
758 first = true;
759 spin_unlock(&ino->i_lock);
760 if (first) {
733 /* The lo must be on the clp list if there is any 761 /* The lo must be on the clp list if there is any
734 * chance of a CB_LAYOUTRECALL(FILE) coming in. 762 * chance of a CB_LAYOUTRECALL(FILE) coming in.
735 */ 763 */
@@ -738,24 +766,18 @@ pnfs_update_layout(struct inode *ino,
738 list_add_tail(&lo->plh_layouts, &clp->cl_layouts); 766 list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
739 spin_unlock(&clp->cl_lock); 767 spin_unlock(&clp->cl_lock);
740 } 768 }
741 spin_unlock(&ino->i_lock);
742 769
743 lseg = send_layoutget(lo, ctx, iomode); 770 lseg = send_layoutget(lo, ctx, iomode);
744 if (!lseg) { 771 if (!lseg && first) {
745 spin_lock(&ino->i_lock); 772 spin_lock(&clp->cl_lock);
746 if (list_empty(&lo->plh_segs)) { 773 list_del_init(&lo->plh_layouts);
747 spin_lock(&clp->cl_lock); 774 spin_unlock(&clp->cl_lock);
748 list_del_init(&lo->plh_layouts);
749 spin_unlock(&clp->cl_lock);
750 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
751 }
752 spin_unlock(&ino->i_lock);
753 } 775 }
754 atomic_dec(&lo->plh_outstanding); 776 atomic_dec(&lo->plh_outstanding);
755 put_layout_hdr(lo); 777 put_layout_hdr(lo);
756out: 778out:
757 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 779 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
758 nfsi->layout->plh_flags, lseg); 780 nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
759 return lseg; 781 return lseg;
760out_unlock: 782out_unlock:
761 spin_unlock(&ino->i_lock); 783 spin_unlock(&ino->i_lock);
@@ -808,7 +830,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
808 } 830 }
809 init_lseg(lo, lseg); 831 init_lseg(lo, lseg);
810 lseg->pls_range = res->range; 832 lseg->pls_range = res->range;
811 *lgp->lsegpp = lseg; 833 *lgp->lsegpp = get_lseg(lseg);
812 pnfs_insert_layout(lo, lseg); 834 pnfs_insert_layout(lo, lseg);
813 835
814 if (res->return_on_close) { 836 if (res->return_on_close) {
@@ -829,137 +851,97 @@ out_forget_reply:
829 goto out; 851 goto out;
830} 852}
831 853
832/* 854static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
833 * Device ID cache. Currently supports one layout type per struct nfs_client. 855 struct nfs_page *prev,
834 * Add layout type to the lookup key to expand to support multiple types. 856 struct nfs_page *req)
835 */
836int
837pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
838 void (*free_callback)(struct pnfs_deviceid_node *))
839{ 857{
840 struct pnfs_deviceid_cache *c; 858 if (pgio->pg_count == prev->wb_bytes) {
841 859 /* This is first coelesce call for a series of nfs_pages */
842 c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); 860 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
843 if (!c) 861 prev->wb_context,
844 return -ENOMEM; 862 IOMODE_READ);
845 spin_lock(&clp->cl_lock);
846 if (clp->cl_devid_cache != NULL) {
847 atomic_inc(&clp->cl_devid_cache->dc_ref);
848 dprintk("%s [kref [%d]]\n", __func__,
849 atomic_read(&clp->cl_devid_cache->dc_ref));
850 kfree(c);
851 } else {
852 /* kzalloc initializes hlists */
853 spin_lock_init(&c->dc_lock);
854 atomic_set(&c->dc_ref, 1);
855 c->dc_free_callback = free_callback;
856 clp->cl_devid_cache = c;
857 dprintk("%s [new]\n", __func__);
858 } 863 }
859 spin_unlock(&clp->cl_lock); 864 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
860 return 0;
861} 865}
862EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
863 866
864/*
865 * Called from pnfs_layoutdriver_type->free_lseg
866 * last layout segment reference frees deviceid
867 */
868void 867void
869pnfs_put_deviceid(struct pnfs_deviceid_cache *c, 868pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
870 struct pnfs_deviceid_node *devid)
871{ 869{
872 struct nfs4_deviceid *id = &devid->de_id; 870 struct pnfs_layoutdriver_type *ld;
873 struct pnfs_deviceid_node *d;
874 struct hlist_node *n;
875 long h = nfs4_deviceid_hash(id);
876 871
877 dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); 872 ld = NFS_SERVER(inode)->pnfs_curr_ld;
878 if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) 873 pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
879 return; 874}
880 875
881 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) 876static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
882 if (!memcmp(&d->de_id, id, sizeof(*id))) { 877 struct nfs_page *prev,
883 hlist_del_rcu(&d->de_node); 878 struct nfs_page *req)
884 spin_unlock(&c->dc_lock); 879{
885 synchronize_rcu(); 880 if (pgio->pg_count == prev->wb_bytes) {
886 c->dc_free_callback(devid); 881 /* This is first coelesce call for a series of nfs_pages */
887 return; 882 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
888 } 883 prev->wb_context,
889 spin_unlock(&c->dc_lock); 884 IOMODE_RW);
890 /* Why wasn't it found in the list? */
891 BUG();
892}
893EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
894
895/* Find and reference a deviceid */
896struct pnfs_deviceid_node *
897pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
898{
899 struct pnfs_deviceid_node *d;
900 struct hlist_node *n;
901 long hash = nfs4_deviceid_hash(id);
902
903 dprintk("--> %s hash %ld\n", __func__, hash);
904 rcu_read_lock();
905 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
906 if (!memcmp(&d->de_id, id, sizeof(*id))) {
907 if (!atomic_inc_not_zero(&d->de_ref)) {
908 goto fail;
909 } else {
910 rcu_read_unlock();
911 return d;
912 }
913 }
914 } 885 }
915fail: 886 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
916 rcu_read_unlock(); 887}
917 return NULL; 888
889void
890pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
891{
892 struct pnfs_layoutdriver_type *ld;
893
894 ld = NFS_SERVER(inode)->pnfs_curr_ld;
895 pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
896}
897
898enum pnfs_try_status
899pnfs_try_to_write_data(struct nfs_write_data *wdata,
900 const struct rpc_call_ops *call_ops, int how)
901{
902 struct inode *inode = wdata->inode;
903 enum pnfs_try_status trypnfs;
904 struct nfs_server *nfss = NFS_SERVER(inode);
905
906 wdata->mds_ops = call_ops;
907
908 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
909 inode->i_ino, wdata->args.count, wdata->args.offset, how);
910
911 trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
912 if (trypnfs == PNFS_NOT_ATTEMPTED) {
913 put_lseg(wdata->lseg);
914 wdata->lseg = NULL;
915 } else
916 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
917
918 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
919 return trypnfs;
918} 920}
919EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
920 921
921/* 922/*
922 * Add a deviceid to the cache. 923 * Call the appropriate parallel I/O subsystem read function.
923 * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
924 */ 924 */
925struct pnfs_deviceid_node * 925enum pnfs_try_status
926pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) 926pnfs_try_to_read_data(struct nfs_read_data *rdata,
927{ 927 const struct rpc_call_ops *call_ops)
928 struct pnfs_deviceid_node *d;
929 long hash = nfs4_deviceid_hash(&new->de_id);
930
931 dprintk("--> %s hash %ld\n", __func__, hash);
932 spin_lock(&c->dc_lock);
933 d = pnfs_find_get_deviceid(c, &new->de_id);
934 if (d) {
935 spin_unlock(&c->dc_lock);
936 dprintk("%s [discard]\n", __func__);
937 c->dc_free_callback(new);
938 return d;
939 }
940 INIT_HLIST_NODE(&new->de_node);
941 atomic_set(&new->de_ref, 1);
942 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
943 spin_unlock(&c->dc_lock);
944 dprintk("%s [new]\n", __func__);
945 return new;
946}
947EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
948
949void
950pnfs_put_deviceid_cache(struct nfs_client *clp)
951{ 928{
952 struct pnfs_deviceid_cache *local = clp->cl_devid_cache; 929 struct inode *inode = rdata->inode;
930 struct nfs_server *nfss = NFS_SERVER(inode);
931 enum pnfs_try_status trypnfs;
953 932
954 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); 933 rdata->mds_ops = call_ops;
955 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { 934
956 int i; 935 dprintk("%s: Reading ino:%lu %u@%llu\n",
957 /* Verify cache is empty */ 936 __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
958 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) 937
959 BUG_ON(!hlist_empty(&local->dc_deviceids[i])); 938 trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
960 clp->cl_devid_cache = NULL; 939 if (trypnfs == PNFS_NOT_ATTEMPTED) {
961 spin_unlock(&clp->cl_lock); 940 put_lseg(rdata->lseg);
962 kfree(local); 941 rdata->lseg = NULL;
942 } else {
943 nfs_inc_stats(inode, NFSIOS_PNFS_READ);
963 } 944 }
945 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
946 return trypnfs;
964} 947}
965EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e2612ea0cbed..6380b9405bcd 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,8 @@
30#ifndef FS_NFS_PNFS_H 30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H 31#define FS_NFS_PNFS_H
32 32
33#include <linux/nfs_page.h>
34
33enum { 35enum {
34 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 36 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
35 NFS_LSEG_ROC, /* roc bit received from server */ 37 NFS_LSEG_ROC, /* roc bit received from server */
@@ -43,6 +45,11 @@ struct pnfs_layout_segment {
43 struct pnfs_layout_hdr *pls_layout; 45 struct pnfs_layout_hdr *pls_layout;
44}; 46};
45 47
48enum pnfs_try_status {
49 PNFS_ATTEMPTED = 0,
50 PNFS_NOT_ATTEMPTED = 1,
51};
52
46#ifdef CONFIG_NFS_V4_1 53#ifdef CONFIG_NFS_V4_1
47 54
48#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" 55#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -61,10 +68,18 @@ struct pnfs_layoutdriver_type {
61 const u32 id; 68 const u32 id;
62 const char *name; 69 const char *name;
63 struct module *owner; 70 struct module *owner;
64 int (*set_layoutdriver) (struct nfs_server *);
65 int (*clear_layoutdriver) (struct nfs_server *);
66 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); 71 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
67 void (*free_lseg) (struct pnfs_layout_segment *lseg); 72 void (*free_lseg) (struct pnfs_layout_segment *lseg);
73
74 /* test for nfs page cache coalescing */
75 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
76
77 /*
78 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
79 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
80 */
81 enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
82 enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
68}; 83};
69 84
70struct pnfs_layout_hdr { 85struct pnfs_layout_hdr {
@@ -90,52 +105,6 @@ struct pnfs_device {
90 unsigned int pglen; 105 unsigned int pglen;
91}; 106};
92 107
93/*
94 * Device ID RCU cache. A device ID is unique per client ID and layout type.
95 */
96#define NFS4_DEVICE_ID_HASH_BITS 5
97#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
98#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
99
100static inline u32
101nfs4_deviceid_hash(struct nfs4_deviceid *id)
102{
103 unsigned char *cptr = (unsigned char *)id->data;
104 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
105 u32 x = 0;
106
107 while (nbytes--) {
108 x *= 37;
109 x += *cptr++;
110 }
111 return x & NFS4_DEVICE_ID_HASH_MASK;
112}
113
114struct pnfs_deviceid_node {
115 struct hlist_node de_node;
116 struct nfs4_deviceid de_id;
117 atomic_t de_ref;
118};
119
120struct pnfs_deviceid_cache {
121 spinlock_t dc_lock;
122 atomic_t dc_ref;
123 void (*dc_free_callback)(struct pnfs_deviceid_node *);
124 struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
125};
126
127extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
128 void (*free_callback)(struct pnfs_deviceid_node *));
129extern void pnfs_put_deviceid_cache(struct nfs_client *);
130extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
131 struct pnfs_deviceid_cache *,
132 struct nfs4_deviceid *);
133extern struct pnfs_deviceid_node *pnfs_add_deviceid(
134 struct pnfs_deviceid_cache *,
135 struct pnfs_deviceid_node *);
136extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
137 struct pnfs_deviceid_node *devid);
138
139extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); 108extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
140extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 109extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
141 110
@@ -146,11 +115,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
146 115
147/* pnfs.c */ 116/* pnfs.c */
148void get_layout_hdr(struct pnfs_layout_hdr *lo); 117void get_layout_hdr(struct pnfs_layout_hdr *lo);
118void put_lseg(struct pnfs_layout_segment *lseg);
149struct pnfs_layout_segment * 119struct pnfs_layout_segment *
150pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 120pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
151 enum pnfs_iomode access_type); 121 enum pnfs_iomode access_type);
152void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 122void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
153void unset_pnfs_layoutdriver(struct nfs_server *); 123void unset_pnfs_layoutdriver(struct nfs_server *);
124enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
125 const struct rpc_call_ops *, int);
126enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
127 const struct rpc_call_ops *);
128void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
129void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
154int pnfs_layout_process(struct nfs4_layoutget *lgp); 130int pnfs_layout_process(struct nfs4_layoutget *lgp);
155void pnfs_free_lseg_list(struct list_head *tmp_list); 131void pnfs_free_lseg_list(struct list_head *tmp_list);
156void pnfs_destroy_layout(struct nfs_inode *); 132void pnfs_destroy_layout(struct nfs_inode *);
@@ -177,6 +153,16 @@ static inline int lo_fail_bit(u32 iomode)
177 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; 153 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
178} 154}
179 155
156static inline struct pnfs_layout_segment *
157get_lseg(struct pnfs_layout_segment *lseg)
158{
159 if (lseg) {
160 atomic_inc(&lseg->pls_refcount);
161 smp_mb__after_atomic_inc();
162 }
163 return lseg;
164}
165
180/* Return true if a layout driver is being used for this mountpoint */ 166/* Return true if a layout driver is being used for this mountpoint */
181static inline int pnfs_enabled_sb(struct nfs_server *nfss) 167static inline int pnfs_enabled_sb(struct nfs_server *nfss)
182{ 168{
@@ -194,12 +180,36 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
194} 180}
195 181
196static inline struct pnfs_layout_segment * 182static inline struct pnfs_layout_segment *
183get_lseg(struct pnfs_layout_segment *lseg)
184{
185 return NULL;
186}
187
188static inline void put_lseg(struct pnfs_layout_segment *lseg)
189{
190}
191
192static inline struct pnfs_layout_segment *
197pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 193pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
198 enum pnfs_iomode access_type) 194 enum pnfs_iomode access_type)
199{ 195{
200 return NULL; 196 return NULL;
201} 197}
202 198
199static inline enum pnfs_try_status
200pnfs_try_to_read_data(struct nfs_read_data *data,
201 const struct rpc_call_ops *call_ops)
202{
203 return PNFS_NOT_ATTEMPTED;
204}
205
206static inline enum pnfs_try_status
207pnfs_try_to_write_data(struct nfs_write_data *data,
208 const struct rpc_call_ops *call_ops, int how)
209{
210 return PNFS_NOT_ATTEMPTED;
211}
212
203static inline bool 213static inline bool
204pnfs_roc(struct inode *ino) 214pnfs_roc(struct inode *ino)
205{ 215{
@@ -230,6 +240,18 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
230{ 240{
231} 241}
232 242
243static inline void
244pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
245{
246 pgio->pg_test = NULL;
247}
248
249static inline void
250pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
251{
252 pgio->pg_test = NULL;
253}
254
233#endif /* CONFIG_NFS_V4_1 */ 255#endif /* CONFIG_NFS_V4_1 */
234 256
235#endif /* FS_NFS_PNFS_H */ 257#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77d5e21c4ad6..b8ec170f2a0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
741 .lock = nfs_proc_lock, 741 .lock = nfs_proc_lock,
742 .lock_check_bounds = nfs_lock_check_bounds, 742 .lock_check_bounds = nfs_lock_check_bounds,
743 .close_context = nfs_close_context, 743 .close_context = nfs_close_context,
744 .init_client = nfs_init_client,
744}; 745};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index aedcaa7f291f..7cded2b12a05 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,19 +18,20 @@
18#include <linux/sunrpc/clnt.h> 18#include <linux/sunrpc/clnt.h>
19#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
20#include <linux/nfs_page.h> 20#include <linux/nfs_page.h>
21#include <linux/module.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
24#include "pnfs.h"
23 25
24#include "nfs4_fs.h" 26#include "nfs4_fs.h"
25#include "internal.h" 27#include "internal.h"
26#include "iostat.h" 28#include "iostat.h"
27#include "fscache.h" 29#include "fscache.h"
28#include "pnfs.h"
29 30
30#define NFSDBG_FACILITY NFSDBG_PAGECACHE 31#define NFSDBG_FACILITY NFSDBG_PAGECACHE
31 32
32static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int); 33static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
33static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int); 34static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
34static const struct rpc_call_ops nfs_read_partial_ops; 35static const struct rpc_call_ops nfs_read_partial_ops;
35static const struct rpc_call_ops nfs_read_full_ops; 36static const struct rpc_call_ops nfs_read_full_ops;
36 37
@@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
69 70
70static void nfs_readdata_release(struct nfs_read_data *rdata) 71static void nfs_readdata_release(struct nfs_read_data *rdata)
71{ 72{
73 put_lseg(rdata->lseg);
72 put_nfs_open_context(rdata->args.context); 74 put_nfs_open_context(rdata->args.context);
73 nfs_readdata_free(rdata); 75 nfs_readdata_free(rdata);
74} 76}
@@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
114int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 116int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
115 struct page *page) 117 struct page *page)
116{ 118{
117 LIST_HEAD(one_request);
118 struct nfs_page *new; 119 struct nfs_page *new;
119 unsigned int len; 120 unsigned int len;
121 struct nfs_pageio_descriptor pgio;
120 122
121 len = nfs_page_length(page); 123 len = nfs_page_length(page);
122 if (len == 0) 124 if (len == 0)
123 return nfs_return_empty_page(page); 125 return nfs_return_empty_page(page);
124 pnfs_update_layout(inode, ctx, IOMODE_READ);
125 new = nfs_create_request(ctx, inode, page, 0, len); 126 new = nfs_create_request(ctx, inode, page, 0, len);
126 if (IS_ERR(new)) { 127 if (IS_ERR(new)) {
127 unlock_page(page); 128 unlock_page(page);
@@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
130 if (len < PAGE_CACHE_SIZE) 131 if (len < PAGE_CACHE_SIZE)
131 zero_user_segment(page, len, PAGE_CACHE_SIZE); 132 zero_user_segment(page, len, PAGE_CACHE_SIZE);
132 133
133 nfs_list_add_request(new, &one_request); 134 nfs_pageio_init(&pgio, inode, NULL, 0, 0);
135 nfs_list_add_request(new, &pgio.pg_list);
136 pgio.pg_count = len;
137
134 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) 138 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
135 nfs_pagein_multi(inode, &one_request, 1, len, 0); 139 nfs_pagein_multi(&pgio);
136 else 140 else
137 nfs_pagein_one(inode, &one_request, 1, len, 0); 141 nfs_pagein_one(&pgio);
138 return 0; 142 return 0;
139} 143}
140 144
@@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req)
155 nfs_release_request(req); 159 nfs_release_request(req);
156} 160}
157 161
158/* 162int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
159 * Set up the NFS read request struct 163 const struct rpc_call_ops *call_ops)
160 */
161static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
162 const struct rpc_call_ops *call_ops,
163 unsigned int count, unsigned int offset)
164{ 164{
165 struct inode *inode = req->wb_context->path.dentry->d_inode; 165 struct inode *inode = data->inode;
166 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; 166 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
167 struct rpc_task *task; 167 struct rpc_task *task;
168 struct rpc_message msg = { 168 struct rpc_message msg = {
169 .rpc_argp = &data->args, 169 .rpc_argp = &data->args,
170 .rpc_resp = &data->res, 170 .rpc_resp = &data->res,
171 .rpc_cred = req->wb_context->cred, 171 .rpc_cred = data->cred,
172 }; 172 };
173 struct rpc_task_setup task_setup_data = { 173 struct rpc_task_setup task_setup_data = {
174 .task = &data->task, 174 .task = &data->task,
175 .rpc_client = NFS_CLIENT(inode), 175 .rpc_client = clnt,
176 .rpc_message = &msg, 176 .rpc_message = &msg,
177 .callback_ops = call_ops, 177 .callback_ops = call_ops,
178 .callback_data = data, 178 .callback_data = data,
@@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
180 .flags = RPC_TASK_ASYNC | swap_flags, 180 .flags = RPC_TASK_ASYNC | swap_flags,
181 }; 181 };
182 182
183 /* Set up the initial task struct. */
184 NFS_PROTO(inode)->read_setup(data, &msg);
185
186 dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
187 "offset %llu)\n",
188 data->task.tk_pid,
189 inode->i_sb->s_id,
190 (long long)NFS_FILEID(inode),
191 data->args.count,
192 (unsigned long long)data->args.offset);
193
194 task = rpc_run_task(&task_setup_data);
195 if (IS_ERR(task))
196 return PTR_ERR(task);
197 rpc_put_task(task);
198 return 0;
199}
200EXPORT_SYMBOL_GPL(nfs_initiate_read);
201
202/*
203 * Set up the NFS read request struct
204 */
205static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
206 const struct rpc_call_ops *call_ops,
207 unsigned int count, unsigned int offset,
208 struct pnfs_layout_segment *lseg)
209{
210 struct inode *inode = req->wb_context->path.dentry->d_inode;
211
183 data->req = req; 212 data->req = req;
184 data->inode = inode; 213 data->inode = inode;
185 data->cred = msg.rpc_cred; 214 data->cred = req->wb_context->cred;
215 data->lseg = get_lseg(lseg);
186 216
187 data->args.fh = NFS_FH(inode); 217 data->args.fh = NFS_FH(inode);
188 data->args.offset = req_offset(req) + offset; 218 data->args.offset = req_offset(req) + offset;
@@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
197 data->res.eof = 0; 227 data->res.eof = 0;
198 nfs_fattr_init(&data->fattr); 228 nfs_fattr_init(&data->fattr);
199 229
200 /* Set up the initial task struct. */ 230 if (data->lseg &&
201 NFS_PROTO(inode)->read_setup(data, &msg); 231 (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
202 232 return 0;
203 dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
204 data->task.tk_pid,
205 inode->i_sb->s_id,
206 (long long)NFS_FILEID(inode),
207 count,
208 (unsigned long long)data->args.offset);
209 233
210 task = rpc_run_task(&task_setup_data); 234 return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
211 if (IS_ERR(task))
212 return PTR_ERR(task);
213 rpc_put_task(task);
214 return 0;
215} 235}
216 236
217static void 237static void
@@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head)
240 * won't see the new data until our attribute cache is updated. This is more 260 * won't see the new data until our attribute cache is updated. This is more
241 * or less conventional NFS client behavior. 261 * or less conventional NFS client behavior.
242 */ 262 */
243static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) 263static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
244{ 264{
245 struct nfs_page *req = nfs_list_entry(head->next); 265 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
246 struct page *page = req->wb_page; 266 struct page *page = req->wb_page;
247 struct nfs_read_data *data; 267 struct nfs_read_data *data;
248 size_t rsize = NFS_SERVER(inode)->rsize, nbytes; 268 size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
249 unsigned int offset; 269 unsigned int offset;
250 int requests = 0; 270 int requests = 0;
251 int ret = 0; 271 int ret = 0;
272 struct pnfs_layout_segment *lseg;
252 LIST_HEAD(list); 273 LIST_HEAD(list);
253 274
254 nfs_list_remove_request(req); 275 nfs_list_remove_request(req);
255 276
256 nbytes = count; 277 nbytes = desc->pg_count;
257 do { 278 do {
258 size_t len = min(nbytes,rsize); 279 size_t len = min(nbytes,rsize);
259 280
@@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
266 } while(nbytes != 0); 287 } while(nbytes != 0);
267 atomic_set(&req->wb_complete, requests); 288 atomic_set(&req->wb_complete, requests);
268 289
290 BUG_ON(desc->pg_lseg != NULL);
291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
269 ClearPageError(page); 292 ClearPageError(page);
270 offset = 0; 293 offset = 0;
271 nbytes = count; 294 nbytes = desc->pg_count;
272 do { 295 do {
273 int ret2; 296 int ret2;
274 297
@@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
280 if (nbytes < rsize) 303 if (nbytes < rsize)
281 rsize = nbytes; 304 rsize = nbytes;
282 ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, 305 ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
283 rsize, offset); 306 rsize, offset, lseg);
284 if (ret == 0) 307 if (ret == 0)
285 ret = ret2; 308 ret = ret2;
286 offset += rsize; 309 offset += rsize;
287 nbytes -= rsize; 310 nbytes -= rsize;
288 } while (nbytes != 0); 311 } while (nbytes != 0);
312 put_lseg(lseg);
313 desc->pg_lseg = NULL;
289 314
290 return ret; 315 return ret;
291 316
@@ -300,16 +325,21 @@ out_bad:
300 return -ENOMEM; 325 return -ENOMEM;
301} 326}
302 327
303static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) 328static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
304{ 329{
305 struct nfs_page *req; 330 struct nfs_page *req;
306 struct page **pages; 331 struct page **pages;
307 struct nfs_read_data *data; 332 struct nfs_read_data *data;
333 struct list_head *head = &desc->pg_list;
334 struct pnfs_layout_segment *lseg = desc->pg_lseg;
308 int ret = -ENOMEM; 335 int ret = -ENOMEM;
309 336
310 data = nfs_readdata_alloc(npages); 337 data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
311 if (!data) 338 desc->pg_count));
312 goto out_bad; 339 if (!data) {
340 nfs_async_read_error(head);
341 goto out;
342 }
313 343
314 pages = data->pagevec; 344 pages = data->pagevec;
315 while (!list_empty(head)) { 345 while (!list_empty(head)) {
@@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
320 *pages++ = req->wb_page; 350 *pages++ = req->wb_page;
321 } 351 }
322 req = nfs_list_entry(data->pages.next); 352 req = nfs_list_entry(data->pages.next);
353 if ((!lseg) && list_is_singular(&data->pages))
354 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
323 355
324 return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); 356 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
325out_bad: 357 0, lseg);
326 nfs_async_read_error(head); 358out:
359 put_lseg(lseg);
360 desc->pg_lseg = NULL;
327 return ret; 361 return ret;
328} 362}
329 363
@@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
366 return; 400 return;
367 401
368 /* Yes, so retry the read at the end of the data */ 402 /* Yes, so retry the read at the end of the data */
403 data->mds_offset += resp->count;
369 argp->offset += resp->count; 404 argp->offset += resp->count;
370 argp->pgbase += resp->count; 405 argp->pgbase += resp->count;
371 argp->count -= resp->count; 406 argp->count -= resp->count;
@@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
625 if (ret == 0) 660 if (ret == 0)
626 goto read_complete; /* all pages were read */ 661 goto read_complete; /* all pages were read */
627 662
628 pnfs_update_layout(inode, desc.ctx, IOMODE_READ); 663 pnfs_pageio_init_read(&pgio, inode);
629 if (rsize < PAGE_CACHE_SIZE) 664 if (rsize < PAGE_CACHE_SIZE)
630 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 665 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
631 else 666 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b68c8607770f..2b8e9a5e366a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -263,8 +263,11 @@ static match_table_t nfs_local_lock_tokens = {
263static void nfs_umount_begin(struct super_block *); 263static void nfs_umount_begin(struct super_block *);
264static int nfs_statfs(struct dentry *, struct kstatfs *); 264static int nfs_statfs(struct dentry *, struct kstatfs *);
265static int nfs_show_options(struct seq_file *, struct vfsmount *); 265static int nfs_show_options(struct seq_file *, struct vfsmount *);
266static int nfs_show_devname(struct seq_file *, struct vfsmount *);
267static int nfs_show_path(struct seq_file *, struct vfsmount *);
266static int nfs_show_stats(struct seq_file *, struct vfsmount *); 268static int nfs_show_stats(struct seq_file *, struct vfsmount *);
267static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); 269static struct dentry *nfs_fs_mount(struct file_system_type *,
270 int, const char *, void *);
268static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, 271static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
269 int flags, const char *dev_name, void *raw_data); 272 int flags, const char *dev_name, void *raw_data);
270static void nfs_put_super(struct super_block *); 273static void nfs_put_super(struct super_block *);
@@ -274,7 +277,7 @@ static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
274static struct file_system_type nfs_fs_type = { 277static struct file_system_type nfs_fs_type = {
275 .owner = THIS_MODULE, 278 .owner = THIS_MODULE,
276 .name = "nfs", 279 .name = "nfs",
277 .get_sb = nfs_get_sb, 280 .mount = nfs_fs_mount,
278 .kill_sb = nfs_kill_super, 281 .kill_sb = nfs_kill_super,
279 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 282 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
280}; 283};
@@ -296,6 +299,8 @@ static const struct super_operations nfs_sops = {
296 .evict_inode = nfs_evict_inode, 299 .evict_inode = nfs_evict_inode,
297 .umount_begin = nfs_umount_begin, 300 .umount_begin = nfs_umount_begin,
298 .show_options = nfs_show_options, 301 .show_options = nfs_show_options,
302 .show_devname = nfs_show_devname,
303 .show_path = nfs_show_path,
299 .show_stats = nfs_show_stats, 304 .show_stats = nfs_show_stats,
300 .remount_fs = nfs_remount, 305 .remount_fs = nfs_remount,
301}; 306};
@@ -303,16 +308,16 @@ static const struct super_operations nfs_sops = {
303#ifdef CONFIG_NFS_V4 308#ifdef CONFIG_NFS_V4
304static int nfs4_validate_text_mount_data(void *options, 309static int nfs4_validate_text_mount_data(void *options,
305 struct nfs_parsed_mount_data *args, const char *dev_name); 310 struct nfs_parsed_mount_data *args, const char *dev_name);
306static int nfs4_try_mount(int flags, const char *dev_name, 311static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
307 struct nfs_parsed_mount_data *data, struct vfsmount *mnt); 312 struct nfs_parsed_mount_data *data);
308static int nfs4_get_sb(struct file_system_type *fs_type, 313static struct dentry *nfs4_mount(struct file_system_type *fs_type,
309 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 314 int flags, const char *dev_name, void *raw_data);
310static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, 315static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
311 int flags, const char *dev_name, void *raw_data); 316 int flags, const char *dev_name, void *raw_data);
312static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type, 317static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
313 int flags, const char *dev_name, void *raw_data); 318 int flags, const char *dev_name, void *raw_data);
314static int nfs4_referral_get_sb(struct file_system_type *fs_type, 319static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
315 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 320 int flags, const char *dev_name, void *raw_data);
316static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, 321static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
317 int flags, const char *dev_name, void *raw_data); 322 int flags, const char *dev_name, void *raw_data);
318static void nfs4_kill_super(struct super_block *sb); 323static void nfs4_kill_super(struct super_block *sb);
@@ -320,7 +325,7 @@ static void nfs4_kill_super(struct super_block *sb);
320static struct file_system_type nfs4_fs_type = { 325static struct file_system_type nfs4_fs_type = {
321 .owner = THIS_MODULE, 326 .owner = THIS_MODULE,
322 .name = "nfs4", 327 .name = "nfs4",
323 .get_sb = nfs4_get_sb, 328 .mount = nfs4_mount,
324 .kill_sb = nfs4_kill_super, 329 .kill_sb = nfs4_kill_super,
325 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 330 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
326}; 331};
@@ -352,7 +357,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {
352struct file_system_type nfs4_referral_fs_type = { 357struct file_system_type nfs4_referral_fs_type = {
353 .owner = THIS_MODULE, 358 .owner = THIS_MODULE,
354 .name = "nfs4", 359 .name = "nfs4",
355 .get_sb = nfs4_referral_get_sb, 360 .mount = nfs4_referral_mount,
356 .kill_sb = nfs4_kill_super, 361 .kill_sb = nfs4_kill_super,
357 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 362 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
358}; 363};
@@ -366,6 +371,8 @@ static const struct super_operations nfs4_sops = {
366 .evict_inode = nfs4_evict_inode, 371 .evict_inode = nfs4_evict_inode,
367 .umount_begin = nfs_umount_begin, 372 .umount_begin = nfs_umount_begin,
368 .show_options = nfs_show_options, 373 .show_options = nfs_show_options,
374 .show_devname = nfs_show_devname,
375 .show_path = nfs_show_path,
369 .show_stats = nfs_show_stats, 376 .show_stats = nfs_show_stats,
370 .remount_fs = nfs_remount, 377 .remount_fs = nfs_remount,
371}; 378};
@@ -726,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
726 return 0; 733 return 0;
727} 734}
728 735
736static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
737{
738 char *page = (char *) __get_free_page(GFP_KERNEL);
739 char *devname, *dummy;
740 int err = 0;
741 if (!page)
742 return -ENOMEM;
743 devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE);
744 if (IS_ERR(devname))
745 err = PTR_ERR(devname);
746 else
747 seq_escape(m, devname, " \t\n\\");
748 free_page((unsigned long)page);
749 return err;
750}
751
752static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt)
753{
754 seq_puts(m, "/");
755 return 0;
756}
757
729/* 758/*
730 * Present statistical information for this VFS mountpoint 759 * Present statistical information for this VFS mountpoint
731 */ 760 */
@@ -979,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value,
979 return 1; 1008 return 1;
980} 1009}
981 1010
1011static int nfs_get_option_str(substring_t args[], char **option)
1012{
1013 kfree(*option);
1014 *option = match_strdup(args);
1015 return !option;
1016}
1017
1018static int nfs_get_option_ul(substring_t args[], unsigned long *option)
1019{
1020 int rc;
1021 char *string;
1022
1023 string = match_strdup(args);
1024 if (string == NULL)
1025 return -ENOMEM;
1026 rc = strict_strtoul(string, 10, option);
1027 kfree(string);
1028
1029 return rc;
1030}
1031
982/* 1032/*
983 * Error-check and convert a string of mount options from user space into 1033 * Error-check and convert a string of mount options from user space into
984 * a data structure. The whole mount string is processed; bad options are 1034 * a data structure. The whole mount string is processed; bad options are
@@ -1127,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw,
1127 * options that take numeric values 1177 * options that take numeric values
1128 */ 1178 */
1129 case Opt_port: 1179 case Opt_port:
1130 string = match_strdup(args); 1180 if (nfs_get_option_ul(args, &option) ||
1131 if (string == NULL) 1181 option > USHRT_MAX)
1132 goto out_nomem;
1133 rc = strict_strtoul(string, 10, &option);
1134 kfree(string);
1135 if (rc != 0 || option > USHRT_MAX)
1136 goto out_invalid_value; 1182 goto out_invalid_value;
1137 mnt->nfs_server.port = option; 1183 mnt->nfs_server.port = option;
1138 break; 1184 break;
1139 case Opt_rsize: 1185 case Opt_rsize:
1140 string = match_strdup(args); 1186 if (nfs_get_option_ul(args, &option))
1141 if (string == NULL)
1142 goto out_nomem;
1143 rc = strict_strtoul(string, 10, &option);
1144 kfree(string);
1145 if (rc != 0)
1146 goto out_invalid_value; 1187 goto out_invalid_value;
1147 mnt->rsize = option; 1188 mnt->rsize = option;
1148 break; 1189 break;
1149 case Opt_wsize: 1190 case Opt_wsize:
1150 string = match_strdup(args); 1191 if (nfs_get_option_ul(args, &option))
1151 if (string == NULL)
1152 goto out_nomem;
1153 rc = strict_strtoul(string, 10, &option);
1154 kfree(string);
1155 if (rc != 0)
1156 goto out_invalid_value; 1192 goto out_invalid_value;
1157 mnt->wsize = option; 1193 mnt->wsize = option;
1158 break; 1194 break;
1159 case Opt_bsize: 1195 case Opt_bsize:
1160 string = match_strdup(args); 1196 if (nfs_get_option_ul(args, &option))
1161 if (string == NULL)
1162 goto out_nomem;
1163 rc = strict_strtoul(string, 10, &option);
1164 kfree(string);
1165 if (rc != 0)
1166 goto out_invalid_value; 1197 goto out_invalid_value;
1167 mnt->bsize = option; 1198 mnt->bsize = option;
1168 break; 1199 break;
1169 case Opt_timeo: 1200 case Opt_timeo:
1170 string = match_strdup(args); 1201 if (nfs_get_option_ul(args, &option) || option == 0)
1171 if (string == NULL)
1172 goto out_nomem;
1173 rc = strict_strtoul(string, 10, &option);
1174 kfree(string);
1175 if (rc != 0 || option == 0)
1176 goto out_invalid_value; 1202 goto out_invalid_value;
1177 mnt->timeo = option; 1203 mnt->timeo = option;
1178 break; 1204 break;
1179 case Opt_retrans: 1205 case Opt_retrans:
1180 string = match_strdup(args); 1206 if (nfs_get_option_ul(args, &option) || option == 0)
1181 if (string == NULL)
1182 goto out_nomem;
1183 rc = strict_strtoul(string, 10, &option);
1184 kfree(string);
1185 if (rc != 0 || option == 0)
1186 goto out_invalid_value; 1207 goto out_invalid_value;
1187 mnt->retrans = option; 1208 mnt->retrans = option;
1188 break; 1209 break;
1189 case Opt_acregmin: 1210 case Opt_acregmin:
1190 string = match_strdup(args); 1211 if (nfs_get_option_ul(args, &option))
1191 if (string == NULL)
1192 goto out_nomem;
1193 rc = strict_strtoul(string, 10, &option);
1194 kfree(string);
1195 if (rc != 0)
1196 goto out_invalid_value; 1212 goto out_invalid_value;
1197 mnt->acregmin = option; 1213 mnt->acregmin = option;
1198 break; 1214 break;
1199 case Opt_acregmax: 1215 case Opt_acregmax:
1200 string = match_strdup(args); 1216 if (nfs_get_option_ul(args, &option))
1201 if (string == NULL)
1202 goto out_nomem;
1203 rc = strict_strtoul(string, 10, &option);
1204 kfree(string);
1205 if (rc != 0)
1206 goto out_invalid_value; 1217 goto out_invalid_value;
1207 mnt->acregmax = option; 1218 mnt->acregmax = option;
1208 break; 1219 break;
1209 case Opt_acdirmin: 1220 case Opt_acdirmin:
1210 string = match_strdup(args); 1221 if (nfs_get_option_ul(args, &option))
1211 if (string == NULL)
1212 goto out_nomem;
1213 rc = strict_strtoul(string, 10, &option);
1214 kfree(string);
1215 if (rc != 0)
1216 goto out_invalid_value; 1222 goto out_invalid_value;
1217 mnt->acdirmin = option; 1223 mnt->acdirmin = option;
1218 break; 1224 break;
1219 case Opt_acdirmax: 1225 case Opt_acdirmax:
1220 string = match_strdup(args); 1226 if (nfs_get_option_ul(args, &option))
1221 if (string == NULL)
1222 goto out_nomem;
1223 rc = strict_strtoul(string, 10, &option);
1224 kfree(string);
1225 if (rc != 0)
1226 goto out_invalid_value; 1227 goto out_invalid_value;
1227 mnt->acdirmax = option; 1228 mnt->acdirmax = option;
1228 break; 1229 break;
1229 case Opt_actimeo: 1230 case Opt_actimeo:
1230 string = match_strdup(args); 1231 if (nfs_get_option_ul(args, &option))
1231 if (string == NULL)
1232 goto out_nomem;
1233 rc = strict_strtoul(string, 10, &option);
1234 kfree(string);
1235 if (rc != 0)
1236 goto out_invalid_value; 1232 goto out_invalid_value;
1237 mnt->acregmin = mnt->acregmax = 1233 mnt->acregmin = mnt->acregmax =
1238 mnt->acdirmin = mnt->acdirmax = option; 1234 mnt->acdirmin = mnt->acdirmax = option;
1239 break; 1235 break;
1240 case Opt_namelen: 1236 case Opt_namelen:
1241 string = match_strdup(args); 1237 if (nfs_get_option_ul(args, &option))
1242 if (string == NULL)
1243 goto out_nomem;
1244 rc = strict_strtoul(string, 10, &option);
1245 kfree(string);
1246 if (rc != 0)
1247 goto out_invalid_value; 1238 goto out_invalid_value;
1248 mnt->namlen = option; 1239 mnt->namlen = option;
1249 break; 1240 break;
1250 case Opt_mountport: 1241 case Opt_mountport:
1251 string = match_strdup(args); 1242 if (nfs_get_option_ul(args, &option) ||
1252 if (string == NULL) 1243 option > USHRT_MAX)
1253 goto out_nomem;
1254 rc = strict_strtoul(string, 10, &option);
1255 kfree(string);
1256 if (rc != 0 || option > USHRT_MAX)
1257 goto out_invalid_value; 1244 goto out_invalid_value;
1258 mnt->mount_server.port = option; 1245 mnt->mount_server.port = option;
1259 break; 1246 break;
1260 case Opt_mountvers: 1247 case Opt_mountvers:
1261 string = match_strdup(args); 1248 if (nfs_get_option_ul(args, &option) ||
1262 if (string == NULL)
1263 goto out_nomem;
1264 rc = strict_strtoul(string, 10, &option);
1265 kfree(string);
1266 if (rc != 0 ||
1267 option < NFS_MNT_VERSION || 1249 option < NFS_MNT_VERSION ||
1268 option > NFS_MNT3_VERSION) 1250 option > NFS_MNT3_VERSION)
1269 goto out_invalid_value; 1251 goto out_invalid_value;
1270 mnt->mount_server.version = option; 1252 mnt->mount_server.version = option;
1271 break; 1253 break;
1272 case Opt_nfsvers: 1254 case Opt_nfsvers:
1273 string = match_strdup(args); 1255 if (nfs_get_option_ul(args, &option))
1274 if (string == NULL)
1275 goto out_nomem;
1276 rc = strict_strtoul(string, 10, &option);
1277 kfree(string);
1278 if (rc != 0)
1279 goto out_invalid_value; 1256 goto out_invalid_value;
1280 switch (option) { 1257 switch (option) {
1281 case NFS2_VERSION: 1258 case NFS2_VERSION:
@@ -1295,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw,
1295 } 1272 }
1296 break; 1273 break;
1297 case Opt_minorversion: 1274 case Opt_minorversion:
1298 string = match_strdup(args); 1275 if (nfs_get_option_ul(args, &option))
1299 if (string == NULL)
1300 goto out_nomem;
1301 rc = strict_strtoul(string, 10, &option);
1302 kfree(string);
1303 if (rc != 0)
1304 goto out_invalid_value; 1276 goto out_invalid_value;
1305 if (option > NFS4_MAX_MINOR_VERSION) 1277 if (option > NFS4_MAX_MINOR_VERSION)
1306 goto out_invalid_value; 1278 goto out_invalid_value;
@@ -1336,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw,
1336 case Opt_xprt_udp: 1308 case Opt_xprt_udp:
1337 mnt->flags &= ~NFS_MOUNT_TCP; 1309 mnt->flags &= ~NFS_MOUNT_TCP;
1338 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1310 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1339 kfree(string);
1340 break; 1311 break;
1341 case Opt_xprt_tcp6: 1312 case Opt_xprt_tcp6:
1342 protofamily = AF_INET6; 1313 protofamily = AF_INET6;
1343 case Opt_xprt_tcp: 1314 case Opt_xprt_tcp:
1344 mnt->flags |= NFS_MOUNT_TCP; 1315 mnt->flags |= NFS_MOUNT_TCP;
1345 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1316 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1346 kfree(string);
1347 break; 1317 break;
1348 case Opt_xprt_rdma: 1318 case Opt_xprt_rdma:
1349 /* vector side protocols to TCP */ 1319 /* vector side protocols to TCP */
1350 mnt->flags |= NFS_MOUNT_TCP; 1320 mnt->flags |= NFS_MOUNT_TCP;
1351 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1321 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1352 xprt_load_transport(string); 1322 xprt_load_transport(string);
1353 kfree(string);
1354 break; 1323 break;
1355 default: 1324 default:
1356 dfprintk(MOUNT, "NFS: unrecognized " 1325 dfprintk(MOUNT, "NFS: unrecognized "
@@ -1358,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw,
1358 kfree(string); 1327 kfree(string);
1359 return 0; 1328 return 0;
1360 } 1329 }
1330 kfree(string);
1361 break; 1331 break;
1362 case Opt_mountproto: 1332 case Opt_mountproto:
1363 string = match_strdup(args); 1333 string = match_strdup(args);
@@ -1400,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw,
1400 goto out_invalid_address; 1370 goto out_invalid_address;
1401 break; 1371 break;
1402 case Opt_clientaddr: 1372 case Opt_clientaddr:
1403 string = match_strdup(args); 1373 if (nfs_get_option_str(args, &mnt->client_address))
1404 if (string == NULL)
1405 goto out_nomem; 1374 goto out_nomem;
1406 kfree(mnt->client_address);
1407 mnt->client_address = string;
1408 break; 1375 break;
1409 case Opt_mounthost: 1376 case Opt_mounthost:
1410 string = match_strdup(args); 1377 if (nfs_get_option_str(args,
1411 if (string == NULL) 1378 &mnt->mount_server.hostname))
1412 goto out_nomem; 1379 goto out_nomem;
1413 kfree(mnt->mount_server.hostname);
1414 mnt->mount_server.hostname = string;
1415 break; 1380 break;
1416 case Opt_mountaddr: 1381 case Opt_mountaddr:
1417 string = match_strdup(args); 1382 string = match_strdup(args);
@@ -1451,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw,
1451 }; 1416 };
1452 break; 1417 break;
1453 case Opt_fscache_uniq: 1418 case Opt_fscache_uniq:
1454 string = match_strdup(args); 1419 if (nfs_get_option_str(args, &mnt->fscache_uniq))
1455 if (string == NULL)
1456 goto out_nomem; 1420 goto out_nomem;
1457 kfree(mnt->fscache_uniq);
1458 mnt->fscache_uniq = string;
1459 mnt->options |= NFS_OPTION_FSCACHE; 1421 mnt->options |= NFS_OPTION_FSCACHE;
1460 break; 1422 break;
1461 case Opt_local_lock: 1423 case Opt_local_lock:
@@ -1665,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1665 return nfs_walk_authlist(args, &request); 1627 return nfs_walk_authlist(args, &request);
1666} 1628}
1667 1629
1668static int nfs_parse_simple_hostname(const char *dev_name, 1630/*
1669 char **hostname, size_t maxnamlen, 1631 * Split "dev_name" into "hostname:export_path".
1670 char **export_path, size_t maxpathlen) 1632 *
1633 * The leftmost colon demarks the split between the server's hostname
1634 * and the export path. If the hostname starts with a left square
1635 * bracket, then it may contain colons.
1636 *
1637 * Note: caller frees hostname and export path, even on error.
1638 */
1639static int nfs_parse_devname(const char *dev_name,
1640 char **hostname, size_t maxnamlen,
1641 char **export_path, size_t maxpathlen)
1671{ 1642{
1672 size_t len; 1643 size_t len;
1673 char *colon, *comma; 1644 char *end;
1674
1675 colon = strchr(dev_name, ':');
1676 if (colon == NULL)
1677 goto out_bad_devname;
1678
1679 len = colon - dev_name;
1680 if (len > maxnamlen)
1681 goto out_hostname;
1682 1645
1683 /* N.B. caller will free nfs_server.hostname in all cases */ 1646 /* Is the host name protected with square brakcets? */
1684 *hostname = kstrndup(dev_name, len, GFP_KERNEL); 1647 if (*dev_name == '[') {
1685 if (!*hostname) 1648 end = strchr(++dev_name, ']');
1686 goto out_nomem; 1649 if (end == NULL || end[1] != ':')
1687
1688 /* kill possible hostname list: not supported */
1689 comma = strchr(*hostname, ',');
1690 if (comma != NULL) {
1691 if (comma == *hostname)
1692 goto out_bad_devname; 1650 goto out_bad_devname;
1693 *comma = '\0';
1694 }
1695 1651
1696 colon++; 1652 len = end - dev_name;
1697 len = strlen(colon); 1653 end++;
1698 if (len > maxpathlen) 1654 } else {
1699 goto out_path; 1655 char *comma;
1700 *export_path = kstrndup(colon, len, GFP_KERNEL);
1701 if (!*export_path)
1702 goto out_nomem;
1703
1704 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
1705 return 0;
1706
1707out_bad_devname:
1708 dfprintk(MOUNT, "NFS: device name not in host:path format\n");
1709 return -EINVAL;
1710
1711out_nomem:
1712 dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
1713 return -ENOMEM;
1714
1715out_hostname:
1716 dfprintk(MOUNT, "NFS: server hostname too long\n");
1717 return -ENAMETOOLONG;
1718
1719out_path:
1720 dfprintk(MOUNT, "NFS: export pathname too long\n");
1721 return -ENAMETOOLONG;
1722}
1723
1724/*
1725 * Hostname has square brackets around it because it contains one or
1726 * more colons. We look for the first closing square bracket, and a
1727 * colon must follow it.
1728 */
1729static int nfs_parse_protected_hostname(const char *dev_name,
1730 char **hostname, size_t maxnamlen,
1731 char **export_path, size_t maxpathlen)
1732{
1733 size_t len;
1734 char *start, *end;
1735 1656
1736 start = (char *)(dev_name + 1); 1657 end = strchr(dev_name, ':');
1658 if (end == NULL)
1659 goto out_bad_devname;
1660 len = end - dev_name;
1737 1661
1738 end = strchr(start, ']'); 1662 /* kill possible hostname list: not supported */
1739 if (end == NULL) 1663 comma = strchr(dev_name, ',');
1740 goto out_bad_devname; 1664 if (comma != NULL && comma < end)
1741 if (*(end + 1) != ':') 1665 *comma = 0;
1742 goto out_bad_devname; 1666 }
1743 1667
1744 len = end - start;
1745 if (len > maxnamlen) 1668 if (len > maxnamlen)
1746 goto out_hostname; 1669 goto out_hostname;
1747 1670
1748 /* N.B. caller will free nfs_server.hostname in all cases */ 1671 /* N.B. caller will free nfs_server.hostname in all cases */
1749 *hostname = kstrndup(start, len, GFP_KERNEL); 1672 *hostname = kstrndup(dev_name, len, GFP_KERNEL);
1750 if (*hostname == NULL) 1673 if (*hostname == NULL)
1751 goto out_nomem; 1674 goto out_nomem;
1752 1675 len = strlen(++end);
1753 end += 2;
1754 len = strlen(end);
1755 if (len > maxpathlen) 1676 if (len > maxpathlen)
1756 goto out_path; 1677 goto out_path;
1757 *export_path = kstrndup(end, len, GFP_KERNEL); 1678 *export_path = kstrndup(end, len, GFP_KERNEL);
1758 if (!*export_path) 1679 if (!*export_path)
1759 goto out_nomem; 1680 goto out_nomem;
1760 1681
1682 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
1761 return 0; 1683 return 0;
1762 1684
1763out_bad_devname: 1685out_bad_devname:
@@ -1778,29 +1700,6 @@ out_path:
1778} 1700}
1779 1701
1780/* 1702/*
1781 * Split "dev_name" into "hostname:export_path".
1782 *
1783 * The leftmost colon demarks the split between the server's hostname
1784 * and the export path. If the hostname starts with a left square
1785 * bracket, then it may contain colons.
1786 *
1787 * Note: caller frees hostname and export path, even on error.
1788 */
1789static int nfs_parse_devname(const char *dev_name,
1790 char **hostname, size_t maxnamlen,
1791 char **export_path, size_t maxpathlen)
1792{
1793 if (*dev_name == '[')
1794 return nfs_parse_protected_hostname(dev_name,
1795 hostname, maxnamlen,
1796 export_path, maxpathlen);
1797
1798 return nfs_parse_simple_hostname(dev_name,
1799 hostname, maxnamlen,
1800 export_path, maxpathlen);
1801}
1802
1803/*
1804 * Validate the NFS2/NFS3 mount data 1703 * Validate the NFS2/NFS3 mount data
1805 * - fills in the mount root filehandle 1704 * - fills in the mount root filehandle
1806 * 1705 *
@@ -2267,19 +2166,19 @@ static int nfs_bdi_register(struct nfs_server *server)
2267 return bdi_register_dev(&server->backing_dev_info, server->s_dev); 2166 return bdi_register_dev(&server->backing_dev_info, server->s_dev);
2268} 2167}
2269 2168
2270static int nfs_get_sb(struct file_system_type *fs_type, 2169static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2271 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2170 int flags, const char *dev_name, void *raw_data)
2272{ 2171{
2273 struct nfs_server *server = NULL; 2172 struct nfs_server *server = NULL;
2274 struct super_block *s; 2173 struct super_block *s;
2275 struct nfs_parsed_mount_data *data; 2174 struct nfs_parsed_mount_data *data;
2276 struct nfs_fh *mntfh; 2175 struct nfs_fh *mntfh;
2277 struct dentry *mntroot; 2176 struct dentry *mntroot = ERR_PTR(-ENOMEM);
2278 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 2177 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
2279 struct nfs_sb_mountdata sb_mntdata = { 2178 struct nfs_sb_mountdata sb_mntdata = {
2280 .mntflags = flags, 2179 .mntflags = flags,
2281 }; 2180 };
2282 int error = -ENOMEM; 2181 int error;
2283 2182
2284 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); 2183 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
2285 mntfh = nfs_alloc_fhandle(); 2184 mntfh = nfs_alloc_fhandle();
@@ -2290,12 +2189,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2290 2189
2291 /* Validate the mount data */ 2190 /* Validate the mount data */
2292 error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); 2191 error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
2293 if (error < 0) 2192 if (error < 0) {
2193 mntroot = ERR_PTR(error);
2294 goto out; 2194 goto out;
2195 }
2295 2196
2296#ifdef CONFIG_NFS_V4 2197#ifdef CONFIG_NFS_V4
2297 if (data->version == 4) { 2198 if (data->version == 4) {
2298 error = nfs4_try_mount(flags, dev_name, data, mnt); 2199 mntroot = nfs4_try_mount(flags, dev_name, data);
2299 kfree(data->client_address); 2200 kfree(data->client_address);
2300 kfree(data->nfs_server.export_path); 2201 kfree(data->nfs_server.export_path);
2301 goto out; 2202 goto out;
@@ -2305,7 +2206,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2305 /* Get a volume representation */ 2206 /* Get a volume representation */
2306 server = nfs_create_server(data, mntfh); 2207 server = nfs_create_server(data, mntfh);
2307 if (IS_ERR(server)) { 2208 if (IS_ERR(server)) {
2308 error = PTR_ERR(server); 2209 mntroot = ERR_CAST(server);
2309 goto out; 2210 goto out;
2310 } 2211 }
2311 sb_mntdata.server = server; 2212 sb_mntdata.server = server;
@@ -2316,7 +2217,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2316 /* Get a superblock - note that we may end up sharing one that already exists */ 2217 /* Get a superblock - note that we may end up sharing one that already exists */
2317 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); 2218 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
2318 if (IS_ERR(s)) { 2219 if (IS_ERR(s)) {
2319 error = PTR_ERR(s); 2220 mntroot = ERR_CAST(s);
2320 goto out_err_nosb; 2221 goto out_err_nosb;
2321 } 2222 }
2322 2223
@@ -2325,8 +2226,10 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2325 server = NULL; 2226 server = NULL;
2326 } else { 2227 } else {
2327 error = nfs_bdi_register(server); 2228 error = nfs_bdi_register(server);
2328 if (error) 2229 if (error) {
2230 mntroot = ERR_PTR(error);
2329 goto error_splat_bdi; 2231 goto error_splat_bdi;
2232 }
2330 } 2233 }
2331 2234
2332 if (!s->s_root) { 2235 if (!s->s_root) {
@@ -2336,20 +2239,15 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2336 s, data ? data->fscache_uniq : NULL, NULL); 2239 s, data ? data->fscache_uniq : NULL, NULL);
2337 } 2240 }
2338 2241
2339 mntroot = nfs_get_root(s, mntfh); 2242 mntroot = nfs_get_root(s, mntfh, dev_name);
2340 if (IS_ERR(mntroot)) { 2243 if (IS_ERR(mntroot))
2341 error = PTR_ERR(mntroot);
2342 goto error_splat_super; 2244 goto error_splat_super;
2343 }
2344 2245
2345 error = security_sb_set_mnt_opts(s, &data->lsm_opts); 2246 error = security_sb_set_mnt_opts(s, &data->lsm_opts);
2346 if (error) 2247 if (error)
2347 goto error_splat_root; 2248 goto error_splat_root;
2348 2249
2349 s->s_flags |= MS_ACTIVE; 2250 s->s_flags |= MS_ACTIVE;
2350 mnt->mnt_sb = s;
2351 mnt->mnt_root = mntroot;
2352 error = 0;
2353 2251
2354out: 2252out:
2355 kfree(data->nfs_server.hostname); 2253 kfree(data->nfs_server.hostname);
@@ -2359,7 +2257,7 @@ out:
2359out_free_fh: 2257out_free_fh:
2360 nfs_free_fhandle(mntfh); 2258 nfs_free_fhandle(mntfh);
2361 kfree(data); 2259 kfree(data);
2362 return error; 2260 return mntroot;
2363 2261
2364out_err_nosb: 2262out_err_nosb:
2365 nfs_free_server(server); 2263 nfs_free_server(server);
@@ -2367,6 +2265,7 @@ out_err_nosb:
2367 2265
2368error_splat_root: 2266error_splat_root:
2369 dput(mntroot); 2267 dput(mntroot);
2268 mntroot = ERR_PTR(error);
2370error_splat_super: 2269error_splat_super:
2371 if (server && !s->s_root) 2270 if (server && !s->s_root)
2372 bdi_unregister(&server->backing_dev_info); 2271 bdi_unregister(&server->backing_dev_info);
@@ -2450,7 +2349,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2450 nfs_fscache_get_super_cookie(s, NULL, data); 2349 nfs_fscache_get_super_cookie(s, NULL, data);
2451 } 2350 }
2452 2351
2453 mntroot = nfs_get_root(s, data->fh); 2352 mntroot = nfs_get_root(s, data->fh, dev_name);
2454 if (IS_ERR(mntroot)) { 2353 if (IS_ERR(mntroot)) {
2455 error = PTR_ERR(mntroot); 2354 error = PTR_ERR(mntroot);
2456 goto error_splat_super; 2355 goto error_splat_super;
@@ -2718,7 +2617,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2718 s, data ? data->fscache_uniq : NULL, NULL); 2617 s, data ? data->fscache_uniq : NULL, NULL);
2719 } 2618 }
2720 2619
2721 mntroot = nfs4_get_root(s, mntfh); 2620 mntroot = nfs4_get_root(s, mntfh, dev_name);
2722 if (IS_ERR(mntroot)) { 2621 if (IS_ERR(mntroot)) {
2723 error = PTR_ERR(mntroot); 2622 error = PTR_ERR(mntroot);
2724 goto error_splat_super; 2623 goto error_splat_super;
@@ -2771,27 +2670,6 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
2771 return root_mnt; 2670 return root_mnt;
2772} 2671}
2773 2672
2774static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
2775{
2776 char *page = (char *) __get_free_page(GFP_KERNEL);
2777 char *devname, *tmp;
2778
2779 if (page == NULL)
2780 return;
2781 devname = nfs_path(path->mnt->mnt_devname,
2782 path->mnt->mnt_root, path->dentry,
2783 page, PAGE_SIZE);
2784 if (IS_ERR(devname))
2785 goto out_freepage;
2786 tmp = kstrdup(devname, GFP_KERNEL);
2787 if (tmp == NULL)
2788 goto out_freepage;
2789 kfree(mnt->mnt_devname);
2790 mnt->mnt_devname = tmp;
2791out_freepage:
2792 free_page((unsigned long)page);
2793}
2794
2795struct nfs_referral_count { 2673struct nfs_referral_count {
2796 struct list_head list; 2674 struct list_head list;
2797 const struct task_struct *task; 2675 const struct task_struct *task;
@@ -2858,17 +2736,18 @@ static void nfs_referral_loop_unprotect(void)
2858 kfree(p); 2736 kfree(p);
2859} 2737}
2860 2738
2861static int nfs_follow_remote_path(struct vfsmount *root_mnt, 2739static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2862 const char *export_path, struct vfsmount *mnt_target) 2740 const char *export_path)
2863{ 2741{
2864 struct nameidata *nd = NULL; 2742 struct nameidata *nd = NULL;
2865 struct mnt_namespace *ns_private; 2743 struct mnt_namespace *ns_private;
2866 struct super_block *s; 2744 struct super_block *s;
2745 struct dentry *dentry;
2867 int ret; 2746 int ret;
2868 2747
2869 nd = kmalloc(sizeof(*nd), GFP_KERNEL); 2748 nd = kmalloc(sizeof(*nd), GFP_KERNEL);
2870 if (nd == NULL) 2749 if (nd == NULL)
2871 return -ENOMEM; 2750 return ERR_PTR(-ENOMEM);
2872 2751
2873 ns_private = create_mnt_ns(root_mnt); 2752 ns_private = create_mnt_ns(root_mnt);
2874 ret = PTR_ERR(ns_private); 2753 ret = PTR_ERR(ns_private);
@@ -2890,32 +2769,27 @@ static int nfs_follow_remote_path(struct vfsmount *root_mnt,
2890 2769
2891 s = nd->path.mnt->mnt_sb; 2770 s = nd->path.mnt->mnt_sb;
2892 atomic_inc(&s->s_active); 2771 atomic_inc(&s->s_active);
2893 mnt_target->mnt_sb = s; 2772 dentry = dget(nd->path.dentry);
2894 mnt_target->mnt_root = dget(nd->path.dentry);
2895
2896 /* Correct the device pathname */
2897 nfs_fix_devname(&nd->path, mnt_target);
2898 2773
2899 path_put(&nd->path); 2774 path_put(&nd->path);
2900 kfree(nd); 2775 kfree(nd);
2901 down_write(&s->s_umount); 2776 down_write(&s->s_umount);
2902 return 0; 2777 return dentry;
2903out_put_mnt_ns: 2778out_put_mnt_ns:
2904 put_mnt_ns(ns_private); 2779 put_mnt_ns(ns_private);
2905out_mntput: 2780out_mntput:
2906 mntput(root_mnt); 2781 mntput(root_mnt);
2907out_err: 2782out_err:
2908 kfree(nd); 2783 kfree(nd);
2909 return ret; 2784 return ERR_PTR(ret);
2910} 2785}
2911 2786
2912static int nfs4_try_mount(int flags, const char *dev_name, 2787static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
2913 struct nfs_parsed_mount_data *data, 2788 struct nfs_parsed_mount_data *data)
2914 struct vfsmount *mnt)
2915{ 2789{
2916 char *export_path; 2790 char *export_path;
2917 struct vfsmount *root_mnt; 2791 struct vfsmount *root_mnt;
2918 int error; 2792 struct dentry *res;
2919 2793
2920 dfprintk(MOUNT, "--> nfs4_try_mount()\n"); 2794 dfprintk(MOUNT, "--> nfs4_try_mount()\n");
2921 2795
@@ -2925,26 +2799,25 @@ static int nfs4_try_mount(int flags, const char *dev_name,
2925 data->nfs_server.hostname); 2799 data->nfs_server.hostname);
2926 data->nfs_server.export_path = export_path; 2800 data->nfs_server.export_path = export_path;
2927 2801
2928 error = PTR_ERR(root_mnt); 2802 res = ERR_CAST(root_mnt);
2929 if (IS_ERR(root_mnt)) 2803 if (!IS_ERR(root_mnt))
2930 goto out; 2804 res = nfs_follow_remote_path(root_mnt, export_path);
2931
2932 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2933 2805
2934out: 2806 dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
2935 dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error, 2807 IS_ERR(res) ? PTR_ERR(res) : 0,
2936 error != 0 ? " [error]" : ""); 2808 IS_ERR(res) ? " [error]" : "");
2937 return error; 2809 return res;
2938} 2810}
2939 2811
2940/* 2812/*
2941 * Get the superblock for an NFS4 mountpoint 2813 * Get the superblock for an NFS4 mountpoint
2942 */ 2814 */
2943static int nfs4_get_sb(struct file_system_type *fs_type, 2815static struct dentry *nfs4_mount(struct file_system_type *fs_type,
2944 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2816 int flags, const char *dev_name, void *raw_data)
2945{ 2817{
2946 struct nfs_parsed_mount_data *data; 2818 struct nfs_parsed_mount_data *data;
2947 int error = -ENOMEM; 2819 int error = -ENOMEM;
2820 struct dentry *res = ERR_PTR(-ENOMEM);
2948 2821
2949 data = nfs_alloc_parsed_mount_data(4); 2822 data = nfs_alloc_parsed_mount_data(4);
2950 if (data == NULL) 2823 if (data == NULL)
@@ -2952,10 +2825,14 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2952 2825
2953 /* Validate the mount data */ 2826 /* Validate the mount data */
2954 error = nfs4_validate_mount_data(raw_data, data, dev_name); 2827 error = nfs4_validate_mount_data(raw_data, data, dev_name);
2955 if (error < 0) 2828 if (error < 0) {
2829 res = ERR_PTR(error);
2956 goto out; 2830 goto out;
2831 }
2957 2832
2958 error = nfs4_try_mount(flags, dev_name, data, mnt); 2833 res = nfs4_try_mount(flags, dev_name, data);
2834 if (IS_ERR(res))
2835 error = PTR_ERR(res);
2959 2836
2960out: 2837out:
2961 kfree(data->client_address); 2838 kfree(data->client_address);
@@ -2964,9 +2841,9 @@ out:
2964 kfree(data->fscache_uniq); 2841 kfree(data->fscache_uniq);
2965out_free_data: 2842out_free_data:
2966 kfree(data); 2843 kfree(data);
2967 dprintk("<-- nfs4_get_sb() = %d%s\n", error, 2844 dprintk("<-- nfs4_mount() = %d%s\n", error,
2968 error != 0 ? " [error]" : ""); 2845 error != 0 ? " [error]" : "");
2969 return error; 2846 return res;
2970} 2847}
2971 2848
2972static void nfs4_kill_super(struct super_block *sb) 2849static void nfs4_kill_super(struct super_block *sb)
@@ -3033,7 +2910,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
3033 nfs_fscache_get_super_cookie(s, NULL, data); 2910 nfs_fscache_get_super_cookie(s, NULL, data);
3034 } 2911 }
3035 2912
3036 mntroot = nfs4_get_root(s, data->fh); 2913 mntroot = nfs4_get_root(s, data->fh, dev_name);
3037 if (IS_ERR(mntroot)) { 2914 if (IS_ERR(mntroot)) {
3038 error = PTR_ERR(mntroot); 2915 error = PTR_ERR(mntroot);
3039 goto error_splat_super; 2916 goto error_splat_super;
@@ -3120,7 +2997,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
3120 nfs_fscache_get_super_cookie(s, NULL, data); 2997 nfs_fscache_get_super_cookie(s, NULL, data);
3121 } 2998 }
3122 2999
3123 mntroot = nfs4_get_root(s, mntfh); 3000 mntroot = nfs4_get_root(s, mntfh, dev_name);
3124 if (IS_ERR(mntroot)) { 3001 if (IS_ERR(mntroot)) {
3125 error = PTR_ERR(mntroot); 3002 error = PTR_ERR(mntroot);
3126 goto error_splat_super; 3003 goto error_splat_super;
@@ -3160,16 +3037,15 @@ error_splat_bdi:
3160/* 3037/*
3161 * Create an NFS4 server record on referral traversal 3038 * Create an NFS4 server record on referral traversal
3162 */ 3039 */
3163static int nfs4_referral_get_sb(struct file_system_type *fs_type, 3040static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
3164 int flags, const char *dev_name, void *raw_data, 3041 int flags, const char *dev_name, void *raw_data)
3165 struct vfsmount *mnt)
3166{ 3042{
3167 struct nfs_clone_mount *data = raw_data; 3043 struct nfs_clone_mount *data = raw_data;
3168 char *export_path; 3044 char *export_path;
3169 struct vfsmount *root_mnt; 3045 struct vfsmount *root_mnt;
3170 int error; 3046 struct dentry *res;
3171 3047
3172 dprintk("--> nfs4_referral_get_sb()\n"); 3048 dprintk("--> nfs4_referral_mount()\n");
3173 3049
3174 export_path = data->mnt_path; 3050 export_path = data->mnt_path;
3175 data->mnt_path = "/"; 3051 data->mnt_path = "/";
@@ -3178,15 +3054,13 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type,
3178 flags, data, data->hostname); 3054 flags, data, data->hostname);
3179 data->mnt_path = export_path; 3055 data->mnt_path = export_path;
3180 3056
3181 error = PTR_ERR(root_mnt); 3057 res = ERR_CAST(root_mnt);
3182 if (IS_ERR(root_mnt)) 3058 if (!IS_ERR(root_mnt))
3183 goto out; 3059 res = nfs_follow_remote_path(root_mnt, export_path);
3184 3060 dprintk("<-- nfs4_referral_mount() = %ld%s\n",
3185 error = nfs_follow_remote_path(root_mnt, export_path, mnt); 3061 IS_ERR(res) ? PTR_ERR(res) : 0,
3186out: 3062 IS_ERR(res) ? " [error]" : "");
3187 dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error, 3063 return res;
3188 error != 0 ? " [error]" : "");
3189 return error;
3190} 3064}
3191 3065
3192#endif /* CONFIG_NFS_V4 */ 3066#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index e313a51acdd1..8d6864c2a5fa 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -148,6 +148,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
148 alias = d_lookup(parent, &data->args.name); 148 alias = d_lookup(parent, &data->args.name);
149 if (alias != NULL) { 149 if (alias != NULL) {
150 int ret = 0; 150 int ret = 0;
151 void *devname_garbage = NULL;
151 152
152 /* 153 /*
153 * Hey, we raced with lookup... See if we need to transfer 154 * Hey, we raced with lookup... See if we need to transfer
@@ -157,6 +158,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
157 spin_lock(&alias->d_lock); 158 spin_lock(&alias->d_lock);
158 if (alias->d_inode != NULL && 159 if (alias->d_inode != NULL &&
159 !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { 160 !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
161 devname_garbage = alias->d_fsdata;
160 alias->d_fsdata = data; 162 alias->d_fsdata = data;
161 alias->d_flags |= DCACHE_NFSFS_RENAMED; 163 alias->d_flags |= DCACHE_NFSFS_RENAMED;
162 ret = 1; 164 ret = 1;
@@ -164,6 +166,13 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
164 spin_unlock(&alias->d_lock); 166 spin_unlock(&alias->d_lock);
165 nfs_dec_sillycount(dir); 167 nfs_dec_sillycount(dir);
166 dput(alias); 168 dput(alias);
169 /*
170 * If we'd displaced old cached devname, free it. At that
171 * point dentry is definitely not a root, so we won't need
172 * that anymore.
173 */
174 if (devname_garbage)
175 kfree(devname_garbage);
167 return ret; 176 return ret;
168 } 177 }
169 data->dir = igrab(dir); 178 data->dir = igrab(dir);
@@ -180,7 +189,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
180 task_setup_data.rpc_client = NFS_CLIENT(dir); 189 task_setup_data.rpc_client = NFS_CLIENT(dir);
181 task = rpc_run_task(&task_setup_data); 190 task = rpc_run_task(&task_setup_data);
182 if (!IS_ERR(task)) 191 if (!IS_ERR(task))
183 rpc_put_task(task); 192 rpc_put_task_async(task);
184 return 1; 193 return 1;
185} 194}
186 195
@@ -252,6 +261,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
252{ 261{
253 struct nfs_unlinkdata *data; 262 struct nfs_unlinkdata *data;
254 int status = -ENOMEM; 263 int status = -ENOMEM;
264 void *devname_garbage = NULL;
255 265
256 data = kzalloc(sizeof(*data), GFP_KERNEL); 266 data = kzalloc(sizeof(*data), GFP_KERNEL);
257 if (data == NULL) 267 if (data == NULL)
@@ -269,8 +279,16 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
269 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) 279 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
270 goto out_unlock; 280 goto out_unlock;
271 dentry->d_flags |= DCACHE_NFSFS_RENAMED; 281 dentry->d_flags |= DCACHE_NFSFS_RENAMED;
282 devname_garbage = dentry->d_fsdata;
272 dentry->d_fsdata = data; 283 dentry->d_fsdata = data;
273 spin_unlock(&dentry->d_lock); 284 spin_unlock(&dentry->d_lock);
285 /*
286 * If we'd displaced old cached devname, free it. At that
287 * point dentry is definitely not a root, so we won't need
288 * that anymore.
289 */
290 if (devname_garbage)
291 kfree(devname_garbage);
274 return 0; 292 return 0;
275out_unlock: 293out_unlock:
276 spin_unlock(&dentry->d_lock); 294 spin_unlock(&dentry->d_lock);
@@ -299,6 +317,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
299 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 317 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
300 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; 318 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
301 data = dentry->d_fsdata; 319 data = dentry->d_fsdata;
320 dentry->d_fsdata = NULL;
302 } 321 }
303 spin_unlock(&dentry->d_lock); 322 spin_unlock(&dentry->d_lock);
304 323
@@ -315,6 +334,7 @@ nfs_cancel_async_unlink(struct dentry *dentry)
315 struct nfs_unlinkdata *data = dentry->d_fsdata; 334 struct nfs_unlinkdata *data = dentry->d_fsdata;
316 335
317 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; 336 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
337 dentry->d_fsdata = NULL;
318 spin_unlock(&dentry->d_lock); 338 spin_unlock(&dentry->d_lock);
319 nfs_free_unlinkdata(data); 339 nfs_free_unlinkdata(data);
320 return; 340 return;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 10d648ea128b..47a3ad63e0d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -28,6 +28,7 @@
28#include "iostat.h" 28#include "iostat.h"
29#include "nfs4_fs.h" 29#include "nfs4_fs.h"
30#include "fscache.h" 30#include "fscache.h"
31#include "pnfs.h"
31 32
32#define NFSDBG_FACILITY NFSDBG_PAGECACHE 33#define NFSDBG_FACILITY NFSDBG_PAGECACHE
33 34
@@ -96,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
96 97
97static void nfs_writedata_release(struct nfs_write_data *wdata) 98static void nfs_writedata_release(struct nfs_write_data *wdata)
98{ 99{
100 put_lseg(wdata->lseg);
99 put_nfs_open_context(wdata->args.context); 101 put_nfs_open_context(wdata->args.context);
100 nfs_writedata_free(wdata); 102 nfs_writedata_free(wdata);
101} 103}
@@ -781,25 +783,21 @@ static int flush_task_priority(int how)
781 return RPC_PRIORITY_NORMAL; 783 return RPC_PRIORITY_NORMAL;
782} 784}
783 785
784/* 786int nfs_initiate_write(struct nfs_write_data *data,
785 * Set up the argument/result storage required for the RPC call. 787 struct rpc_clnt *clnt,
786 */ 788 const struct rpc_call_ops *call_ops,
787static int nfs_write_rpcsetup(struct nfs_page *req, 789 int how)
788 struct nfs_write_data *data,
789 const struct rpc_call_ops *call_ops,
790 unsigned int count, unsigned int offset,
791 int how)
792{ 790{
793 struct inode *inode = req->wb_context->path.dentry->d_inode; 791 struct inode *inode = data->inode;
794 int priority = flush_task_priority(how); 792 int priority = flush_task_priority(how);
795 struct rpc_task *task; 793 struct rpc_task *task;
796 struct rpc_message msg = { 794 struct rpc_message msg = {
797 .rpc_argp = &data->args, 795 .rpc_argp = &data->args,
798 .rpc_resp = &data->res, 796 .rpc_resp = &data->res,
799 .rpc_cred = req->wb_context->cred, 797 .rpc_cred = data->cred,
800 }; 798 };
801 struct rpc_task_setup task_setup_data = { 799 struct rpc_task_setup task_setup_data = {
802 .rpc_client = NFS_CLIENT(inode), 800 .rpc_client = clnt,
803 .task = &data->task, 801 .task = &data->task,
804 .rpc_message = &msg, 802 .rpc_message = &msg,
805 .callback_ops = call_ops, 803 .callback_ops = call_ops,
@@ -810,12 +808,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
810 }; 808 };
811 int ret = 0; 809 int ret = 0;
812 810
811 /* Set up the initial task struct. */
812 NFS_PROTO(inode)->write_setup(data, &msg);
813
814 dprintk("NFS: %5u initiated write call "
815 "(req %s/%lld, %u bytes @ offset %llu)\n",
816 data->task.tk_pid,
817 inode->i_sb->s_id,
818 (long long)NFS_FILEID(inode),
819 data->args.count,
820 (unsigned long long)data->args.offset);
821
822 task = rpc_run_task(&task_setup_data);
823 if (IS_ERR(task)) {
824 ret = PTR_ERR(task);
825 goto out;
826 }
827 if (how & FLUSH_SYNC) {
828 ret = rpc_wait_for_completion_task(task);
829 if (ret == 0)
830 ret = task->tk_status;
831 }
832 rpc_put_task(task);
833out:
834 return ret;
835}
836EXPORT_SYMBOL_GPL(nfs_initiate_write);
837
838/*
839 * Set up the argument/result storage required for the RPC call.
840 */
841static int nfs_write_rpcsetup(struct nfs_page *req,
842 struct nfs_write_data *data,
843 const struct rpc_call_ops *call_ops,
844 unsigned int count, unsigned int offset,
845 struct pnfs_layout_segment *lseg,
846 int how)
847{
848 struct inode *inode = req->wb_context->path.dentry->d_inode;
849
813 /* Set up the RPC argument and reply structs 850 /* Set up the RPC argument and reply structs
814 * NB: take care not to mess about with data->commit et al. */ 851 * NB: take care not to mess about with data->commit et al. */
815 852
816 data->req = req; 853 data->req = req;
817 data->inode = inode = req->wb_context->path.dentry->d_inode; 854 data->inode = inode = req->wb_context->path.dentry->d_inode;
818 data->cred = msg.rpc_cred; 855 data->cred = req->wb_context->cred;
856 data->lseg = get_lseg(lseg);
819 857
820 data->args.fh = NFS_FH(inode); 858 data->args.fh = NFS_FH(inode);
821 data->args.offset = req_offset(req) + offset; 859 data->args.offset = req_offset(req) + offset;
@@ -836,30 +874,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
836 data->res.verf = &data->verf; 874 data->res.verf = &data->verf;
837 nfs_fattr_init(&data->fattr); 875 nfs_fattr_init(&data->fattr);
838 876
839 /* Set up the initial task struct. */ 877 if (data->lseg &&
840 NFS_PROTO(inode)->write_setup(data, &msg); 878 (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
841 879 return 0;
842 dprintk("NFS: %5u initiated write call "
843 "(req %s/%lld, %u bytes @ offset %llu)\n",
844 data->task.tk_pid,
845 inode->i_sb->s_id,
846 (long long)NFS_FILEID(inode),
847 count,
848 (unsigned long long)data->args.offset);
849 880
850 task = rpc_run_task(&task_setup_data); 881 return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
851 if (IS_ERR(task)) {
852 ret = PTR_ERR(task);
853 goto out;
854 }
855 if (how & FLUSH_SYNC) {
856 ret = rpc_wait_for_completion_task(task);
857 if (ret == 0)
858 ret = task->tk_status;
859 }
860 rpc_put_task(task);
861out:
862 return ret;
863} 882}
864 883
865/* If a nfs_flush_* function fails, it should remove reqs from @head and 884/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -879,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req)
879 * Generate multiple small requests to write out a single 898 * Generate multiple small requests to write out a single
880 * contiguous dirty area on one page. 899 * contiguous dirty area on one page.
881 */ 900 */
882static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) 901static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
883{ 902{
884 struct nfs_page *req = nfs_list_entry(head->next); 903 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
885 struct page *page = req->wb_page; 904 struct page *page = req->wb_page;
886 struct nfs_write_data *data; 905 struct nfs_write_data *data;
887 size_t wsize = NFS_SERVER(inode)->wsize, nbytes; 906 size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
888 unsigned int offset; 907 unsigned int offset;
889 int requests = 0; 908 int requests = 0;
890 int ret = 0; 909 int ret = 0;
910 struct pnfs_layout_segment *lseg;
891 LIST_HEAD(list); 911 LIST_HEAD(list);
892 912
893 nfs_list_remove_request(req); 913 nfs_list_remove_request(req);
894 914
895 nbytes = count; 915 nbytes = desc->pg_count;
896 do { 916 do {
897 size_t len = min(nbytes, wsize); 917 size_t len = min(nbytes, wsize);
898 918
@@ -905,9 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
905 } while (nbytes != 0); 925 } while (nbytes != 0);
906 atomic_set(&req->wb_complete, requests); 926 atomic_set(&req->wb_complete, requests);
907 927
928 BUG_ON(desc->pg_lseg);
929 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
908 ClearPageError(page); 930 ClearPageError(page);
909 offset = 0; 931 offset = 0;
910 nbytes = count; 932 nbytes = desc->pg_count;
911 do { 933 do {
912 int ret2; 934 int ret2;
913 935
@@ -919,20 +941,22 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
919 if (nbytes < wsize) 941 if (nbytes < wsize)
920 wsize = nbytes; 942 wsize = nbytes;
921 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, 943 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
922 wsize, offset, how); 944 wsize, offset, lseg, desc->pg_ioflags);
923 if (ret == 0) 945 if (ret == 0)
924 ret = ret2; 946 ret = ret2;
925 offset += wsize; 947 offset += wsize;
926 nbytes -= wsize; 948 nbytes -= wsize;
927 } while (nbytes != 0); 949 } while (nbytes != 0);
928 950
951 put_lseg(lseg);
952 desc->pg_lseg = NULL;
929 return ret; 953 return ret;
930 954
931out_bad: 955out_bad:
932 while (!list_empty(&list)) { 956 while (!list_empty(&list)) {
933 data = list_entry(list.next, struct nfs_write_data, pages); 957 data = list_entry(list.next, struct nfs_write_data, pages);
934 list_del(&data->pages); 958 list_del(&data->pages);
935 nfs_writedata_release(data); 959 nfs_writedata_free(data);
936 } 960 }
937 nfs_redirty_request(req); 961 nfs_redirty_request(req);
938 return -ENOMEM; 962 return -ENOMEM;
@@ -946,16 +970,26 @@ out_bad:
946 * This is the case if nfs_updatepage detects a conflicting request 970 * This is the case if nfs_updatepage detects a conflicting request
947 * that has been written but not committed. 971 * that has been written but not committed.
948 */ 972 */
949static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) 973static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
950{ 974{
951 struct nfs_page *req; 975 struct nfs_page *req;
952 struct page **pages; 976 struct page **pages;
953 struct nfs_write_data *data; 977 struct nfs_write_data *data;
978 struct list_head *head = &desc->pg_list;
979 struct pnfs_layout_segment *lseg = desc->pg_lseg;
980 int ret;
954 981
955 data = nfs_writedata_alloc(npages); 982 data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
956 if (!data) 983 desc->pg_count));
957 goto out_bad; 984 if (!data) {
958 985 while (!list_empty(head)) {
986 req = nfs_list_entry(head->next);
987 nfs_list_remove_request(req);
988 nfs_redirty_request(req);
989 }
990 ret = -ENOMEM;
991 goto out;
992 }
959 pages = data->pagevec; 993 pages = data->pagevec;
960 while (!list_empty(head)) { 994 while (!list_empty(head)) {
961 req = nfs_list_entry(head->next); 995 req = nfs_list_entry(head->next);
@@ -965,16 +999,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
965 *pages++ = req->wb_page; 999 *pages++ = req->wb_page;
966 } 1000 }
967 req = nfs_list_entry(data->pages.next); 1001 req = nfs_list_entry(data->pages.next);
1002 if ((!lseg) && list_is_singular(&data->pages))
1003 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
968 1004
969 /* Set up the argument struct */ 1005 /* Set up the argument struct */
970 return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); 1006 ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
971 out_bad: 1007out:
972 while (!list_empty(head)) { 1008 put_lseg(lseg); /* Cleans any gotten in ->pg_test */
973 req = nfs_list_entry(head->next); 1009 desc->pg_lseg = NULL;
974 nfs_list_remove_request(req); 1010 return ret;
975 nfs_redirty_request(req);
976 }
977 return -ENOMEM;
978} 1011}
979 1012
980static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, 1013static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -982,6 +1015,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
982{ 1015{
983 size_t wsize = NFS_SERVER(inode)->wsize; 1016 size_t wsize = NFS_SERVER(inode)->wsize;
984 1017
1018 pnfs_pageio_init_write(pgio, inode);
1019
985 if (wsize < PAGE_CACHE_SIZE) 1020 if (wsize < PAGE_CACHE_SIZE)
986 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 1021 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
987 else 1022 else
@@ -1132,7 +1167,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
1132/* 1167/*
1133 * This function is called when the WRITE call is complete. 1168 * This function is called when the WRITE call is complete.
1134 */ 1169 */
1135int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) 1170void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1136{ 1171{
1137 struct nfs_writeargs *argp = &data->args; 1172 struct nfs_writeargs *argp = &data->args;
1138 struct nfs_writeres *resp = &data->res; 1173 struct nfs_writeres *resp = &data->res;
@@ -1151,7 +1186,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1151 */ 1186 */
1152 status = NFS_PROTO(data->inode)->write_done(task, data); 1187 status = NFS_PROTO(data->inode)->write_done(task, data);
1153 if (status != 0) 1188 if (status != 0)
1154 return status; 1189 return;
1155 nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); 1190 nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
1156 1191
1157#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1192#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1166,6 +1201,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1166 */ 1201 */
1167 static unsigned long complain; 1202 static unsigned long complain;
1168 1203
1204 /* Note this will print the MDS for a DS write */
1169 if (time_before(complain, jiffies)) { 1205 if (time_before(complain, jiffies)) {
1170 dprintk("NFS: faulty NFS server %s:" 1206 dprintk("NFS: faulty NFS server %s:"
1171 " (committed = %d) != (stable = %d)\n", 1207 " (committed = %d) != (stable = %d)\n",
@@ -1186,6 +1222,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1186 /* Was this an NFSv2 write or an NFSv3 stable write? */ 1222 /* Was this an NFSv2 write or an NFSv3 stable write? */
1187 if (resp->verf->committed != NFS_UNSTABLE) { 1223 if (resp->verf->committed != NFS_UNSTABLE) {
1188 /* Resend from where the server left off */ 1224 /* Resend from where the server left off */
1225 data->mds_offset += resp->count;
1189 argp->offset += resp->count; 1226 argp->offset += resp->count;
1190 argp->pgbase += resp->count; 1227 argp->pgbase += resp->count;
1191 argp->count -= resp->count; 1228 argp->count -= resp->count;
@@ -1196,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1196 argp->stable = NFS_FILE_SYNC; 1233 argp->stable = NFS_FILE_SYNC;
1197 } 1234 }
1198 nfs_restart_rpc(task, server->nfs_client); 1235 nfs_restart_rpc(task, server->nfs_client);
1199 return -EAGAIN; 1236 return;
1200 } 1237 }
1201 if (time_before(complain, jiffies)) { 1238 if (time_before(complain, jiffies)) {
1202 printk(KERN_WARNING 1239 printk(KERN_WARNING
@@ -1207,7 +1244,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1207 /* Can't do anything about it except throw an error. */ 1244 /* Can't do anything about it except throw an error. */
1208 task->tk_status = -EIO; 1245 task->tk_status = -EIO;
1209 } 1246 }
1210 return 0; 1247 return;
1211} 1248}
1212 1249
1213 1250
@@ -1292,6 +1329,8 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1292 task = rpc_run_task(&task_setup_data); 1329 task = rpc_run_task(&task_setup_data);
1293 if (IS_ERR(task)) 1330 if (IS_ERR(task))
1294 return PTR_ERR(task); 1331 return PTR_ERR(task);
1332 if (how & FLUSH_SYNC)
1333 rpc_wait_for_completion_task(task);
1295 rpc_put_task(task); 1334 rpc_put_task(task);
1296 return 0; 1335 return 0;
1297} 1336}
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c52571c03..84c27d69d421 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
42 gid_t gid; 42 gid_t gid;
43}; 43};
44 44
45struct nfsacl_simple_acl {
46 struct posix_acl acl;
47 struct posix_acl_entry ace[4];
48};
49
45static int 50static int
46xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) 51xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
47{ 52{
@@ -72,9 +77,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
72 return 0; 77 return 0;
73} 78}
74 79
75unsigned int 80/**
76nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, 81 * nfsacl_encode - Encode an NFSv3 ACL
77 struct posix_acl *acl, int encode_entries, int typeflag) 82 *
83 * @buf: destination xdr_buf to contain XDR encoded ACL
84 * @base: byte offset in xdr_buf where XDR'd ACL begins
85 * @inode: inode of file whose ACL this is
86 * @acl: posix_acl to encode
87 * @encode_entries: whether to encode ACEs as well
88 * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
89 *
90 * Returns size of encoded ACL in bytes or a negative errno value.
91 */
92int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
93 struct posix_acl *acl, int encode_entries, int typeflag)
78{ 94{
79 int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; 95 int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
80 struct nfsacl_encode_desc nfsacl_desc = { 96 struct nfsacl_encode_desc nfsacl_desc = {
@@ -88,17 +104,22 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
88 .uid = inode->i_uid, 104 .uid = inode->i_uid,
89 .gid = inode->i_gid, 105 .gid = inode->i_gid,
90 }; 106 };
107 struct nfsacl_simple_acl aclbuf;
91 int err; 108 int err;
92 struct posix_acl *acl2 = NULL;
93 109
94 if (entries > NFS_ACL_MAX_ENTRIES || 110 if (entries > NFS_ACL_MAX_ENTRIES ||
95 xdr_encode_word(buf, base, entries)) 111 xdr_encode_word(buf, base, entries))
96 return -EINVAL; 112 return -EINVAL;
97 if (encode_entries && acl && acl->a_count == 3) { 113 if (encode_entries && acl && acl->a_count == 3) {
98 /* Fake up an ACL_MASK entry. */ 114 struct posix_acl *acl2 = &aclbuf.acl;
99 acl2 = posix_acl_alloc(4, GFP_KERNEL); 115
100 if (!acl2) 116 /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
101 return -ENOMEM; 117 * invoked in contexts where a memory allocation failure is
118 * fatal. Fortunately this fake ACL is small enough to
119 * construct on the stack. */
120 memset(acl2, 0, sizeof(acl2));
121 posix_acl_init(acl2, 4);
122
102 /* Insert entries in canonical order: other orders seem 123 /* Insert entries in canonical order: other orders seem
103 to confuse Solaris VxFS. */ 124 to confuse Solaris VxFS. */
104 acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */ 125 acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */
@@ -109,8 +130,6 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
109 nfsacl_desc.acl = acl2; 130 nfsacl_desc.acl = acl2;
110 } 131 }
111 err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc); 132 err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
112 if (acl2)
113 posix_acl_release(acl2);
114 if (!err) 133 if (!err)
115 err = 8 + nfsacl_desc.desc.elem_size * 134 err = 8 + nfsacl_desc.desc.elem_size *
116 nfsacl_desc.desc.array_len; 135 nfsacl_desc.desc.array_len;
@@ -224,9 +243,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
224 return 0; 243 return 0;
225} 244}
226 245
227unsigned int 246/**
228nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, 247 * nfsacl_decode - Decode an NFSv3 ACL
229 struct posix_acl **pacl) 248 *
249 * @buf: xdr_buf containing XDR'd ACL data to decode
250 * @base: byte offset in xdr_buf where XDR'd ACL begins
251 * @aclcnt: count of ACEs in decoded posix_acl
252 * @pacl: buffer in which to place decoded posix_acl
253 *
254 * Returns the length of the decoded ACL in bytes, or a negative errno value.
255 */
256int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
257 struct posix_acl **pacl)
230{ 258{
231 struct nfsacl_decode_desc nfsacl_desc = { 259 struct nfsacl_decode_desc nfsacl_desc = {
232 .desc = { 260 .desc = {
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd242ddd..124e8fcb0dd6 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
22 22
23static struct file *do_open(char *name, int flags) 23static struct file *do_open(char *name, int flags)
24{ 24{
25 struct nameidata nd;
26 struct vfsmount *mnt; 25 struct vfsmount *mnt;
27 int error; 26 struct file *file;
28 27
29 mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); 28 mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
30 if (IS_ERR(mnt)) 29 if (IS_ERR(mnt))
31 return (struct file *)mnt; 30 return (struct file *)mnt;
32 31
33 error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd); 32 file = file_open_root(mnt->mnt_root, mnt, name, flags);
34 mntput(mnt); /* drop do_kern_mount reference */
35 if (error)
36 return ERR_PTR(error);
37
38 if (flags == O_RDWR)
39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
40 else
41 error = may_open(&nd.path, MAY_WRITE, flags);
42 33
43 if (!error) 34 mntput(mnt); /* drop do_kern_mount reference */
44 return dentry_open(nd.path.dentry, nd.path.mnt, flags, 35 return file;
45 current_cred());
46
47 path_put(&nd.path);
48 return ERR_PTR(error);
49} 36}
50 37
51static struct { 38static struct {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 8b31e5f8795d..ad000aeb21a2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -299,7 +299,6 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
299 299
300#define EXPORT_HASHBITS 8 300#define EXPORT_HASHBITS 8
301#define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) 301#define EXPORT_HASHMAX (1<< EXPORT_HASHBITS)
302#define EXPORT_HASHMASK (EXPORT_HASHMAX -1)
303 302
304static struct cache_head *export_table[EXPORT_HASHMAX]; 303static struct cache_head *export_table[EXPORT_HASHMAX];
305 304
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3be975e18919..02eb4edf0ece 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
432 * If the server returns different values for sessionID, slotID or 432 * If the server returns different values for sessionID, slotID or
433 * sequence number, the server is looney tunes. 433 * sequence number, the server is looney tunes.
434 */ 434 */
435 p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4); 435 p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
436 if (unlikely(p == NULL)) 436 if (unlikely(p == NULL))
437 goto out_overflow; 437 goto out_overflow;
438 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); 438 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
@@ -484,7 +484,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
484out: 484out:
485 return status; 485 return status;
486out_default: 486out_default:
487 return nfs_cb_stat_to_errno(status); 487 return nfs_cb_stat_to_errno(nfserr);
488} 488}
489 489
490/* 490/*
@@ -564,11 +564,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
564 if (unlikely(status)) 564 if (unlikely(status))
565 goto out; 565 goto out;
566 if (unlikely(nfserr != NFS4_OK)) 566 if (unlikely(nfserr != NFS4_OK))
567 goto out_default; 567 status = nfs_cb_stat_to_errno(nfserr);
568out: 568out:
569 return status; 569 return status;
570out_default:
571 return nfs_cb_stat_to_errno(status);
572} 570}
573 571
574/* 572/*
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6d2c397d458b..55780a22fdbd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -63,7 +63,6 @@ struct ent {
63 63
64#define ENT_HASHBITS 8 64#define ENT_HASHBITS 8
65#define ENT_HASHMAX (1 << ENT_HASHBITS) 65#define ENT_HASHMAX (1 << ENT_HASHBITS)
66#define ENT_HASHMASK (ENT_HASHMAX - 1)
67 66
68static void 67static void
69ent_init(struct cache_head *cnew, struct cache_head *citm) 68ent_init(struct cache_head *cnew, struct cache_head *citm)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index db52546143d1..5fcb1396a7e3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -984,8 +984,8 @@ typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
984 void *); 984 void *);
985enum nfsd4_op_flags { 985enum nfsd4_op_flags {
986 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ 986 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
987 ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */ 987 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
988 ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */ 988 ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */
989}; 989};
990 990
991struct nfsd4_operation { 991struct nfsd4_operation {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d98d0213285d..fbde6f79922e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -148,7 +148,7 @@ static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
148/* hash table for nfs4_file */ 148/* hash table for nfs4_file */
149#define FILE_HASH_BITS 8 149#define FILE_HASH_BITS 8
150#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) 150#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
151#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) 151
152/* hash table for (open)nfs4_stateid */ 152/* hash table for (open)nfs4_stateid */
153#define STATEID_HASH_BITS 10 153#define STATEID_HASH_BITS 10
154#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) 154#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
@@ -230,9 +230,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
230 dp->dl_client = clp; 230 dp->dl_client = clp;
231 get_nfs4_file(fp); 231 get_nfs4_file(fp);
232 dp->dl_file = fp; 232 dp->dl_file = fp;
233 dp->dl_vfs_file = find_readable_file(fp);
234 get_file(dp->dl_vfs_file);
235 dp->dl_flock = NULL;
236 dp->dl_type = type; 233 dp->dl_type = type;
237 dp->dl_stateid.si_boot = boot_time; 234 dp->dl_stateid.si_boot = boot_time;
238 dp->dl_stateid.si_stateownerid = current_delegid++; 235 dp->dl_stateid.si_stateownerid = current_delegid++;
@@ -241,8 +238,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
241 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 238 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
242 dp->dl_time = 0; 239 dp->dl_time = 0;
243 atomic_set(&dp->dl_count, 1); 240 atomic_set(&dp->dl_count, 1);
244 list_add(&dp->dl_perfile, &fp->fi_delegations);
245 list_add(&dp->dl_perclnt, &clp->cl_delegations);
246 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); 241 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
247 return dp; 242 return dp;
248} 243}
@@ -253,36 +248,30 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
253 if (atomic_dec_and_test(&dp->dl_count)) { 248 if (atomic_dec_and_test(&dp->dl_count)) {
254 dprintk("NFSD: freeing dp %p\n",dp); 249 dprintk("NFSD: freeing dp %p\n",dp);
255 put_nfs4_file(dp->dl_file); 250 put_nfs4_file(dp->dl_file);
256 fput(dp->dl_vfs_file);
257 kmem_cache_free(deleg_slab, dp); 251 kmem_cache_free(deleg_slab, dp);
258 num_delegations--; 252 num_delegations--;
259 } 253 }
260} 254}
261 255
262/* Remove the associated file_lock first, then remove the delegation. 256static void nfs4_put_deleg_lease(struct nfs4_file *fp)
263 * lease_modify() is called to remove the FS_LEASE file_lock from
264 * the i_flock list, eventually calling nfsd's lock_manager
265 * fl_release_callback.
266 */
267static void
268nfs4_close_delegation(struct nfs4_delegation *dp)
269{ 257{
270 dprintk("NFSD: close_delegation dp %p\n",dp); 258 if (atomic_dec_and_test(&fp->fi_delegees)) {
271 /* XXX: do we even need this check?: */ 259 vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
272 if (dp->dl_flock) 260 fp->fi_lease = NULL;
273 vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock); 261 fp->fi_deleg_file = NULL;
262 }
274} 263}
275 264
276/* Called under the state lock. */ 265/* Called under the state lock. */
277static void 266static void
278unhash_delegation(struct nfs4_delegation *dp) 267unhash_delegation(struct nfs4_delegation *dp)
279{ 268{
280 list_del_init(&dp->dl_perfile);
281 list_del_init(&dp->dl_perclnt); 269 list_del_init(&dp->dl_perclnt);
282 spin_lock(&recall_lock); 270 spin_lock(&recall_lock);
271 list_del_init(&dp->dl_perfile);
283 list_del_init(&dp->dl_recall_lru); 272 list_del_init(&dp->dl_recall_lru);
284 spin_unlock(&recall_lock); 273 spin_unlock(&recall_lock);
285 nfs4_close_delegation(dp); 274 nfs4_put_deleg_lease(dp->dl_file);
286 nfs4_put_delegation(dp); 275 nfs4_put_delegation(dp);
287} 276}
288 277
@@ -327,64 +316,6 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
327static struct list_head client_lru; 316static struct list_head client_lru;
328static struct list_head close_lru; 317static struct list_head close_lru;
329 318
330static void unhash_generic_stateid(struct nfs4_stateid *stp)
331{
332 list_del(&stp->st_hash);
333 list_del(&stp->st_perfile);
334 list_del(&stp->st_perstateowner);
335}
336
337static void free_generic_stateid(struct nfs4_stateid *stp)
338{
339 put_nfs4_file(stp->st_file);
340 kmem_cache_free(stateid_slab, stp);
341}
342
343static void release_lock_stateid(struct nfs4_stateid *stp)
344{
345 struct file *file;
346
347 unhash_generic_stateid(stp);
348 file = find_any_file(stp->st_file);
349 if (file)
350 locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
351 free_generic_stateid(stp);
352}
353
354static void unhash_lockowner(struct nfs4_stateowner *sop)
355{
356 struct nfs4_stateid *stp;
357
358 list_del(&sop->so_idhash);
359 list_del(&sop->so_strhash);
360 list_del(&sop->so_perstateid);
361 while (!list_empty(&sop->so_stateids)) {
362 stp = list_first_entry(&sop->so_stateids,
363 struct nfs4_stateid, st_perstateowner);
364 release_lock_stateid(stp);
365 }
366}
367
368static void release_lockowner(struct nfs4_stateowner *sop)
369{
370 unhash_lockowner(sop);
371 nfs4_put_stateowner(sop);
372}
373
374static void
375release_stateid_lockowners(struct nfs4_stateid *open_stp)
376{
377 struct nfs4_stateowner *lock_sop;
378
379 while (!list_empty(&open_stp->st_lockowners)) {
380 lock_sop = list_entry(open_stp->st_lockowners.next,
381 struct nfs4_stateowner, so_perstateid);
382 /* list_del(&open_stp->st_lockowners); */
383 BUG_ON(lock_sop->so_is_open_owner);
384 release_lockowner(lock_sop);
385 }
386}
387
388/* 319/*
389 * We store the NONE, READ, WRITE, and BOTH bits separately in the 320 * We store the NONE, READ, WRITE, and BOTH bits separately in the
390 * st_{access,deny}_bmap field of the stateid, in order to track not 321 * st_{access,deny}_bmap field of the stateid, in order to track not
@@ -457,13 +388,71 @@ static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
457 return nfs4_access_to_omode(access); 388 return nfs4_access_to_omode(access);
458} 389}
459 390
460static void release_open_stateid(struct nfs4_stateid *stp) 391static void unhash_generic_stateid(struct nfs4_stateid *stp)
392{
393 list_del(&stp->st_hash);
394 list_del(&stp->st_perfile);
395 list_del(&stp->st_perstateowner);
396}
397
398static void free_generic_stateid(struct nfs4_stateid *stp)
461{ 399{
462 int oflag = nfs4_access_bmap_to_omode(stp); 400 int oflag = nfs4_access_bmap_to_omode(stp);
463 401
402 nfs4_file_put_access(stp->st_file, oflag);
403 put_nfs4_file(stp->st_file);
404 kmem_cache_free(stateid_slab, stp);
405}
406
407static void release_lock_stateid(struct nfs4_stateid *stp)
408{
409 struct file *file;
410
411 unhash_generic_stateid(stp);
412 file = find_any_file(stp->st_file);
413 if (file)
414 locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
415 free_generic_stateid(stp);
416}
417
418static void unhash_lockowner(struct nfs4_stateowner *sop)
419{
420 struct nfs4_stateid *stp;
421
422 list_del(&sop->so_idhash);
423 list_del(&sop->so_strhash);
424 list_del(&sop->so_perstateid);
425 while (!list_empty(&sop->so_stateids)) {
426 stp = list_first_entry(&sop->so_stateids,
427 struct nfs4_stateid, st_perstateowner);
428 release_lock_stateid(stp);
429 }
430}
431
432static void release_lockowner(struct nfs4_stateowner *sop)
433{
434 unhash_lockowner(sop);
435 nfs4_put_stateowner(sop);
436}
437
438static void
439release_stateid_lockowners(struct nfs4_stateid *open_stp)
440{
441 struct nfs4_stateowner *lock_sop;
442
443 while (!list_empty(&open_stp->st_lockowners)) {
444 lock_sop = list_entry(open_stp->st_lockowners.next,
445 struct nfs4_stateowner, so_perstateid);
446 /* list_del(&open_stp->st_lockowners); */
447 BUG_ON(lock_sop->so_is_open_owner);
448 release_lockowner(lock_sop);
449 }
450}
451
452static void release_open_stateid(struct nfs4_stateid *stp)
453{
464 unhash_generic_stateid(stp); 454 unhash_generic_stateid(stp);
465 release_stateid_lockowners(stp); 455 release_stateid_lockowners(stp);
466 nfs4_file_put_access(stp->st_file, oflag);
467 free_generic_stateid(stp); 456 free_generic_stateid(stp);
468} 457}
469 458
@@ -619,7 +608,8 @@ static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4
619 u32 maxrpc = nfsd_serv->sv_max_mesg; 608 u32 maxrpc = nfsd_serv->sv_max_mesg;
620 609
621 new->maxreqs = numslots; 610 new->maxreqs = numslots;
622 new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ; 611 new->maxresp_cached = min_t(u32, req->maxresp_cached,
612 slotsize + NFSD_MIN_HDR_SEQ_SZ);
623 new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc); 613 new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
624 new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc); 614 new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
625 new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND); 615 new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
@@ -958,8 +948,6 @@ expire_client(struct nfs4_client *clp)
958 spin_lock(&recall_lock); 948 spin_lock(&recall_lock);
959 while (!list_empty(&clp->cl_delegations)) { 949 while (!list_empty(&clp->cl_delegations)) {
960 dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); 950 dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
961 dprintk("NFSD: expire client. dp %p, fp %p\n", dp,
962 dp->dl_flock);
963 list_del_init(&dp->dl_perclnt); 951 list_del_init(&dp->dl_perclnt);
964 list_move(&dp->dl_recall_lru, &reaplist); 952 list_move(&dp->dl_recall_lru, &reaplist);
965 } 953 }
@@ -2078,6 +2066,7 @@ alloc_init_file(struct inode *ino)
2078 fp->fi_inode = igrab(ino); 2066 fp->fi_inode = igrab(ino);
2079 fp->fi_id = current_fileid++; 2067 fp->fi_id = current_fileid++;
2080 fp->fi_had_conflict = false; 2068 fp->fi_had_conflict = false;
2069 fp->fi_lease = NULL;
2081 memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); 2070 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
2082 memset(fp->fi_access, 0, sizeof(fp->fi_access)); 2071 memset(fp->fi_access, 0, sizeof(fp->fi_access));
2083 spin_lock(&recall_lock); 2072 spin_lock(&recall_lock);
@@ -2329,23 +2318,8 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
2329 nfs4_file_put_access(fp, O_RDONLY); 2318 nfs4_file_put_access(fp, O_RDONLY);
2330} 2319}
2331 2320
2332/* 2321static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
2333 * Spawn a thread to perform a recall on the delegation represented
2334 * by the lease (file_lock)
2335 *
2336 * Called from break_lease() with lock_flocks() held.
2337 * Note: we assume break_lease will only call this *once* for any given
2338 * lease.
2339 */
2340static
2341void nfsd_break_deleg_cb(struct file_lock *fl)
2342{ 2322{
2343 struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
2344
2345 dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
2346 if (!dp)
2347 return;
2348
2349 /* We're assuming the state code never drops its reference 2323 /* We're assuming the state code never drops its reference
2350 * without first removing the lease. Since we're in this lease 2324 * without first removing the lease. Since we're in this lease
2351 * callback (and since the lease code is serialized by the kernel 2325 * callback (and since the lease code is serialized by the kernel
@@ -2353,22 +2327,35 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2353 * it's safe to take a reference: */ 2327 * it's safe to take a reference: */
2354 atomic_inc(&dp->dl_count); 2328 atomic_inc(&dp->dl_count);
2355 2329
2356 spin_lock(&recall_lock);
2357 list_add_tail(&dp->dl_recall_lru, &del_recall_lru); 2330 list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
2358 spin_unlock(&recall_lock);
2359 2331
2360 /* only place dl_time is set. protected by lock_flocks*/ 2332 /* only place dl_time is set. protected by lock_flocks*/
2361 dp->dl_time = get_seconds(); 2333 dp->dl_time = get_seconds();
2362 2334
2335 nfsd4_cb_recall(dp);
2336}
2337
2338/* Called from break_lease() with lock_flocks() held. */
2339static void nfsd_break_deleg_cb(struct file_lock *fl)
2340{
2341 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
2342 struct nfs4_delegation *dp;
2343
2344 BUG_ON(!fp);
2345 /* We assume break_lease is only called once per lease: */
2346 BUG_ON(fp->fi_had_conflict);
2363 /* 2347 /*
2364 * We don't want the locks code to timeout the lease for us; 2348 * We don't want the locks code to timeout the lease for us;
2365 * we'll remove it ourself if the delegation isn't returned 2349 * we'll remove it ourself if a delegation isn't returned
2366 * in time. 2350 * in time:
2367 */ 2351 */
2368 fl->fl_break_time = 0; 2352 fl->fl_break_time = 0;
2369 2353
2370 dp->dl_file->fi_had_conflict = true; 2354 spin_lock(&recall_lock);
2371 nfsd4_cb_recall(dp); 2355 fp->fi_had_conflict = true;
2356 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
2357 nfsd_break_one_deleg(dp);
2358 spin_unlock(&recall_lock);
2372} 2359}
2373 2360
2374static 2361static
@@ -2461,10 +2448,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
2461{ 2448{
2462 struct nfs4_delegation *dp; 2449 struct nfs4_delegation *dp;
2463 2450
2464 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) { 2451 spin_lock(&recall_lock);
2465 if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) 2452 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
2453 if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
2454 spin_unlock(&recall_lock);
2466 return dp; 2455 return dp;
2467 } 2456 }
2457 spin_unlock(&recall_lock);
2468 return NULL; 2458 return NULL;
2469} 2459}
2470 2460
@@ -2641,6 +2631,66 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
2641 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; 2631 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
2642} 2632}
2643 2633
2634static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag)
2635{
2636 struct file_lock *fl;
2637
2638 fl = locks_alloc_lock();
2639 if (!fl)
2640 return NULL;
2641 locks_init_lock(fl);
2642 fl->fl_lmops = &nfsd_lease_mng_ops;
2643 fl->fl_flags = FL_LEASE;
2644 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2645 fl->fl_end = OFFSET_MAX;
2646 fl->fl_owner = (fl_owner_t)(dp->dl_file);
2647 fl->fl_pid = current->tgid;
2648 return fl;
2649}
2650
2651static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
2652{
2653 struct nfs4_file *fp = dp->dl_file;
2654 struct file_lock *fl;
2655 int status;
2656
2657 fl = nfs4_alloc_init_lease(dp, flag);
2658 if (!fl)
2659 return -ENOMEM;
2660 fl->fl_file = find_readable_file(fp);
2661 list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
2662 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
2663 if (status) {
2664 list_del_init(&dp->dl_perclnt);
2665 locks_free_lock(fl);
2666 return -ENOMEM;
2667 }
2668 fp->fi_lease = fl;
2669 fp->fi_deleg_file = fl->fl_file;
2670 get_file(fp->fi_deleg_file);
2671 atomic_set(&fp->fi_delegees, 1);
2672 list_add(&dp->dl_perfile, &fp->fi_delegations);
2673 return 0;
2674}
2675
2676static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
2677{
2678 struct nfs4_file *fp = dp->dl_file;
2679
2680 if (!fp->fi_lease)
2681 return nfs4_setlease(dp, flag);
2682 spin_lock(&recall_lock);
2683 if (fp->fi_had_conflict) {
2684 spin_unlock(&recall_lock);
2685 return -EAGAIN;
2686 }
2687 atomic_inc(&fp->fi_delegees);
2688 list_add(&dp->dl_perfile, &fp->fi_delegations);
2689 spin_unlock(&recall_lock);
2690 list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
2691 return 0;
2692}
2693
2644/* 2694/*
2645 * Attempt to hand out a delegation. 2695 * Attempt to hand out a delegation.
2646 */ 2696 */
@@ -2650,7 +2700,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2650 struct nfs4_delegation *dp; 2700 struct nfs4_delegation *dp;
2651 struct nfs4_stateowner *sop = stp->st_stateowner; 2701 struct nfs4_stateowner *sop = stp->st_stateowner;
2652 int cb_up; 2702 int cb_up;
2653 struct file_lock *fl;
2654 int status, flag = 0; 2703 int status, flag = 0;
2655 2704
2656 cb_up = nfsd4_cb_channel_good(sop->so_client); 2705 cb_up = nfsd4_cb_channel_good(sop->so_client);
@@ -2681,36 +2730,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2681 } 2730 }
2682 2731
2683 dp = alloc_init_deleg(sop->so_client, stp, fh, flag); 2732 dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
2684 if (dp == NULL) { 2733 if (dp == NULL)
2685 flag = NFS4_OPEN_DELEGATE_NONE; 2734 goto out_no_deleg;
2686 goto out; 2735 status = nfs4_set_delegation(dp, flag);
2687 } 2736 if (status)
2688 status = -ENOMEM; 2737 goto out_free;
2689 fl = locks_alloc_lock();
2690 if (!fl)
2691 goto out;
2692 locks_init_lock(fl);
2693 fl->fl_lmops = &nfsd_lease_mng_ops;
2694 fl->fl_flags = FL_LEASE;
2695 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2696 fl->fl_end = OFFSET_MAX;
2697 fl->fl_owner = (fl_owner_t)dp;
2698 fl->fl_file = find_readable_file(stp->st_file);
2699 BUG_ON(!fl->fl_file);
2700 fl->fl_pid = current->tgid;
2701 dp->dl_flock = fl;
2702
2703 /* vfs_setlease checks to see if delegation should be handed out.
2704 * the lock_manager callback fl_change is used
2705 */
2706 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2707 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
2708 dp->dl_flock = NULL;
2709 locks_free_lock(fl);
2710 unhash_delegation(dp);
2711 flag = NFS4_OPEN_DELEGATE_NONE;
2712 goto out;
2713 }
2714 2738
2715 memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); 2739 memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
2716 2740
@@ -2722,6 +2746,12 @@ out:
2722 && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) 2746 && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
2723 dprintk("NFSD: WARNING: refusing delegation reclaim\n"); 2747 dprintk("NFSD: WARNING: refusing delegation reclaim\n");
2724 open->op_delegate_type = flag; 2748 open->op_delegate_type = flag;
2749 return;
2750out_free:
2751 nfs4_put_delegation(dp);
2752out_no_deleg:
2753 flag = NFS4_OPEN_DELEGATE_NONE;
2754 goto out;
2725} 2755}
2726 2756
2727/* 2757/*
@@ -2916,8 +2946,6 @@ nfs4_laundromat(void)
2916 test_val = u; 2946 test_val = u;
2917 break; 2947 break;
2918 } 2948 }
2919 dprintk("NFSD: purging unused delegation dp %p, fp %p\n",
2920 dp, dp->dl_flock);
2921 list_move(&dp->dl_recall_lru, &reaplist); 2949 list_move(&dp->dl_recall_lru, &reaplist);
2922 } 2950 }
2923 spin_unlock(&recall_lock); 2951 spin_unlock(&recall_lock);
@@ -3128,7 +3156,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
3128 goto out; 3156 goto out;
3129 renew_client(dp->dl_client); 3157 renew_client(dp->dl_client);
3130 if (filpp) { 3158 if (filpp) {
3131 *filpp = find_readable_file(dp->dl_file); 3159 *filpp = dp->dl_file->fi_deleg_file;
3132 BUG_ON(!*filpp); 3160 BUG_ON(!*filpp);
3133 } 3161 }
3134 } else { /* open or lock stateid */ 3162 } else { /* open or lock stateid */
@@ -3708,6 +3736,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
3708 stp->st_stateid.si_stateownerid = sop->so_id; 3736 stp->st_stateid.si_stateownerid = sop->so_id;
3709 stp->st_stateid.si_fileid = fp->fi_id; 3737 stp->st_stateid.si_fileid = fp->fi_id;
3710 stp->st_stateid.si_generation = 0; 3738 stp->st_stateid.si_generation = 0;
3739 stp->st_access_bmap = 0;
3711 stp->st_deny_bmap = open_stp->st_deny_bmap; 3740 stp->st_deny_bmap = open_stp->st_deny_bmap;
3712 stp->st_openstp = open_stp; 3741 stp->st_openstp = open_stp;
3713 3742
@@ -3722,6 +3751,17 @@ check_lock_length(u64 offset, u64 length)
3722 LOFF_OVERFLOW(offset, length))); 3751 LOFF_OVERFLOW(offset, length)));
3723} 3752}
3724 3753
3754static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
3755{
3756 struct nfs4_file *fp = lock_stp->st_file;
3757 int oflag = nfs4_access_to_omode(access);
3758
3759 if (test_bit(access, &lock_stp->st_access_bmap))
3760 return;
3761 nfs4_file_get_access(fp, oflag);
3762 __set_bit(access, &lock_stp->st_access_bmap);
3763}
3764
3725/* 3765/*
3726 * LOCK operation 3766 * LOCK operation
3727 */ 3767 */
@@ -3738,7 +3778,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3738 struct file_lock conflock; 3778 struct file_lock conflock;
3739 __be32 status = 0; 3779 __be32 status = 0;
3740 unsigned int strhashval; 3780 unsigned int strhashval;
3741 unsigned int cmd;
3742 int err; 3781 int err;
3743 3782
3744 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", 3783 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -3820,22 +3859,18 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3820 switch (lock->lk_type) { 3859 switch (lock->lk_type) {
3821 case NFS4_READ_LT: 3860 case NFS4_READ_LT:
3822 case NFS4_READW_LT: 3861 case NFS4_READW_LT:
3823 if (find_readable_file(lock_stp->st_file)) { 3862 filp = find_readable_file(lock_stp->st_file);
3824 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ); 3863 if (filp)
3825 filp = find_readable_file(lock_stp->st_file); 3864 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
3826 }
3827 file_lock.fl_type = F_RDLCK; 3865 file_lock.fl_type = F_RDLCK;
3828 cmd = F_SETLK; 3866 break;
3829 break;
3830 case NFS4_WRITE_LT: 3867 case NFS4_WRITE_LT:
3831 case NFS4_WRITEW_LT: 3868 case NFS4_WRITEW_LT:
3832 if (find_writeable_file(lock_stp->st_file)) { 3869 filp = find_writeable_file(lock_stp->st_file);
3833 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE); 3870 if (filp)
3834 filp = find_writeable_file(lock_stp->st_file); 3871 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
3835 }
3836 file_lock.fl_type = F_WRLCK; 3872 file_lock.fl_type = F_WRLCK;
3837 cmd = F_SETLK; 3873 break;
3838 break;
3839 default: 3874 default:
3840 status = nfserr_inval; 3875 status = nfserr_inval;
3841 goto out; 3876 goto out;
@@ -3859,7 +3894,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3859 * Note: locks.c uses the BKL to protect the inode's lock list. 3894 * Note: locks.c uses the BKL to protect the inode's lock list.
3860 */ 3895 */
3861 3896
3862 err = vfs_lock_file(filp, cmd, &file_lock, &conflock); 3897 err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
3863 switch (-err) { 3898 switch (-err) {
3864 case 0: /* success! */ 3899 case 0: /* success! */
3865 update_stateid(&lock_stp->st_stateid); 3900 update_stateid(&lock_stp->st_stateid);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 956629b9cdc9..c6766af00d98 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -317,8 +317,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
317 READ_BUF(dummy32); 317 READ_BUF(dummy32);
318 len += (XDR_QUADLEN(dummy32) << 2); 318 len += (XDR_QUADLEN(dummy32) << 2);
319 READMEM(buf, dummy32); 319 READMEM(buf, dummy32);
320 if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) 320 if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
321 goto out_nfserr; 321 return status;
322 iattr->ia_valid |= ATTR_UID; 322 iattr->ia_valid |= ATTR_UID;
323 } 323 }
324 if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { 324 if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
@@ -328,8 +328,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
328 READ_BUF(dummy32); 328 READ_BUF(dummy32);
329 len += (XDR_QUADLEN(dummy32) << 2); 329 len += (XDR_QUADLEN(dummy32) << 2);
330 READMEM(buf, dummy32); 330 READMEM(buf, dummy32);
331 if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) 331 if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
332 goto out_nfserr; 332 return status;
333 iattr->ia_valid |= ATTR_GID; 333 iattr->ia_valid |= ATTR_GID;
334 } 334 }
335 if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { 335 if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
@@ -1215,8 +1215,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1215 READ_BUF(4); 1215 READ_BUF(4);
1216 READ32(dummy); 1216 READ32(dummy);
1217 READ_BUF(dummy * 4); 1217 READ_BUF(dummy * 4);
1218 for (i = 0; i < dummy; ++i)
1219 READ32(dummy);
1220 break; 1218 break;
1221 case RPC_AUTH_GSS: 1219 case RPC_AUTH_GSS:
1222 dprintk("RPC_AUTH_GSS callback secflavor " 1220 dprintk("RPC_AUTH_GSS callback secflavor "
@@ -1232,7 +1230,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1232 READ_BUF(4); 1230 READ_BUF(4);
1233 READ32(dummy); 1231 READ32(dummy);
1234 READ_BUF(dummy); 1232 READ_BUF(dummy);
1235 p += XDR_QUADLEN(dummy);
1236 break; 1233 break;
1237 default: 1234 default:
1238 dprintk("Illegal callback secflavor\n"); 1235 dprintk("Illegal callback secflavor\n");
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 33b3e2b06779..1f5eae40f34e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -12,13 +12,14 @@
12#include <linux/nfsd/syscall.h> 12#include <linux/nfsd/syscall.h>
13#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
14#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/gss_api.h>
15 16
16#include "idmap.h" 17#include "idmap.h"
17#include "nfsd.h" 18#include "nfsd.h"
18#include "cache.h" 19#include "cache.h"
19 20
20/* 21/*
21 * We have a single directory with 9 nodes in it. 22 * We have a single directory with several nodes in it.
22 */ 23 */
23enum { 24enum {
24 NFSD_Root = 1, 25 NFSD_Root = 1,
@@ -42,6 +43,7 @@ enum {
42 NFSD_Versions, 43 NFSD_Versions,
43 NFSD_Ports, 44 NFSD_Ports,
44 NFSD_MaxBlkSize, 45 NFSD_MaxBlkSize,
46 NFSD_SupportedEnctypes,
45 /* 47 /*
46 * The below MUST come last. Otherwise we leave a hole in nfsd_files[] 48 * The below MUST come last. Otherwise we leave a hole in nfsd_files[]
47 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops 49 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -187,6 +189,34 @@ static struct file_operations export_features_operations = {
187 .release = single_release, 189 .release = single_release,
188}; 190};
189 191
192#ifdef CONFIG_SUNRPC_GSS
193static int supported_enctypes_show(struct seq_file *m, void *v)
194{
195 struct gss_api_mech *k5mech;
196
197 k5mech = gss_mech_get_by_name("krb5");
198 if (k5mech == NULL)
199 goto out;
200 if (k5mech->gm_upcall_enctypes != NULL)
201 seq_printf(m, k5mech->gm_upcall_enctypes);
202 gss_mech_put(k5mech);
203out:
204 return 0;
205}
206
207static int supported_enctypes_open(struct inode *inode, struct file *file)
208{
209 return single_open(file, supported_enctypes_show, NULL);
210}
211
212static struct file_operations supported_enctypes_ops = {
213 .open = supported_enctypes_open,
214 .read = seq_read,
215 .llseek = seq_lseek,
216 .release = single_release,
217};
218#endif /* CONFIG_SUNRPC_GSS */
219
190extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); 220extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
191extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); 221extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
192 222
@@ -1397,6 +1427,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1397 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, 1427 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
1398 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, 1428 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
1399 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1429 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
1430#ifdef CONFIG_SUNRPC_GSS
1431 [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
1432#endif /* CONFIG_SUNRPC_GSS */
1400#ifdef CONFIG_NFSD_V4 1433#ifdef CONFIG_NFSD_V4
1401 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1434 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
1402 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1435 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 3074656ba7bf..6bd2f3c21f2b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -83,8 +83,6 @@ struct nfs4_delegation {
83 atomic_t dl_count; /* ref count */ 83 atomic_t dl_count; /* ref count */
84 struct nfs4_client *dl_client; 84 struct nfs4_client *dl_client;
85 struct nfs4_file *dl_file; 85 struct nfs4_file *dl_file;
86 struct file *dl_vfs_file;
87 struct file_lock *dl_flock;
88 u32 dl_type; 86 u32 dl_type;
89 time_t dl_time; 87 time_t dl_time;
90/* For recall: */ 88/* For recall: */
@@ -369,16 +367,15 @@ struct nfs4_file {
369 struct list_head fi_delegations; 367 struct list_head fi_delegations;
370 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ 368 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
371 struct file * fi_fds[3]; 369 struct file * fi_fds[3];
372 /* One each for O_RDONLY, O_WRONLY: */
373 atomic_t fi_access[2];
374 /* 370 /*
375 * Each open stateid contributes 1 to either fi_readers or 371 * Each open or lock stateid contributes 1 to either
376 * fi_writers, or both, depending on the open mode. A 372 * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
377 * delegation also takes an fi_readers reference. Lock 373 * on open or lock mode:
378 * stateid's take none.
379 */ 374 */
380 atomic_t fi_readers; 375 atomic_t fi_access[2];
381 atomic_t fi_writers; 376 struct file *fi_deleg_file;
377 struct file_lock *fi_lease;
378 atomic_t fi_delegees;
382 struct inode *fi_inode; 379 struct inode *fi_inode;
383 u32 fi_id; /* used with stateowner->so_id 380 u32 fi_id; /* used with stateowner->so_id
384 * for stateid_hashtbl hash */ 381 * for stateid_hashtbl hash */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 641117f2188d..2e1cebde90df 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -87,7 +87,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
87 .dentry = dget(dentry)}; 87 .dentry = dget(dentry)};
88 int err = 0; 88 int err = 0;
89 89
90 err = follow_down(&path, false); 90 err = follow_down(&path);
91 if (err < 0) 91 if (err < 0)
92 goto out; 92 goto out;
93 93
@@ -808,7 +808,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
808 if (ra->p_count == 0) 808 if (ra->p_count == 0)
809 frap = rap; 809 frap = rap;
810 } 810 }
811 depth = nfsdstats.ra_size*11/10; 811 depth = nfsdstats.ra_size;
812 if (!frap) { 812 if (!frap) {
813 spin_unlock(&rab->pb_lock); 813 spin_unlock(&rab->pb_lock);
814 return NULL; 814 return NULL;
@@ -1744,6 +1744,11 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1744 host_err = nfsd_break_lease(odentry->d_inode); 1744 host_err = nfsd_break_lease(odentry->d_inode);
1745 if (host_err) 1745 if (host_err)
1746 goto out_drop_write; 1746 goto out_drop_write;
1747 if (ndentry->d_inode) {
1748 host_err = nfsd_break_lease(ndentry->d_inode);
1749 if (host_err)
1750 goto out_drop_write;
1751 }
1747 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1752 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1748 if (!host_err) { 1753 if (!host_err) {
1749 host_err = commit_metadata(tfhp); 1754 host_err = commit_metadata(tfhp);
@@ -1812,22 +1817,22 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1812 1817
1813 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); 1818 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
1814 if (host_err) 1819 if (host_err)
1815 goto out_nfserr; 1820 goto out_put;
1816 1821
1817 host_err = nfsd_break_lease(rdentry->d_inode); 1822 host_err = nfsd_break_lease(rdentry->d_inode);
1818 if (host_err) 1823 if (host_err)
1819 goto out_put; 1824 goto out_drop_write;
1820 if (type != S_IFDIR) 1825 if (type != S_IFDIR)
1821 host_err = vfs_unlink(dirp, rdentry); 1826 host_err = vfs_unlink(dirp, rdentry);
1822 else 1827 else
1823 host_err = vfs_rmdir(dirp, rdentry); 1828 host_err = vfs_rmdir(dirp, rdentry);
1824out_put:
1825 dput(rdentry);
1826
1827 if (!host_err) 1829 if (!host_err)
1828 host_err = commit_metadata(fhp); 1830 host_err = commit_metadata(fhp);
1829 1831out_drop_write:
1830 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1832 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1833out_put:
1834 dput(rdentry);
1835
1831out_nfserr: 1836out_nfserr:
1832 err = nfserrno(host_err); 1837 err = nfserrno(host_err);
1833out: 1838out:
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d7fd696e595c..0a0a66d98cce 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -521,8 +521,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
521 group_offset, bitmap)) 521 group_offset, bitmap))
522 printk(KERN_WARNING "%s: entry number %llu already freed\n", 522 printk(KERN_WARNING "%s: entry number %llu already freed\n",
523 __func__, (unsigned long long)req->pr_entry_nr); 523 __func__, (unsigned long long)req->pr_entry_nr);
524 524 else
525 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); 525 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
526 526
527 kunmap(req->pr_bitmap_bh->b_page); 527 kunmap(req->pr_bitmap_bh->b_page);
528 kunmap(req->pr_desc_bh->b_page); 528 kunmap(req->pr_desc_bh->b_page);
@@ -558,8 +558,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
558 group_offset, bitmap)) 558 group_offset, bitmap))
559 printk(KERN_WARNING "%s: entry number %llu already freed\n", 559 printk(KERN_WARNING "%s: entry number %llu already freed\n",
560 __func__, (unsigned long long)req->pr_entry_nr); 560 __func__, (unsigned long long)req->pr_entry_nr);
561 561 else
562 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); 562 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
563 563
564 kunmap(req->pr_bitmap_bh->b_page); 564 kunmap(req->pr_bitmap_bh->b_page);
565 kunmap(req->pr_desc_bh->b_page); 565 kunmap(req->pr_desc_bh->b_page);
@@ -665,7 +665,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
665 for (j = i, n = 0; 665 for (j = i, n = 0;
666 (j < nitems) && nilfs_palloc_group_is_in(inode, group, 666 (j < nitems) && nilfs_palloc_group_is_in(inode, group,
667 entry_nrs[j]); 667 entry_nrs[j]);
668 j++, n++) { 668 j++) {
669 nilfs_palloc_group(inode, entry_nrs[j], &group_offset); 669 nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
670 if (!nilfs_clear_bit_atomic( 670 if (!nilfs_clear_bit_atomic(
671 nilfs_mdt_bgl_lock(inode, group), 671 nilfs_mdt_bgl_lock(inode, group),
@@ -674,6 +674,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
674 "%s: entry number %llu already freed\n", 674 "%s: entry number %llu already freed\n",
675 __func__, 675 __func__,
676 (unsigned long long)entry_nrs[j]); 676 (unsigned long long)entry_nrs[j]);
677 } else {
678 n++;
677 } 679 }
678 } 680 }
679 nilfs_palloc_group_desc_add_entries(inode, group, desc, n); 681 nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 9af34a7e6e13..f5fde36b9e28 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -74,7 +74,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
74 74
75#define nilfs_set_bit_atomic ext2_set_bit_atomic 75#define nilfs_set_bit_atomic ext2_set_bit_atomic
76#define nilfs_clear_bit_atomic ext2_clear_bit_atomic 76#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
77#define nilfs_find_next_zero_bit ext2_find_next_zero_bit 77#define nilfs_find_next_zero_bit find_next_zero_bit_le
78 78
79/* 79/*
80 * persistent object allocator cache 80 * persistent object allocator cache
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3ee67c67cc52..4723f04e9b12 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -25,7 +25,6 @@
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include "nilfs.h" 26#include "nilfs.h"
27#include "bmap.h" 27#include "bmap.h"
28#include "sb.h"
29#include "btree.h" 28#include "btree.h"
30#include "direct.h" 29#include "direct.h"
31#include "btnode.h" 30#include "btnode.h"
@@ -425,17 +424,6 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
425/* 424/*
426 * Internal use only 425 * Internal use only
427 */ 426 */
428
429void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
430{
431 inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
432}
433
434void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
435{
436 inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
437}
438
439__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, 427__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
440 const struct buffer_head *bh) 428 const struct buffer_head *bh)
441{ 429{
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bde1c0aa2e15..40d9f453d31c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -240,9 +240,6 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
240__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); 240__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
241__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); 241__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
242 242
243void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
244void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
245
246 243
247/* Assume that bmap semaphore is locked. */ 244/* Assume that bmap semaphore is locked. */
248static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) 245static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f5286..609cd223eea8 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -34,20 +34,10 @@
34#include "page.h" 34#include "page.h"
35#include "btnode.h" 35#include "btnode.h"
36 36
37
38void nilfs_btnode_cache_init_once(struct address_space *btnc)
39{
40 nilfs_mapping_init_once(btnc);
41}
42
43static const struct address_space_operations def_btnode_aops = {
44 .sync_page = block_sync_page,
45};
46
47void nilfs_btnode_cache_init(struct address_space *btnc, 37void nilfs_btnode_cache_init(struct address_space *btnc,
48 struct backing_dev_info *bdi) 38 struct backing_dev_info *bdi)
49{ 39{
50 nilfs_mapping_init(btnc, bdi, &def_btnode_aops); 40 nilfs_mapping_init(btnc, bdi);
51} 41}
52 42
53void nilfs_btnode_cache_clear(struct address_space *btnc) 43void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 79037494f1e0..1b8ebd888c28 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
37 struct buffer_head *newbh; 37 struct buffer_head *newbh;
38}; 38};
39 39
40void nilfs_btnode_cache_init_once(struct address_space *);
41void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); 40void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
42void nilfs_btnode_cache_clear(struct address_space *); 41void nilfs_btnode_cache_clear(struct address_space *);
43struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, 42struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 300c2bc00c3f..d451ae0e0bf3 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1174,7 +1174,7 @@ static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
1174 if (ret < 0) 1174 if (ret < 0)
1175 goto out; 1175 goto out;
1176 nilfs_btree_commit_insert(btree, path, level, key, ptr); 1176 nilfs_btree_commit_insert(btree, path, level, key, ptr);
1177 nilfs_bmap_add_blocks(btree, stats.bs_nblocks); 1177 nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
1178 1178
1179 out: 1179 out:
1180 nilfs_btree_free_path(path); 1180 nilfs_btree_free_path(path);
@@ -1511,7 +1511,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
1511 if (ret < 0) 1511 if (ret < 0)
1512 goto out; 1512 goto out;
1513 nilfs_btree_commit_delete(btree, path, level, dat); 1513 nilfs_btree_commit_delete(btree, path, level, dat);
1514 nilfs_bmap_sub_blocks(btree, stats.bs_nblocks); 1514 nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks);
1515 1515
1516out: 1516out:
1517 nilfs_btree_free_path(path); 1517 nilfs_btree_free_path(path);
@@ -1776,7 +1776,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
1776 return ret; 1776 return ret;
1777 nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n, 1777 nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
1778 di, ni, bh); 1778 di, ni, bh);
1779 nilfs_bmap_add_blocks(btree, stats.bs_nblocks); 1779 nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
1780 return 0; 1780 return 0;
1781} 1781}
1782 1782
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 9d45773b79e6..3a1923943b14 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -440,7 +440,6 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
440 nilfs_commit_chunk(page, mapping, from, to); 440 nilfs_commit_chunk(page, mapping, from, to);
441 nilfs_put_page(page); 441 nilfs_put_page(page);
442 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 442 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
443/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
444} 443}
445 444
446/* 445/*
@@ -531,7 +530,6 @@ got_it:
531 nilfs_set_de_type(de, inode); 530 nilfs_set_de_type(de, inode);
532 nilfs_commit_chunk(page, page->mapping, from, to); 531 nilfs_commit_chunk(page, page->mapping, from, to);
533 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 532 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
534/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
535 nilfs_mark_inode_dirty(dir); 533 nilfs_mark_inode_dirty(dir);
536 /* OFFSET_CACHE */ 534 /* OFFSET_CACHE */
537out_put: 535out_put:
@@ -579,7 +577,6 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
579 dir->inode = 0; 577 dir->inode = 0;
580 nilfs_commit_chunk(page, mapping, from, to); 578 nilfs_commit_chunk(page, mapping, from, to);
581 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 579 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
582/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
583out: 580out:
584 nilfs_put_page(page); 581 nilfs_put_page(page);
585 return err; 582 return err;
@@ -684,7 +681,7 @@ const struct file_operations nilfs_dir_operations = {
684 .readdir = nilfs_readdir, 681 .readdir = nilfs_readdir,
685 .unlocked_ioctl = nilfs_ioctl, 682 .unlocked_ioctl = nilfs_ioctl,
686#ifdef CONFIG_COMPAT 683#ifdef CONFIG_COMPAT
687 .compat_ioctl = nilfs_ioctl, 684 .compat_ioctl = nilfs_compat_ioctl,
688#endif /* CONFIG_COMPAT */ 685#endif /* CONFIG_COMPAT */
689 .fsync = nilfs_sync_file, 686 .fsync = nilfs_sync_file,
690 687
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 324d80c57518..82f4865e86dd 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -146,7 +146,7 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
146 if (NILFS_BMAP_USE_VBN(bmap)) 146 if (NILFS_BMAP_USE_VBN(bmap))
147 nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr); 147 nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
148 148
149 nilfs_bmap_add_blocks(bmap, 1); 149 nilfs_inode_add_blocks(bmap->b_inode, 1);
150 } 150 }
151 return ret; 151 return ret;
152} 152}
@@ -168,7 +168,7 @@ static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
168 if (!ret) { 168 if (!ret) {
169 nilfs_bmap_commit_end_ptr(bmap, &req, dat); 169 nilfs_bmap_commit_end_ptr(bmap, &req, dat);
170 nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR); 170 nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
171 nilfs_bmap_sub_blocks(bmap, 1); 171 nilfs_inode_sub_blocks(bmap->b_inode, 1);
172 } 172 }
173 return ret; 173 return ret;
174} 174}
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 2f560c9fb808..93589fccdd97 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -59,7 +59,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
59 struct nilfs_transaction_info ti; 59 struct nilfs_transaction_info ti;
60 int ret; 60 int ret;
61 61
62 if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs))) 62 if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
63 return VM_FAULT_SIGBUS; /* -ENOSPC */ 63 return VM_FAULT_SIGBUS; /* -ENOSPC */
64 64
65 lock_page(page); 65 lock_page(page);
@@ -142,7 +142,7 @@ const struct file_operations nilfs_file_operations = {
142 .aio_write = generic_file_aio_write, 142 .aio_write = generic_file_aio_write,
143 .unlocked_ioctl = nilfs_ioctl, 143 .unlocked_ioctl = nilfs_ioctl,
144#ifdef CONFIG_COMPAT 144#ifdef CONFIG_COMPAT
145 .compat_ioctl = nilfs_ioctl, 145 .compat_ioctl = nilfs_compat_ioctl,
146#endif /* CONFIG_COMPAT */ 146#endif /* CONFIG_COMPAT */
147 .mmap = nilfs_file_mmap, 147 .mmap = nilfs_file_mmap,
148 .open = generic_file_open, 148 .open = generic_file_open,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index caf9a6a3fb54..1c2a3e23f8b2 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -49,7 +49,6 @@
49#include "ifile.h" 49#include "ifile.h"
50 50
51static const struct address_space_operations def_gcinode_aops = { 51static const struct address_space_operations def_gcinode_aops = {
52 .sync_page = block_sync_page,
53}; 52};
54 53
55/* 54/*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2fd440d8d6b8..c0aa27490c02 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -41,6 +41,24 @@ struct nilfs_iget_args {
41 int for_gc; 41 int for_gc;
42}; 42};
43 43
44void nilfs_inode_add_blocks(struct inode *inode, int n)
45{
46 struct nilfs_root *root = NILFS_I(inode)->i_root;
47
48 inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
49 if (root)
50 atomic_add(n, &root->blocks_count);
51}
52
53void nilfs_inode_sub_blocks(struct inode *inode, int n)
54{
55 struct nilfs_root *root = NILFS_I(inode)->i_root;
56
57 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
58 if (root)
59 atomic_sub(n, &root->blocks_count);
60}
61
44/** 62/**
45 * nilfs_get_block() - get a file block on the filesystem (callback function) 63 * nilfs_get_block() - get a file block on the filesystem (callback function)
46 * @inode - inode struct of the target file 64 * @inode - inode struct of the target file
@@ -262,7 +280,6 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
262const struct address_space_operations nilfs_aops = { 280const struct address_space_operations nilfs_aops = {
263 .writepage = nilfs_writepage, 281 .writepage = nilfs_writepage,
264 .readpage = nilfs_readpage, 282 .readpage = nilfs_readpage,
265 .sync_page = block_sync_page,
266 .writepages = nilfs_writepages, 283 .writepages = nilfs_writepages,
267 .set_page_dirty = nilfs_set_page_dirty, 284 .set_page_dirty = nilfs_set_page_dirty,
268 .readpages = nilfs_readpages, 285 .readpages = nilfs_readpages,
@@ -277,7 +294,7 @@ const struct address_space_operations nilfs_aops = {
277struct inode *nilfs_new_inode(struct inode *dir, int mode) 294struct inode *nilfs_new_inode(struct inode *dir, int mode)
278{ 295{
279 struct super_block *sb = dir->i_sb; 296 struct super_block *sb = dir->i_sb;
280 struct nilfs_sb_info *sbi = NILFS_SB(sb); 297 struct the_nilfs *nilfs = sb->s_fs_info;
281 struct inode *inode; 298 struct inode *inode;
282 struct nilfs_inode_info *ii; 299 struct nilfs_inode_info *ii;
283 struct nilfs_root *root; 300 struct nilfs_root *root;
@@ -315,19 +332,16 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
315 /* No lock is needed; iget() ensures it. */ 332 /* No lock is needed; iget() ensures it. */
316 } 333 }
317 334
318 ii->i_flags = NILFS_I(dir)->i_flags; 335 ii->i_flags = nilfs_mask_flags(
319 if (S_ISLNK(mode)) 336 mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
320 ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
321 if (!S_ISDIR(mode))
322 ii->i_flags &= ~NILFS_DIRSYNC_FL;
323 337
324 /* ii->i_file_acl = 0; */ 338 /* ii->i_file_acl = 0; */
325 /* ii->i_dir_acl = 0; */ 339 /* ii->i_dir_acl = 0; */
326 ii->i_dir_start_lookup = 0; 340 ii->i_dir_start_lookup = 0;
327 nilfs_set_inode_flags(inode); 341 nilfs_set_inode_flags(inode);
328 spin_lock(&sbi->s_next_gen_lock); 342 spin_lock(&nilfs->ns_next_gen_lock);
329 inode->i_generation = sbi->s_next_generation++; 343 inode->i_generation = nilfs->ns_next_generation++;
330 spin_unlock(&sbi->s_next_gen_lock); 344 spin_unlock(&nilfs->ns_next_gen_lock);
331 insert_inode_hash(inode); 345 insert_inode_hash(inode);
332 346
333 err = nilfs_init_acl(inode, dir); 347 err = nilfs_init_acl(inode, dir);
@@ -359,17 +373,15 @@ void nilfs_set_inode_flags(struct inode *inode)
359 373
360 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | 374 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
361 S_DIRSYNC); 375 S_DIRSYNC);
362 if (flags & NILFS_SYNC_FL) 376 if (flags & FS_SYNC_FL)
363 inode->i_flags |= S_SYNC; 377 inode->i_flags |= S_SYNC;
364 if (flags & NILFS_APPEND_FL) 378 if (flags & FS_APPEND_FL)
365 inode->i_flags |= S_APPEND; 379 inode->i_flags |= S_APPEND;
366 if (flags & NILFS_IMMUTABLE_FL) 380 if (flags & FS_IMMUTABLE_FL)
367 inode->i_flags |= S_IMMUTABLE; 381 inode->i_flags |= S_IMMUTABLE;
368#ifndef NILFS_ATIME_DISABLE 382 if (flags & FS_NOATIME_FL)
369 if (flags & NILFS_NOATIME_FL)
370#endif
371 inode->i_flags |= S_NOATIME; 383 inode->i_flags |= S_NOATIME;
372 if (flags & NILFS_DIRSYNC_FL) 384 if (flags & FS_DIRSYNC_FL)
373 inode->i_flags |= S_DIRSYNC; 385 inode->i_flags |= S_DIRSYNC;
374 mapping_set_gfp_mask(inode->i_mapping, 386 mapping_set_gfp_mask(inode->i_mapping,
375 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 387 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
@@ -420,7 +432,7 @@ static int __nilfs_read_inode(struct super_block *sb,
420 struct nilfs_root *root, unsigned long ino, 432 struct nilfs_root *root, unsigned long ino,
421 struct inode *inode) 433 struct inode *inode)
422{ 434{
423 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; 435 struct the_nilfs *nilfs = sb->s_fs_info;
424 struct buffer_head *bh; 436 struct buffer_head *bh;
425 struct nilfs_inode *raw_inode; 437 struct nilfs_inode *raw_inode;
426 int err; 438 int err;
@@ -707,6 +719,7 @@ void nilfs_evict_inode(struct inode *inode)
707 struct nilfs_transaction_info ti; 719 struct nilfs_transaction_info ti;
708 struct super_block *sb = inode->i_sb; 720 struct super_block *sb = inode->i_sb;
709 struct nilfs_inode_info *ii = NILFS_I(inode); 721 struct nilfs_inode_info *ii = NILFS_I(inode);
722 int ret;
710 723
711 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { 724 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
712 if (inode->i_data.nrpages) 725 if (inode->i_data.nrpages)
@@ -725,8 +738,9 @@ void nilfs_evict_inode(struct inode *inode)
725 nilfs_mark_inode_dirty(inode); 738 nilfs_mark_inode_dirty(inode);
726 end_writeback(inode); 739 end_writeback(inode);
727 740
728 nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); 741 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
729 atomic_dec(&ii->i_root->inodes_count); 742 if (!ret)
743 atomic_dec(&ii->i_root->inodes_count);
730 744
731 nilfs_clear_inode(inode); 745 nilfs_clear_inode(inode);
732 746
@@ -792,18 +806,18 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
792 806
793int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) 807int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
794{ 808{
795 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); 809 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
796 struct nilfs_inode_info *ii = NILFS_I(inode); 810 struct nilfs_inode_info *ii = NILFS_I(inode);
797 int err; 811 int err;
798 812
799 spin_lock(&sbi->s_inode_lock); 813 spin_lock(&nilfs->ns_inode_lock);
800 if (ii->i_bh == NULL) { 814 if (ii->i_bh == NULL) {
801 spin_unlock(&sbi->s_inode_lock); 815 spin_unlock(&nilfs->ns_inode_lock);
802 err = nilfs_ifile_get_inode_block(ii->i_root->ifile, 816 err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
803 inode->i_ino, pbh); 817 inode->i_ino, pbh);
804 if (unlikely(err)) 818 if (unlikely(err))
805 return err; 819 return err;
806 spin_lock(&sbi->s_inode_lock); 820 spin_lock(&nilfs->ns_inode_lock);
807 if (ii->i_bh == NULL) 821 if (ii->i_bh == NULL)
808 ii->i_bh = *pbh; 822 ii->i_bh = *pbh;
809 else { 823 else {
@@ -814,36 +828,36 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
814 *pbh = ii->i_bh; 828 *pbh = ii->i_bh;
815 829
816 get_bh(*pbh); 830 get_bh(*pbh);
817 spin_unlock(&sbi->s_inode_lock); 831 spin_unlock(&nilfs->ns_inode_lock);
818 return 0; 832 return 0;
819} 833}
820 834
821int nilfs_inode_dirty(struct inode *inode) 835int nilfs_inode_dirty(struct inode *inode)
822{ 836{
823 struct nilfs_inode_info *ii = NILFS_I(inode); 837 struct nilfs_inode_info *ii = NILFS_I(inode);
824 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); 838 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
825 int ret = 0; 839 int ret = 0;
826 840
827 if (!list_empty(&ii->i_dirty)) { 841 if (!list_empty(&ii->i_dirty)) {
828 spin_lock(&sbi->s_inode_lock); 842 spin_lock(&nilfs->ns_inode_lock);
829 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || 843 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
830 test_bit(NILFS_I_BUSY, &ii->i_state); 844 test_bit(NILFS_I_BUSY, &ii->i_state);
831 spin_unlock(&sbi->s_inode_lock); 845 spin_unlock(&nilfs->ns_inode_lock);
832 } 846 }
833 return ret; 847 return ret;
834} 848}
835 849
836int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) 850int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
837{ 851{
838 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
839 struct nilfs_inode_info *ii = NILFS_I(inode); 852 struct nilfs_inode_info *ii = NILFS_I(inode);
853 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
840 854
841 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); 855 atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
842 856
843 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) 857 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
844 return 0; 858 return 0;
845 859
846 spin_lock(&sbi->s_inode_lock); 860 spin_lock(&nilfs->ns_inode_lock);
847 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && 861 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
848 !test_bit(NILFS_I_BUSY, &ii->i_state)) { 862 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
849 /* Because this routine may race with nilfs_dispose_list(), 863 /* Because this routine may race with nilfs_dispose_list(),
@@ -851,18 +865,18 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
851 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { 865 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
852 /* This will happen when somebody is freeing 866 /* This will happen when somebody is freeing
853 this inode. */ 867 this inode. */
854 nilfs_warning(sbi->s_super, __func__, 868 nilfs_warning(inode->i_sb, __func__,
855 "cannot get inode (ino=%lu)\n", 869 "cannot get inode (ino=%lu)\n",
856 inode->i_ino); 870 inode->i_ino);
857 spin_unlock(&sbi->s_inode_lock); 871 spin_unlock(&nilfs->ns_inode_lock);
858 return -EINVAL; /* NILFS_I_DIRTY may remain for 872 return -EINVAL; /* NILFS_I_DIRTY may remain for
859 freeing inode */ 873 freeing inode */
860 } 874 }
861 list_del(&ii->i_dirty); 875 list_del(&ii->i_dirty);
862 list_add_tail(&ii->i_dirty, &sbi->s_dirty_files); 876 list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
863 set_bit(NILFS_I_QUEUED, &ii->i_state); 877 set_bit(NILFS_I_QUEUED, &ii->i_state);
864 } 878 }
865 spin_unlock(&sbi->s_inode_lock); 879 spin_unlock(&nilfs->ns_inode_lock);
866 return 0; 880 return 0;
867} 881}
868 882
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 496738963fdb..f2469ba6246b 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -26,7 +26,9 @@
26#include <linux/capability.h> /* capable() */ 26#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29#include <linux/compat.h> /* compat_ptr() */
29#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */ 30#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */
31#include <linux/buffer_head.h>
30#include <linux/nilfs2_fs.h> 32#include <linux/nilfs2_fs.h>
31#include "nilfs.h" 33#include "nilfs.h"
32#include "segment.h" 34#include "segment.h"
@@ -97,11 +99,74 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
97 return ret; 99 return ret;
98} 100}
99 101
102static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
103{
104 unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
105
106 return put_user(flags, (int __user *)argp);
107}
108
109static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
110 void __user *argp)
111{
112 struct nilfs_transaction_info ti;
113 unsigned int flags, oldflags;
114 int ret;
115
116 if (!inode_owner_or_capable(inode))
117 return -EACCES;
118
119 if (get_user(flags, (int __user *)argp))
120 return -EFAULT;
121
122 ret = mnt_want_write(filp->f_path.mnt);
123 if (ret)
124 return ret;
125
126 flags = nilfs_mask_flags(inode->i_mode, flags);
127
128 mutex_lock(&inode->i_mutex);
129
130 oldflags = NILFS_I(inode)->i_flags;
131
132 /*
133 * The IMMUTABLE and APPEND_ONLY flags can only be changed by the
134 * relevant capability.
135 */
136 ret = -EPERM;
137 if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) &&
138 !capable(CAP_LINUX_IMMUTABLE))
139 goto out;
140
141 ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);
142 if (ret)
143 goto out;
144
145 NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) |
146 (flags & FS_FL_USER_MODIFIABLE);
147
148 nilfs_set_inode_flags(inode);
149 inode->i_ctime = CURRENT_TIME;
150 if (IS_SYNC(inode))
151 nilfs_set_transaction_flag(NILFS_TI_SYNC);
152
153 nilfs_mark_inode_dirty(inode);
154 ret = nilfs_transaction_commit(inode->i_sb);
155out:
156 mutex_unlock(&inode->i_mutex);
157 mnt_drop_write(filp->f_path.mnt);
158 return ret;
159}
160
161static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
162{
163 return put_user(inode->i_generation, (int __user *)argp);
164}
165
100static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, 166static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
101 unsigned int cmd, void __user *argp) 167 unsigned int cmd, void __user *argp)
102{ 168{
103 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 169 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
104 struct inode *cpfile = nilfs->ns_cpfile;
105 struct nilfs_transaction_info ti; 170 struct nilfs_transaction_info ti;
106 struct nilfs_cpmode cpmode; 171 struct nilfs_cpmode cpmode;
107 int ret; 172 int ret;
@@ -121,7 +186,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
121 186
122 nilfs_transaction_begin(inode->i_sb, &ti, 0); 187 nilfs_transaction_begin(inode->i_sb, &ti, 0);
123 ret = nilfs_cpfile_change_cpmode( 188 ret = nilfs_cpfile_change_cpmode(
124 cpfile, cpmode.cm_cno, cpmode.cm_mode); 189 nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode);
125 if (unlikely(ret < 0)) 190 if (unlikely(ret < 0))
126 nilfs_transaction_abort(inode->i_sb); 191 nilfs_transaction_abort(inode->i_sb);
127 else 192 else
@@ -137,7 +202,7 @@ static int
137nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, 202nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
138 unsigned int cmd, void __user *argp) 203 unsigned int cmd, void __user *argp)
139{ 204{
140 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; 205 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
141 struct nilfs_transaction_info ti; 206 struct nilfs_transaction_info ti;
142 __u64 cno; 207 __u64 cno;
143 int ret; 208 int ret;
@@ -154,7 +219,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
154 goto out; 219 goto out;
155 220
156 nilfs_transaction_begin(inode->i_sb, &ti, 0); 221 nilfs_transaction_begin(inode->i_sb, &ti, 0);
157 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); 222 ret = nilfs_cpfile_delete_checkpoint(nilfs->ns_cpfile, cno);
158 if (unlikely(ret < 0)) 223 if (unlikely(ret < 0))
159 nilfs_transaction_abort(inode->i_sb); 224 nilfs_transaction_abort(inode->i_sb);
160 else 225 else
@@ -180,7 +245,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
180static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, 245static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
181 unsigned int cmd, void __user *argp) 246 unsigned int cmd, void __user *argp)
182{ 247{
183 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 248 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
184 struct nilfs_cpstat cpstat; 249 struct nilfs_cpstat cpstat;
185 int ret; 250 int ret;
186 251
@@ -211,7 +276,7 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
211static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, 276static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
212 unsigned int cmd, void __user *argp) 277 unsigned int cmd, void __user *argp)
213{ 278{
214 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 279 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
215 struct nilfs_sustat sustat; 280 struct nilfs_sustat sustat;
216 int ret; 281 int ret;
217 282
@@ -267,7 +332,7 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
267static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, 332static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
268 unsigned int cmd, void __user *argp) 333 unsigned int cmd, void __user *argp)
269{ 334{
270 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 335 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
271 struct nilfs_argv argv; 336 struct nilfs_argv argv;
272 int ret; 337 int ret;
273 338
@@ -336,7 +401,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
336 struct nilfs_argv *argv, void *buf) 401 struct nilfs_argv *argv, void *buf)
337{ 402{
338 size_t nmembs = argv->v_nmembs; 403 size_t nmembs = argv->v_nmembs;
339 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; 404 struct the_nilfs *nilfs = sb->s_fs_info;
340 struct inode *inode; 405 struct inode *inode;
341 struct nilfs_vdesc *vdesc; 406 struct nilfs_vdesc *vdesc;
342 struct buffer_head *bh, *n; 407 struct buffer_head *bh, *n;
@@ -550,7 +615,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
550 ret = PTR_ERR(kbufs[4]); 615 ret = PTR_ERR(kbufs[4]);
551 goto out; 616 goto out;
552 } 617 }
553 nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 618 nilfs = inode->i_sb->s_fs_info;
554 619
555 for (n = 0; n < 4; n++) { 620 for (n = 0; n < 4; n++) {
556 ret = -EINVAL; 621 ret = -EINVAL;
@@ -623,7 +688,7 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
623 return ret; 688 return ret;
624 689
625 if (argp != NULL) { 690 if (argp != NULL) {
626 nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 691 nilfs = inode->i_sb->s_fs_info;
627 down_read(&nilfs->ns_segctor_sem); 692 down_read(&nilfs->ns_segctor_sem);
628 cno = nilfs->ns_cno - 1; 693 cno = nilfs->ns_cno - 1;
629 up_read(&nilfs->ns_segctor_sem); 694 up_read(&nilfs->ns_segctor_sem);
@@ -641,7 +706,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
641 void *, size_t, size_t)) 706 void *, size_t, size_t))
642 707
643{ 708{
644 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 709 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
645 struct nilfs_argv argv; 710 struct nilfs_argv argv;
646 int ret; 711 int ret;
647 712
@@ -666,6 +731,12 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
666 void __user *argp = (void __user *)arg; 731 void __user *argp = (void __user *)arg;
667 732
668 switch (cmd) { 733 switch (cmd) {
734 case FS_IOC_GETFLAGS:
735 return nilfs_ioctl_getflags(inode, argp);
736 case FS_IOC_SETFLAGS:
737 return nilfs_ioctl_setflags(inode, filp, argp);
738 case FS_IOC_GETVERSION:
739 return nilfs_ioctl_getversion(inode, argp);
669 case NILFS_IOCTL_CHANGE_CPMODE: 740 case NILFS_IOCTL_CHANGE_CPMODE:
670 return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp); 741 return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
671 case NILFS_IOCTL_DELETE_CHECKPOINT: 742 case NILFS_IOCTL_DELETE_CHECKPOINT:
@@ -696,3 +767,23 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
696 return -ENOTTY; 767 return -ENOTTY;
697 } 768 }
698} 769}
770
771#ifdef CONFIG_COMPAT
772long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
773{
774 switch (cmd) {
775 case FS_IOC32_GETFLAGS:
776 cmd = FS_IOC_GETFLAGS;
777 break;
778 case FS_IOC32_SETFLAGS:
779 cmd = FS_IOC_SETFLAGS;
780 break;
781 case FS_IOC32_GETVERSION:
782 cmd = FS_IOC_GETVERSION;
783 break;
784 default:
785 return -ENOIOCTLCMD;
786 }
787 return nilfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
788}
789#endif
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f60..a649b05f7069 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -399,7 +399,6 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
399 399
400static const struct address_space_operations def_mdt_aops = { 400static const struct address_space_operations def_mdt_aops = {
401 .writepage = nilfs_mdt_write_page, 401 .writepage = nilfs_mdt_write_page,
402 .sync_page = block_sync_page,
403}; 402};
404 403
405static const struct inode_operations def_mdt_iops; 404static const struct inode_operations def_mdt_iops;
@@ -438,10 +437,6 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
438 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); 437 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
439} 438}
440 439
441static const struct address_space_operations shadow_map_aops = {
442 .sync_page = block_sync_page,
443};
444
445/** 440/**
446 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file 441 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
447 * @inode: inode of the metadata file 442 * @inode: inode of the metadata file
@@ -454,10 +449,10 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
454 struct backing_dev_info *bdi = inode->i_sb->s_bdi; 449 struct backing_dev_info *bdi = inode->i_sb->s_bdi;
455 450
456 INIT_LIST_HEAD(&shadow->frozen_buffers); 451 INIT_LIST_HEAD(&shadow->frozen_buffers);
457 nilfs_mapping_init_once(&shadow->frozen_data); 452 address_space_init_once(&shadow->frozen_data);
458 nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops); 453 nilfs_mapping_init(&shadow->frozen_data, bdi);
459 nilfs_mapping_init_once(&shadow->frozen_btnodes); 454 address_space_init_once(&shadow->frozen_btnodes);
460 nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops); 455 nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
461 mi->mi_shadow = shadow; 456 mi->mi_shadow = shadow;
462 return 0; 457 return 0;
463} 458}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index b13734bf3521..ed68563ec708 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -66,7 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
66 66
67static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) 67static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
68{ 68{
69 return NILFS_SB(inode->i_sb)->s_nilfs; 69 return inode->i_sb->s_fs_info;
70} 70}
71 71
72/* Default GFP flags using highmem */ 72/* Default GFP flags using highmem */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 98034271cd02..546849b3e88f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
397 new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); 397 new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
398 if (!new_de) 398 if (!new_de)
399 goto out_dir; 399 goto out_dir;
400 inc_nlink(old_inode);
401 nilfs_set_link(new_dir, new_de, new_page, old_inode); 400 nilfs_set_link(new_dir, new_de, new_page, old_inode);
402 nilfs_mark_inode_dirty(new_dir); 401 nilfs_mark_inode_dirty(new_dir);
403 new_inode->i_ctime = CURRENT_TIME; 402 new_inode->i_ctime = CURRENT_TIME;
@@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
411 if (new_dir->i_nlink >= NILFS_LINK_MAX) 410 if (new_dir->i_nlink >= NILFS_LINK_MAX)
412 goto out_dir; 411 goto out_dir;
413 } 412 }
414 inc_nlink(old_inode);
415 err = nilfs_add_link(new_dentry, old_inode); 413 err = nilfs_add_link(new_dentry, old_inode);
416 if (err) { 414 if (err)
417 drop_nlink(old_inode);
418 nilfs_mark_inode_dirty(old_inode);
419 goto out_dir; 415 goto out_dir;
420 }
421 if (dir_de) { 416 if (dir_de) {
422 inc_nlink(new_dir); 417 inc_nlink(new_dir);
423 nilfs_mark_inode_dirty(new_dir); 418 nilfs_mark_inode_dirty(new_dir);
@@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
431 old_inode->i_ctime = CURRENT_TIME; 426 old_inode->i_ctime = CURRENT_TIME;
432 427
433 nilfs_delete_entry(old_de, old_page); 428 nilfs_delete_entry(old_de, old_page);
434 drop_nlink(old_inode);
435 429
436 if (dir_de) { 430 if (dir_de) {
437 nilfs_set_link(old_inode, dir_de, dir_page, new_dir); 431 nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
@@ -488,7 +482,7 @@ static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
488 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO) 482 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
489 return ERR_PTR(-ESTALE); 483 return ERR_PTR(-ESTALE);
490 484
491 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno); 485 root = nilfs_lookup_root(sb->s_fs_info, cno);
492 if (!root) 486 if (!root)
493 return ERR_PTR(-ESTALE); 487 return ERR_PTR(-ESTALE);
494 488
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 777e8fd04304..856e8e4e0b74 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -30,7 +30,6 @@
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/nilfs2_fs.h> 31#include <linux/nilfs2_fs.h>
32#include "the_nilfs.h" 32#include "the_nilfs.h"
33#include "sb.h"
34#include "bmap.h" 33#include "bmap.h"
35 34
36/* 35/*
@@ -122,7 +121,7 @@ enum {
122#define NILFS_SYS_INO_BITS \ 121#define NILFS_SYS_INO_BITS \
123 ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) 122 ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
124 123
125#define NILFS_FIRST_INO(sb) (NILFS_SB(sb)->s_nilfs->ns_first_ino) 124#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
126 125
127#define NILFS_MDT_INODE(sb, ino) \ 126#define NILFS_MDT_INODE(sb, ino) \
128 ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino)))) 127 ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
@@ -212,6 +211,23 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
212 211
213#define NILFS_ATIME_DISABLE 212#define NILFS_ATIME_DISABLE
214 213
214/* Flags that should be inherited by new inodes from their parent. */
215#define NILFS_FL_INHERITED \
216 (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL | \
217 FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\
218 FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
219
220/* Mask out flags that are inappropriate for the given type of inode. */
221static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
222{
223 if (S_ISDIR(mode))
224 return flags;
225 else if (S_ISREG(mode))
226 return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL);
227 else
228 return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
229}
230
215/* dir.c */ 231/* dir.c */
216extern int nilfs_add_link(struct dentry *, struct inode *); 232extern int nilfs_add_link(struct dentry *, struct inode *);
217extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); 233extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
@@ -229,10 +245,13 @@ extern int nilfs_sync_file(struct file *, int);
229 245
230/* ioctl.c */ 246/* ioctl.c */
231long nilfs_ioctl(struct file *, unsigned int, unsigned long); 247long nilfs_ioctl(struct file *, unsigned int, unsigned long);
248long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
232int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, 249int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
233 void **); 250 void **);
234 251
235/* inode.c */ 252/* inode.c */
253void nilfs_inode_add_blocks(struct inode *inode, int n);
254void nilfs_inode_sub_blocks(struct inode *inode, int n);
236extern struct inode *nilfs_new_inode(struct inode *, int); 255extern struct inode *nilfs_new_inode(struct inode *, int);
237extern void nilfs_free_inode(struct inode *); 256extern void nilfs_free_inode(struct inode *);
238extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 257extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
@@ -275,11 +294,11 @@ extern int nilfs_check_feature_compatibility(struct super_block *,
275 struct nilfs_super_block *); 294 struct nilfs_super_block *);
276extern void nilfs_set_log_cursor(struct nilfs_super_block *, 295extern void nilfs_set_log_cursor(struct nilfs_super_block *,
277 struct the_nilfs *); 296 struct the_nilfs *);
278extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *, 297struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
279 int flip); 298 int flip);
280extern int nilfs_commit_super(struct nilfs_sb_info *, int); 299int nilfs_commit_super(struct super_block *sb, int flag);
281extern int nilfs_cleanup_super(struct nilfs_sb_info *); 300int nilfs_cleanup_super(struct super_block *sb);
282int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt, 301int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
283 struct nilfs_root **root); 302 struct nilfs_root **root);
284int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno); 303int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
285 304
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfef..4d2a1ee0eb47 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -492,29 +492,15 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
492 return nc; 492 return nc;
493} 493}
494 494
495void nilfs_mapping_init_once(struct address_space *mapping)
496{
497 memset(mapping, 0, sizeof(*mapping));
498 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
499 spin_lock_init(&mapping->tree_lock);
500 INIT_LIST_HEAD(&mapping->private_list);
501 spin_lock_init(&mapping->private_lock);
502
503 spin_lock_init(&mapping->i_mmap_lock);
504 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
505 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
506}
507
508void nilfs_mapping_init(struct address_space *mapping, 495void nilfs_mapping_init(struct address_space *mapping,
509 struct backing_dev_info *bdi, 496 struct backing_dev_info *bdi)
510 const struct address_space_operations *aops)
511{ 497{
512 mapping->host = NULL; 498 mapping->host = NULL;
513 mapping->flags = 0; 499 mapping->flags = 0;
514 mapping_set_gfp_mask(mapping, GFP_NOFS); 500 mapping_set_gfp_mask(mapping, GFP_NOFS);
515 mapping->assoc_mapping = NULL; 501 mapping->assoc_mapping = NULL;
516 mapping->backing_dev_info = bdi; 502 mapping->backing_dev_info = bdi;
517 mapping->a_ops = aops; 503 mapping->a_ops = NULL;
518} 504}
519 505
520/* 506/*
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd891..f06b79ad7493 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -61,10 +61,8 @@ void nilfs_free_private_page(struct page *);
61int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); 61int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
62void nilfs_copy_back_pages(struct address_space *, struct address_space *); 62void nilfs_copy_back_pages(struct address_space *, struct address_space *);
63void nilfs_clear_dirty_pages(struct address_space *); 63void nilfs_clear_dirty_pages(struct address_space *);
64void nilfs_mapping_init_once(struct address_space *mapping);
65void nilfs_mapping_init(struct address_space *mapping, 64void nilfs_mapping_init(struct address_space *mapping,
66 struct backing_dev_info *bdi, 65 struct backing_dev_info *bdi);
67 const struct address_space_operations *aops);
68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 66unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
69unsigned long nilfs_find_uncommitted_extent(struct inode *inode, 67unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
70 sector_t start_blk, 68 sector_t start_blk,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 3dfcd3b7d389..ba4a64518f38 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -425,7 +425,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
425} 425}
426 426
427static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, 427static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
428 struct nilfs_sb_info *sbi, 428 struct super_block *sb,
429 struct nilfs_recovery_info *ri) 429 struct nilfs_recovery_info *ri)
430{ 430{
431 struct list_head *head = &ri->ri_used_segments; 431 struct list_head *head = &ri->ri_used_segments;
@@ -501,7 +501,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
501} 501}
502 502
503static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, 503static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
504 struct nilfs_sb_info *sbi, 504 struct super_block *sb,
505 struct nilfs_root *root, 505 struct nilfs_root *root,
506 struct list_head *head, 506 struct list_head *head,
507 unsigned long *nr_salvaged_blocks) 507 unsigned long *nr_salvaged_blocks)
@@ -514,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
514 int err = 0, err2 = 0; 514 int err = 0, err2 = 0;
515 515
516 list_for_each_entry_safe(rb, n, head, list) { 516 list_for_each_entry_safe(rb, n, head, list) {
517 inode = nilfs_iget(sbi->s_super, root, rb->ino); 517 inode = nilfs_iget(sb, root, rb->ino);
518 if (IS_ERR(inode)) { 518 if (IS_ERR(inode)) {
519 err = PTR_ERR(inode); 519 err = PTR_ERR(inode);
520 inode = NULL; 520 inode = NULL;
@@ -572,11 +572,11 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
572 * nilfs_do_roll_forward - salvage logical segments newer than the latest 572 * nilfs_do_roll_forward - salvage logical segments newer than the latest
573 * checkpoint 573 * checkpoint
574 * @nilfs: nilfs object 574 * @nilfs: nilfs object
575 * @sbi: nilfs_sb_info 575 * @sb: super block instance
576 * @ri: pointer to a nilfs_recovery_info 576 * @ri: pointer to a nilfs_recovery_info
577 */ 577 */
578static int nilfs_do_roll_forward(struct the_nilfs *nilfs, 578static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
579 struct nilfs_sb_info *sbi, 579 struct super_block *sb,
580 struct nilfs_root *root, 580 struct nilfs_root *root,
581 struct nilfs_recovery_info *ri) 581 struct nilfs_recovery_info *ri)
582{ 582{
@@ -648,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
648 goto failed; 648 goto failed;
649 if (flags & NILFS_SS_LOGEND) { 649 if (flags & NILFS_SS_LOGEND) {
650 err = nilfs_recover_dsync_blocks( 650 err = nilfs_recover_dsync_blocks(
651 nilfs, sbi, root, &dsync_blocks, 651 nilfs, sb, root, &dsync_blocks,
652 &nsalvaged_blocks); 652 &nsalvaged_blocks);
653 if (unlikely(err)) 653 if (unlikely(err))
654 goto failed; 654 goto failed;
@@ -681,7 +681,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
681 681
682 if (nsalvaged_blocks) { 682 if (nsalvaged_blocks) {
683 printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n", 683 printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
684 sbi->s_super->s_id, nsalvaged_blocks); 684 sb->s_id, nsalvaged_blocks);
685 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; 685 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
686 } 686 }
687 out: 687 out:
@@ -695,7 +695,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
695 printk(KERN_ERR 695 printk(KERN_ERR
696 "NILFS (device %s): Error roll-forwarding " 696 "NILFS (device %s): Error roll-forwarding "
697 "(err=%d, pseg block=%llu). ", 697 "(err=%d, pseg block=%llu). ",
698 sbi->s_super->s_id, err, (unsigned long long)pseg_start); 698 sb->s_id, err, (unsigned long long)pseg_start);
699 goto out; 699 goto out;
700} 700}
701 701
@@ -724,7 +724,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
724/** 724/**
725 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint 725 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
726 * @nilfs: nilfs object 726 * @nilfs: nilfs object
727 * @sbi: nilfs_sb_info 727 * @sb: super block instance
728 * @ri: pointer to a nilfs_recovery_info struct to store search results. 728 * @ri: pointer to a nilfs_recovery_info struct to store search results.
729 * 729 *
730 * Return Value: On success, 0 is returned. On error, one of the following 730 * Return Value: On success, 0 is returned. On error, one of the following
@@ -741,7 +741,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
741 * %-ENOMEM - Insufficient memory available. 741 * %-ENOMEM - Insufficient memory available.
742 */ 742 */
743int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, 743int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
744 struct nilfs_sb_info *sbi, 744 struct super_block *sb,
745 struct nilfs_recovery_info *ri) 745 struct nilfs_recovery_info *ri)
746{ 746{
747 struct nilfs_root *root; 747 struct nilfs_root *root;
@@ -750,32 +750,32 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
750 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) 750 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
751 return 0; 751 return 0;
752 752
753 err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root); 753 err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
754 if (unlikely(err)) { 754 if (unlikely(err)) {
755 printk(KERN_ERR 755 printk(KERN_ERR
756 "NILFS: error loading the latest checkpoint.\n"); 756 "NILFS: error loading the latest checkpoint.\n");
757 return err; 757 return err;
758 } 758 }
759 759
760 err = nilfs_do_roll_forward(nilfs, sbi, root, ri); 760 err = nilfs_do_roll_forward(nilfs, sb, root, ri);
761 if (unlikely(err)) 761 if (unlikely(err))
762 goto failed; 762 goto failed;
763 763
764 if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { 764 if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
765 err = nilfs_prepare_segment_for_recovery(nilfs, sbi, ri); 765 err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
766 if (unlikely(err)) { 766 if (unlikely(err)) {
767 printk(KERN_ERR "NILFS: Error preparing segments for " 767 printk(KERN_ERR "NILFS: Error preparing segments for "
768 "recovery.\n"); 768 "recovery.\n");
769 goto failed; 769 goto failed;
770 } 770 }
771 771
772 err = nilfs_attach_segment_constructor(sbi, root); 772 err = nilfs_attach_log_writer(sb, root);
773 if (unlikely(err)) 773 if (unlikely(err))
774 goto failed; 774 goto failed;
775 775
776 set_nilfs_discontinued(nilfs); 776 set_nilfs_discontinued(nilfs);
777 err = nilfs_construct_segment(sbi->s_super); 777 err = nilfs_construct_segment(sb);
778 nilfs_detach_segment_constructor(sbi); 778 nilfs_detach_log_writer(sb);
779 779
780 if (unlikely(err)) { 780 if (unlikely(err)) {
781 printk(KERN_ERR "NILFS: Oops! recovery failed. " 781 printk(KERN_ERR "NILFS: Oops! recovery failed. "
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
deleted file mode 100644
index 7a17715f215f..000000000000
--- a/fs/nilfs2/sb.h
+++ /dev/null
@@ -1,85 +0,0 @@
1/*
2 * sb.h - NILFS on-memory super block structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _NILFS_SB
25#define _NILFS_SB
26
27#include <linux/types.h>
28#include <linux/fs.h>
29
30struct the_nilfs;
31struct nilfs_sc_info;
32
33/*
34 * NILFS super-block data in memory
35 */
36struct nilfs_sb_info {
37 /* Mount options */
38 unsigned long s_mount_opt;
39 uid_t s_resuid;
40 gid_t s_resgid;
41
42 unsigned long s_interval; /* construction interval */
43 unsigned long s_watermark; /* threshold of data amount
44 for the segment construction */
45
46 /* Fundamental members */
47 struct super_block *s_super; /* reverse pointer to super_block */
48 struct the_nilfs *s_nilfs;
49
50 /* Segment constructor */
51 struct list_head s_dirty_files; /* dirty files list */
52 struct nilfs_sc_info *s_sc_info; /* segment constructor info */
53 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
54 It covers s_dirty_files list */
55
56 /* Inode allocator */
57 spinlock_t s_next_gen_lock;
58 u32 s_next_generation;
59};
60
61static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
62{
63 return sb->s_fs_info;
64}
65
66static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
67{
68 return sbi->s_sc_info;
69}
70
71/*
72 * Bit operations for the mount option
73 */
74#define nilfs_clear_opt(sbi, opt) \
75 do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
76#define nilfs_set_opt(sbi, opt) \
77 do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
78#define nilfs_test_opt(sbi, opt) ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
79#define nilfs_write_opt(sbi, mask, opt) \
80 do { (sbi)->s_mount_opt = \
81 (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) | \
82 NILFS_MOUNT_##opt); \
83 } while (0)
84
85#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 0f83e93935b2..2853ff20f85a 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -509,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
509 * Last BIO is always sent through the following 509 * Last BIO is always sent through the following
510 * submission. 510 * submission.
511 */ 511 */
512 rw |= REQ_SYNC | REQ_UNPLUG; 512 rw |= REQ_SYNC;
513 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); 513 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
514 } 514 }
515 515
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55ebae5c7f39..afe4f2183454 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -104,8 +104,7 @@ struct nilfs_sc_operations {
104static void nilfs_segctor_start_timer(struct nilfs_sc_info *); 104static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
105static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int); 105static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
106static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *); 106static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
107static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *, 107static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
108 int);
109 108
110#define nilfs_cnt32_gt(a, b) \ 109#define nilfs_cnt32_gt(a, b) \
111 (typecheck(__u32, a) && typecheck(__u32, b) && \ 110 (typecheck(__u32, a) && typecheck(__u32, b) && \
@@ -182,7 +181,6 @@ int nilfs_transaction_begin(struct super_block *sb,
182 struct nilfs_transaction_info *ti, 181 struct nilfs_transaction_info *ti,
183 int vacancy_check) 182 int vacancy_check)
184{ 183{
185 struct nilfs_sb_info *sbi;
186 struct the_nilfs *nilfs; 184 struct the_nilfs *nilfs;
187 int ret = nilfs_prepare_segment_lock(ti); 185 int ret = nilfs_prepare_segment_lock(ti);
188 186
@@ -193,8 +191,7 @@ int nilfs_transaction_begin(struct super_block *sb,
193 191
194 vfs_check_frozen(sb, SB_FREEZE_WRITE); 192 vfs_check_frozen(sb, SB_FREEZE_WRITE);
195 193
196 sbi = NILFS_SB(sb); 194 nilfs = sb->s_fs_info;
197 nilfs = sbi->s_nilfs;
198 down_read(&nilfs->ns_segctor_sem); 195 down_read(&nilfs->ns_segctor_sem);
199 if (vacancy_check && nilfs_near_disk_full(nilfs)) { 196 if (vacancy_check && nilfs_near_disk_full(nilfs)) {
200 up_read(&nilfs->ns_segctor_sem); 197 up_read(&nilfs->ns_segctor_sem);
@@ -225,8 +222,7 @@ int nilfs_transaction_begin(struct super_block *sb,
225int nilfs_transaction_commit(struct super_block *sb) 222int nilfs_transaction_commit(struct super_block *sb)
226{ 223{
227 struct nilfs_transaction_info *ti = current->journal_info; 224 struct nilfs_transaction_info *ti = current->journal_info;
228 struct nilfs_sb_info *sbi; 225 struct the_nilfs *nilfs = sb->s_fs_info;
229 struct nilfs_sc_info *sci;
230 int err = 0; 226 int err = 0;
231 227
232 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); 228 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
@@ -235,16 +231,15 @@ int nilfs_transaction_commit(struct super_block *sb)
235 ti->ti_count--; 231 ti->ti_count--;
236 return 0; 232 return 0;
237 } 233 }
238 sbi = NILFS_SB(sb); 234 if (nilfs->ns_writer) {
239 sci = NILFS_SC(sbi); 235 struct nilfs_sc_info *sci = nilfs->ns_writer;
240 if (sci != NULL) { 236
241 if (ti->ti_flags & NILFS_TI_COMMIT) 237 if (ti->ti_flags & NILFS_TI_COMMIT)
242 nilfs_segctor_start_timer(sci); 238 nilfs_segctor_start_timer(sci);
243 if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) > 239 if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark)
244 sci->sc_watermark)
245 nilfs_segctor_do_flush(sci, 0); 240 nilfs_segctor_do_flush(sci, 0);
246 } 241 }
247 up_read(&sbi->s_nilfs->ns_segctor_sem); 242 up_read(&nilfs->ns_segctor_sem);
248 current->journal_info = ti->ti_save; 243 current->journal_info = ti->ti_save;
249 244
250 if (ti->ti_flags & NILFS_TI_SYNC) 245 if (ti->ti_flags & NILFS_TI_SYNC)
@@ -257,13 +252,14 @@ int nilfs_transaction_commit(struct super_block *sb)
257void nilfs_transaction_abort(struct super_block *sb) 252void nilfs_transaction_abort(struct super_block *sb)
258{ 253{
259 struct nilfs_transaction_info *ti = current->journal_info; 254 struct nilfs_transaction_info *ti = current->journal_info;
255 struct the_nilfs *nilfs = sb->s_fs_info;
260 256
261 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); 257 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
262 if (ti->ti_count > 0) { 258 if (ti->ti_count > 0) {
263 ti->ti_count--; 259 ti->ti_count--;
264 return; 260 return;
265 } 261 }
266 up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem); 262 up_read(&nilfs->ns_segctor_sem);
267 263
268 current->journal_info = ti->ti_save; 264 current->journal_info = ti->ti_save;
269 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 265 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
@@ -272,9 +268,8 @@ void nilfs_transaction_abort(struct super_block *sb)
272 268
273void nilfs_relax_pressure_in_lock(struct super_block *sb) 269void nilfs_relax_pressure_in_lock(struct super_block *sb)
274{ 270{
275 struct nilfs_sb_info *sbi = NILFS_SB(sb); 271 struct the_nilfs *nilfs = sb->s_fs_info;
276 struct nilfs_sc_info *sci = NILFS_SC(sbi); 272 struct nilfs_sc_info *sci = nilfs->ns_writer;
277 struct the_nilfs *nilfs = sbi->s_nilfs;
278 273
279 if (!sci || !sci->sc_flush_request) 274 if (!sci || !sci->sc_flush_request)
280 return; 275 return;
@@ -294,11 +289,13 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb)
294 downgrade_write(&nilfs->ns_segctor_sem); 289 downgrade_write(&nilfs->ns_segctor_sem);
295} 290}
296 291
297static void nilfs_transaction_lock(struct nilfs_sb_info *sbi, 292static void nilfs_transaction_lock(struct super_block *sb,
298 struct nilfs_transaction_info *ti, 293 struct nilfs_transaction_info *ti,
299 int gcflag) 294 int gcflag)
300{ 295{
301 struct nilfs_transaction_info *cur_ti = current->journal_info; 296 struct nilfs_transaction_info *cur_ti = current->journal_info;
297 struct the_nilfs *nilfs = sb->s_fs_info;
298 struct nilfs_sc_info *sci = nilfs->ns_writer;
302 299
303 WARN_ON(cur_ti); 300 WARN_ON(cur_ti);
304 ti->ti_flags = NILFS_TI_WRITER; 301 ti->ti_flags = NILFS_TI_WRITER;
@@ -309,30 +306,31 @@ static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
309 current->journal_info = ti; 306 current->journal_info = ti;
310 307
311 for (;;) { 308 for (;;) {
312 down_write(&sbi->s_nilfs->ns_segctor_sem); 309 down_write(&nilfs->ns_segctor_sem);
313 if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags)) 310 if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
314 break; 311 break;
315 312
316 nilfs_segctor_do_immediate_flush(NILFS_SC(sbi)); 313 nilfs_segctor_do_immediate_flush(sci);
317 314
318 up_write(&sbi->s_nilfs->ns_segctor_sem); 315 up_write(&nilfs->ns_segctor_sem);
319 yield(); 316 yield();
320 } 317 }
321 if (gcflag) 318 if (gcflag)
322 ti->ti_flags |= NILFS_TI_GC; 319 ti->ti_flags |= NILFS_TI_GC;
323} 320}
324 321
325static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi) 322static void nilfs_transaction_unlock(struct super_block *sb)
326{ 323{
327 struct nilfs_transaction_info *ti = current->journal_info; 324 struct nilfs_transaction_info *ti = current->journal_info;
325 struct the_nilfs *nilfs = sb->s_fs_info;
328 326
329 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); 327 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
330 BUG_ON(ti->ti_count > 0); 328 BUG_ON(ti->ti_count > 0);
331 329
332 up_write(&sbi->s_nilfs->ns_segctor_sem); 330 up_write(&nilfs->ns_segctor_sem);
333 current->journal_info = ti->ti_save; 331 current->journal_info = ti->ti_save;
334 if (!list_empty(&ti->ti_garbage)) 332 if (!list_empty(&ti->ti_garbage))
335 nilfs_dispose_list(sbi, &ti->ti_garbage, 0); 333 nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
336} 334}
337 335
338static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, 336static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -430,7 +428,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
430 nilfs_segctor_map_segsum_entry( 428 nilfs_segctor_map_segsum_entry(
431 sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); 429 sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
432 430
433 if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) 431 if (NILFS_I(inode)->i_root &&
432 !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
434 set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 433 set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
435 /* skip finfo */ 434 /* skip finfo */
436} 435}
@@ -713,7 +712,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
713 } 712 }
714} 713}
715 714
716static void nilfs_dispose_list(struct nilfs_sb_info *sbi, 715static void nilfs_dispose_list(struct the_nilfs *nilfs,
717 struct list_head *head, int force) 716 struct list_head *head, int force)
718{ 717{
719 struct nilfs_inode_info *ii, *n; 718 struct nilfs_inode_info *ii, *n;
@@ -721,7 +720,7 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
721 unsigned nv = 0; 720 unsigned nv = 0;
722 721
723 while (!list_empty(head)) { 722 while (!list_empty(head)) {
724 spin_lock(&sbi->s_inode_lock); 723 spin_lock(&nilfs->ns_inode_lock);
725 list_for_each_entry_safe(ii, n, head, i_dirty) { 724 list_for_each_entry_safe(ii, n, head, i_dirty) {
726 list_del_init(&ii->i_dirty); 725 list_del_init(&ii->i_dirty);
727 if (force) { 726 if (force) {
@@ -732,14 +731,14 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
732 } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) { 731 } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
733 set_bit(NILFS_I_QUEUED, &ii->i_state); 732 set_bit(NILFS_I_QUEUED, &ii->i_state);
734 list_add_tail(&ii->i_dirty, 733 list_add_tail(&ii->i_dirty,
735 &sbi->s_dirty_files); 734 &nilfs->ns_dirty_files);
736 continue; 735 continue;
737 } 736 }
738 ivec[nv++] = ii; 737 ivec[nv++] = ii;
739 if (nv == SC_N_INODEVEC) 738 if (nv == SC_N_INODEVEC)
740 break; 739 break;
741 } 740 }
742 spin_unlock(&sbi->s_inode_lock); 741 spin_unlock(&nilfs->ns_inode_lock);
743 742
744 for (pii = ivec; nv > 0; pii++, nv--) 743 for (pii = ivec; nv > 0; pii++, nv--)
745 iput(&(*pii)->vfs_inode); 744 iput(&(*pii)->vfs_inode);
@@ -772,24 +771,23 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
772 771
773static int nilfs_segctor_confirm(struct nilfs_sc_info *sci) 772static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
774{ 773{
775 struct nilfs_sb_info *sbi = sci->sc_sbi; 774 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
776 int ret = 0; 775 int ret = 0;
777 776
778 if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root)) 777 if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
779 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 778 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
780 779
781 spin_lock(&sbi->s_inode_lock); 780 spin_lock(&nilfs->ns_inode_lock);
782 if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci)) 781 if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci))
783 ret++; 782 ret++;
784 783
785 spin_unlock(&sbi->s_inode_lock); 784 spin_unlock(&nilfs->ns_inode_lock);
786 return ret; 785 return ret;
787} 786}
788 787
789static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) 788static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
790{ 789{
791 struct nilfs_sb_info *sbi = sci->sc_sbi; 790 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
792 struct the_nilfs *nilfs = sbi->s_nilfs;
793 791
794 nilfs_mdt_clear_dirty(sci->sc_root->ifile); 792 nilfs_mdt_clear_dirty(sci->sc_root->ifile);
795 nilfs_mdt_clear_dirty(nilfs->ns_cpfile); 793 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
@@ -799,7 +797,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
799 797
800static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) 798static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
801{ 799{
802 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 800 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
803 struct buffer_head *bh_cp; 801 struct buffer_head *bh_cp;
804 struct nilfs_checkpoint *raw_cp; 802 struct nilfs_checkpoint *raw_cp;
805 int err; 803 int err;
@@ -823,8 +821,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
823 821
824static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) 822static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
825{ 823{
826 struct nilfs_sb_info *sbi = sci->sc_sbi; 824 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
827 struct the_nilfs *nilfs = sbi->s_nilfs;
828 struct buffer_head *bh_cp; 825 struct buffer_head *bh_cp;
829 struct nilfs_checkpoint *raw_cp; 826 struct nilfs_checkpoint *raw_cp;
830 int err; 827 int err;
@@ -1048,8 +1045,7 @@ static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
1048 1045
1049static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) 1046static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1050{ 1047{
1051 struct nilfs_sb_info *sbi = sci->sc_sbi; 1048 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
1052 struct the_nilfs *nilfs = sbi->s_nilfs;
1053 struct list_head *head; 1049 struct list_head *head;
1054 struct nilfs_inode_info *ii; 1050 struct nilfs_inode_info *ii;
1055 size_t ndone; 1051 size_t ndone;
@@ -1858,7 +1854,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1858{ 1854{
1859 struct nilfs_segment_buffer *segbuf; 1855 struct nilfs_segment_buffer *segbuf;
1860 struct page *bd_page = NULL, *fs_page = NULL; 1856 struct page *bd_page = NULL, *fs_page = NULL;
1861 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 1857 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
1862 int update_sr = false; 1858 int update_sr = false;
1863 1859
1864 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1860 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -1962,30 +1958,30 @@ static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
1962 return ret; 1958 return ret;
1963} 1959}
1964 1960
1965static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci, 1961static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
1966 struct nilfs_sb_info *sbi) 1962 struct the_nilfs *nilfs)
1967{ 1963{
1968 struct nilfs_inode_info *ii, *n; 1964 struct nilfs_inode_info *ii, *n;
1969 struct inode *ifile = sci->sc_root->ifile; 1965 struct inode *ifile = sci->sc_root->ifile;
1970 1966
1971 spin_lock(&sbi->s_inode_lock); 1967 spin_lock(&nilfs->ns_inode_lock);
1972 retry: 1968 retry:
1973 list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) { 1969 list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) {
1974 if (!ii->i_bh) { 1970 if (!ii->i_bh) {
1975 struct buffer_head *ibh; 1971 struct buffer_head *ibh;
1976 int err; 1972 int err;
1977 1973
1978 spin_unlock(&sbi->s_inode_lock); 1974 spin_unlock(&nilfs->ns_inode_lock);
1979 err = nilfs_ifile_get_inode_block( 1975 err = nilfs_ifile_get_inode_block(
1980 ifile, ii->vfs_inode.i_ino, &ibh); 1976 ifile, ii->vfs_inode.i_ino, &ibh);
1981 if (unlikely(err)) { 1977 if (unlikely(err)) {
1982 nilfs_warning(sbi->s_super, __func__, 1978 nilfs_warning(sci->sc_super, __func__,
1983 "failed to get inode block.\n"); 1979 "failed to get inode block.\n");
1984 return err; 1980 return err;
1985 } 1981 }
1986 nilfs_mdt_mark_buffer_dirty(ibh); 1982 nilfs_mdt_mark_buffer_dirty(ibh);
1987 nilfs_mdt_mark_dirty(ifile); 1983 nilfs_mdt_mark_dirty(ifile);
1988 spin_lock(&sbi->s_inode_lock); 1984 spin_lock(&nilfs->ns_inode_lock);
1989 if (likely(!ii->i_bh)) 1985 if (likely(!ii->i_bh))
1990 ii->i_bh = ibh; 1986 ii->i_bh = ibh;
1991 else 1987 else
@@ -1998,18 +1994,18 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1998 list_del(&ii->i_dirty); 1994 list_del(&ii->i_dirty);
1999 list_add_tail(&ii->i_dirty, &sci->sc_dirty_files); 1995 list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
2000 } 1996 }
2001 spin_unlock(&sbi->s_inode_lock); 1997 spin_unlock(&nilfs->ns_inode_lock);
2002 1998
2003 return 0; 1999 return 0;
2004} 2000}
2005 2001
2006static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci, 2002static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
2007 struct nilfs_sb_info *sbi) 2003 struct the_nilfs *nilfs)
2008{ 2004{
2009 struct nilfs_transaction_info *ti = current->journal_info; 2005 struct nilfs_transaction_info *ti = current->journal_info;
2010 struct nilfs_inode_info *ii, *n; 2006 struct nilfs_inode_info *ii, *n;
2011 2007
2012 spin_lock(&sbi->s_inode_lock); 2008 spin_lock(&nilfs->ns_inode_lock);
2013 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { 2009 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2014 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || 2010 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2015 test_bit(NILFS_I_DIRTY, &ii->i_state)) 2011 test_bit(NILFS_I_DIRTY, &ii->i_state))
@@ -2021,7 +2017,7 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2021 list_del(&ii->i_dirty); 2017 list_del(&ii->i_dirty);
2022 list_add_tail(&ii->i_dirty, &ti->ti_garbage); 2018 list_add_tail(&ii->i_dirty, &ti->ti_garbage);
2023 } 2019 }
2024 spin_unlock(&sbi->s_inode_lock); 2020 spin_unlock(&nilfs->ns_inode_lock);
2025} 2021}
2026 2022
2027/* 2023/*
@@ -2029,15 +2025,14 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2029 */ 2025 */
2030static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) 2026static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2031{ 2027{
2032 struct nilfs_sb_info *sbi = sci->sc_sbi; 2028 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
2033 struct the_nilfs *nilfs = sbi->s_nilfs;
2034 struct page *failed_page; 2029 struct page *failed_page;
2035 int err; 2030 int err;
2036 2031
2037 sci->sc_stage.scnt = NILFS_ST_INIT; 2032 sci->sc_stage.scnt = NILFS_ST_INIT;
2038 sci->sc_cno = nilfs->ns_cno; 2033 sci->sc_cno = nilfs->ns_cno;
2039 2034
2040 err = nilfs_segctor_check_in_files(sci, sbi); 2035 err = nilfs_segctor_collect_dirty_files(sci, nilfs);
2041 if (unlikely(err)) 2036 if (unlikely(err))
2042 goto out; 2037 goto out;
2043 2038
@@ -2115,7 +2110,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2115 } while (sci->sc_stage.scnt != NILFS_ST_DONE); 2110 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2116 2111
2117 out: 2112 out:
2118 nilfs_segctor_check_out_files(sci, sbi); 2113 nilfs_segctor_drop_written_files(sci, nilfs);
2119 return err; 2114 return err;
2120 2115
2121 failed_to_write: 2116 failed_to_write:
@@ -2168,8 +2163,8 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
2168 */ 2163 */
2169void nilfs_flush_segment(struct super_block *sb, ino_t ino) 2164void nilfs_flush_segment(struct super_block *sb, ino_t ino)
2170{ 2165{
2171 struct nilfs_sb_info *sbi = NILFS_SB(sb); 2166 struct the_nilfs *nilfs = sb->s_fs_info;
2172 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2167 struct nilfs_sc_info *sci = nilfs->ns_writer;
2173 2168
2174 if (!sci || nilfs_doing_construction()) 2169 if (!sci || nilfs_doing_construction())
2175 return; 2170 return;
@@ -2258,8 +2253,8 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
2258 */ 2253 */
2259int nilfs_construct_segment(struct super_block *sb) 2254int nilfs_construct_segment(struct super_block *sb)
2260{ 2255{
2261 struct nilfs_sb_info *sbi = NILFS_SB(sb); 2256 struct the_nilfs *nilfs = sb->s_fs_info;
2262 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2257 struct nilfs_sc_info *sci = nilfs->ns_writer;
2263 struct nilfs_transaction_info *ti; 2258 struct nilfs_transaction_info *ti;
2264 int err; 2259 int err;
2265 2260
@@ -2296,8 +2291,8 @@ int nilfs_construct_segment(struct super_block *sb)
2296int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, 2291int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2297 loff_t start, loff_t end) 2292 loff_t start, loff_t end)
2298{ 2293{
2299 struct nilfs_sb_info *sbi = NILFS_SB(sb); 2294 struct the_nilfs *nilfs = sb->s_fs_info;
2300 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2295 struct nilfs_sc_info *sci = nilfs->ns_writer;
2301 struct nilfs_inode_info *ii; 2296 struct nilfs_inode_info *ii;
2302 struct nilfs_transaction_info ti; 2297 struct nilfs_transaction_info ti;
2303 int err = 0; 2298 int err = 0;
@@ -2305,33 +2300,33 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2305 if (!sci) 2300 if (!sci)
2306 return -EROFS; 2301 return -EROFS;
2307 2302
2308 nilfs_transaction_lock(sbi, &ti, 0); 2303 nilfs_transaction_lock(sb, &ti, 0);
2309 2304
2310 ii = NILFS_I(inode); 2305 ii = NILFS_I(inode);
2311 if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) || 2306 if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
2312 nilfs_test_opt(sbi, STRICT_ORDER) || 2307 nilfs_test_opt(nilfs, STRICT_ORDER) ||
2313 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || 2308 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2314 nilfs_discontinued(sbi->s_nilfs)) { 2309 nilfs_discontinued(nilfs)) {
2315 nilfs_transaction_unlock(sbi); 2310 nilfs_transaction_unlock(sb);
2316 err = nilfs_segctor_sync(sci); 2311 err = nilfs_segctor_sync(sci);
2317 return err; 2312 return err;
2318 } 2313 }
2319 2314
2320 spin_lock(&sbi->s_inode_lock); 2315 spin_lock(&nilfs->ns_inode_lock);
2321 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && 2316 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
2322 !test_bit(NILFS_I_BUSY, &ii->i_state)) { 2317 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
2323 spin_unlock(&sbi->s_inode_lock); 2318 spin_unlock(&nilfs->ns_inode_lock);
2324 nilfs_transaction_unlock(sbi); 2319 nilfs_transaction_unlock(sb);
2325 return 0; 2320 return 0;
2326 } 2321 }
2327 spin_unlock(&sbi->s_inode_lock); 2322 spin_unlock(&nilfs->ns_inode_lock);
2328 sci->sc_dsync_inode = ii; 2323 sci->sc_dsync_inode = ii;
2329 sci->sc_dsync_start = start; 2324 sci->sc_dsync_start = start;
2330 sci->sc_dsync_end = end; 2325 sci->sc_dsync_end = end;
2331 2326
2332 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); 2327 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
2333 2328
2334 nilfs_transaction_unlock(sbi); 2329 nilfs_transaction_unlock(sb);
2335 return err; 2330 return err;
2336} 2331}
2337 2332
@@ -2387,8 +2382,7 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2387 */ 2382 */
2388static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) 2383static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2389{ 2384{
2390 struct nilfs_sb_info *sbi = sci->sc_sbi; 2385 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
2391 struct the_nilfs *nilfs = sbi->s_nilfs;
2392 struct nilfs_super_block **sbp; 2386 struct nilfs_super_block **sbp;
2393 int err = 0; 2387 int err = 0;
2394 2388
@@ -2406,11 +2400,12 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2406 nilfs_discontinued(nilfs)) { 2400 nilfs_discontinued(nilfs)) {
2407 down_write(&nilfs->ns_sem); 2401 down_write(&nilfs->ns_sem);
2408 err = -EIO; 2402 err = -EIO;
2409 sbp = nilfs_prepare_super(sbi, 2403 sbp = nilfs_prepare_super(sci->sc_super,
2410 nilfs_sb_will_flip(nilfs)); 2404 nilfs_sb_will_flip(nilfs));
2411 if (likely(sbp)) { 2405 if (likely(sbp)) {
2412 nilfs_set_log_cursor(sbp[0], nilfs); 2406 nilfs_set_log_cursor(sbp[0], nilfs);
2413 err = nilfs_commit_super(sbi, NILFS_SB_COMMIT); 2407 err = nilfs_commit_super(sci->sc_super,
2408 NILFS_SB_COMMIT);
2414 } 2409 }
2415 up_write(&nilfs->ns_sem); 2410 up_write(&nilfs->ns_sem);
2416 } 2411 }
@@ -2442,16 +2437,15 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2442int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, 2437int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2443 void **kbufs) 2438 void **kbufs)
2444{ 2439{
2445 struct nilfs_sb_info *sbi = NILFS_SB(sb); 2440 struct the_nilfs *nilfs = sb->s_fs_info;
2446 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2441 struct nilfs_sc_info *sci = nilfs->ns_writer;
2447 struct the_nilfs *nilfs = sbi->s_nilfs;
2448 struct nilfs_transaction_info ti; 2442 struct nilfs_transaction_info ti;
2449 int err; 2443 int err;
2450 2444
2451 if (unlikely(!sci)) 2445 if (unlikely(!sci))
2452 return -EROFS; 2446 return -EROFS;
2453 2447
2454 nilfs_transaction_lock(sbi, &ti, 1); 2448 nilfs_transaction_lock(sb, &ti, 1);
2455 2449
2456 err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat); 2450 err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
2457 if (unlikely(err)) 2451 if (unlikely(err))
@@ -2479,14 +2473,14 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2479 set_current_state(TASK_INTERRUPTIBLE); 2473 set_current_state(TASK_INTERRUPTIBLE);
2480 schedule_timeout(sci->sc_interval); 2474 schedule_timeout(sci->sc_interval);
2481 } 2475 }
2482 if (nilfs_test_opt(sbi, DISCARD)) { 2476 if (nilfs_test_opt(nilfs, DISCARD)) {
2483 int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs, 2477 int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
2484 sci->sc_nfreesegs); 2478 sci->sc_nfreesegs);
2485 if (ret) { 2479 if (ret) {
2486 printk(KERN_WARNING 2480 printk(KERN_WARNING
2487 "NILFS warning: error %d on discard request, " 2481 "NILFS warning: error %d on discard request, "
2488 "turning discards off for the device\n", ret); 2482 "turning discards off for the device\n", ret);
2489 nilfs_clear_opt(sbi, DISCARD); 2483 nilfs_clear_opt(nilfs, DISCARD);
2490 } 2484 }
2491 } 2485 }
2492 2486
@@ -2494,16 +2488,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2494 sci->sc_freesegs = NULL; 2488 sci->sc_freesegs = NULL;
2495 sci->sc_nfreesegs = 0; 2489 sci->sc_nfreesegs = 0;
2496 nilfs_mdt_clear_shadow_map(nilfs->ns_dat); 2490 nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
2497 nilfs_transaction_unlock(sbi); 2491 nilfs_transaction_unlock(sb);
2498 return err; 2492 return err;
2499} 2493}
2500 2494
2501static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) 2495static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2502{ 2496{
2503 struct nilfs_sb_info *sbi = sci->sc_sbi;
2504 struct nilfs_transaction_info ti; 2497 struct nilfs_transaction_info ti;
2505 2498
2506 nilfs_transaction_lock(sbi, &ti, 0); 2499 nilfs_transaction_lock(sci->sc_super, &ti, 0);
2507 nilfs_segctor_construct(sci, mode); 2500 nilfs_segctor_construct(sci, mode);
2508 2501
2509 /* 2502 /*
@@ -2514,7 +2507,7 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2514 if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) 2507 if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
2515 nilfs_segctor_start_timer(sci); 2508 nilfs_segctor_start_timer(sci);
2516 2509
2517 nilfs_transaction_unlock(sbi); 2510 nilfs_transaction_unlock(sci->sc_super);
2518} 2511}
2519 2512
2520static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) 2513static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
@@ -2560,7 +2553,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2560static int nilfs_segctor_thread(void *arg) 2553static int nilfs_segctor_thread(void *arg)
2561{ 2554{
2562 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2555 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2563 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 2556 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
2564 int timeout = 0; 2557 int timeout = 0;
2565 2558
2566 sci->sc_timer.data = (unsigned long)current; 2559 sci->sc_timer.data = (unsigned long)current;
@@ -2671,17 +2664,17 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2671/* 2664/*
2672 * Setup & clean-up functions 2665 * Setup & clean-up functions
2673 */ 2666 */
2674static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi, 2667static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
2675 struct nilfs_root *root) 2668 struct nilfs_root *root)
2676{ 2669{
2670 struct the_nilfs *nilfs = sb->s_fs_info;
2677 struct nilfs_sc_info *sci; 2671 struct nilfs_sc_info *sci;
2678 2672
2679 sci = kzalloc(sizeof(*sci), GFP_KERNEL); 2673 sci = kzalloc(sizeof(*sci), GFP_KERNEL);
2680 if (!sci) 2674 if (!sci)
2681 return NULL; 2675 return NULL;
2682 2676
2683 sci->sc_sbi = sbi; 2677 sci->sc_super = sb;
2684 sci->sc_super = sbi->s_super;
2685 2678
2686 nilfs_get_root(root); 2679 nilfs_get_root(root);
2687 sci->sc_root = root; 2680 sci->sc_root = root;
@@ -2701,10 +2694,10 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
2701 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; 2694 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
2702 sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; 2695 sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
2703 2696
2704 if (sbi->s_interval) 2697 if (nilfs->ns_interval)
2705 sci->sc_interval = sbi->s_interval; 2698 sci->sc_interval = nilfs->ns_interval;
2706 if (sbi->s_watermark) 2699 if (nilfs->ns_watermark)
2707 sci->sc_watermark = sbi->s_watermark; 2700 sci->sc_watermark = nilfs->ns_watermark;
2708 return sci; 2701 return sci;
2709} 2702}
2710 2703
@@ -2715,12 +2708,11 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2715 /* The segctord thread was stopped and its timer was removed. 2708 /* The segctord thread was stopped and its timer was removed.
2716 But some tasks remain. */ 2709 But some tasks remain. */
2717 do { 2710 do {
2718 struct nilfs_sb_info *sbi = sci->sc_sbi;
2719 struct nilfs_transaction_info ti; 2711 struct nilfs_transaction_info ti;
2720 2712
2721 nilfs_transaction_lock(sbi, &ti, 0); 2713 nilfs_transaction_lock(sci->sc_super, &ti, 0);
2722 ret = nilfs_segctor_construct(sci, SC_LSEG_SR); 2714 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
2723 nilfs_transaction_unlock(sbi); 2715 nilfs_transaction_unlock(sci->sc_super);
2724 2716
2725 } while (ret && retrycount-- > 0); 2717 } while (ret && retrycount-- > 0);
2726} 2718}
@@ -2735,10 +2727,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2735 */ 2727 */
2736static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) 2728static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2737{ 2729{
2738 struct nilfs_sb_info *sbi = sci->sc_sbi; 2730 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
2739 int flag; 2731 int flag;
2740 2732
2741 up_write(&sbi->s_nilfs->ns_segctor_sem); 2733 up_write(&nilfs->ns_segctor_sem);
2742 2734
2743 spin_lock(&sci->sc_state_lock); 2735 spin_lock(&sci->sc_state_lock);
2744 nilfs_segctor_kill_thread(sci); 2736 nilfs_segctor_kill_thread(sci);
@@ -2752,9 +2744,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2752 WARN_ON(!list_empty(&sci->sc_copied_buffers)); 2744 WARN_ON(!list_empty(&sci->sc_copied_buffers));
2753 2745
2754 if (!list_empty(&sci->sc_dirty_files)) { 2746 if (!list_empty(&sci->sc_dirty_files)) {
2755 nilfs_warning(sbi->s_super, __func__, 2747 nilfs_warning(sci->sc_super, __func__,
2756 "dirty file(s) after the final construction\n"); 2748 "dirty file(s) after the final construction\n");
2757 nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1); 2749 nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
2758 } 2750 }
2759 2751
2760 WARN_ON(!list_empty(&sci->sc_segbufs)); 2752 WARN_ON(!list_empty(&sci->sc_segbufs));
@@ -2762,79 +2754,78 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2762 2754
2763 nilfs_put_root(sci->sc_root); 2755 nilfs_put_root(sci->sc_root);
2764 2756
2765 down_write(&sbi->s_nilfs->ns_segctor_sem); 2757 down_write(&nilfs->ns_segctor_sem);
2766 2758
2767 del_timer_sync(&sci->sc_timer); 2759 del_timer_sync(&sci->sc_timer);
2768 kfree(sci); 2760 kfree(sci);
2769} 2761}
2770 2762
2771/** 2763/**
2772 * nilfs_attach_segment_constructor - attach a segment constructor 2764 * nilfs_attach_log_writer - attach log writer
2773 * @sbi: nilfs_sb_info 2765 * @sb: super block instance
2774 * @root: root object of the current filesystem tree 2766 * @root: root object of the current filesystem tree
2775 * 2767 *
2776 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2768 * This allocates a log writer object, initializes it, and starts the
2777 * initializes it, and starts the segment constructor. 2769 * log writer.
2778 * 2770 *
2779 * Return Value: On success, 0 is returned. On error, one of the following 2771 * Return Value: On success, 0 is returned. On error, one of the following
2780 * negative error code is returned. 2772 * negative error code is returned.
2781 * 2773 *
2782 * %-ENOMEM - Insufficient memory available. 2774 * %-ENOMEM - Insufficient memory available.
2783 */ 2775 */
2784int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi, 2776int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
2785 struct nilfs_root *root)
2786{ 2777{
2778 struct the_nilfs *nilfs = sb->s_fs_info;
2787 int err; 2779 int err;
2788 2780
2789 if (NILFS_SC(sbi)) { 2781 if (nilfs->ns_writer) {
2790 /* 2782 /*
2791 * This happens if the filesystem was remounted 2783 * This happens if the filesystem was remounted
2792 * read/write after nilfs_error degenerated it into a 2784 * read/write after nilfs_error degenerated it into a
2793 * read-only mount. 2785 * read-only mount.
2794 */ 2786 */
2795 nilfs_detach_segment_constructor(sbi); 2787 nilfs_detach_log_writer(sb);
2796 } 2788 }
2797 2789
2798 sbi->s_sc_info = nilfs_segctor_new(sbi, root); 2790 nilfs->ns_writer = nilfs_segctor_new(sb, root);
2799 if (!sbi->s_sc_info) 2791 if (!nilfs->ns_writer)
2800 return -ENOMEM; 2792 return -ENOMEM;
2801 2793
2802 err = nilfs_segctor_start_thread(NILFS_SC(sbi)); 2794 err = nilfs_segctor_start_thread(nilfs->ns_writer);
2803 if (err) { 2795 if (err) {
2804 kfree(sbi->s_sc_info); 2796 kfree(nilfs->ns_writer);
2805 sbi->s_sc_info = NULL; 2797 nilfs->ns_writer = NULL;
2806 } 2798 }
2807 return err; 2799 return err;
2808} 2800}
2809 2801
2810/** 2802/**
2811 * nilfs_detach_segment_constructor - destroy the segment constructor 2803 * nilfs_detach_log_writer - destroy log writer
2812 * @sbi: nilfs_sb_info 2804 * @sb: super block instance
2813 * 2805 *
2814 * nilfs_detach_segment_constructor() kills the segment constructor daemon, 2806 * This kills log writer daemon, frees the log writer object, and
2815 * frees the struct nilfs_sc_info, and destroy the dirty file list. 2807 * destroys list of dirty files.
2816 */ 2808 */
2817void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi) 2809void nilfs_detach_log_writer(struct super_block *sb)
2818{ 2810{
2819 struct the_nilfs *nilfs = sbi->s_nilfs; 2811 struct the_nilfs *nilfs = sb->s_fs_info;
2820 LIST_HEAD(garbage_list); 2812 LIST_HEAD(garbage_list);
2821 2813
2822 down_write(&nilfs->ns_segctor_sem); 2814 down_write(&nilfs->ns_segctor_sem);
2823 if (NILFS_SC(sbi)) { 2815 if (nilfs->ns_writer) {
2824 nilfs_segctor_destroy(NILFS_SC(sbi)); 2816 nilfs_segctor_destroy(nilfs->ns_writer);
2825 sbi->s_sc_info = NULL; 2817 nilfs->ns_writer = NULL;
2826 } 2818 }
2827 2819
2828 /* Force to free the list of dirty files */ 2820 /* Force to free the list of dirty files */
2829 spin_lock(&sbi->s_inode_lock); 2821 spin_lock(&nilfs->ns_inode_lock);
2830 if (!list_empty(&sbi->s_dirty_files)) { 2822 if (!list_empty(&nilfs->ns_dirty_files)) {
2831 list_splice_init(&sbi->s_dirty_files, &garbage_list); 2823 list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
2832 nilfs_warning(sbi->s_super, __func__, 2824 nilfs_warning(sb, __func__,
2833 "Non empty dirty list after the last " 2825 "Hit dirty file after stopped log writer\n");
2834 "segment construction\n"); 2826 }
2835 } 2827 spin_unlock(&nilfs->ns_inode_lock);
2836 spin_unlock(&sbi->s_inode_lock);
2837 up_write(&nilfs->ns_segctor_sem); 2828 up_write(&nilfs->ns_segctor_sem);
2838 2829
2839 nilfs_dispose_list(sbi, &garbage_list, 1); 2830 nilfs_dispose_list(nilfs, &garbage_list, 1);
2840} 2831}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index cd8056e7cbed..6c02a86745fb 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -27,7 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h> 29#include <linux/nilfs2_fs.h>
30#include "sb.h" 30#include "nilfs.h"
31 31
32struct nilfs_root; 32struct nilfs_root;
33 33
@@ -88,7 +88,6 @@ struct nilfs_segsum_pointer {
88/** 88/**
89 * struct nilfs_sc_info - Segment constructor information 89 * struct nilfs_sc_info - Segment constructor information
90 * @sc_super: Back pointer to super_block struct 90 * @sc_super: Back pointer to super_block struct
91 * @sc_sbi: Back pointer to nilfs_sb_info struct
92 * @sc_root: root object of the current filesystem tree 91 * @sc_root: root object of the current filesystem tree
93 * @sc_nblk_inc: Block count of current generation 92 * @sc_nblk_inc: Block count of current generation
94 * @sc_dirty_files: List of files to be written 93 * @sc_dirty_files: List of files to be written
@@ -131,7 +130,6 @@ struct nilfs_segsum_pointer {
131 */ 130 */
132struct nilfs_sc_info { 131struct nilfs_sc_info {
133 struct super_block *sc_super; 132 struct super_block *sc_super;
134 struct nilfs_sb_info *sc_sbi;
135 struct nilfs_root *sc_root; 133 struct nilfs_root *sc_root;
136 134
137 unsigned long sc_nblk_inc; 135 unsigned long sc_nblk_inc;
@@ -235,18 +233,16 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
235extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, 233extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
236 void **); 234 void **);
237 235
238int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi, 236int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root);
239 struct nilfs_root *root); 237void nilfs_detach_log_writer(struct super_block *sb);
240extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
241 238
242/* recovery.c */ 239/* recovery.c */
243extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t, 240extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
244 struct buffer_head **, int); 241 struct buffer_head **, int);
245extern int nilfs_search_super_root(struct the_nilfs *, 242extern int nilfs_search_super_root(struct the_nilfs *,
246 struct nilfs_recovery_info *); 243 struct nilfs_recovery_info *);
247extern int nilfs_salvage_orphan_logs(struct the_nilfs *, 244int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb,
248 struct nilfs_sb_info *, 245 struct nilfs_recovery_info *ri);
249 struct nilfs_recovery_info *);
250extern void nilfs_dispose_segment_list(struct list_head *); 246extern void nilfs_dispose_segment_list(struct list_head *);
251 247
252#endif /* _NILFS_SEGMENT_H */ 248#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0994f6a76c07..062cca065195 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -43,7 +43,6 @@
43#include <linux/init.h> 43#include <linux/init.h>
44#include <linux/blkdev.h> 44#include <linux/blkdev.h>
45#include <linux/parser.h> 45#include <linux/parser.h>
46#include <linux/random.h>
47#include <linux/crc32.h> 46#include <linux/crc32.h>
48#include <linux/vfs.h> 47#include <linux/vfs.h>
49#include <linux/writeback.h> 48#include <linux/writeback.h>
@@ -72,23 +71,23 @@ struct kmem_cache *nilfs_transaction_cachep;
72struct kmem_cache *nilfs_segbuf_cachep; 71struct kmem_cache *nilfs_segbuf_cachep;
73struct kmem_cache *nilfs_btree_path_cache; 72struct kmem_cache *nilfs_btree_path_cache;
74 73
75static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount); 74static int nilfs_setup_super(struct super_block *sb, int is_mount);
76static int nilfs_remount(struct super_block *sb, int *flags, char *data); 75static int nilfs_remount(struct super_block *sb, int *flags, char *data);
77 76
78static void nilfs_set_error(struct nilfs_sb_info *sbi) 77static void nilfs_set_error(struct super_block *sb)
79{ 78{
80 struct the_nilfs *nilfs = sbi->s_nilfs; 79 struct the_nilfs *nilfs = sb->s_fs_info;
81 struct nilfs_super_block **sbp; 80 struct nilfs_super_block **sbp;
82 81
83 down_write(&nilfs->ns_sem); 82 down_write(&nilfs->ns_sem);
84 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { 83 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
85 nilfs->ns_mount_state |= NILFS_ERROR_FS; 84 nilfs->ns_mount_state |= NILFS_ERROR_FS;
86 sbp = nilfs_prepare_super(sbi, 0); 85 sbp = nilfs_prepare_super(sb, 0);
87 if (likely(sbp)) { 86 if (likely(sbp)) {
88 sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS); 87 sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
89 if (sbp[1]) 88 if (sbp[1])
90 sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS); 89 sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
91 nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); 90 nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
92 } 91 }
93 } 92 }
94 up_write(&nilfs->ns_sem); 93 up_write(&nilfs->ns_sem);
@@ -109,7 +108,7 @@ static void nilfs_set_error(struct nilfs_sb_info *sbi)
109void nilfs_error(struct super_block *sb, const char *function, 108void nilfs_error(struct super_block *sb, const char *function,
110 const char *fmt, ...) 109 const char *fmt, ...)
111{ 110{
112 struct nilfs_sb_info *sbi = NILFS_SB(sb); 111 struct the_nilfs *nilfs = sb->s_fs_info;
113 struct va_format vaf; 112 struct va_format vaf;
114 va_list args; 113 va_list args;
115 114
@@ -124,15 +123,15 @@ void nilfs_error(struct super_block *sb, const char *function,
124 va_end(args); 123 va_end(args);
125 124
126 if (!(sb->s_flags & MS_RDONLY)) { 125 if (!(sb->s_flags & MS_RDONLY)) {
127 nilfs_set_error(sbi); 126 nilfs_set_error(sb);
128 127
129 if (nilfs_test_opt(sbi, ERRORS_RO)) { 128 if (nilfs_test_opt(nilfs, ERRORS_RO)) {
130 printk(KERN_CRIT "Remounting filesystem read-only\n"); 129 printk(KERN_CRIT "Remounting filesystem read-only\n");
131 sb->s_flags |= MS_RDONLY; 130 sb->s_flags |= MS_RDONLY;
132 } 131 }
133 } 132 }
134 133
135 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 134 if (nilfs_test_opt(nilfs, ERRORS_PANIC))
136 panic("NILFS (device %s): panic forced after error\n", 135 panic("NILFS (device %s): panic forced after error\n",
137 sb->s_id); 136 sb->s_id);
138} 137}
@@ -189,14 +188,14 @@ void nilfs_destroy_inode(struct inode *inode)
189 call_rcu(&inode->i_rcu, nilfs_i_callback); 188 call_rcu(&inode->i_rcu, nilfs_i_callback);
190} 189}
191 190
192static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) 191static int nilfs_sync_super(struct super_block *sb, int flag)
193{ 192{
194 struct the_nilfs *nilfs = sbi->s_nilfs; 193 struct the_nilfs *nilfs = sb->s_fs_info;
195 int err; 194 int err;
196 195
197 retry: 196 retry:
198 set_buffer_dirty(nilfs->ns_sbh[0]); 197 set_buffer_dirty(nilfs->ns_sbh[0]);
199 if (nilfs_test_opt(sbi, BARRIER)) { 198 if (nilfs_test_opt(nilfs, BARRIER)) {
200 err = __sync_dirty_buffer(nilfs->ns_sbh[0], 199 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
201 WRITE_SYNC | WRITE_FLUSH_FUA); 200 WRITE_SYNC | WRITE_FLUSH_FUA);
202 } else { 201 } else {
@@ -263,10 +262,10 @@ void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
263 spin_unlock(&nilfs->ns_last_segment_lock); 262 spin_unlock(&nilfs->ns_last_segment_lock);
264} 263}
265 264
266struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi, 265struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
267 int flip) 266 int flip)
268{ 267{
269 struct the_nilfs *nilfs = sbi->s_nilfs; 268 struct the_nilfs *nilfs = sb->s_fs_info;
270 struct nilfs_super_block **sbp = nilfs->ns_sbp; 269 struct nilfs_super_block **sbp = nilfs->ns_sbp;
271 270
272 /* nilfs->ns_sem must be locked by the caller. */ 271 /* nilfs->ns_sem must be locked by the caller. */
@@ -276,7 +275,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
276 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); 275 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
277 } else { 276 } else {
278 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", 277 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
279 sbi->s_super->s_id); 278 sb->s_id);
280 return NULL; 279 return NULL;
281 } 280 }
282 } else if (sbp[1] && 281 } else if (sbp[1] &&
@@ -290,9 +289,9 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
290 return sbp; 289 return sbp;
291} 290}
292 291
293int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag) 292int nilfs_commit_super(struct super_block *sb, int flag)
294{ 293{
295 struct the_nilfs *nilfs = sbi->s_nilfs; 294 struct the_nilfs *nilfs = sb->s_fs_info;
296 struct nilfs_super_block **sbp = nilfs->ns_sbp; 295 struct nilfs_super_block **sbp = nilfs->ns_sbp;
297 time_t t; 296 time_t t;
298 297
@@ -312,27 +311,28 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
312 nilfs->ns_sbsize)); 311 nilfs->ns_sbsize));
313 } 312 }
314 clear_nilfs_sb_dirty(nilfs); 313 clear_nilfs_sb_dirty(nilfs);
315 return nilfs_sync_super(sbi, flag); 314 return nilfs_sync_super(sb, flag);
316} 315}
317 316
318/** 317/**
319 * nilfs_cleanup_super() - write filesystem state for cleanup 318 * nilfs_cleanup_super() - write filesystem state for cleanup
320 * @sbi: nilfs_sb_info to be unmounted or degraded to read-only 319 * @sb: super block instance to be unmounted or degraded to read-only
321 * 320 *
322 * This function restores state flags in the on-disk super block. 321 * This function restores state flags in the on-disk super block.
323 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the 322 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
324 * filesystem was not clean previously. 323 * filesystem was not clean previously.
325 */ 324 */
326int nilfs_cleanup_super(struct nilfs_sb_info *sbi) 325int nilfs_cleanup_super(struct super_block *sb)
327{ 326{
327 struct the_nilfs *nilfs = sb->s_fs_info;
328 struct nilfs_super_block **sbp; 328 struct nilfs_super_block **sbp;
329 int flag = NILFS_SB_COMMIT; 329 int flag = NILFS_SB_COMMIT;
330 int ret = -EIO; 330 int ret = -EIO;
331 331
332 sbp = nilfs_prepare_super(sbi, 0); 332 sbp = nilfs_prepare_super(sb, 0);
333 if (sbp) { 333 if (sbp) {
334 sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state); 334 sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
335 nilfs_set_log_cursor(sbp[0], sbi->s_nilfs); 335 nilfs_set_log_cursor(sbp[0], nilfs);
336 if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) { 336 if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
337 /* 337 /*
338 * make the "clean" flag also to the opposite 338 * make the "clean" flag also to the opposite
@@ -342,21 +342,20 @@ int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
342 sbp[1]->s_state = sbp[0]->s_state; 342 sbp[1]->s_state = sbp[0]->s_state;
343 flag = NILFS_SB_COMMIT_ALL; 343 flag = NILFS_SB_COMMIT_ALL;
344 } 344 }
345 ret = nilfs_commit_super(sbi, flag); 345 ret = nilfs_commit_super(sb, flag);
346 } 346 }
347 return ret; 347 return ret;
348} 348}
349 349
350static void nilfs_put_super(struct super_block *sb) 350static void nilfs_put_super(struct super_block *sb)
351{ 351{
352 struct nilfs_sb_info *sbi = NILFS_SB(sb); 352 struct the_nilfs *nilfs = sb->s_fs_info;
353 struct the_nilfs *nilfs = sbi->s_nilfs;
354 353
355 nilfs_detach_segment_constructor(sbi); 354 nilfs_detach_log_writer(sb);
356 355
357 if (!(sb->s_flags & MS_RDONLY)) { 356 if (!(sb->s_flags & MS_RDONLY)) {
358 down_write(&nilfs->ns_sem); 357 down_write(&nilfs->ns_sem);
359 nilfs_cleanup_super(sbi); 358 nilfs_cleanup_super(sb);
360 up_write(&nilfs->ns_sem); 359 up_write(&nilfs->ns_sem);
361 } 360 }
362 361
@@ -365,15 +364,12 @@ static void nilfs_put_super(struct super_block *sb)
365 iput(nilfs->ns_dat); 364 iput(nilfs->ns_dat);
366 365
367 destroy_nilfs(nilfs); 366 destroy_nilfs(nilfs);
368 sbi->s_super = NULL;
369 sb->s_fs_info = NULL; 367 sb->s_fs_info = NULL;
370 kfree(sbi);
371} 368}
372 369
373static int nilfs_sync_fs(struct super_block *sb, int wait) 370static int nilfs_sync_fs(struct super_block *sb, int wait)
374{ 371{
375 struct nilfs_sb_info *sbi = NILFS_SB(sb); 372 struct the_nilfs *nilfs = sb->s_fs_info;
376 struct the_nilfs *nilfs = sbi->s_nilfs;
377 struct nilfs_super_block **sbp; 373 struct nilfs_super_block **sbp;
378 int err = 0; 374 int err = 0;
379 375
@@ -383,10 +379,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
383 379
384 down_write(&nilfs->ns_sem); 380 down_write(&nilfs->ns_sem);
385 if (nilfs_sb_dirty(nilfs)) { 381 if (nilfs_sb_dirty(nilfs)) {
386 sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs)); 382 sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs));
387 if (likely(sbp)) { 383 if (likely(sbp)) {
388 nilfs_set_log_cursor(sbp[0], nilfs); 384 nilfs_set_log_cursor(sbp[0], nilfs);
389 nilfs_commit_super(sbi, NILFS_SB_COMMIT); 385 nilfs_commit_super(sb, NILFS_SB_COMMIT);
390 } 386 }
391 } 387 }
392 up_write(&nilfs->ns_sem); 388 up_write(&nilfs->ns_sem);
@@ -394,10 +390,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
394 return err; 390 return err;
395} 391}
396 392
397int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt, 393int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
398 struct nilfs_root **rootp) 394 struct nilfs_root **rootp)
399{ 395{
400 struct the_nilfs *nilfs = sbi->s_nilfs; 396 struct the_nilfs *nilfs = sb->s_fs_info;
401 struct nilfs_root *root; 397 struct nilfs_root *root;
402 struct nilfs_checkpoint *raw_cp; 398 struct nilfs_checkpoint *raw_cp;
403 struct buffer_head *bh_cp; 399 struct buffer_head *bh_cp;
@@ -426,7 +422,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
426 goto failed; 422 goto failed;
427 } 423 }
428 424
429 err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size, 425 err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
430 &raw_cp->cp_ifile_inode, &root->ifile); 426 &raw_cp->cp_ifile_inode, &root->ifile);
431 if (err) 427 if (err)
432 goto failed_bh; 428 goto failed_bh;
@@ -450,8 +446,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
450 446
451static int nilfs_freeze(struct super_block *sb) 447static int nilfs_freeze(struct super_block *sb)
452{ 448{
453 struct nilfs_sb_info *sbi = NILFS_SB(sb); 449 struct the_nilfs *nilfs = sb->s_fs_info;
454 struct the_nilfs *nilfs = sbi->s_nilfs;
455 int err; 450 int err;
456 451
457 if (sb->s_flags & MS_RDONLY) 452 if (sb->s_flags & MS_RDONLY)
@@ -459,21 +454,20 @@ static int nilfs_freeze(struct super_block *sb)
459 454
460 /* Mark super block clean */ 455 /* Mark super block clean */
461 down_write(&nilfs->ns_sem); 456 down_write(&nilfs->ns_sem);
462 err = nilfs_cleanup_super(sbi); 457 err = nilfs_cleanup_super(sb);
463 up_write(&nilfs->ns_sem); 458 up_write(&nilfs->ns_sem);
464 return err; 459 return err;
465} 460}
466 461
467static int nilfs_unfreeze(struct super_block *sb) 462static int nilfs_unfreeze(struct super_block *sb)
468{ 463{
469 struct nilfs_sb_info *sbi = NILFS_SB(sb); 464 struct the_nilfs *nilfs = sb->s_fs_info;
470 struct the_nilfs *nilfs = sbi->s_nilfs;
471 465
472 if (sb->s_flags & MS_RDONLY) 466 if (sb->s_flags & MS_RDONLY)
473 return 0; 467 return 0;
474 468
475 down_write(&nilfs->ns_sem); 469 down_write(&nilfs->ns_sem);
476 nilfs_setup_super(sbi, false); 470 nilfs_setup_super(sb, false);
477 up_write(&nilfs->ns_sem); 471 up_write(&nilfs->ns_sem);
478 return 0; 472 return 0;
479} 473}
@@ -530,22 +524,22 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
530static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) 524static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
531{ 525{
532 struct super_block *sb = vfs->mnt_sb; 526 struct super_block *sb = vfs->mnt_sb;
533 struct nilfs_sb_info *sbi = NILFS_SB(sb); 527 struct the_nilfs *nilfs = sb->s_fs_info;
534 struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root; 528 struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
535 529
536 if (!nilfs_test_opt(sbi, BARRIER)) 530 if (!nilfs_test_opt(nilfs, BARRIER))
537 seq_puts(seq, ",nobarrier"); 531 seq_puts(seq, ",nobarrier");
538 if (root->cno != NILFS_CPTREE_CURRENT_CNO) 532 if (root->cno != NILFS_CPTREE_CURRENT_CNO)
539 seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno); 533 seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
540 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 534 if (nilfs_test_opt(nilfs, ERRORS_PANIC))
541 seq_puts(seq, ",errors=panic"); 535 seq_puts(seq, ",errors=panic");
542 if (nilfs_test_opt(sbi, ERRORS_CONT)) 536 if (nilfs_test_opt(nilfs, ERRORS_CONT))
543 seq_puts(seq, ",errors=continue"); 537 seq_puts(seq, ",errors=continue");
544 if (nilfs_test_opt(sbi, STRICT_ORDER)) 538 if (nilfs_test_opt(nilfs, STRICT_ORDER))
545 seq_puts(seq, ",order=strict"); 539 seq_puts(seq, ",order=strict");
546 if (nilfs_test_opt(sbi, NORECOVERY)) 540 if (nilfs_test_opt(nilfs, NORECOVERY))
547 seq_puts(seq, ",norecovery"); 541 seq_puts(seq, ",norecovery");
548 if (nilfs_test_opt(sbi, DISCARD)) 542 if (nilfs_test_opt(nilfs, DISCARD))
549 seq_puts(seq, ",discard"); 543 seq_puts(seq, ",discard");
550 544
551 return 0; 545 return 0;
@@ -594,7 +588,7 @@ static match_table_t tokens = {
594 588
595static int parse_options(char *options, struct super_block *sb, int is_remount) 589static int parse_options(char *options, struct super_block *sb, int is_remount)
596{ 590{
597 struct nilfs_sb_info *sbi = NILFS_SB(sb); 591 struct the_nilfs *nilfs = sb->s_fs_info;
598 char *p; 592 char *p;
599 substring_t args[MAX_OPT_ARGS]; 593 substring_t args[MAX_OPT_ARGS];
600 594
@@ -609,29 +603,29 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
609 token = match_token(p, tokens, args); 603 token = match_token(p, tokens, args);
610 switch (token) { 604 switch (token) {
611 case Opt_barrier: 605 case Opt_barrier:
612 nilfs_set_opt(sbi, BARRIER); 606 nilfs_set_opt(nilfs, BARRIER);
613 break; 607 break;
614 case Opt_nobarrier: 608 case Opt_nobarrier:
615 nilfs_clear_opt(sbi, BARRIER); 609 nilfs_clear_opt(nilfs, BARRIER);
616 break; 610 break;
617 case Opt_order: 611 case Opt_order:
618 if (strcmp(args[0].from, "relaxed") == 0) 612 if (strcmp(args[0].from, "relaxed") == 0)
619 /* Ordered data semantics */ 613 /* Ordered data semantics */
620 nilfs_clear_opt(sbi, STRICT_ORDER); 614 nilfs_clear_opt(nilfs, STRICT_ORDER);
621 else if (strcmp(args[0].from, "strict") == 0) 615 else if (strcmp(args[0].from, "strict") == 0)
622 /* Strict in-order semantics */ 616 /* Strict in-order semantics */
623 nilfs_set_opt(sbi, STRICT_ORDER); 617 nilfs_set_opt(nilfs, STRICT_ORDER);
624 else 618 else
625 return 0; 619 return 0;
626 break; 620 break;
627 case Opt_err_panic: 621 case Opt_err_panic:
628 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC); 622 nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC);
629 break; 623 break;
630 case Opt_err_ro: 624 case Opt_err_ro:
631 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO); 625 nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO);
632 break; 626 break;
633 case Opt_err_cont: 627 case Opt_err_cont:
634 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); 628 nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT);
635 break; 629 break;
636 case Opt_snapshot: 630 case Opt_snapshot:
637 if (is_remount) { 631 if (is_remount) {
@@ -642,13 +636,13 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
642 } 636 }
643 break; 637 break;
644 case Opt_norecovery: 638 case Opt_norecovery:
645 nilfs_set_opt(sbi, NORECOVERY); 639 nilfs_set_opt(nilfs, NORECOVERY);
646 break; 640 break;
647 case Opt_discard: 641 case Opt_discard:
648 nilfs_set_opt(sbi, DISCARD); 642 nilfs_set_opt(nilfs, DISCARD);
649 break; 643 break;
650 case Opt_nodiscard: 644 case Opt_nodiscard:
651 nilfs_clear_opt(sbi, DISCARD); 645 nilfs_clear_opt(nilfs, DISCARD);
652 break; 646 break;
653 default: 647 default:
654 printk(KERN_ERR 648 printk(KERN_ERR
@@ -660,22 +654,24 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
660} 654}
661 655
662static inline void 656static inline void
663nilfs_set_default_options(struct nilfs_sb_info *sbi, 657nilfs_set_default_options(struct super_block *sb,
664 struct nilfs_super_block *sbp) 658 struct nilfs_super_block *sbp)
665{ 659{
666 sbi->s_mount_opt = 660 struct the_nilfs *nilfs = sb->s_fs_info;
661
662 nilfs->ns_mount_opt =
667 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; 663 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
668} 664}
669 665
670static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount) 666static int nilfs_setup_super(struct super_block *sb, int is_mount)
671{ 667{
672 struct the_nilfs *nilfs = sbi->s_nilfs; 668 struct the_nilfs *nilfs = sb->s_fs_info;
673 struct nilfs_super_block **sbp; 669 struct nilfs_super_block **sbp;
674 int max_mnt_count; 670 int max_mnt_count;
675 int mnt_count; 671 int mnt_count;
676 672
677 /* nilfs->ns_sem must be locked by the caller. */ 673 /* nilfs->ns_sem must be locked by the caller. */
678 sbp = nilfs_prepare_super(sbi, 0); 674 sbp = nilfs_prepare_super(sb, 0);
679 if (!sbp) 675 if (!sbp)
680 return -EIO; 676 return -EIO;
681 677
@@ -704,8 +700,9 @@ skip_mount_setup:
704 sbp[0]->s_state = 700 sbp[0]->s_state =
705 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); 701 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
706 /* synchronize sbp[1] with sbp[0] */ 702 /* synchronize sbp[1] with sbp[0] */
707 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 703 if (sbp[1])
708 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); 704 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
705 return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
709} 706}
710 707
711struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, 708struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -726,7 +723,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
726 struct nilfs_super_block *sbp, 723 struct nilfs_super_block *sbp,
727 char *data) 724 char *data)
728{ 725{
729 struct nilfs_sb_info *sbi = NILFS_SB(sb); 726 struct the_nilfs *nilfs = sb->s_fs_info;
730 727
731 sb->s_magic = le16_to_cpu(sbp->s_magic); 728 sb->s_magic = le16_to_cpu(sbp->s_magic);
732 729
@@ -735,12 +732,12 @@ int nilfs_store_magic_and_option(struct super_block *sb,
735 sb->s_flags |= MS_NOATIME; 732 sb->s_flags |= MS_NOATIME;
736#endif 733#endif
737 734
738 nilfs_set_default_options(sbi, sbp); 735 nilfs_set_default_options(sb, sbp);
739 736
740 sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid); 737 nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
741 sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid); 738 nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
742 sbi->s_interval = le32_to_cpu(sbp->s_c_interval); 739 nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
743 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); 740 nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
744 741
745 return !parse_options(data, sb, 0) ? -EINVAL : 0 ; 742 return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
746} 743}
@@ -821,7 +818,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
821static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, 818static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
822 struct dentry **root_dentry) 819 struct dentry **root_dentry)
823{ 820{
824 struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs; 821 struct the_nilfs *nilfs = s->s_fs_info;
825 struct nilfs_root *root; 822 struct nilfs_root *root;
826 int ret; 823 int ret;
827 824
@@ -839,7 +836,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
839 goto out; 836 goto out;
840 } 837 }
841 838
842 ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root); 839 ret = nilfs_attach_checkpoint(s, cno, false, &root);
843 if (ret) { 840 if (ret) {
844 printk(KERN_ERR "NILFS: error loading snapshot " 841 printk(KERN_ERR "NILFS: error loading snapshot "
845 "(checkpoint number=%llu).\n", 842 "(checkpoint number=%llu).\n",
@@ -873,7 +870,7 @@ static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
873 870
874int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) 871int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
875{ 872{
876 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; 873 struct the_nilfs *nilfs = sb->s_fs_info;
877 struct nilfs_root *root; 874 struct nilfs_root *root;
878 struct inode *inode; 875 struct inode *inode;
879 struct dentry *dentry; 876 struct dentry *dentry;
@@ -886,7 +883,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
886 return true; /* protect recent checkpoints */ 883 return true; /* protect recent checkpoints */
887 884
888 ret = false; 885 ret = false;
889 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno); 886 root = nilfs_lookup_root(nilfs, cno);
890 if (root) { 887 if (root) {
891 inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO); 888 inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
892 if (inode) { 889 if (inode) {
@@ -916,43 +913,21 @@ static int
916nilfs_fill_super(struct super_block *sb, void *data, int silent) 913nilfs_fill_super(struct super_block *sb, void *data, int silent)
917{ 914{
918 struct the_nilfs *nilfs; 915 struct the_nilfs *nilfs;
919 struct nilfs_sb_info *sbi;
920 struct nilfs_root *fsroot; 916 struct nilfs_root *fsroot;
921 struct backing_dev_info *bdi; 917 struct backing_dev_info *bdi;
922 __u64 cno; 918 __u64 cno;
923 int err; 919 int err;
924 920
925 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 921 nilfs = alloc_nilfs(sb->s_bdev);
926 if (!sbi) 922 if (!nilfs)
927 return -ENOMEM; 923 return -ENOMEM;
928 924
929 sb->s_fs_info = sbi; 925 sb->s_fs_info = nilfs;
930 sbi->s_super = sb;
931
932 nilfs = alloc_nilfs(sb->s_bdev);
933 if (!nilfs) {
934 err = -ENOMEM;
935 goto failed_sbi;
936 }
937 sbi->s_nilfs = nilfs;
938 926
939 err = init_nilfs(nilfs, sbi, (char *)data); 927 err = init_nilfs(nilfs, sb, (char *)data);
940 if (err) 928 if (err)
941 goto failed_nilfs; 929 goto failed_nilfs;
942 930
943 spin_lock_init(&sbi->s_inode_lock);
944 INIT_LIST_HEAD(&sbi->s_dirty_files);
945
946 /*
947 * Following initialization is overlapped because
948 * nilfs_sb_info structure has been cleared at the beginning.
949 * But we reserve them to keep our interest and make ready
950 * for the future change.
951 */
952 get_random_bytes(&sbi->s_next_generation,
953 sizeof(sbi->s_next_generation));
954 spin_lock_init(&sbi->s_next_gen_lock);
955
956 sb->s_op = &nilfs_sops; 931 sb->s_op = &nilfs_sops;
957 sb->s_export_op = &nilfs_export_ops; 932 sb->s_export_op = &nilfs_export_ops;
958 sb->s_root = NULL; 933 sb->s_root = NULL;
@@ -961,12 +936,12 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
961 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 936 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
962 sb->s_bdi = bdi ? : &default_backing_dev_info; 937 sb->s_bdi = bdi ? : &default_backing_dev_info;
963 938
964 err = load_nilfs(nilfs, sbi); 939 err = load_nilfs(nilfs, sb);
965 if (err) 940 if (err)
966 goto failed_nilfs; 941 goto failed_nilfs;
967 942
968 cno = nilfs_last_cno(nilfs); 943 cno = nilfs_last_cno(nilfs);
969 err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot); 944 err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
970 if (err) { 945 if (err) {
971 printk(KERN_ERR "NILFS: error loading last checkpoint " 946 printk(KERN_ERR "NILFS: error loading last checkpoint "
972 "(checkpoint number=%llu).\n", (unsigned long long)cno); 947 "(checkpoint number=%llu).\n", (unsigned long long)cno);
@@ -974,7 +949,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
974 } 949 }
975 950
976 if (!(sb->s_flags & MS_RDONLY)) { 951 if (!(sb->s_flags & MS_RDONLY)) {
977 err = nilfs_attach_segment_constructor(sbi, fsroot); 952 err = nilfs_attach_log_writer(sb, fsroot);
978 if (err) 953 if (err)
979 goto failed_checkpoint; 954 goto failed_checkpoint;
980 } 955 }
@@ -987,14 +962,14 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
987 962
988 if (!(sb->s_flags & MS_RDONLY)) { 963 if (!(sb->s_flags & MS_RDONLY)) {
989 down_write(&nilfs->ns_sem); 964 down_write(&nilfs->ns_sem);
990 nilfs_setup_super(sbi, true); 965 nilfs_setup_super(sb, true);
991 up_write(&nilfs->ns_sem); 966 up_write(&nilfs->ns_sem);
992 } 967 }
993 968
994 return 0; 969 return 0;
995 970
996 failed_segctor: 971 failed_segctor:
997 nilfs_detach_segment_constructor(sbi); 972 nilfs_detach_log_writer(sb);
998 973
999 failed_checkpoint: 974 failed_checkpoint:
1000 nilfs_put_root(fsroot); 975 nilfs_put_root(fsroot);
@@ -1006,23 +981,18 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
1006 981
1007 failed_nilfs: 982 failed_nilfs:
1008 destroy_nilfs(nilfs); 983 destroy_nilfs(nilfs);
1009
1010 failed_sbi:
1011 sb->s_fs_info = NULL;
1012 kfree(sbi);
1013 return err; 984 return err;
1014} 985}
1015 986
1016static int nilfs_remount(struct super_block *sb, int *flags, char *data) 987static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1017{ 988{
1018 struct nilfs_sb_info *sbi = NILFS_SB(sb); 989 struct the_nilfs *nilfs = sb->s_fs_info;
1019 struct the_nilfs *nilfs = sbi->s_nilfs;
1020 unsigned long old_sb_flags; 990 unsigned long old_sb_flags;
1021 unsigned long old_mount_opt; 991 unsigned long old_mount_opt;
1022 int err; 992 int err;
1023 993
1024 old_sb_flags = sb->s_flags; 994 old_sb_flags = sb->s_flags;
1025 old_mount_opt = sbi->s_mount_opt; 995 old_mount_opt = nilfs->ns_mount_opt;
1026 996
1027 if (!parse_options(data, sb, 1)) { 997 if (!parse_options(data, sb, 1)) {
1028 err = -EINVAL; 998 err = -EINVAL;
@@ -1042,8 +1012,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1042 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1012 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1043 goto out; 1013 goto out;
1044 if (*flags & MS_RDONLY) { 1014 if (*flags & MS_RDONLY) {
1045 /* Shutting down the segment constructor */ 1015 /* Shutting down log writer */
1046 nilfs_detach_segment_constructor(sbi); 1016 nilfs_detach_log_writer(sb);
1047 sb->s_flags |= MS_RDONLY; 1017 sb->s_flags |= MS_RDONLY;
1048 1018
1049 /* 1019 /*
@@ -1051,7 +1021,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1051 * the RDONLY flag and then mark the partition as valid again. 1021 * the RDONLY flag and then mark the partition as valid again.
1052 */ 1022 */
1053 down_write(&nilfs->ns_sem); 1023 down_write(&nilfs->ns_sem);
1054 nilfs_cleanup_super(sbi); 1024 nilfs_cleanup_super(sb);
1055 up_write(&nilfs->ns_sem); 1025 up_write(&nilfs->ns_sem);
1056 } else { 1026 } else {
1057 __u64 features; 1027 __u64 features;
@@ -1078,12 +1048,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1078 sb->s_flags &= ~MS_RDONLY; 1048 sb->s_flags &= ~MS_RDONLY;
1079 1049
1080 root = NILFS_I(sb->s_root->d_inode)->i_root; 1050 root = NILFS_I(sb->s_root->d_inode)->i_root;
1081 err = nilfs_attach_segment_constructor(sbi, root); 1051 err = nilfs_attach_log_writer(sb, root);
1082 if (err) 1052 if (err)
1083 goto restore_opts; 1053 goto restore_opts;
1084 1054
1085 down_write(&nilfs->ns_sem); 1055 down_write(&nilfs->ns_sem);
1086 nilfs_setup_super(sbi, true); 1056 nilfs_setup_super(sb, true);
1087 up_write(&nilfs->ns_sem); 1057 up_write(&nilfs->ns_sem);
1088 } 1058 }
1089 out: 1059 out:
@@ -1091,13 +1061,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1091 1061
1092 restore_opts: 1062 restore_opts:
1093 sb->s_flags = old_sb_flags; 1063 sb->s_flags = old_sb_flags;
1094 sbi->s_mount_opt = old_mount_opt; 1064 nilfs->ns_mount_opt = old_mount_opt;
1095 return err; 1065 return err;
1096} 1066}
1097 1067
1098struct nilfs_super_data { 1068struct nilfs_super_data {
1099 struct block_device *bdev; 1069 struct block_device *bdev;
1100 struct nilfs_sb_info *sbi;
1101 __u64 cno; 1070 __u64 cno;
1102 int flags; 1071 int flags;
1103}; 1072};
@@ -1278,7 +1247,7 @@ static void nilfs_inode_init_once(void *obj)
1278#ifdef CONFIG_NILFS_XATTR 1247#ifdef CONFIG_NILFS_XATTR
1279 init_rwsem(&ii->xattr_sem); 1248 init_rwsem(&ii->xattr_sem);
1280#endif 1249#endif
1281 nilfs_btnode_cache_init_once(&ii->i_btnode_cache); 1250 address_space_init_once(&ii->i_btnode_cache);
1282 ii->i_bmap = &ii->i_bmap_data; 1251 ii->i_bmap = &ii->i_bmap_data;
1283 inode_init_once(&ii->vfs_inode); 1252 inode_init_once(&ii->vfs_inode);
1284} 1253}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad4ac607cf57..d2acd1a651f3 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -25,6 +25,7 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/random.h>
28#include <linux/crc32.h> 29#include <linux/crc32.h>
29#include "nilfs.h" 30#include "nilfs.h"
30#include "segment.h" 31#include "segment.h"
@@ -75,7 +76,10 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
75 nilfs->ns_bdev = bdev; 76 nilfs->ns_bdev = bdev;
76 atomic_set(&nilfs->ns_ndirtyblks, 0); 77 atomic_set(&nilfs->ns_ndirtyblks, 0);
77 init_rwsem(&nilfs->ns_sem); 78 init_rwsem(&nilfs->ns_sem);
79 INIT_LIST_HEAD(&nilfs->ns_dirty_files);
78 INIT_LIST_HEAD(&nilfs->ns_gc_inodes); 80 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
81 spin_lock_init(&nilfs->ns_inode_lock);
82 spin_lock_init(&nilfs->ns_next_gen_lock);
79 spin_lock_init(&nilfs->ns_last_segment_lock); 83 spin_lock_init(&nilfs->ns_last_segment_lock);
80 nilfs->ns_cptree = RB_ROOT; 84 nilfs->ns_cptree = RB_ROOT;
81 spin_lock_init(&nilfs->ns_cptree_lock); 85 spin_lock_init(&nilfs->ns_cptree_lock);
@@ -197,16 +201,16 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
197/** 201/**
198 * load_nilfs - load and recover the nilfs 202 * load_nilfs - load and recover the nilfs
199 * @nilfs: the_nilfs structure to be released 203 * @nilfs: the_nilfs structure to be released
200 * @sbi: nilfs_sb_info used to recover past segment 204 * @sb: super block isntance used to recover past segment
201 * 205 *
202 * load_nilfs() searches and load the latest super root, 206 * load_nilfs() searches and load the latest super root,
203 * attaches the last segment, and does recovery if needed. 207 * attaches the last segment, and does recovery if needed.
204 * The caller must call this exclusively for simultaneous mounts. 208 * The caller must call this exclusively for simultaneous mounts.
205 */ 209 */
206int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) 210int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
207{ 211{
208 struct nilfs_recovery_info ri; 212 struct nilfs_recovery_info ri;
209 unsigned int s_flags = sbi->s_super->s_flags; 213 unsigned int s_flags = sb->s_flags;
210 int really_read_only = bdev_read_only(nilfs->ns_bdev); 214 int really_read_only = bdev_read_only(nilfs->ns_bdev);
211 int valid_fs = nilfs_valid_fs(nilfs); 215 int valid_fs = nilfs_valid_fs(nilfs);
212 int err; 216 int err;
@@ -271,7 +275,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
271 goto scan_error; 275 goto scan_error;
272 } 276 }
273 277
274 err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root); 278 err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
275 if (unlikely(err)) { 279 if (unlikely(err)) {
276 printk(KERN_ERR "NILFS: error loading super root.\n"); 280 printk(KERN_ERR "NILFS: error loading super root.\n");
277 goto failed; 281 goto failed;
@@ -283,7 +287,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
283 if (s_flags & MS_RDONLY) { 287 if (s_flags & MS_RDONLY) {
284 __u64 features; 288 __u64 features;
285 289
286 if (nilfs_test_opt(sbi, NORECOVERY)) { 290 if (nilfs_test_opt(nilfs, NORECOVERY)) {
287 printk(KERN_INFO "NILFS: norecovery option specified. " 291 printk(KERN_INFO "NILFS: norecovery option specified. "
288 "skipping roll-forward recovery\n"); 292 "skipping roll-forward recovery\n");
289 goto skip_recovery; 293 goto skip_recovery;
@@ -304,21 +308,21 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
304 err = -EROFS; 308 err = -EROFS;
305 goto failed_unload; 309 goto failed_unload;
306 } 310 }
307 sbi->s_super->s_flags &= ~MS_RDONLY; 311 sb->s_flags &= ~MS_RDONLY;
308 } else if (nilfs_test_opt(sbi, NORECOVERY)) { 312 } else if (nilfs_test_opt(nilfs, NORECOVERY)) {
309 printk(KERN_ERR "NILFS: recovery cancelled because norecovery " 313 printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
310 "option was specified for a read/write mount\n"); 314 "option was specified for a read/write mount\n");
311 err = -EINVAL; 315 err = -EINVAL;
312 goto failed_unload; 316 goto failed_unload;
313 } 317 }
314 318
315 err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri); 319 err = nilfs_salvage_orphan_logs(nilfs, sb, &ri);
316 if (err) 320 if (err)
317 goto failed_unload; 321 goto failed_unload;
318 322
319 down_write(&nilfs->ns_sem); 323 down_write(&nilfs->ns_sem);
320 nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */ 324 nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
321 err = nilfs_cleanup_super(sbi); 325 err = nilfs_cleanup_super(sb);
322 up_write(&nilfs->ns_sem); 326 up_write(&nilfs->ns_sem);
323 327
324 if (err) { 328 if (err) {
@@ -330,7 +334,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
330 334
331 skip_recovery: 335 skip_recovery:
332 nilfs_clear_recovery_info(&ri); 336 nilfs_clear_recovery_info(&ri);
333 sbi->s_super->s_flags = s_flags; 337 sb->s_flags = s_flags;
334 return 0; 338 return 0;
335 339
336 scan_error: 340 scan_error:
@@ -344,7 +348,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
344 348
345 failed: 349 failed:
346 nilfs_clear_recovery_info(&ri); 350 nilfs_clear_recovery_info(&ri);
347 sbi->s_super->s_flags = s_flags; 351 sb->s_flags = s_flags;
348 return err; 352 return err;
349} 353}
350 354
@@ -475,10 +479,13 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
475 return -EIO; 479 return -EIO;
476 } 480 }
477 printk(KERN_WARNING 481 printk(KERN_WARNING
478 "NILFS warning: unable to read primary superblock\n"); 482 "NILFS warning: unable to read primary superblock "
479 } else if (!sbp[1]) 483 "(blocksize = %d)\n", blocksize);
484 } else if (!sbp[1]) {
480 printk(KERN_WARNING 485 printk(KERN_WARNING
481 "NILFS warning: unable to read secondary superblock\n"); 486 "NILFS warning: unable to read secondary superblock "
487 "(blocksize = %d)\n", blocksize);
488 }
482 489
483 /* 490 /*
484 * Compare two super blocks and set 1 in swp if the secondary 491 * Compare two super blocks and set 1 in swp if the secondary
@@ -505,7 +512,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
505 512
506 if (!valid[!swp]) 513 if (!valid[!swp])
507 printk(KERN_WARNING "NILFS warning: broken superblock. " 514 printk(KERN_WARNING "NILFS warning: broken superblock. "
508 "using spare superblock.\n"); 515 "using spare superblock (blocksize = %d).\n", blocksize);
509 if (swp) 516 if (swp)
510 nilfs_swap_super_block(nilfs); 517 nilfs_swap_super_block(nilfs);
511 518
@@ -519,7 +526,6 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
519/** 526/**
520 * init_nilfs - initialize a NILFS instance. 527 * init_nilfs - initialize a NILFS instance.
521 * @nilfs: the_nilfs structure 528 * @nilfs: the_nilfs structure
522 * @sbi: nilfs_sb_info
523 * @sb: super block 529 * @sb: super block
524 * @data: mount options 530 * @data: mount options
525 * 531 *
@@ -530,9 +536,8 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
530 * Return Value: On success, 0 is returned. On error, a negative error 536 * Return Value: On success, 0 is returned. On error, a negative error
531 * code is returned. 537 * code is returned.
532 */ 538 */
533int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) 539int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
534{ 540{
535 struct super_block *sb = sbi->s_super;
536 struct nilfs_super_block *sbp; 541 struct nilfs_super_block *sbp;
537 int blocksize; 542 int blocksize;
538 int err; 543 int err;
@@ -588,6 +593,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
588 nilfs->ns_blocksize_bits = sb->s_blocksize_bits; 593 nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
589 nilfs->ns_blocksize = blocksize; 594 nilfs->ns_blocksize = blocksize;
590 595
596 get_random_bytes(&nilfs->ns_next_generation,
597 sizeof(nilfs->ns_next_generation));
598
591 err = nilfs_store_disk_layout(nilfs, sbp); 599 err = nilfs_store_disk_layout(nilfs, sbp);
592 if (err) 600 if (err)
593 goto failed_sbh; 601 goto failed_sbh;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index fd85e4c05c6b..f4968145c2a3 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -31,7 +31,8 @@
31#include <linux/blkdev.h> 31#include <linux/blkdev.h>
32#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include "sb.h" 34
35struct nilfs_sc_info;
35 36
36/* the_nilfs struct */ 37/* the_nilfs struct */
37enum { 38enum {
@@ -65,13 +66,23 @@ enum {
65 * @ns_last_cno: checkpoint number of the latest segment 66 * @ns_last_cno: checkpoint number of the latest segment
66 * @ns_prot_seq: least sequence number of segments which must not be reclaimed 67 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
67 * @ns_prev_seq: base sequence number used to decide if advance log cursor 68 * @ns_prev_seq: base sequence number used to decide if advance log cursor
68 * @ns_segctor_sem: segment constructor semaphore 69 * @ns_writer: log writer
70 * @ns_segctor_sem: semaphore protecting log write
69 * @ns_dat: DAT file inode 71 * @ns_dat: DAT file inode
70 * @ns_cpfile: checkpoint file inode 72 * @ns_cpfile: checkpoint file inode
71 * @ns_sufile: segusage file inode 73 * @ns_sufile: segusage file inode
72 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root) 74 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
73 * @ns_cptree_lock: lock protecting @ns_cptree 75 * @ns_cptree_lock: lock protecting @ns_cptree
76 * @ns_dirty_files: list of dirty files
77 * @ns_inode_lock: lock protecting @ns_dirty_files
74 * @ns_gc_inodes: dummy inodes to keep live blocks 78 * @ns_gc_inodes: dummy inodes to keep live blocks
79 * @ns_next_generation: next generation number for inodes
80 * @ns_next_gen_lock: lock protecting @ns_next_generation
81 * @ns_mount_opt: mount options
82 * @ns_resuid: uid for reserved blocks
83 * @ns_resgid: gid for reserved blocks
84 * @ns_interval: checkpoint creation interval
85 * @ns_watermark: watermark for the number of dirty buffers
75 * @ns_blocksize_bits: bit length of block size 86 * @ns_blocksize_bits: bit length of block size
76 * @ns_blocksize: block size 87 * @ns_blocksize: block size
77 * @ns_nsegments: number of segments in filesystem 88 * @ns_nsegments: number of segments in filesystem
@@ -131,6 +142,7 @@ struct the_nilfs {
131 u64 ns_prot_seq; 142 u64 ns_prot_seq;
132 u64 ns_prev_seq; 143 u64 ns_prev_seq;
133 144
145 struct nilfs_sc_info *ns_writer;
134 struct rw_semaphore ns_segctor_sem; 146 struct rw_semaphore ns_segctor_sem;
135 147
136 /* 148 /*
@@ -145,9 +157,25 @@ struct the_nilfs {
145 struct rb_root ns_cptree; 157 struct rb_root ns_cptree;
146 spinlock_t ns_cptree_lock; 158 spinlock_t ns_cptree_lock;
147 159
160 /* Dirty inode list */
161 struct list_head ns_dirty_files;
162 spinlock_t ns_inode_lock;
163
148 /* GC inode list */ 164 /* GC inode list */
149 struct list_head ns_gc_inodes; 165 struct list_head ns_gc_inodes;
150 166
167 /* Inode allocator */
168 u32 ns_next_generation;
169 spinlock_t ns_next_gen_lock;
170
171 /* Mount options */
172 unsigned long ns_mount_opt;
173
174 uid_t ns_resuid;
175 gid_t ns_resgid;
176 unsigned long ns_interval;
177 unsigned long ns_watermark;
178
151 /* Disk layout information (static) */ 179 /* Disk layout information (static) */
152 unsigned int ns_blocksize_bits; 180 unsigned int ns_blocksize_bits;
153 unsigned int ns_blocksize; 181 unsigned int ns_blocksize;
@@ -180,6 +208,20 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
180THE_NILFS_FNS(GC_RUNNING, gc_running) 208THE_NILFS_FNS(GC_RUNNING, gc_running)
181THE_NILFS_FNS(SB_DIRTY, sb_dirty) 209THE_NILFS_FNS(SB_DIRTY, sb_dirty)
182 210
211/*
212 * Mount option operations
213 */
214#define nilfs_clear_opt(nilfs, opt) \
215 do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
216#define nilfs_set_opt(nilfs, opt) \
217 do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
218#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
219#define nilfs_write_opt(nilfs, mask, opt) \
220 do { (nilfs)->ns_mount_opt = \
221 (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \
222 NILFS_MOUNT_##opt); \
223 } while (0)
224
183/** 225/**
184 * struct nilfs_root - nilfs root object 226 * struct nilfs_root - nilfs root object
185 * @cno: checkpoint number 227 * @cno: checkpoint number
@@ -224,15 +266,14 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
224void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 266void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
225struct the_nilfs *alloc_nilfs(struct block_device *bdev); 267struct the_nilfs *alloc_nilfs(struct block_device *bdev);
226void destroy_nilfs(struct the_nilfs *nilfs); 268void destroy_nilfs(struct the_nilfs *nilfs);
227int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 269int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
228int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 270int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
229int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); 271int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
230int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 272int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
231struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno); 273struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
232struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs, 274struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
233 __u64 cno); 275 __u64 cno);
234void nilfs_put_root(struct nilfs_root *root); 276void nilfs_put_root(struct nilfs_root *root);
235struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
236int nilfs_near_disk_full(struct the_nilfs *); 277int nilfs_near_disk_full(struct the_nilfs *);
237void nilfs_fall_back_super_block(struct the_nilfs *); 278void nilfs_fall_back_super_block(struct the_nilfs *);
238void nilfs_swap_super_block(struct the_nilfs *); 279void nilfs_swap_super_block(struct the_nilfs *);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8b61220cffc5..6b1305dc26c0 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -876,7 +876,7 @@ SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
876#endif 876#endif
877 877
878/* 878/*
879 * fanotify_user_setup - Our initialization function. Note that we cannnot return 879 * fanotify_user_setup - Our initialization function. Note that we cannot return
880 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 880 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
881 * must result in panic(). 881 * must result in panic().
882 */ 882 */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4c29fcf557d1..07ea8d3e6ea2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -22,13 +22,14 @@
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/spinlock.h> 24#include <linux/spinlock.h>
25#include <linux/writeback.h> /* for inode_lock */
26 25
27#include <asm/atomic.h> 26#include <asm/atomic.h>
28 27
29#include <linux/fsnotify_backend.h> 28#include <linux/fsnotify_backend.h>
30#include "fsnotify.h" 29#include "fsnotify.h"
31 30
31#include "../internal.h"
32
32/* 33/*
33 * Recalculate the mask of events relevant to a given inode locked. 34 * Recalculate the mask of events relevant to a given inode locked.
34 */ 35 */
@@ -237,15 +238,14 @@ out:
237 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. 238 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
238 * @list: list of inodes being unmounted (sb->s_inodes) 239 * @list: list of inodes being unmounted (sb->s_inodes)
239 * 240 *
240 * Called with inode_lock held, protecting the unmounting super block's list 241 * Called during unmount with no locks held, so needs to be safe against
241 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. 242 * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
242 * We temporarily drop inode_lock, however, and CAN block.
243 */ 243 */
244void fsnotify_unmount_inodes(struct list_head *list) 244void fsnotify_unmount_inodes(struct list_head *list)
245{ 245{
246 struct inode *inode, *next_i, *need_iput = NULL; 246 struct inode *inode, *next_i, *need_iput = NULL;
247 247
248 spin_lock(&inode_lock); 248 spin_lock(&inode_sb_list_lock);
249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
250 struct inode *need_iput_tmp; 250 struct inode *need_iput_tmp;
251 251
@@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
254 * I_WILL_FREE, or I_NEW which is fine because by that point 254 * I_WILL_FREE, or I_NEW which is fine because by that point
255 * the inode cannot have any associated watches. 255 * the inode cannot have any associated watches.
256 */ 256 */
257 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 257 spin_lock(&inode->i_lock);
258 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
259 spin_unlock(&inode->i_lock);
258 continue; 260 continue;
261 }
259 262
260 /* 263 /*
261 * If i_count is zero, the inode cannot have any watches and 264 * If i_count is zero, the inode cannot have any watches and
@@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list)
263 * evict all inodes with zero i_count from icache which is 266 * evict all inodes with zero i_count from icache which is
264 * unnecessarily violent and may in fact be illegal to do. 267 * unnecessarily violent and may in fact be illegal to do.
265 */ 268 */
266 if (!atomic_read(&inode->i_count)) 269 if (!atomic_read(&inode->i_count)) {
270 spin_unlock(&inode->i_lock);
267 continue; 271 continue;
272 }
268 273
269 need_iput_tmp = need_iput; 274 need_iput_tmp = need_iput;
270 need_iput = NULL; 275 need_iput = NULL;
@@ -274,22 +279,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
274 __iget(inode); 279 __iget(inode);
275 else 280 else
276 need_iput_tmp = NULL; 281 need_iput_tmp = NULL;
282 spin_unlock(&inode->i_lock);
277 283
278 /* In case the dropping of a reference would nuke next_i. */ 284 /* In case the dropping of a reference would nuke next_i. */
279 if ((&next_i->i_sb_list != list) && 285 if ((&next_i->i_sb_list != list) &&
280 atomic_read(&next_i->i_count) && 286 atomic_read(&next_i->i_count)) {
281 !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { 287 spin_lock(&next_i->i_lock);
282 __iget(next_i); 288 if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
283 need_iput = next_i; 289 __iget(next_i);
290 need_iput = next_i;
291 }
292 spin_unlock(&next_i->i_lock);
284 } 293 }
285 294
286 /* 295 /*
287 * We can safely drop inode_lock here because we hold 296 * We can safely drop inode_sb_list_lock here because we hold
288 * references on both inode and next_i. Also no new inodes 297 * references on both inode and next_i. Also no new inodes
289 * will be added since the umount has begun. Finally, 298 * will be added since the umount has begun.
290 * iprune_mutex keeps shrink_icache_memory() away.
291 */ 299 */
292 spin_unlock(&inode_lock); 300 spin_unlock(&inode_sb_list_lock);
293 301
294 if (need_iput_tmp) 302 if (need_iput_tmp)
295 iput(need_iput_tmp); 303 iput(need_iput_tmp);
@@ -301,7 +309,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
301 309
302 iput(inode); 310 iput(inode);
303 311
304 spin_lock(&inode_lock); 312 spin_lock(&inode_sb_list_lock);
305 } 313 }
306 spin_unlock(&inode_lock); 314 spin_unlock(&inode_sb_list_lock);
307} 315}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4cd5d5d78f9f..bd46e7c8a0ef 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -841,7 +841,7 @@ out:
841} 841}
842 842
843/* 843/*
844 * inotify_user_setup - Our initialization function. Note that we cannnot return 844 * inotify_user_setup - Our initialization function. Note that we cannot return
845 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 845 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
846 * must result in panic(). 846 * must result in panic().
847 */ 847 */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 325185e514bb..50c00856f730 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,7 +91,6 @@
91#include <linux/slab.h> 91#include <linux/slab.h>
92#include <linux/spinlock.h> 92#include <linux/spinlock.h>
93#include <linux/srcu.h> 93#include <linux/srcu.h>
94#include <linux/writeback.h> /* for inode_lock */
95 94
96#include <asm/atomic.h> 95#include <asm/atomic.h>
97 96
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 85eebff6d0d7..e86577d6c5c3 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -23,7 +23,6 @@
23#include <linux/mount.h> 23#include <linux/mount.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26#include <linux/writeback.h> /* for inode_lock */
27 26
28#include <asm/atomic.h> 27#include <asm/atomic.h>
29 28
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 4ff028fcfd6e..30206b238433 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -2,18 +2,13 @@
2 2
3obj-$(CONFIG_NTFS_FS) += ntfs.o 3obj-$(CONFIG_NTFS_FS) += ntfs.o
4 4
5ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ 5ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\" 9ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ccflags-y := -DNTFS_VERSION=\"2.1.30\"
12EXTRA_CFLAGS += -DDEBUG 12ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
13endif 13ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
14 14
15ifeq ($(CONFIG_NTFS_RW),y)
16EXTRA_CFLAGS += -DNTFS_RW
17
18ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
19endif
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index c3c2c7ac9020..0b1e885b8cf8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1543,8 +1543,6 @@ err_out:
1543 */ 1543 */
1544const struct address_space_operations ntfs_aops = { 1544const struct address_space_operations ntfs_aops = {
1545 .readpage = ntfs_readpage, /* Fill page with data. */ 1545 .readpage = ntfs_readpage, /* Fill page with data. */
1546 .sync_page = block_sync_page, /* Currently, just unplugs the
1547 disk request queue. */
1548#ifdef NTFS_RW 1546#ifdef NTFS_RW
1549 .writepage = ntfs_writepage, /* Write dirty page to disk. */ 1547 .writepage = ntfs_writepage, /* Write dirty page to disk. */
1550#endif /* NTFS_RW */ 1548#endif /* NTFS_RW */
@@ -1560,8 +1558,6 @@ const struct address_space_operations ntfs_aops = {
1560 */ 1558 */
1561const struct address_space_operations ntfs_mst_aops = { 1559const struct address_space_operations ntfs_mst_aops = {
1562 .readpage = ntfs_readpage, /* Fill page with data. */ 1560 .readpage = ntfs_readpage, /* Fill page with data. */
1563 .sync_page = block_sync_page, /* Currently, just unplugs the
1564 disk request queue. */
1565#ifdef NTFS_RW 1561#ifdef NTFS_RW
1566 .writepage = ntfs_writepage, /* Write dirty page to disk. */ 1562 .writepage = ntfs_writepage, /* Write dirty page to disk. */
1567 .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty 1563 .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6551c7cbad92..ef9ed854255c 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -698,8 +698,7 @@ lock_retry_remap:
698 "uptodate! Unplugging the disk queue " 698 "uptodate! Unplugging the disk queue "
699 "and rescheduling."); 699 "and rescheduling.");
700 get_bh(tbh); 700 get_bh(tbh);
701 blk_run_address_space(mapping); 701 io_schedule();
702 schedule();
703 put_bh(tbh); 702 put_bh(tbh);
704 if (unlikely(!buffer_uptodate(tbh))) 703 if (unlikely(!buffer_uptodate(tbh)))
705 goto read_err; 704 goto read_err;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index a627ed82c0a3..0b56c6b7ec01 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -54,7 +54,7 @@
54 * 54 *
55 * Return 1 if the attributes match and 0 if not. 55 * Return 1 if the attributes match and 0 if not.
56 * 56 *
57 * NOTE: This function runs with the inode_lock spin lock held so it is not 57 * NOTE: This function runs with the inode->i_lock spin lock held so it is not
58 * allowed to sleep. 58 * allowed to sleep.
59 */ 59 */
60int ntfs_test_inode(struct inode *vi, ntfs_attr *na) 60int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
98 * 98 *
99 * Return 0 on success and -errno on error. 99 * Return 0 on success and -errno on error.
100 * 100 *
101 * NOTE: This function runs with the inode_lock spin lock held so it is not 101 * NOTE: This function runs with the inode->i_lock spin lock held so it is not
102 * allowed to sleep. (Hence the GFP_ATOMIC allocation.) 102 * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
103 */ 103 */
104static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) 104static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b572b6727181..326e7475a22a 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. 2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2006 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * Copyright (c) 2002 Richard Russon 5 * Copyright (c) 2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -2576,6 +2576,8 @@ mft_rec_already_initialized:
2576 flush_dcache_page(page); 2576 flush_dcache_page(page);
2577 SetPageUptodate(page); 2577 SetPageUptodate(page);
2578 if (base_ni) { 2578 if (base_ni) {
2579 MFT_RECORD *m_tmp;
2580
2579 /* 2581 /*
2580 * Setup the base mft record in the extent mft record. This 2582 * Setup the base mft record in the extent mft record. This
2581 * completes initialization of the allocated extent mft record 2583 * completes initialization of the allocated extent mft record
@@ -2588,11 +2590,11 @@ mft_rec_already_initialized:
2588 * attach it to the base inode @base_ni and map, pin, and lock 2590 * attach it to the base inode @base_ni and map, pin, and lock
2589 * its, i.e. the allocated, mft record. 2591 * its, i.e. the allocated, mft record.
2590 */ 2592 */
2591 m = map_extent_mft_record(base_ni, bit, &ni); 2593 m_tmp = map_extent_mft_record(base_ni, bit, &ni);
2592 if (IS_ERR(m)) { 2594 if (IS_ERR(m_tmp)) {
2593 ntfs_error(vol->sb, "Failed to map allocated extent " 2595 ntfs_error(vol->sb, "Failed to map allocated extent "
2594 "mft record 0x%llx.", (long long)bit); 2596 "mft record 0x%llx.", (long long)bit);
2595 err = PTR_ERR(m); 2597 err = PTR_ERR(m_tmp);
2596 /* Set the mft record itself not in use. */ 2598 /* Set the mft record itself not in use. */
2597 m->flags &= cpu_to_le16( 2599 m->flags &= cpu_to_le16(
2598 ~le16_to_cpu(MFT_RECORD_IN_USE)); 2600 ~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2603,6 +2605,7 @@ mft_rec_already_initialized:
2603 ntfs_unmap_page(page); 2605 ntfs_unmap_page(page);
2604 goto undo_mftbmp_alloc; 2606 goto undo_mftbmp_alloc;
2605 } 2607 }
2608 BUG_ON(m != m_tmp);
2606 /* 2609 /*
2607 * Make sure the allocated mft record is written out to disk. 2610 * Make sure the allocated mft record is written out to disk.
2608 * No need to set the inode dirty because the caller is going 2611 * No need to set the inode dirty because the caller is going
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 07d9fd854350..d8a0313e99e6 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,6 +1,6 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1ccflags-y := -Ifs/ocfs2
2 2
3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES 3ccflags-y += -DCATCH_BH_JBD_RACES
4 4
5obj-$(CONFIG_OCFS2_FS) += \ 5obj-$(CONFIG_OCFS2_FS) += \
6 ocfs2.o \ 6 ocfs2.o \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 704f6b1742f3..90f2729b7a5b 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -497,7 +497,7 @@ static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
497 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 497 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
498 return -EOPNOTSUPP; 498 return -EOPNOTSUPP;
499 499
500 if (!is_owner_or_cap(inode)) 500 if (!inode_owner_or_capable(inode))
501 return -EPERM; 501 return -EPERM;
502 502
503 if (value) { 503 if (value) {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1fbb0e20131b..daea0359e974 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2043,7 +2043,6 @@ const struct address_space_operations ocfs2_aops = {
2043 .write_begin = ocfs2_write_begin, 2043 .write_begin = ocfs2_write_begin,
2044 .write_end = ocfs2_write_end, 2044 .write_end = ocfs2_write_end,
2045 .bmap = ocfs2_bmap, 2045 .bmap = ocfs2_bmap,
2046 .sync_page = block_sync_page,
2047 .direct_IO = ocfs2_direct_IO, 2046 .direct_IO = ocfs2_direct_IO,
2048 .invalidatepage = ocfs2_invalidatepage, 2047 .invalidatepage = ocfs2_invalidatepage,
2049 .releasepage = ocfs2_releasepage, 2048 .releasepage = ocfs2_releasepage,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b108e863d8f6..1adab287bd24 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -367,11 +367,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
367static void o2hb_wait_on_io(struct o2hb_region *reg, 367static void o2hb_wait_on_io(struct o2hb_region *reg,
368 struct o2hb_bio_wait_ctxt *wc) 368 struct o2hb_bio_wait_ctxt *wc)
369{ 369{
370 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
371
372 blk_run_address_space(mapping);
373 o2hb_bio_wait_dec(wc, 1); 370 o2hb_bio_wait_dec(wc, 1);
374
375 wait_for_completion(&wc->wc_io_complete); 371 wait_for_completion(&wc->wc_io_complete);
376} 372}
377 373
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 6d80ecc7834f..7eb90403fc8a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
56 int ret = 0; /* if all else fails, just return false */ 56 int ret = 0; /* if all else fails, just return false */
57 struct ocfs2_super *osb; 57 struct ocfs2_super *osb;
58 58
59 if (nd->flags & LOOKUP_RCU) 59 if (nd && nd->flags & LOOKUP_RCU)
60 return -ECHILD; 60 return -ECHILD;
61 61
62 inode = dentry->d_inode; 62 inode = dentry->d_inode;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d417b3f9b0c7..f97b6f1c61dd 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -354,7 +354,7 @@ static inline int ocfs2_match(int len,
354/* 354/*
355 * Returns 0 if not found, -1 on failure, and 1 on success 355 * Returns 0 if not found, -1 on failure, and 1 on success
356 */ 356 */
357static int inline ocfs2_search_dirblock(struct buffer_head *bh, 357static inline int ocfs2_search_dirblock(struct buffer_head *bh,
358 struct inode *dir, 358 struct inode *dir,
359 const char *name, int namelen, 359 const char *name, int namelen,
360 unsigned long offset, 360 unsigned long offset,
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index dcebf0d920fa..c8a044efbb15 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,4 +1,4 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1ccflags-y := -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index df69b4856d0d..f14be89a6701 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,4 +1,4 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1ccflags-y := -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4 4
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc3062b4fd..254652a9b542 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
197 dentry->d_name.len, dentry->d_name.name, 197 dentry->d_name.len, dentry->d_name.name,
198 fh, len, connectable); 198 fh, len, connectable);
199 199
200 if (len < 3 || (connectable && len < 6)) { 200 if (connectable && (len < 6)) {
201 mlog(ML_ERROR, "fh buffer is too small for encoding\n"); 201 *max_len = 6;
202 type = 255;
203 goto bail;
204 } else if (len < 3) {
205 *max_len = 3;
202 type = 255; 206 type = 255;
203 goto bail; 207 goto bail;
204 } 208 }
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7a4868196152..09de77ce002a 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -82,7 +82,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
82 } 82 }
83 83
84 status = -EACCES; 84 status = -EACCES;
85 if (!is_owner_or_cap(inode)) 85 if (!inode_owner_or_capable(inode))
86 goto bail_unlock; 86 goto bail_unlock;
87 87
88 if (!S_ISDIR(inode->i_mode)) 88 if (!S_ISDIR(inode->i_mode))
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 43e56b97f9c0..6180da1e37e6 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
405 ocfs2_quota_trans_credits(sb); 405 ocfs2_quota_trans_credits(sb);
406} 406}
407 407
408/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 408/* data block for new dir/symlink, allocation of directory block, dx_root
409 * bitmap block for the new bit) dx_root update for free list */ 409 * update for free list */
410#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1) 410#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
411 411
412static inline int ocfs2_add_dir_index_credits(struct super_block *sb) 412static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
413{ 413{
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 849fb4a2e814..d6c25d76b537 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -293,7 +293,7 @@ static int ocfs2_mknod(struct inode *dir,
293 } 293 }
294 294
295 /* get security xattr */ 295 /* get security xattr */
296 status = ocfs2_init_security_get(inode, dir, &si); 296 status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
297 if (status) { 297 if (status) {
298 if (status == -EOPNOTSUPP) 298 if (status == -EOPNOTSUPP)
299 si.enable = 0; 299 si.enable = 0;
@@ -1665,7 +1665,7 @@ static int ocfs2_symlink(struct inode *dir,
1665 } 1665 }
1666 1666
1667 /* get security xattr */ 1667 /* get security xattr */
1668 status = ocfs2_init_security_get(inode, dir, &si); 1668 status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
1669 if (status) { 1669 if (status) {
1670 if (status == -EOPNOTSUPP) 1670 if (status == -EOPNOTSUPP)
1671 si.enable = 0; 1671 si.enable = 0;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 51cd6898e7f1..1a97ba1ec3fc 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -831,18 +831,18 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
831 831
832static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) 832static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
833{ 833{
834 ext2_set_bit(bit, bitmap); 834 __test_and_set_bit_le(bit, bitmap);
835} 835}
836#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) 836#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
837 837
838static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) 838static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
839{ 839{
840 ext2_clear_bit(bit, bitmap); 840 __test_and_clear_bit_le(bit, bitmap);
841} 841}
842#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) 842#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
843 843
844#define ocfs2_test_bit ext2_test_bit 844#define ocfs2_test_bit test_bit_le
845#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 845#define ocfs2_find_next_zero_bit find_next_zero_bit_le
846#define ocfs2_find_next_bit ext2_find_next_bit 846#define ocfs2_find_next_bit find_next_bit_le
847#endif /* OCFS2_H */ 847#endif /* OCFS2_H */
848 848
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 196fcb52d95d..d5ab56cbe5c5 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -114,7 +114,4 @@ int ocfs2_local_write_dquot(struct dquot *dquot);
114extern const struct dquot_operations ocfs2_quota_operations; 114extern const struct dquot_operations ocfs2_quota_operations;
115extern struct quota_format_type ocfs2_quota_format; 115extern struct quota_format_type ocfs2_quota_format;
116 116
117int ocfs2_quota_setup(void);
118void ocfs2_quota_shutdown(void);
119
120#endif /* _OCFS2_QUOTA_H */ 117#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 4607923eb24c..a73f64166481 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -63,8 +63,6 @@
63 * write to gf 63 * write to gf
64 */ 64 */
65 65
66static struct workqueue_struct *ocfs2_quota_wq = NULL;
67
68static void qsync_work_fn(struct work_struct *work); 66static void qsync_work_fn(struct work_struct *work);
69 67
70static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp) 68static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
@@ -400,8 +398,8 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
400 OCFS2_QBLK_RESERVED_SPACE; 398 OCFS2_QBLK_RESERVED_SPACE;
401 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); 399 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
402 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); 400 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
403 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, 401 schedule_delayed_work(&oinfo->dqi_sync_work,
404 msecs_to_jiffies(oinfo->dqi_syncms)); 402 msecs_to_jiffies(oinfo->dqi_syncms));
405 403
406out_err: 404out_err:
407 mlog_exit(status); 405 mlog_exit(status);
@@ -635,8 +633,8 @@ static void qsync_work_fn(struct work_struct *work)
635 struct super_block *sb = oinfo->dqi_gqinode->i_sb; 633 struct super_block *sb = oinfo->dqi_gqinode->i_sb;
636 634
637 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); 635 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
638 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, 636 schedule_delayed_work(&oinfo->dqi_sync_work,
639 msecs_to_jiffies(oinfo->dqi_syncms)); 637 msecs_to_jiffies(oinfo->dqi_syncms));
640} 638}
641 639
642/* 640/*
@@ -923,20 +921,3 @@ const struct dquot_operations ocfs2_quota_operations = {
923 .alloc_dquot = ocfs2_alloc_dquot, 921 .alloc_dquot = ocfs2_alloc_dquot,
924 .destroy_dquot = ocfs2_destroy_dquot, 922 .destroy_dquot = ocfs2_destroy_dquot,
925}; 923};
926
927int ocfs2_quota_setup(void)
928{
929 ocfs2_quota_wq = create_workqueue("o2quot");
930 if (!ocfs2_quota_wq)
931 return -ENOMEM;
932 return 0;
933}
934
935void ocfs2_quota_shutdown(void)
936{
937 if (ocfs2_quota_wq) {
938 flush_workqueue(ocfs2_quota_wq);
939 destroy_workqueue(ocfs2_quota_wq);
940 ocfs2_quota_wq = NULL;
941 }
942}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b5f9160e93e9..c384d634872a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3228 u32 num_clusters, unsigned int e_flags) 3228 u32 num_clusters, unsigned int e_flags)
3229{ 3229{
3230 int ret, delete, index, credits = 0; 3230 int ret, delete, index, credits = 0;
3231 u32 new_bit, new_len; 3231 u32 new_bit, new_len, orig_num_clusters;
3232 unsigned int set_len; 3232 unsigned int set_len;
3233 struct ocfs2_super *osb = OCFS2_SB(sb); 3233 struct ocfs2_super *osb = OCFS2_SB(sb);
3234 handle_t *handle; 3234 handle_t *handle;
@@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3261 goto out; 3261 goto out;
3262 } 3262 }
3263 3263
3264 orig_num_clusters = num_clusters;
3265
3264 while (num_clusters) { 3266 while (num_clusters) {
3265 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, 3267 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3266 p_cluster, num_clusters, 3268 p_cluster, num_clusters,
@@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3348 * in write-back mode. 3350 * in write-back mode.
3349 */ 3351 */
3350 if (context->get_clusters == ocfs2_di_get_clusters) { 3352 if (context->get_clusters == ocfs2_di_get_clusters) {
3351 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); 3353 ret = ocfs2_cow_sync_writeback(sb, context, cpos,
3354 orig_num_clusters);
3352 if (ret) 3355 if (ret)
3353 mlog_errno(ret); 3356 mlog_errno(ret);
3354 } 3357 }
@@ -4325,7 +4328,8 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4325 4328
4326 /* If the security isn't preserved, we need to re-initialize them. */ 4329 /* If the security isn't preserved, we need to re-initialize them. */
4327 if (!preserve) { 4330 if (!preserve) {
4328 error = ocfs2_init_security_and_acl(dir, new_orphan_inode); 4331 error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
4332 &new_dentry->d_name);
4329 if (error) 4333 if (error)
4330 mlog_errno(error); 4334 mlog_errno(error);
4331 } 4335 }
@@ -4376,7 +4380,7 @@ static int ocfs2_user_path_parent(const char __user *path,
4376 if (IS_ERR(s)) 4380 if (IS_ERR(s))
4377 return PTR_ERR(s); 4381 return PTR_ERR(s);
4378 4382
4379 error = path_lookup(s, LOOKUP_PARENT, nd); 4383 error = kern_path_parent(s, nd);
4380 if (error) 4384 if (error)
4381 putname(s); 4385 putname(s);
4382 else 4386 else
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 38f986d2447e..236ed1bdca2c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1316 struct mount_options *mopt, 1316 struct mount_options *mopt,
1317 int is_remount) 1317 int is_remount)
1318{ 1318{
1319 int status; 1319 int status, user_stack = 0;
1320 char *p; 1320 char *p;
1321 u32 tmp; 1321 u32 tmp;
1322 1322
@@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb,
1459 memcpy(mopt->cluster_stack, args[0].from, 1459 memcpy(mopt->cluster_stack, args[0].from,
1460 OCFS2_STACK_LABEL_LEN); 1460 OCFS2_STACK_LABEL_LEN);
1461 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; 1461 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
1462 /*
1463 * Open code the memcmp here as we don't have
1464 * an osb to pass to
1465 * ocfs2_userspace_stack().
1466 */
1467 if (memcmp(mopt->cluster_stack,
1468 OCFS2_CLASSIC_CLUSTER_STACK,
1469 OCFS2_STACK_LABEL_LEN))
1470 user_stack = 1;
1462 break; 1471 break;
1463 case Opt_inode64: 1472 case Opt_inode64:
1464 mopt->mount_opt |= OCFS2_MOUNT_INODE64; 1473 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
@@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb,
1514 } 1523 }
1515 } 1524 }
1516 1525
1517 /* Ensure only one heartbeat mode */ 1526 if (user_stack == 0) {
1518 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | 1527 /* Ensure only one heartbeat mode */
1519 OCFS2_MOUNT_HB_NONE); 1528 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
1520 if (hweight32(tmp) != 1) { 1529 OCFS2_MOUNT_HB_GLOBAL |
1521 mlog(ML_ERROR, "Invalid heartbeat mount options\n"); 1530 OCFS2_MOUNT_HB_NONE);
1522 status = 0; 1531 if (hweight32(tmp) != 1) {
1523 goto bail; 1532 mlog(ML_ERROR, "Invalid heartbeat mount options\n");
1533 status = 0;
1534 goto bail;
1535 }
1524 } 1536 }
1525 1537
1526 status = 1; 1538 status = 1;
@@ -1645,16 +1657,11 @@ static int __init ocfs2_init(void)
1645 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1657 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1646 } 1658 }
1647 1659
1648 status = ocfs2_quota_setup();
1649 if (status)
1650 goto leave;
1651
1652 ocfs2_set_locking_protocol(); 1660 ocfs2_set_locking_protocol();
1653 1661
1654 status = register_quota_format(&ocfs2_quota_format); 1662 status = register_quota_format(&ocfs2_quota_format);
1655leave: 1663leave:
1656 if (status < 0) { 1664 if (status < 0) {
1657 ocfs2_quota_shutdown();
1658 ocfs2_free_mem_caches(); 1665 ocfs2_free_mem_caches();
1659 exit_ocfs2_uptodate_cache(); 1666 exit_ocfs2_uptodate_cache();
1660 } 1667 }
@@ -1671,8 +1678,6 @@ static void __exit ocfs2_exit(void)
1671{ 1678{
1672 mlog_entry_void(); 1679 mlog_entry_void();
1673 1680
1674 ocfs2_quota_shutdown();
1675
1676 if (ocfs2_wq) { 1681 if (ocfs2_wq) {
1677 flush_workqueue(ocfs2_wq); 1682 flush_workqueue(ocfs2_wq);
1678 destroy_workqueue(ocfs2_wq); 1683 destroy_workqueue(ocfs2_wq);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 67cd43914641..6bb602486c6b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7185,7 +7185,8 @@ out:
7185 * must not hold any lock expect i_mutex. 7185 * must not hold any lock expect i_mutex.
7186 */ 7186 */
7187int ocfs2_init_security_and_acl(struct inode *dir, 7187int ocfs2_init_security_and_acl(struct inode *dir,
7188 struct inode *inode) 7188 struct inode *inode,
7189 const struct qstr *qstr)
7189{ 7190{
7190 int ret = 0; 7191 int ret = 0;
7191 struct buffer_head *dir_bh = NULL; 7192 struct buffer_head *dir_bh = NULL;
@@ -7193,7 +7194,7 @@ int ocfs2_init_security_and_acl(struct inode *dir,
7193 .enable = 1, 7194 .enable = 1,
7194 }; 7195 };
7195 7196
7196 ret = ocfs2_init_security_get(inode, dir, &si); 7197 ret = ocfs2_init_security_get(inode, dir, qstr, &si);
7197 if (!ret) { 7198 if (!ret) {
7198 ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, 7199 ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
7199 si.name, si.value, si.value_len, 7200 si.name, si.value, si.value_len,
@@ -7261,13 +7262,14 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
7261 7262
7262int ocfs2_init_security_get(struct inode *inode, 7263int ocfs2_init_security_get(struct inode *inode,
7263 struct inode *dir, 7264 struct inode *dir,
7265 const struct qstr *qstr,
7264 struct ocfs2_security_xattr_info *si) 7266 struct ocfs2_security_xattr_info *si)
7265{ 7267{
7266 /* check whether ocfs2 support feature xattr */ 7268 /* check whether ocfs2 support feature xattr */
7267 if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) 7269 if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
7268 return -EOPNOTSUPP; 7270 return -EOPNOTSUPP;
7269 return security_inode_init_security(inode, dir, &si->name, &si->value, 7271 return security_inode_init_security(inode, dir, qstr, &si->name,
7270 &si->value_len); 7272 &si->value, &si->value_len);
7271} 7273}
7272 7274
7273int ocfs2_init_security_set(handle_t *handle, 7275int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index aa64bb37a65b..d63cfb72316b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -57,6 +57,7 @@ int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
57 struct ocfs2_dinode *di); 57 struct ocfs2_dinode *di);
58int ocfs2_xattr_remove(struct inode *, struct buffer_head *); 58int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
59int ocfs2_init_security_get(struct inode *, struct inode *, 59int ocfs2_init_security_get(struct inode *, struct inode *,
60 const struct qstr *,
60 struct ocfs2_security_xattr_info *); 61 struct ocfs2_security_xattr_info *);
61int ocfs2_init_security_set(handle_t *, struct inode *, 62int ocfs2_init_security_set(handle_t *, struct inode *,
62 struct buffer_head *, 63 struct buffer_head *,
@@ -94,5 +95,6 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
94 struct buffer_head *new_bh, 95 struct buffer_head *new_bh,
95 bool preserve_security); 96 bool preserve_security);
96int ocfs2_init_security_and_acl(struct inode *dir, 97int ocfs2_init_security_and_acl(struct inode *dir,
97 struct inode *inode); 98 struct inode *inode,
99 const struct qstr *qstr);
98#endif /* OCFS2_XATTR_H */ 100#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 393f3f659da7..de4ff29f1e05 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -235,33 +235,22 @@ static int omfs_dir_is_empty(struct inode *inode)
235 return *ptr != ~0; 235 return *ptr != ~0;
236} 236}
237 237
238static int omfs_unlink(struct inode *dir, struct dentry *dentry) 238static int omfs_remove(struct inode *dir, struct dentry *dentry)
239{ 239{
240 int ret;
241 struct inode *inode = dentry->d_inode; 240 struct inode *inode = dentry->d_inode;
241 int ret;
242
243 if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
244 return -ENOTEMPTY;
242 245
243 ret = omfs_delete_entry(dentry); 246 ret = omfs_delete_entry(dentry);
244 if (ret) 247 if (ret)
245 goto end_unlink; 248 return ret;
246 249
247 inode_dec_link_count(inode); 250 clear_nlink(inode);
251 mark_inode_dirty(inode);
248 mark_inode_dirty(dir); 252 mark_inode_dirty(dir);
249 253 return 0;
250end_unlink:
251 return ret;
252}
253
254static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
255{
256 int err = -ENOTEMPTY;
257 struct inode *inode = dentry->d_inode;
258
259 if (omfs_dir_is_empty(inode)) {
260 err = omfs_unlink(dir, dentry);
261 if (!err)
262 inode_dec_link_count(inode);
263 }
264 return err;
265} 254}
266 255
267static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode) 256static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
@@ -372,9 +361,10 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
372 361
373 res = filldir(dirent, oi->i_name, strnlen(oi->i_name, 362 res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
374 OMFS_NAMELEN), filp->f_pos, self, d_type); 363 OMFS_NAMELEN), filp->f_pos, self, d_type);
375 if (res == 0)
376 filp->f_pos++;
377 brelse(bh); 364 brelse(bh);
365 if (res < 0)
366 break;
367 filp->f_pos++;
378 } 368 }
379out: 369out:
380 return res; 370 return res;
@@ -385,44 +375,28 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
385{ 375{
386 struct inode *new_inode = new_dentry->d_inode; 376 struct inode *new_inode = new_dentry->d_inode;
387 struct inode *old_inode = old_dentry->d_inode; 377 struct inode *old_inode = old_dentry->d_inode;
388 struct buffer_head *bh;
389 int is_dir;
390 int err; 378 int err;
391 379
392 is_dir = S_ISDIR(old_inode->i_mode);
393
394 if (new_inode) { 380 if (new_inode) {
395 /* overwriting existing file/dir */ 381 /* overwriting existing file/dir */
396 err = -ENOTEMPTY; 382 err = omfs_remove(new_dir, new_dentry);
397 if (is_dir && !omfs_dir_is_empty(new_inode))
398 goto out;
399
400 err = -ENOENT;
401 bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
402 new_dentry->d_name.len);
403 if (IS_ERR(bh))
404 goto out;
405 brelse(bh);
406
407 err = omfs_unlink(new_dir, new_dentry);
408 if (err) 383 if (err)
409 goto out; 384 goto out;
410 } 385 }
411 386
412 /* since omfs locates files by name, we need to unlink _before_ 387 /* since omfs locates files by name, we need to unlink _before_
413 * adding the new link or we won't find the old one */ 388 * adding the new link or we won't find the old one */
414 inode_inc_link_count(old_inode); 389 err = omfs_delete_entry(old_dentry);
415 err = omfs_unlink(old_dir, old_dentry); 390 if (err)
416 if (err) {
417 inode_dec_link_count(old_inode);
418 goto out; 391 goto out;
419 }
420 392
393 mark_inode_dirty(old_dir);
421 err = omfs_add_link(new_dentry, old_inode); 394 err = omfs_add_link(new_dentry, old_inode);
422 if (err) 395 if (err)
423 goto out; 396 goto out;
424 397
425 old_inode->i_ctime = CURRENT_TIME_SEC; 398 old_inode->i_ctime = CURRENT_TIME_SEC;
399 mark_inode_dirty(old_inode);
426out: 400out:
427 return err; 401 return err;
428} 402}
@@ -488,8 +462,8 @@ const struct inode_operations omfs_dir_inops = {
488 .mkdir = omfs_mkdir, 462 .mkdir = omfs_mkdir,
489 .rename = omfs_rename, 463 .rename = omfs_rename,
490 .create = omfs_create, 464 .create = omfs_create,
491 .unlink = omfs_unlink, 465 .unlink = omfs_remove,
492 .rmdir = omfs_rmdir, 466 .rmdir = omfs_remove,
493}; 467};
494 468
495const struct file_operations omfs_dir_operations = { 469const struct file_operations omfs_dir_operations = {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 8a6d34fa668a..d738a7e493dd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,6 @@ const struct address_space_operations omfs_aops = {
372 .readpages = omfs_readpages, 372 .readpages = omfs_readpages,
373 .writepage = omfs_writepage, 373 .writepage = omfs_writepage,
374 .writepages = omfs_writepages, 374 .writepages = omfs_writepages,
375 .sync_page = block_sync_page,
376 .write_begin = omfs_write_begin, 375 .write_begin = omfs_write_begin,
377 .write_end = generic_write_end, 376 .write_end = generic_write_end,
378 .bmap = omfs_bmap, 377 .bmap = omfs_bmap,
diff --git a/fs/open.c b/fs/open.c
index e52389e1f05b..b52cf013ffa1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
233 233
234 if (!(file->f_mode & FMODE_WRITE)) 234 if (!(file->f_mode & FMODE_WRITE))
235 return -EBADF; 235 return -EBADF;
236
237 /* It's not possible punch hole on append only file */
238 if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
239 return -EPERM;
240
241 if (IS_IMMUTABLE(inode))
242 return -EPERM;
243
236 /* 244 /*
237 * Revalidate the write permissions, in case security policy has 245 * Revalidate the write permissions, in case security policy has
238 * changed since the files were opened. 246 * changed since the files were opened.
@@ -565,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
565{ 573{
566 struct path path; 574 struct path path;
567 int error = -EINVAL; 575 int error = -EINVAL;
568 int follow; 576 int lookup_flags;
569 577
570 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) 578 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
571 goto out; 579 goto out;
572 580
573 follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; 581 lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
574 error = user_path_at(dfd, filename, follow, &path); 582 if (flag & AT_EMPTY_PATH)
583 lookup_flags |= LOOKUP_EMPTY;
584 error = user_path_at(dfd, filename, lookup_flags, &path);
575 if (error) 585 if (error)
576 goto out; 586 goto out;
577 error = mnt_want_write(path.mnt); 587 error = mnt_want_write(path.mnt);
@@ -661,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
661 int (*open)(struct inode *, struct file *), 671 int (*open)(struct inode *, struct file *),
662 const struct cred *cred) 672 const struct cred *cred)
663{ 673{
674 static const struct file_operations empty_fops = {};
664 struct inode *inode; 675 struct inode *inode;
665 int error; 676 int error;
666 677
667 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | 678 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
668 FMODE_PREAD | FMODE_PWRITE; 679 FMODE_PREAD | FMODE_PWRITE;
680
681 if (unlikely(f->f_flags & O_PATH))
682 f->f_mode = FMODE_PATH;
683
669 inode = dentry->d_inode; 684 inode = dentry->d_inode;
670 if (f->f_mode & FMODE_WRITE) { 685 if (f->f_mode & FMODE_WRITE) {
671 error = __get_file_write_access(inode, mnt); 686 error = __get_file_write_access(inode, mnt);
@@ -679,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
679 f->f_path.dentry = dentry; 694 f->f_path.dentry = dentry;
680 f->f_path.mnt = mnt; 695 f->f_path.mnt = mnt;
681 f->f_pos = 0; 696 f->f_pos = 0;
682 f->f_op = fops_get(inode->i_fop);
683 file_sb_list_add(f, inode->i_sb); 697 file_sb_list_add(f, inode->i_sb);
684 698
699 if (unlikely(f->f_mode & FMODE_PATH)) {
700 f->f_op = &empty_fops;
701 return f;
702 }
703
704 f->f_op = fops_get(inode->i_fop);
705
685 error = security_dentry_open(f, cred); 706 error = security_dentry_open(f, cred);
686 if (error) 707 if (error)
687 goto cleanup_all; 708 goto cleanup_all;
@@ -693,7 +714,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
693 if (error) 714 if (error)
694 goto cleanup_all; 715 goto cleanup_all;
695 } 716 }
696 ima_counts_get(f); 717 if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
718 i_readcount_inc(inode);
697 719
698 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); 720 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
699 721
@@ -790,6 +812,8 @@ struct file *nameidata_to_filp(struct nameidata *nd)
790 812
791 /* Pick up the filp from the open intent */ 813 /* Pick up the filp from the open intent */
792 filp = nd->intent.open.file; 814 filp = nd->intent.open.file;
815 nd->intent.open.file = NULL;
816
793 /* Has the filesystem initialised the file for us? */ 817 /* Has the filesystem initialised the file for us? */
794 if (filp->f_path.dentry == NULL) { 818 if (filp->f_path.dentry == NULL) {
795 path_get(&nd->path); 819 path_get(&nd->path);
@@ -811,17 +835,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
811 835
812 validate_creds(cred); 836 validate_creds(cred);
813 837
814 /* 838 /* We must always pass in a valid mount pointer. */
815 * We must always pass in a valid mount pointer. Historically 839 BUG_ON(!mnt);
816 * callers got away with not passing it, but we must enforce this at
817 * the earliest possible point now to avoid strange problems deep in the
818 * filesystem stack.
819 */
820 if (!mnt) {
821 printk(KERN_WARNING "%s called with NULL vfsmount\n", __func__);
822 dump_stack();
823 return ERR_PTR(-EINVAL);
824 }
825 840
826 error = -ENFILE; 841 error = -ENFILE;
827 f = get_empty_filp(); 842 f = get_empty_filp();
@@ -880,15 +895,110 @@ void fd_install(unsigned int fd, struct file *file)
880 895
881EXPORT_SYMBOL(fd_install); 896EXPORT_SYMBOL(fd_install);
882 897
898static inline int build_open_flags(int flags, int mode, struct open_flags *op)
899{
900 int lookup_flags = 0;
901 int acc_mode;
902
903 if (!(flags & O_CREAT))
904 mode = 0;
905 op->mode = mode;
906
907 /* Must never be set by userspace */
908 flags &= ~FMODE_NONOTIFY;
909
910 /*
911 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
912 * check for O_DSYNC if the need any syncing at all we enforce it's
913 * always set instead of having to deal with possibly weird behaviour
914 * for malicious applications setting only __O_SYNC.
915 */
916 if (flags & __O_SYNC)
917 flags |= O_DSYNC;
918
919 /*
920 * If we have O_PATH in the open flag. Then we
921 * cannot have anything other than the below set of flags
922 */
923 if (flags & O_PATH) {
924 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
925 acc_mode = 0;
926 } else {
927 acc_mode = MAY_OPEN | ACC_MODE(flags);
928 }
929
930 op->open_flag = flags;
931
932 /* O_TRUNC implies we need access checks for write permissions */
933 if (flags & O_TRUNC)
934 acc_mode |= MAY_WRITE;
935
936 /* Allow the LSM permission hook to distinguish append
937 access from general write access. */
938 if (flags & O_APPEND)
939 acc_mode |= MAY_APPEND;
940
941 op->acc_mode = acc_mode;
942
943 op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
944
945 if (flags & O_CREAT) {
946 op->intent |= LOOKUP_CREATE;
947 if (flags & O_EXCL)
948 op->intent |= LOOKUP_EXCL;
949 }
950
951 if (flags & O_DIRECTORY)
952 lookup_flags |= LOOKUP_DIRECTORY;
953 if (!(flags & O_NOFOLLOW))
954 lookup_flags |= LOOKUP_FOLLOW;
955 return lookup_flags;
956}
957
958/**
959 * filp_open - open file and return file pointer
960 *
961 * @filename: path to open
962 * @flags: open flags as per the open(2) second argument
963 * @mode: mode for the new file if O_CREAT is set, else ignored
964 *
965 * This is the helper to open a file from kernelspace if you really
966 * have to. But in generally you should not do this, so please move
967 * along, nothing to see here..
968 */
969struct file *filp_open(const char *filename, int flags, int mode)
970{
971 struct open_flags op;
972 int lookup = build_open_flags(flags, mode, &op);
973 return do_filp_open(AT_FDCWD, filename, &op, lookup);
974}
975EXPORT_SYMBOL(filp_open);
976
977struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
978 const char *filename, int flags)
979{
980 struct open_flags op;
981 int lookup = build_open_flags(flags, 0, &op);
982 if (flags & O_CREAT)
983 return ERR_PTR(-EINVAL);
984 if (!filename && (flags & O_DIRECTORY))
985 if (!dentry->d_inode->i_op->lookup)
986 return ERR_PTR(-ENOTDIR);
987 return do_file_open_root(dentry, mnt, filename, &op, lookup);
988}
989EXPORT_SYMBOL(file_open_root);
990
883long do_sys_open(int dfd, const char __user *filename, int flags, int mode) 991long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
884{ 992{
993 struct open_flags op;
994 int lookup = build_open_flags(flags, mode, &op);
885 char *tmp = getname(filename); 995 char *tmp = getname(filename);
886 int fd = PTR_ERR(tmp); 996 int fd = PTR_ERR(tmp);
887 997
888 if (!IS_ERR(tmp)) { 998 if (!IS_ERR(tmp)) {
889 fd = get_unused_fd_flags(flags); 999 fd = get_unused_fd_flags(flags);
890 if (fd >= 0) { 1000 if (fd >= 0) {
891 struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); 1001 struct file *f = do_filp_open(dfd, tmp, &op, lookup);
892 if (IS_ERR(f)) { 1002 if (IS_ERR(f)) {
893 put_unused_fd(fd); 1003 put_unused_fd(fd);
894 fd = PTR_ERR(f); 1004 fd = PTR_ERR(f);
@@ -958,8 +1068,10 @@ int filp_close(struct file *filp, fl_owner_t id)
958 if (filp->f_op && filp->f_op->flush) 1068 if (filp->f_op && filp->f_op->flush)
959 retval = filp->f_op->flush(filp, id); 1069 retval = filp->f_op->flush(filp, id);
960 1070
961 dnotify_flush(filp, id); 1071 if (likely(!(filp->f_mode & FMODE_PATH))) {
962 locks_remove_posix(filp, id); 1072 dnotify_flush(filp, id);
1073 locks_remove_posix(filp, id);
1074 }
963 fput(filp); 1075 fput(filp);
964 return retval; 1076 return retval;
965} 1077}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9c21119512b9..ac546975031f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -290,7 +290,8 @@ ssize_t part_inflight_show(struct device *dev,
290{ 290{
291 struct hd_struct *p = dev_to_part(dev); 291 struct hd_struct *p = dev_to_part(dev);
292 292
293 return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); 293 return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
294 atomic_read(&p->in_flight[1]));
294} 295}
295 296
296#ifdef CONFIG_FAIL_MAKE_REQUEST 297#ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 789c625c7aa5..b10e3540d5b7 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
251 } 251 }
252 252
253 vm->vblk_size = get_unaligned_be32(data + 0x08); 253 vm->vblk_size = get_unaligned_be32(data + 0x08);
254 if (vm->vblk_size == 0) {
255 ldm_error ("Illegal VBLK size");
256 return false;
257 }
258
254 vm->vblk_offset = get_unaligned_be32(data + 0x0C); 259 vm->vblk_offset = get_unaligned_be32(data + 0x0C);
255 vm->last_vblk_seq = get_unaligned_be32(data + 0x04); 260 vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
256 261
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 68d6a216ee79..11f688bd76c5 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -29,10 +29,9 @@ static inline void mac_fix_string(char *stg, int len)
29 29
30int mac_partition(struct parsed_partitions *state) 30int mac_partition(struct parsed_partitions *state)
31{ 31{
32 int slot = 1;
33 Sector sect; 32 Sector sect;
34 unsigned char *data; 33 unsigned char *data;
35 int blk, blocks_in_map; 34 int slot, blocks_in_map;
36 unsigned secsize; 35 unsigned secsize;
37#ifdef CONFIG_PPC_PMAC 36#ifdef CONFIG_PPC_PMAC
38 int found_root = 0; 37 int found_root = 0;
@@ -59,10 +58,14 @@ int mac_partition(struct parsed_partitions *state)
59 put_dev_sector(sect); 58 put_dev_sector(sect);
60 return 0; /* not a MacOS disk */ 59 return 0; /* not a MacOS disk */
61 } 60 }
62 strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
63 blocks_in_map = be32_to_cpu(part->map_count); 61 blocks_in_map = be32_to_cpu(part->map_count);
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 62 if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
65 int pos = blk * secsize; 63 put_dev_sector(sect);
64 return 0;
65 }
66 strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
67 for (slot = 1; slot <= blocks_in_map; ++slot) {
68 int pos = slot * secsize;
66 put_dev_sector(sect); 69 put_dev_sector(sect);
67 data = read_part_sector(state, pos/512, &sect); 70 data = read_part_sector(state, pos/512, &sect);
68 if (!data) 71 if (!data)
@@ -113,13 +116,11 @@ int mac_partition(struct parsed_partitions *state)
113 } 116 }
114 117
115 if (goodness > found_root_goodness) { 118 if (goodness > found_root_goodness) {
116 found_root = blk; 119 found_root = slot;
117 found_root_goodness = goodness; 120 found_root_goodness = goodness;
118 } 121 }
119 } 122 }
120#endif /* CONFIG_PPC_PMAC */ 123#endif /* CONFIG_PPC_PMAC */
121
122 ++slot;
123 } 124 }
124#ifdef CONFIG_PPC_PMAC 125#ifdef CONFIG_PPC_PMAC
125 if (found_root_goodness) 126 if (found_root_goodness)
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index 48cec7cbca17..764b86a01965 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,10 +10,13 @@
10#include "check.h" 10#include "check.h"
11#include "osf.h" 11#include "osf.h"
12 12
13#define MAX_OSF_PARTITIONS 18
14
13int osf_partition(struct parsed_partitions *state) 15int osf_partition(struct parsed_partitions *state)
14{ 16{
15 int i; 17 int i;
16 int slot = 1; 18 int slot = 1;
19 unsigned int npartitions;
17 Sector sect; 20 Sector sect;
18 unsigned char *data; 21 unsigned char *data;
19 struct disklabel { 22 struct disklabel {
@@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state)
45 u8 p_fstype; 48 u8 p_fstype;
46 u8 p_frag; 49 u8 p_frag;
47 __le16 p_cpg; 50 __le16 p_cpg;
48 } d_partitions[8]; 51 } d_partitions[MAX_OSF_PARTITIONS];
49 } * label; 52 } * label;
50 struct d_partition * partition; 53 struct d_partition * partition;
51 54
@@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state)
63 put_dev_sector(sect); 66 put_dev_sector(sect);
64 return 0; 67 return 0;
65 } 68 }
66 for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) { 69 npartitions = le16_to_cpu(label->d_npartitions);
70 if (npartitions > MAX_OSF_PARTITIONS) {
71 put_dev_sector(sect);
72 return 0;
73 }
74 for (i = 0 ; i < npartitions; i++, partition++) {
67 if (slot == state->limit) 75 if (slot == state->limit)
68 break; 76 break;
69 if (le32_to_cpu(partition->p_size)) 77 if (le32_to_cpu(partition->p_size))
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a0ec25..b1cf6bf4b41d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/errno.h> 23#include <linux/errno.h>
24 24
25EXPORT_SYMBOL(posix_acl_init);
25EXPORT_SYMBOL(posix_acl_alloc); 26EXPORT_SYMBOL(posix_acl_alloc);
26EXPORT_SYMBOL(posix_acl_clone); 27EXPORT_SYMBOL(posix_acl_clone);
27EXPORT_SYMBOL(posix_acl_valid); 28EXPORT_SYMBOL(posix_acl_valid);
@@ -32,6 +33,16 @@ EXPORT_SYMBOL(posix_acl_chmod_masq);
32EXPORT_SYMBOL(posix_acl_permission); 33EXPORT_SYMBOL(posix_acl_permission);
33 34
34/* 35/*
36 * Init a fresh posix_acl
37 */
38void
39posix_acl_init(struct posix_acl *acl, int count)
40{
41 atomic_set(&acl->a_refcount, 1);
42 acl->a_count = count;
43}
44
45/*
35 * Allocate a new ACL with the specified number of entries. 46 * Allocate a new ACL with the specified number of entries.
36 */ 47 */
37struct posix_acl * 48struct posix_acl *
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
40 const size_t size = sizeof(struct posix_acl) + 51 const size_t size = sizeof(struct posix_acl) +
41 count * sizeof(struct posix_acl_entry); 52 count * sizeof(struct posix_acl_entry);
42 struct posix_acl *acl = kmalloc(size, flags); 53 struct posix_acl *acl = kmalloc(size, flags);
43 if (acl) { 54 if (acl)
44 atomic_set(&acl->a_refcount, 1); 55 posix_acl_init(acl, count);
45 acl->a_count = count;
46 }
47 return acl; 56 return acl;
48} 57}
49 58
diff --git a/fs/proc/array.c b/fs/proc/array.c
index df2b703b9d0f..5e4f776b0917 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -353,9 +353,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
353 task_cap(m, task); 353 task_cap(m, task);
354 task_cpus_allowed(m, task); 354 task_cpus_allowed(m, task);
355 cpuset_task_status_allowed(m, task); 355 cpuset_task_status_allowed(m, task);
356#if defined(CONFIG_S390)
357 task_show_regs(m, task);
358#endif
359 task_context_switch_counts(m, task); 356 task_context_switch_counts(m, task);
360 return 0; 357 return 0;
361} 358}
@@ -492,8 +489,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
492 vsize, 489 vsize,
493 mm ? get_mm_rss(mm) : 0, 490 mm ? get_mm_rss(mm) : 0,
494 rsslim, 491 rsslim,
495 mm ? mm->start_code : 0, 492 mm ? (permitted ? mm->start_code : 1) : 0,
496 mm ? mm->end_code : 0, 493 mm ? (permitted ? mm->end_code : 1) : 0,
497 (permitted && mm) ? mm->start_stack : 0, 494 (permitted && mm) ? mm->start_stack : 0,
498 esp, 495 esp,
499 eip, 496 eip,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d096e82b201..5a670c11aeac 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -191,17 +191,20 @@ static int proc_root_link(struct inode *inode, struct path *path)
191 return result; 191 return result;
192} 192}
193 193
194/* 194static struct mm_struct *__check_mem_permission(struct task_struct *task)
195 * Return zero if current may access user memory in @task, -error if not.
196 */
197static int check_mem_permission(struct task_struct *task)
198{ 195{
196 struct mm_struct *mm;
197
198 mm = get_task_mm(task);
199 if (!mm)
200 return ERR_PTR(-EINVAL);
201
199 /* 202 /*
200 * A task can always look at itself, in case it chooses 203 * A task can always look at itself, in case it chooses
201 * to use system calls instead of load instructions. 204 * to use system calls instead of load instructions.
202 */ 205 */
203 if (task == current) 206 if (task == current)
204 return 0; 207 return mm;
205 208
206 /* 209 /*
207 * If current is actively ptrace'ing, and would also be 210 * If current is actively ptrace'ing, and would also be
@@ -213,27 +216,53 @@ static int check_mem_permission(struct task_struct *task)
213 match = (tracehook_tracer_task(task) == current); 216 match = (tracehook_tracer_task(task) == current);
214 rcu_read_unlock(); 217 rcu_read_unlock();
215 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) 218 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
216 return 0; 219 return mm;
217 } 220 }
218 221
219 /* 222 /*
220 * Noone else is allowed. 223 * Noone else is allowed.
221 */ 224 */
222 return -EPERM; 225 mmput(mm);
226 return ERR_PTR(-EPERM);
227}
228
229/*
230 * If current may access user memory in @task return a reference to the
231 * corresponding mm, otherwise ERR_PTR.
232 */
233static struct mm_struct *check_mem_permission(struct task_struct *task)
234{
235 struct mm_struct *mm;
236 int err;
237
238 /*
239 * Avoid racing if task exec's as we might get a new mm but validate
240 * against old credentials.
241 */
242 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
243 if (err)
244 return ERR_PTR(err);
245
246 mm = __check_mem_permission(task);
247 mutex_unlock(&task->signal->cred_guard_mutex);
248
249 return mm;
223} 250}
224 251
225struct mm_struct *mm_for_maps(struct task_struct *task) 252struct mm_struct *mm_for_maps(struct task_struct *task)
226{ 253{
227 struct mm_struct *mm; 254 struct mm_struct *mm;
255 int err;
228 256
229 if (mutex_lock_killable(&task->signal->cred_guard_mutex)) 257 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
230 return NULL; 258 if (err)
259 return ERR_PTR(err);
231 260
232 mm = get_task_mm(task); 261 mm = get_task_mm(task);
233 if (mm && mm != current->mm && 262 if (mm && mm != current->mm &&
234 !ptrace_may_access(task, PTRACE_MODE_READ)) { 263 !ptrace_may_access(task, PTRACE_MODE_READ)) {
235 mmput(mm); 264 mmput(mm);
236 mm = NULL; 265 mm = ERR_PTR(-EACCES);
237 } 266 }
238 mutex_unlock(&task->signal->cred_guard_mutex); 267 mutex_unlock(&task->signal->cred_guard_mutex);
239 268
@@ -279,9 +308,9 @@ out:
279 308
280static int proc_pid_auxv(struct task_struct *task, char *buffer) 309static int proc_pid_auxv(struct task_struct *task, char *buffer)
281{ 310{
282 int res = 0; 311 struct mm_struct *mm = mm_for_maps(task);
283 struct mm_struct *mm = get_task_mm(task); 312 int res = PTR_ERR(mm);
284 if (mm) { 313 if (mm && !IS_ERR(mm)) {
285 unsigned int nwords = 0; 314 unsigned int nwords = 0;
286 do { 315 do {
287 nwords += 2; 316 nwords += 2;
@@ -318,6 +347,23 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
318} 347}
319#endif /* CONFIG_KALLSYMS */ 348#endif /* CONFIG_KALLSYMS */
320 349
350static int lock_trace(struct task_struct *task)
351{
352 int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
353 if (err)
354 return err;
355 if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
356 mutex_unlock(&task->signal->cred_guard_mutex);
357 return -EPERM;
358 }
359 return 0;
360}
361
362static void unlock_trace(struct task_struct *task)
363{
364 mutex_unlock(&task->signal->cred_guard_mutex);
365}
366
321#ifdef CONFIG_STACKTRACE 367#ifdef CONFIG_STACKTRACE
322 368
323#define MAX_STACK_TRACE_DEPTH 64 369#define MAX_STACK_TRACE_DEPTH 64
@@ -327,6 +373,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
327{ 373{
328 struct stack_trace trace; 374 struct stack_trace trace;
329 unsigned long *entries; 375 unsigned long *entries;
376 int err;
330 int i; 377 int i;
331 378
332 entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); 379 entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
@@ -337,15 +384,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
337 trace.max_entries = MAX_STACK_TRACE_DEPTH; 384 trace.max_entries = MAX_STACK_TRACE_DEPTH;
338 trace.entries = entries; 385 trace.entries = entries;
339 trace.skip = 0; 386 trace.skip = 0;
340 save_stack_trace_tsk(task, &trace);
341 387
342 for (i = 0; i < trace.nr_entries; i++) { 388 err = lock_trace(task);
343 seq_printf(m, "[<%p>] %pS\n", 389 if (!err) {
344 (void *)entries[i], (void *)entries[i]); 390 save_stack_trace_tsk(task, &trace);
391
392 for (i = 0; i < trace.nr_entries; i++) {
393 seq_printf(m, "[<%pK>] %pS\n",
394 (void *)entries[i], (void *)entries[i]);
395 }
396 unlock_trace(task);
345 } 397 }
346 kfree(entries); 398 kfree(entries);
347 399
348 return 0; 400 return err;
349} 401}
350#endif 402#endif
351 403
@@ -508,18 +560,22 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
508{ 560{
509 long nr; 561 long nr;
510 unsigned long args[6], sp, pc; 562 unsigned long args[6], sp, pc;
563 int res = lock_trace(task);
564 if (res)
565 return res;
511 566
512 if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) 567 if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
513 return sprintf(buffer, "running\n"); 568 res = sprintf(buffer, "running\n");
514 569 else if (nr < 0)
515 if (nr < 0) 570 res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
516 return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); 571 else
517 572 res = sprintf(buffer,
518 return sprintf(buffer,
519 "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", 573 "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
520 nr, 574 nr,
521 args[0], args[1], args[2], args[3], args[4], args[5], 575 args[0], args[1], args[2], args[3], args[4], args[5],
522 sp, pc); 576 sp, pc);
577 unlock_trace(task);
578 return res;
523} 579}
524#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ 580#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
525 581
@@ -775,18 +831,14 @@ static ssize_t mem_read(struct file * file, char __user * buf,
775 if (!task) 831 if (!task)
776 goto out_no_task; 832 goto out_no_task;
777 833
778 if (check_mem_permission(task))
779 goto out;
780
781 ret = -ENOMEM; 834 ret = -ENOMEM;
782 page = (char *)__get_free_page(GFP_TEMPORARY); 835 page = (char *)__get_free_page(GFP_TEMPORARY);
783 if (!page) 836 if (!page)
784 goto out; 837 goto out;
785 838
786 ret = 0; 839 mm = check_mem_permission(task);
787 840 ret = PTR_ERR(mm);
788 mm = get_task_mm(task); 841 if (IS_ERR(mm))
789 if (!mm)
790 goto out_free; 842 goto out_free;
791 843
792 ret = -EIO; 844 ret = -EIO;
@@ -800,8 +852,8 @@ static ssize_t mem_read(struct file * file, char __user * buf,
800 int this_len, retval; 852 int this_len, retval;
801 853
802 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; 854 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
803 retval = access_process_vm(task, src, page, this_len, 0); 855 retval = access_remote_vm(mm, src, page, this_len, 0);
804 if (!retval || check_mem_permission(task)) { 856 if (!retval) {
805 if (!ret) 857 if (!ret)
806 ret = -EIO; 858 ret = -EIO;
807 break; 859 break;
@@ -829,10 +881,6 @@ out_no_task:
829 return ret; 881 return ret;
830} 882}
831 883
832#define mem_write NULL
833
834#ifndef mem_write
835/* This is a security hazard */
836static ssize_t mem_write(struct file * file, const char __user *buf, 884static ssize_t mem_write(struct file * file, const char __user *buf,
837 size_t count, loff_t *ppos) 885 size_t count, loff_t *ppos)
838{ 886{
@@ -840,18 +888,25 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
840 char *page; 888 char *page;
841 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 889 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
842 unsigned long dst = *ppos; 890 unsigned long dst = *ppos;
891 struct mm_struct *mm;
843 892
844 copied = -ESRCH; 893 copied = -ESRCH;
845 if (!task) 894 if (!task)
846 goto out_no_task; 895 goto out_no_task;
847 896
848 if (check_mem_permission(task)) 897 mm = check_mem_permission(task);
849 goto out; 898 copied = PTR_ERR(mm);
899 if (IS_ERR(mm))
900 goto out_task;
901
902 copied = -EIO;
903 if (file->private_data != (void *)((long)current->self_exec_id))
904 goto out_mm;
850 905
851 copied = -ENOMEM; 906 copied = -ENOMEM;
852 page = (char *)__get_free_page(GFP_TEMPORARY); 907 page = (char *)__get_free_page(GFP_TEMPORARY);
853 if (!page) 908 if (!page)
854 goto out; 909 goto out_mm;
855 910
856 copied = 0; 911 copied = 0;
857 while (count > 0) { 912 while (count > 0) {
@@ -862,7 +917,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
862 copied = -EFAULT; 917 copied = -EFAULT;
863 break; 918 break;
864 } 919 }
865 retval = access_process_vm(task, dst, page, this_len, 1); 920 retval = access_remote_vm(mm, dst, page, this_len, 1);
866 if (!retval) { 921 if (!retval) {
867 if (!copied) 922 if (!copied)
868 copied = -EIO; 923 copied = -EIO;
@@ -875,12 +930,13 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
875 } 930 }
876 *ppos = dst; 931 *ppos = dst;
877 free_page((unsigned long) page); 932 free_page((unsigned long) page);
878out: 933out_mm:
934 mmput(mm);
935out_task:
879 put_task_struct(task); 936 put_task_struct(task);
880out_no_task: 937out_no_task:
881 return copied; 938 return copied;
882} 939}
883#endif
884 940
885loff_t mem_lseek(struct file *file, loff_t offset, int orig) 941loff_t mem_lseek(struct file *file, loff_t offset, int orig)
886{ 942{
@@ -917,20 +973,18 @@ static ssize_t environ_read(struct file *file, char __user *buf,
917 if (!task) 973 if (!task)
918 goto out_no_task; 974 goto out_no_task;
919 975
920 if (!ptrace_may_access(task, PTRACE_MODE_READ))
921 goto out;
922
923 ret = -ENOMEM; 976 ret = -ENOMEM;
924 page = (char *)__get_free_page(GFP_TEMPORARY); 977 page = (char *)__get_free_page(GFP_TEMPORARY);
925 if (!page) 978 if (!page)
926 goto out; 979 goto out;
927 980
928 ret = 0;
929 981
930 mm = get_task_mm(task); 982 mm = mm_for_maps(task);
931 if (!mm) 983 ret = PTR_ERR(mm);
984 if (!mm || IS_ERR(mm))
932 goto out_free; 985 goto out_free;
933 986
987 ret = 0;
934 while (count > 0) { 988 while (count > 0) {
935 int this_len, retval, max_len; 989 int this_len, retval, max_len;
936 990
@@ -2620,35 +2674,6 @@ static const struct pid_entry proc_base_stuff[] = {
2620 &proc_self_inode_operations, NULL, {}), 2674 &proc_self_inode_operations, NULL, {}),
2621}; 2675};
2622 2676
2623/*
2624 * Exceptional case: normally we are not allowed to unhash a busy
2625 * directory. In this case, however, we can do it - no aliasing problems
2626 * due to the way we treat inodes.
2627 */
2628static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
2629{
2630 struct inode *inode;
2631 struct task_struct *task;
2632
2633 if (nd->flags & LOOKUP_RCU)
2634 return -ECHILD;
2635
2636 inode = dentry->d_inode;
2637 task = get_proc_task(inode);
2638 if (task) {
2639 put_task_struct(task);
2640 return 1;
2641 }
2642 d_drop(dentry);
2643 return 0;
2644}
2645
2646static const struct dentry_operations proc_base_dentry_operations =
2647{
2648 .d_revalidate = proc_base_revalidate,
2649 .d_delete = pid_delete_dentry,
2650};
2651
2652static struct dentry *proc_base_instantiate(struct inode *dir, 2677static struct dentry *proc_base_instantiate(struct inode *dir,
2653 struct dentry *dentry, struct task_struct *task, const void *ptr) 2678 struct dentry *dentry, struct task_struct *task, const void *ptr)
2654{ 2679{
@@ -2685,7 +2710,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2685 if (p->fop) 2710 if (p->fop)
2686 inode->i_fop = p->fop; 2711 inode->i_fop = p->fop;
2687 ei->op = p->op; 2712 ei->op = p->op;
2688 d_set_d_op(dentry, &proc_base_dentry_operations);
2689 d_add(dentry, inode); 2713 d_add(dentry, inode);
2690 error = NULL; 2714 error = NULL;
2691out: 2715out:
@@ -2778,8 +2802,12 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
2778static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, 2802static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
2779 struct pid *pid, struct task_struct *task) 2803 struct pid *pid, struct task_struct *task)
2780{ 2804{
2781 seq_printf(m, "%08x\n", task->personality); 2805 int err = lock_trace(task);
2782 return 0; 2806 if (!err) {
2807 seq_printf(m, "%08x\n", task->personality);
2808 unlock_trace(task);
2809 }
2810 return err;
2783} 2811}
2784 2812
2785/* 2813/*
@@ -2798,7 +2826,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2798 REG("environ", S_IRUSR, proc_environ_operations), 2826 REG("environ", S_IRUSR, proc_environ_operations),
2799 INF("auxv", S_IRUSR, proc_pid_auxv), 2827 INF("auxv", S_IRUSR, proc_pid_auxv),
2800 ONE("status", S_IRUGO, proc_pid_status), 2828 ONE("status", S_IRUGO, proc_pid_status),
2801 ONE("personality", S_IRUSR, proc_pid_personality), 2829 ONE("personality", S_IRUGO, proc_pid_personality),
2802 INF("limits", S_IRUGO, proc_pid_limits), 2830 INF("limits", S_IRUGO, proc_pid_limits),
2803#ifdef CONFIG_SCHED_DEBUG 2831#ifdef CONFIG_SCHED_DEBUG
2804 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2832 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2808,7 +2836,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2808#endif 2836#endif
2809 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2837 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2810#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2838#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2811 INF("syscall", S_IRUSR, proc_pid_syscall), 2839 INF("syscall", S_IRUGO, proc_pid_syscall),
2812#endif 2840#endif
2813 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2841 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2814 ONE("stat", S_IRUGO, proc_tgid_stat), 2842 ONE("stat", S_IRUGO, proc_tgid_stat),
@@ -2827,7 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2827#ifdef CONFIG_PROC_PAGE_MONITOR 2855#ifdef CONFIG_PROC_PAGE_MONITOR
2828 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2856 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2829 REG("smaps", S_IRUGO, proc_smaps_operations), 2857 REG("smaps", S_IRUGO, proc_smaps_operations),
2830 REG("pagemap", S_IRUSR, proc_pagemap_operations), 2858 REG("pagemap", S_IRUGO, proc_pagemap_operations),
2831#endif 2859#endif
2832#ifdef CONFIG_SECURITY 2860#ifdef CONFIG_SECURITY
2833 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2861 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2836,7 +2864,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2836 INF("wchan", S_IRUGO, proc_pid_wchan), 2864 INF("wchan", S_IRUGO, proc_pid_wchan),
2837#endif 2865#endif
2838#ifdef CONFIG_STACKTRACE 2866#ifdef CONFIG_STACKTRACE
2839 ONE("stack", S_IRUSR, proc_pid_stack), 2867 ONE("stack", S_IRUGO, proc_pid_stack),
2840#endif 2868#endif
2841#ifdef CONFIG_SCHEDSTATS 2869#ifdef CONFIG_SCHEDSTATS
2842 INF("schedstat", S_IRUGO, proc_pid_schedstat), 2870 INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -3138,14 +3166,14 @@ static const struct pid_entry tid_base_stuff[] = {
3138 REG("environ", S_IRUSR, proc_environ_operations), 3166 REG("environ", S_IRUSR, proc_environ_operations),
3139 INF("auxv", S_IRUSR, proc_pid_auxv), 3167 INF("auxv", S_IRUSR, proc_pid_auxv),
3140 ONE("status", S_IRUGO, proc_pid_status), 3168 ONE("status", S_IRUGO, proc_pid_status),
3141 ONE("personality", S_IRUSR, proc_pid_personality), 3169 ONE("personality", S_IRUGO, proc_pid_personality),
3142 INF("limits", S_IRUGO, proc_pid_limits), 3170 INF("limits", S_IRUGO, proc_pid_limits),
3143#ifdef CONFIG_SCHED_DEBUG 3171#ifdef CONFIG_SCHED_DEBUG
3144 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 3172 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3145#endif 3173#endif
3146 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 3174 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
3147#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 3175#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3148 INF("syscall", S_IRUSR, proc_pid_syscall), 3176 INF("syscall", S_IRUGO, proc_pid_syscall),
3149#endif 3177#endif
3150 INF("cmdline", S_IRUGO, proc_pid_cmdline), 3178 INF("cmdline", S_IRUGO, proc_pid_cmdline),
3151 ONE("stat", S_IRUGO, proc_tid_stat), 3179 ONE("stat", S_IRUGO, proc_tid_stat),
@@ -3163,7 +3191,7 @@ static const struct pid_entry tid_base_stuff[] = {
3163#ifdef CONFIG_PROC_PAGE_MONITOR 3191#ifdef CONFIG_PROC_PAGE_MONITOR
3164 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 3192 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3165 REG("smaps", S_IRUGO, proc_smaps_operations), 3193 REG("smaps", S_IRUGO, proc_smaps_operations),
3166 REG("pagemap", S_IRUSR, proc_pagemap_operations), 3194 REG("pagemap", S_IRUGO, proc_pagemap_operations),
3167#endif 3195#endif
3168#ifdef CONFIG_SECURITY 3196#ifdef CONFIG_SECURITY
3169 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 3197 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -3172,7 +3200,7 @@ static const struct pid_entry tid_base_stuff[] = {
3172 INF("wchan", S_IRUGO, proc_pid_wchan), 3200 INF("wchan", S_IRUGO, proc_pid_wchan),
3173#endif 3201#endif
3174#ifdef CONFIG_STACKTRACE 3202#ifdef CONFIG_STACKTRACE
3175 ONE("stack", S_IRUSR, proc_pid_stack), 3203 ONE("stack", S_IRUGO, proc_pid_stack),
3176#endif 3204#endif
3177#ifdef CONFIG_SCHEDSTATS 3205#ifdef CONFIG_SCHEDSTATS
3178 INF("schedstat", S_IRUGO, proc_pid_schedstat), 3206 INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -3191,7 +3219,7 @@ static const struct pid_entry tid_base_stuff[] = {
3191 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 3219 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3192#ifdef CONFIG_AUDITSYSCALL 3220#ifdef CONFIG_AUDITSYSCALL
3193 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3221 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
3194 REG("sessionid", S_IRUSR, proc_sessionid_operations), 3222 REG("sessionid", S_IRUGO, proc_sessionid_operations),
3195#endif 3223#endif
3196#ifdef CONFIG_FAULT_INJECTION 3224#ifdef CONFIG_FAULT_INJECTION
3197 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), 3225 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index eafc22ab1fdd..b701eaa482bf 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -67,7 +67,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
67 struct console *con; 67 struct console *con;
68 loff_t off = 0; 68 loff_t off = 0;
69 69
70 acquire_console_sem(); 70 console_lock();
71 for_each_console(con) 71 for_each_console(con)
72 if (off++ == *pos) 72 if (off++ == *pos)
73 break; 73 break;
@@ -84,7 +84,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
84 84
85static void c_stop(struct seq_file *m, void *v) 85static void c_stop(struct seq_file *m, void *v)
86{ 86{
87 release_console_sem(); 87 console_unlock();
88} 88}
89 89
90static const struct seq_operations consoles_op = { 90static const struct seq_operations consoles_op = {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 01e07f2a188f..f1281339b6fa 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -28,7 +28,7 @@
28 28
29DEFINE_SPINLOCK(proc_subdir_lock); 29DEFINE_SPINLOCK(proc_subdir_lock);
30 30
31static int proc_match(int len, const char *name, struct proc_dir_entry *de) 31static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
32{ 32{
33 if (de->namelen != len) 33 if (de->namelen != len)
34 return 0; 34 return 0;
@@ -303,7 +303,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
303{ 303{
304 const char *cp = name, *next; 304 const char *cp = name, *next;
305 struct proc_dir_entry *de; 305 struct proc_dir_entry *de;
306 int len; 306 unsigned int len;
307 307
308 de = *ret; 308 de = *ret;
309 if (!de) 309 if (!de)
@@ -602,7 +602,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
602{ 602{
603 struct proc_dir_entry *ent = NULL; 603 struct proc_dir_entry *ent = NULL;
604 const char *fn = name; 604 const char *fn = name;
605 int len; 605 unsigned int len;
606 606
607 /* make sure name is valid */ 607 /* make sure name is valid */
608 if (!name || !strlen(name)) goto out; 608 if (!name || !strlen(name)) goto out;
@@ -786,7 +786,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
786 struct proc_dir_entry **p; 786 struct proc_dir_entry **p;
787 struct proc_dir_entry *de = NULL; 787 struct proc_dir_entry *de = NULL;
788 const char *fn = name; 788 const char *fn = name;
789 int len; 789 unsigned int len;
790 790
791 spin_lock(&proc_subdir_lock); 791 spin_lock(&proc_subdir_lock);
792 if (__xlate_proc_name(name, &parent, &fn) != 0) { 792 if (__xlate_proc_name(name, &parent, &fn) != 0) {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4cda68a..d15aa1b1cc8f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -27,6 +27,7 @@
27static void proc_evict_inode(struct inode *inode) 27static void proc_evict_inode(struct inode *inode)
28{ 28{
29 struct proc_dir_entry *de; 29 struct proc_dir_entry *de;
30 struct ctl_table_header *head;
30 31
31 truncate_inode_pages(&inode->i_data, 0); 32 truncate_inode_pages(&inode->i_data, 0);
32 end_writeback(inode); 33 end_writeback(inode);
@@ -38,12 +39,13 @@ static void proc_evict_inode(struct inode *inode)
38 de = PROC_I(inode)->pde; 39 de = PROC_I(inode)->pde;
39 if (de) 40 if (de)
40 pde_put(de); 41 pde_put(de);
41 if (PROC_I(inode)->sysctl) 42 head = PROC_I(inode)->sysctl;
42 sysctl_head_put(PROC_I(inode)->sysctl); 43 if (head) {
44 rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
45 sysctl_head_put(head);
46 }
43} 47}
44 48
45struct vfsmount *proc_mnt;
46
47static struct kmem_cache * proc_inode_cachep; 49static struct kmem_cache * proc_inode_cachep;
48 50
49static struct inode *proc_alloc_inode(struct super_block *sb) 51static struct inode *proc_alloc_inode(struct super_block *sb)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9ad561ded409..c03e8d3a3a5b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -107,7 +107,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
107} 107}
108void pde_put(struct proc_dir_entry *pde); 108void pde_put(struct proc_dir_entry *pde);
109 109
110extern struct vfsmount *proc_mnt;
111int proc_fill_super(struct super_block *); 110int proc_fill_super(struct super_block *);
112struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); 111struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
113 112
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index d9396a4fc7ff..927cbd115e53 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -233,7 +233,7 @@ void __init proc_device_tree_init(void)
233 return; 233 return;
234 root = of_find_node_by_path("/"); 234 root = of_find_node_by_path("/");
235 if (root == NULL) { 235 if (root == NULL) {
236 printk(KERN_ERR "/proc/device-tree: can't find root\n"); 236 pr_debug("/proc/device-tree: can't find root\n");
237 return; 237 return;
238 } 238 }
239 proc_device_tree_add_node(root, proc_device_tree); 239 proc_device_tree_add_node(root, proc_device_tree);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92a34ef..f50133c11c24 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -32,7 +32,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
32 ei->sysctl_entry = table; 32 ei->sysctl_entry = table;
33 33
34 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 34 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
35 inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
36 inode->i_mode = table->mode; 35 inode->i_mode = table->mode;
37 if (!table->child) { 36 if (!table->child) {
38 inode->i_mode |= S_IFREG; 37 inode->i_mode |= S_IFREG;
@@ -408,15 +407,18 @@ static int proc_sys_compare(const struct dentry *parent,
408 const struct dentry *dentry, const struct inode *inode, 407 const struct dentry *dentry, const struct inode *inode,
409 unsigned int len, const char *str, const struct qstr *name) 408 unsigned int len, const char *str, const struct qstr *name)
410{ 409{
410 struct ctl_table_header *head;
411 /* Although proc doesn't have negative dentries, rcu-walk means 411 /* Although proc doesn't have negative dentries, rcu-walk means
412 * that inode here can be NULL */ 412 * that inode here can be NULL */
413 /* AV: can it, indeed? */
413 if (!inode) 414 if (!inode)
414 return 0; 415 return 1;
415 if (name->len != len) 416 if (name->len != len)
416 return 1; 417 return 1;
417 if (memcmp(name->name, str, len)) 418 if (memcmp(name->name, str, len))
418 return 1; 419 return 1;
419 return !sysctl_is_seen(PROC_I(inode)->sysctl); 420 head = rcu_dereference(PROC_I(inode)->sysctl);
421 return !head || !sysctl_is_seen(head);
420} 422}
421 423
422static const struct dentry_operations proc_sys_dentry_operations = { 424static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ef9fa8e24ad6..a9000e9cfee5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
43 struct pid_namespace *ns; 43 struct pid_namespace *ns;
44 struct proc_inode *ei; 44 struct proc_inode *ei;
45 45
46 if (proc_mnt) {
47 /* Seed the root directory with a pid so it doesn't need
48 * to be special in base.c. I would do this earlier but
49 * the only task alive when /proc is mounted the first time
50 * is the init_task and it doesn't have any pids.
51 */
52 ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
53 if (!ei->pid)
54 ei->pid = find_get_pid(1);
55 }
56
57 if (flags & MS_KERNMOUNT) 46 if (flags & MS_KERNMOUNT)
58 ns = (struct pid_namespace *)data; 47 ns = (struct pid_namespace *)data;
59 else 48 else
@@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
71 return ERR_PTR(err); 60 return ERR_PTR(err);
72 } 61 }
73 62
74 ei = PROC_I(sb->s_root->d_inode);
75 if (!ei->pid) {
76 rcu_read_lock();
77 ei->pid = get_pid(find_pid_ns(1, ns));
78 rcu_read_unlock();
79 }
80
81 sb->s_flags |= MS_ACTIVE; 63 sb->s_flags |= MS_ACTIVE;
82 } 64 }
83 65
66 ei = PROC_I(sb->s_root->d_inode);
67 if (!ei->pid) {
68 rcu_read_lock();
69 ei->pid = get_pid(find_pid_ns(1, ns));
70 rcu_read_unlock();
71 }
72
84 return dget(sb->s_root); 73 return dget(sb->s_root);
85} 74}
86 75
@@ -101,19 +90,20 @@ static struct file_system_type proc_fs_type = {
101 90
102void __init proc_root_init(void) 91void __init proc_root_init(void)
103{ 92{
93 struct vfsmount *mnt;
104 int err; 94 int err;
105 95
106 proc_init_inodecache(); 96 proc_init_inodecache();
107 err = register_filesystem(&proc_fs_type); 97 err = register_filesystem(&proc_fs_type);
108 if (err) 98 if (err)
109 return; 99 return;
110 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); 100 mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
111 if (IS_ERR(proc_mnt)) { 101 if (IS_ERR(mnt)) {
112 unregister_filesystem(&proc_fs_type); 102 unregister_filesystem(&proc_fs_type);
113 return; 103 return;
114 } 104 }
115 105
116 init_pid_ns.proc_mnt = proc_mnt; 106 init_pid_ns.proc_mnt = mnt;
117 proc_symlink("mounts", NULL, "self/mounts"); 107 proc_symlink("mounts", NULL, "self/mounts");
118 108
119 proc_net_init(); 109 proc_net_init();
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 60b914860f81..7c708a418acc 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,5 +1,6 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/hugetlb.h> 2#include <linux/hugetlb.h>
3#include <linux/huge_mm.h>
3#include <linux/mount.h> 4#include <linux/mount.h>
4#include <linux/seq_file.h> 5#include <linux/seq_file.h>
5#include <linux/highmem.h> 6#include <linux/highmem.h>
@@ -7,6 +8,7 @@
7#include <linux/slab.h> 8#include <linux/slab.h>
8#include <linux/pagemap.h> 9#include <linux/pagemap.h>
9#include <linux/mempolicy.h> 10#include <linux/mempolicy.h>
11#include <linux/rmap.h>
10#include <linux/swap.h> 12#include <linux/swap.h>
11#include <linux/swapops.h> 13#include <linux/swapops.h>
12 14
@@ -119,14 +121,14 @@ static void *m_start(struct seq_file *m, loff_t *pos)
119 121
120 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 122 priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
121 if (!priv->task) 123 if (!priv->task)
122 return NULL; 124 return ERR_PTR(-ESRCH);
123 125
124 mm = mm_for_maps(priv->task); 126 mm = mm_for_maps(priv->task);
125 if (!mm) 127 if (!mm || IS_ERR(mm))
126 return NULL; 128 return mm;
127 down_read(&mm->mmap_sem); 129 down_read(&mm->mmap_sem);
128 130
129 tail_vma = get_gate_vma(priv->task); 131 tail_vma = get_gate_vma(priv->task->mm);
130 priv->tail_vma = tail_vma; 132 priv->tail_vma = tail_vma;
131 133
132 /* Start with last addr hint */ 134 /* Start with last addr hint */
@@ -249,8 +251,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
249 const char *name = arch_vma_name(vma); 251 const char *name = arch_vma_name(vma);
250 if (!name) { 252 if (!name) {
251 if (mm) { 253 if (mm) {
252 if (vma->vm_start <= mm->start_brk && 254 if (vma->vm_start <= mm->brk &&
253 vma->vm_end >= mm->brk) { 255 vma->vm_end >= mm->start_brk) {
254 name = "[heap]"; 256 name = "[heap]";
255 } else if (vma->vm_start <= mm->start_stack && 257 } else if (vma->vm_start <= mm->start_stack &&
256 vma->vm_end >= mm->start_stack) { 258 vma->vm_end >= mm->start_stack) {
@@ -277,7 +279,8 @@ static int show_map(struct seq_file *m, void *v)
277 show_map_vma(m, vma); 279 show_map_vma(m, vma);
278 280
279 if (m->count < m->size) /* vma is copied successfully */ 281 if (m->count < m->size) /* vma is copied successfully */
280 m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; 282 m->version = (vma != get_gate_vma(task->mm))
283 ? vma->vm_start : 0;
281 return 0; 284 return 0;
282} 285}
283 286
@@ -329,58 +332,86 @@ struct mem_size_stats {
329 unsigned long private_dirty; 332 unsigned long private_dirty;
330 unsigned long referenced; 333 unsigned long referenced;
331 unsigned long anonymous; 334 unsigned long anonymous;
335 unsigned long anonymous_thp;
332 unsigned long swap; 336 unsigned long swap;
333 u64 pss; 337 u64 pss;
334}; 338};
335 339
336static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 340
337 struct mm_walk *walk) 341static void smaps_pte_entry(pte_t ptent, unsigned long addr,
342 unsigned long ptent_size, struct mm_walk *walk)
338{ 343{
339 struct mem_size_stats *mss = walk->private; 344 struct mem_size_stats *mss = walk->private;
340 struct vm_area_struct *vma = mss->vma; 345 struct vm_area_struct *vma = mss->vma;
341 pte_t *pte, ptent;
342 spinlock_t *ptl;
343 struct page *page; 346 struct page *page;
344 int mapcount; 347 int mapcount;
345 348
346 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 349 if (is_swap_pte(ptent)) {
347 for (; addr != end; pte++, addr += PAGE_SIZE) { 350 mss->swap += ptent_size;
348 ptent = *pte; 351 return;
349 352 }
350 if (is_swap_pte(ptent)) {
351 mss->swap += PAGE_SIZE;
352 continue;
353 }
354 353
355 if (!pte_present(ptent)) 354 if (!pte_present(ptent))
356 continue; 355 return;
356
357 page = vm_normal_page(vma, addr, ptent);
358 if (!page)
359 return;
360
361 if (PageAnon(page))
362 mss->anonymous += ptent_size;
363
364 mss->resident += ptent_size;
365 /* Accumulate the size in pages that have been accessed. */
366 if (pte_young(ptent) || PageReferenced(page))
367 mss->referenced += ptent_size;
368 mapcount = page_mapcount(page);
369 if (mapcount >= 2) {
370 if (pte_dirty(ptent) || PageDirty(page))
371 mss->shared_dirty += ptent_size;
372 else
373 mss->shared_clean += ptent_size;
374 mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
375 } else {
376 if (pte_dirty(ptent) || PageDirty(page))
377 mss->private_dirty += ptent_size;
378 else
379 mss->private_clean += ptent_size;
380 mss->pss += (ptent_size << PSS_SHIFT);
381 }
382}
357 383
358 page = vm_normal_page(vma, addr, ptent); 384static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
359 if (!page) 385 struct mm_walk *walk)
360 continue; 386{
387 struct mem_size_stats *mss = walk->private;
388 struct vm_area_struct *vma = mss->vma;
389 pte_t *pte;
390 spinlock_t *ptl;
361 391
362 if (PageAnon(page)) 392 spin_lock(&walk->mm->page_table_lock);
363 mss->anonymous += PAGE_SIZE; 393 if (pmd_trans_huge(*pmd)) {
364 394 if (pmd_trans_splitting(*pmd)) {
365 mss->resident += PAGE_SIZE; 395 spin_unlock(&walk->mm->page_table_lock);
366 /* Accumulate the size in pages that have been accessed. */ 396 wait_split_huge_page(vma->anon_vma, pmd);
367 if (pte_young(ptent) || PageReferenced(page))
368 mss->referenced += PAGE_SIZE;
369 mapcount = page_mapcount(page);
370 if (mapcount >= 2) {
371 if (pte_dirty(ptent) || PageDirty(page))
372 mss->shared_dirty += PAGE_SIZE;
373 else
374 mss->shared_clean += PAGE_SIZE;
375 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
376 } else { 397 } else {
377 if (pte_dirty(ptent) || PageDirty(page)) 398 smaps_pte_entry(*(pte_t *)pmd, addr,
378 mss->private_dirty += PAGE_SIZE; 399 HPAGE_PMD_SIZE, walk);
379 else 400 spin_unlock(&walk->mm->page_table_lock);
380 mss->private_clean += PAGE_SIZE; 401 mss->anonymous_thp += HPAGE_PMD_SIZE;
381 mss->pss += (PAGE_SIZE << PSS_SHIFT); 402 return 0;
382 } 403 }
404 } else {
405 spin_unlock(&walk->mm->page_table_lock);
383 } 406 }
407 /*
408 * The mmap_sem held all the way back in m_start() is what
409 * keeps khugepaged out of here and from collapsing things
410 * in here.
411 */
412 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
413 for (; addr != end; pte++, addr += PAGE_SIZE)
414 smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
384 pte_unmap_unlock(pte - 1, ptl); 415 pte_unmap_unlock(pte - 1, ptl);
385 cond_resched(); 416 cond_resched();
386 return 0; 417 return 0;
@@ -416,6 +447,7 @@ static int show_smap(struct seq_file *m, void *v)
416 "Private_Dirty: %8lu kB\n" 447 "Private_Dirty: %8lu kB\n"
417 "Referenced: %8lu kB\n" 448 "Referenced: %8lu kB\n"
418 "Anonymous: %8lu kB\n" 449 "Anonymous: %8lu kB\n"
450 "AnonHugePages: %8lu kB\n"
419 "Swap: %8lu kB\n" 451 "Swap: %8lu kB\n"
420 "KernelPageSize: %8lu kB\n" 452 "KernelPageSize: %8lu kB\n"
421 "MMUPageSize: %8lu kB\n" 453 "MMUPageSize: %8lu kB\n"
@@ -429,6 +461,7 @@ static int show_smap(struct seq_file *m, void *v)
429 mss.private_dirty >> 10, 461 mss.private_dirty >> 10,
430 mss.referenced >> 10, 462 mss.referenced >> 10,
431 mss.anonymous >> 10, 463 mss.anonymous >> 10,
464 mss.anonymous_thp >> 10,
432 mss.swap >> 10, 465 mss.swap >> 10,
433 vma_kernel_pagesize(vma) >> 10, 466 vma_kernel_pagesize(vma) >> 10,
434 vma_mmu_pagesize(vma) >> 10, 467 vma_mmu_pagesize(vma) >> 10,
@@ -436,7 +469,8 @@ static int show_smap(struct seq_file *m, void *v)
436 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 469 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
437 470
438 if (m->count < m->size) /* vma is copied successfully */ 471 if (m->count < m->size) /* vma is copied successfully */
439 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 472 m->version = (vma != get_gate_vma(task->mm))
473 ? vma->vm_start : 0;
440 return 0; 474 return 0;
441} 475}
442 476
@@ -467,6 +501,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
467 spinlock_t *ptl; 501 spinlock_t *ptl;
468 struct page *page; 502 struct page *page;
469 503
504 split_huge_page_pmd(walk->mm, pmd);
505
470 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 506 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
471 for (; addr != end; pte++, addr += PAGE_SIZE) { 507 for (; addr != end; pte++, addr += PAGE_SIZE) {
472 ptent = *pte; 508 ptent = *pte;
@@ -623,6 +659,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
623 pte_t *pte; 659 pte_t *pte;
624 int err = 0; 660 int err = 0;
625 661
662 split_huge_page_pmd(walk->mm, pmd);
663
626 /* find the first VMA at or above 'addr' */ 664 /* find the first VMA at or above 'addr' */
627 vma = find_vma(walk->mm, addr); 665 vma = find_vma(walk->mm, addr);
628 for (; addr != end; addr += PAGE_SIZE) { 666 for (; addr != end; addr += PAGE_SIZE) {
@@ -728,8 +766,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
728 if (!task) 766 if (!task)
729 goto out; 767 goto out;
730 768
731 ret = -EACCES; 769 mm = mm_for_maps(task);
732 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 770 ret = PTR_ERR(mm);
771 if (!mm || IS_ERR(mm))
733 goto out_task; 772 goto out_task;
734 773
735 ret = -EINVAL; 774 ret = -EINVAL;
@@ -742,10 +781,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
742 if (!count) 781 if (!count)
743 goto out_task; 782 goto out_task;
744 783
745 mm = get_task_mm(task);
746 if (!mm)
747 goto out_task;
748
749 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 784 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
750 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); 785 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
751 ret = -ENOMEM; 786 ret = -ENOMEM;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index b535d3e5d5f1..980de547c070 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -199,13 +199,13 @@ static void *m_start(struct seq_file *m, loff_t *pos)
199 /* pin the task and mm whilst we play with them */ 199 /* pin the task and mm whilst we play with them */
200 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 200 priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
201 if (!priv->task) 201 if (!priv->task)
202 return NULL; 202 return ERR_PTR(-ESRCH);
203 203
204 mm = mm_for_maps(priv->task); 204 mm = mm_for_maps(priv->task);
205 if (!mm) { 205 if (!mm || IS_ERR(mm)) {
206 put_task_struct(priv->task); 206 put_task_struct(priv->task);
207 priv->task = NULL; 207 priv->task = NULL;
208 return NULL; 208 return mm;
209 } 209 }
210 down_read(&mm->mmap_sem); 210 down_read(&mm->mmap_sem);
211 211
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
new file mode 100644
index 000000000000..867d0ac026ce
--- /dev/null
+++ b/fs/pstore/Kconfig
@@ -0,0 +1,13 @@
1config PSTORE
2 bool "Persistant store support"
3 default n
4 help
5 This option enables generic access to platform level
6 persistent storage via "pstore" filesystem that can
7 be mounted as /dev/pstore. Only useful if you have
8 a platform level driver that registers with pstore to
9 provide the data, so you probably should just go say "Y"
10 (or "M") to a platform specific persistent store driver
11 (e.g. ACPI_APEI on X86) which will select this for you.
12 If you don't have a platform persistent store driver,
13 say N.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
new file mode 100644
index 000000000000..760f4bce7d1d
--- /dev/null
+++ b/fs/pstore/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the linux pstorefs routines.
3#
4
5obj-y += pstore.o
6
7pstore-objs += inode.o platform.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
new file mode 100644
index 000000000000..977ed2723845
--- /dev/null
+++ b/fs/pstore/inode.c
@@ -0,0 +1,311 @@
1/*
2 * Persistent Storage - ramfs parts.
3 *
4 * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/fs.h>
22#include <linux/fsnotify.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/mount.h>
29#include <linux/ramfs.h>
30#include <linux/parser.h>
31#include <linux/sched.h>
32#include <linux/magic.h>
33#include <linux/pstore.h>
34#include <linux/slab.h>
35#include <linux/uaccess.h>
36
37#include "internal.h"
38
39#define PSTORE_NAMELEN 64
40
41struct pstore_private {
42 u64 id;
43 int (*erase)(u64);
44 ssize_t size;
45 char data[];
46};
47
48static int pstore_file_open(struct inode *inode, struct file *file)
49{
50 file->private_data = inode->i_private;
51 return 0;
52}
53
54static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
55 size_t count, loff_t *ppos)
56{
57 struct pstore_private *ps = file->private_data;
58
59 return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
60}
61
62static const struct file_operations pstore_file_operations = {
63 .open = pstore_file_open,
64 .read = pstore_file_read,
65 .llseek = default_llseek,
66};
67
68/*
69 * When a file is unlinked from our file system we call the
70 * platform driver to erase the record from persistent store.
71 */
72static int pstore_unlink(struct inode *dir, struct dentry *dentry)
73{
74 struct pstore_private *p = dentry->d_inode->i_private;
75
76 p->erase(p->id);
77
78 return simple_unlink(dir, dentry);
79}
80
81static void pstore_evict_inode(struct inode *inode)
82{
83 end_writeback(inode);
84 kfree(inode->i_private);
85}
86
87static const struct inode_operations pstore_dir_inode_operations = {
88 .lookup = simple_lookup,
89 .unlink = pstore_unlink,
90};
91
92static struct inode *pstore_get_inode(struct super_block *sb,
93 const struct inode *dir, int mode, dev_t dev)
94{
95 struct inode *inode = new_inode(sb);
96
97 if (inode) {
98 inode->i_ino = get_next_ino();
99 inode->i_uid = inode->i_gid = 0;
100 inode->i_mode = mode;
101 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
102 switch (mode & S_IFMT) {
103 case S_IFREG:
104 inode->i_fop = &pstore_file_operations;
105 break;
106 case S_IFDIR:
107 inode->i_op = &pstore_dir_inode_operations;
108 inode->i_fop = &simple_dir_operations;
109 inc_nlink(inode);
110 break;
111 }
112 }
113 return inode;
114}
115
116enum {
117 Opt_kmsg_bytes, Opt_err
118};
119
120static const match_table_t tokens = {
121 {Opt_kmsg_bytes, "kmsg_bytes=%u"},
122 {Opt_err, NULL}
123};
124
125static void parse_options(char *options)
126{
127 char *p;
128 substring_t args[MAX_OPT_ARGS];
129 int option;
130
131 if (!options)
132 return;
133
134 while ((p = strsep(&options, ",")) != NULL) {
135 int token;
136
137 if (!*p)
138 continue;
139
140 token = match_token(p, tokens, args);
141 switch (token) {
142 case Opt_kmsg_bytes:
143 if (!match_int(&args[0], &option))
144 pstore_set_kmsg_bytes(option);
145 break;
146 }
147 }
148}
149
150static int pstore_remount(struct super_block *sb, int *flags, char *data)
151{
152 parse_options(data);
153
154 return 0;
155}
156
157static const struct super_operations pstore_ops = {
158 .statfs = simple_statfs,
159 .drop_inode = generic_delete_inode,
160 .evict_inode = pstore_evict_inode,
161 .remount_fs = pstore_remount,
162 .show_options = generic_show_options,
163};
164
165static struct super_block *pstore_sb;
166
167int pstore_is_mounted(void)
168{
169 return pstore_sb != NULL;
170}
171
172/*
173 * Make a regular file in the root directory of our file system.
174 * Load it up with "size" bytes of data from "buf".
175 * Set the mtime & ctime to the date that this record was originally stored.
176 */
177int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
178 char *data, size_t size,
179 struct timespec time, int (*erase)(u64))
180{
181 struct dentry *root = pstore_sb->s_root;
182 struct dentry *dentry;
183 struct inode *inode;
184 int rc;
185 char name[PSTORE_NAMELEN];
186 struct pstore_private *private;
187
188 rc = -ENOMEM;
189 inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
190 if (!inode)
191 goto fail;
192 private = kmalloc(sizeof *private + size, GFP_KERNEL);
193 if (!private)
194 goto fail_alloc;
195 private->id = id;
196 private->erase = erase;
197
198 switch (type) {
199 case PSTORE_TYPE_DMESG:
200 sprintf(name, "dmesg-%s-%lld", psname, id);
201 break;
202 case PSTORE_TYPE_MCE:
203 sprintf(name, "mce-%s-%lld", psname, id);
204 break;
205 case PSTORE_TYPE_UNKNOWN:
206 sprintf(name, "unknown-%s-%lld", psname, id);
207 break;
208 default:
209 sprintf(name, "type%d-%s-%lld", type, psname, id);
210 break;
211 }
212
213 mutex_lock(&root->d_inode->i_mutex);
214
215 rc = -ENOSPC;
216 dentry = d_alloc_name(root, name);
217 if (IS_ERR(dentry))
218 goto fail_lockedalloc;
219
220 memcpy(private->data, data, size);
221 inode->i_size = private->size = size;
222
223 inode->i_private = private;
224
225 if (time.tv_sec)
226 inode->i_mtime = inode->i_ctime = time;
227
228 d_add(dentry, inode);
229
230 mutex_unlock(&root->d_inode->i_mutex);
231
232 return 0;
233
234fail_lockedalloc:
235 mutex_unlock(&root->d_inode->i_mutex);
236 kfree(private);
237fail_alloc:
238 iput(inode);
239
240fail:
241 return rc;
242}
243
244int pstore_fill_super(struct super_block *sb, void *data, int silent)
245{
246 struct inode *inode = NULL;
247 struct dentry *root;
248 int err;
249
250 save_mount_options(sb, data);
251
252 pstore_sb = sb;
253
254 sb->s_maxbytes = MAX_LFS_FILESIZE;
255 sb->s_blocksize = PAGE_CACHE_SIZE;
256 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
257 sb->s_magic = PSTOREFS_MAGIC;
258 sb->s_op = &pstore_ops;
259 sb->s_time_gran = 1;
260
261 parse_options(data);
262
263 inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
264 if (!inode) {
265 err = -ENOMEM;
266 goto fail;
267 }
268 /* override ramfs "dir" options so we catch unlink(2) */
269 inode->i_op = &pstore_dir_inode_operations;
270
271 root = d_alloc_root(inode);
272 sb->s_root = root;
273 if (!root) {
274 err = -ENOMEM;
275 goto fail;
276 }
277
278 pstore_get_records();
279
280 return 0;
281fail:
282 iput(inode);
283 return err;
284}
285
286static struct dentry *pstore_mount(struct file_system_type *fs_type,
287 int flags, const char *dev_name, void *data)
288{
289 return mount_single(fs_type, flags, data, pstore_fill_super);
290}
291
292static void pstore_kill_sb(struct super_block *sb)
293{
294 kill_litter_super(sb);
295 pstore_sb = NULL;
296}
297
298static struct file_system_type pstore_fs_type = {
299 .name = "pstore",
300 .mount = pstore_mount,
301 .kill_sb = pstore_kill_sb,
302};
303
304static int __init init_pstore_fs(void)
305{
306 return register_filesystem(&pstore_fs_type);
307}
308module_init(init_pstore_fs)
309
310MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
311MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
new file mode 100644
index 000000000000..8c9f23eb1645
--- /dev/null
+++ b/fs/pstore/internal.h
@@ -0,0 +1,6 @@
1extern void pstore_set_kmsg_bytes(int);
2extern void pstore_get_records(void);
3extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
4 char *data, size_t size,
5 struct timespec time, int (*erase)(u64));
6extern int pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
new file mode 100644
index 000000000000..f835a25625ff
--- /dev/null
+++ b/fs/pstore/platform.c
@@ -0,0 +1,201 @@
1/*
2 * Persistent Storage - platform driver interface parts.
3 *
4 * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/atomic.h>
21#include <linux/types.h>
22#include <linux/errno.h>
23#include <linux/init.h>
24#include <linux/kmsg_dump.h>
25#include <linux/module.h>
26#include <linux/pstore.h>
27#include <linux/string.h>
28#include <linux/slab.h>
29#include <linux/uaccess.h>
30
31#include "internal.h"
32
33/*
34 * pstore_lock just protects "psinfo" during
35 * calls to pstore_register()
36 */
37static DEFINE_SPINLOCK(pstore_lock);
38static struct pstore_info *psinfo;
39
40/* How much of the console log to snapshot */
41static unsigned long kmsg_bytes = 10240;
42
43void pstore_set_kmsg_bytes(int bytes)
44{
45 kmsg_bytes = bytes;
46}
47
48/* Tag each group of saved records with a sequence number */
49static int oopscount;
50
51static char *reason_str[] = {
52 "Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency"
53};
54
55/*
56 * callback from kmsg_dump. (s2,l2) has the most recently
57 * written bytes, older bytes are in (s1,l1). Save as much
58 * as we can from the end of the buffer.
59 */
60static void pstore_dump(struct kmsg_dumper *dumper,
61 enum kmsg_dump_reason reason,
62 const char *s1, unsigned long l1,
63 const char *s2, unsigned long l2)
64{
65 unsigned long s1_start, s2_start;
66 unsigned long l1_cpy, l2_cpy;
67 unsigned long size, total = 0;
68 char *dst, *why;
69 u64 id;
70 int hsize, part = 1;
71
72 if (reason < ARRAY_SIZE(reason_str))
73 why = reason_str[reason];
74 else
75 why = "Unknown";
76
77 mutex_lock(&psinfo->buf_mutex);
78 oopscount++;
79 while (total < kmsg_bytes) {
80 dst = psinfo->buf;
81 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++);
82 size = psinfo->bufsize - hsize;
83 dst += hsize;
84
85 l2_cpy = min(l2, size);
86 l1_cpy = min(l1, size - l2_cpy);
87
88 if (l1_cpy + l2_cpy == 0)
89 break;
90
91 s2_start = l2 - l2_cpy;
92 s1_start = l1 - l1_cpy;
93
94 memcpy(dst, s1 + s1_start, l1_cpy);
95 memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
96
97 id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
98 if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
99 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
100 psinfo->buf, hsize + l1_cpy + l2_cpy,
101 CURRENT_TIME, psinfo->erase);
102 l1 -= l1_cpy;
103 l2 -= l2_cpy;
104 total += l1_cpy + l2_cpy;
105 }
106 mutex_unlock(&psinfo->buf_mutex);
107}
108
109static struct kmsg_dumper pstore_dumper = {
110 .dump = pstore_dump,
111};
112
113/*
114 * platform specific persistent storage driver registers with
115 * us here. If pstore is already mounted, call the platform
116 * read function right away to populate the file system. If not
117 * then the pstore mount code will call us later to fill out
118 * the file system.
119 *
120 * Register with kmsg_dump to save last part of console log on panic.
121 */
122int pstore_register(struct pstore_info *psi)
123{
124 struct module *owner = psi->owner;
125
126 spin_lock(&pstore_lock);
127 if (psinfo) {
128 spin_unlock(&pstore_lock);
129 return -EBUSY;
130 }
131 psinfo = psi;
132 spin_unlock(&pstore_lock);
133
134 if (owner && !try_module_get(owner)) {
135 psinfo = NULL;
136 return -EINVAL;
137 }
138
139 if (pstore_is_mounted())
140 pstore_get_records();
141
142 kmsg_dump_register(&pstore_dumper);
143
144 return 0;
145}
146EXPORT_SYMBOL_GPL(pstore_register);
147
148/*
149 * Read all the records from the persistent store. Create and
150 * file files in our filesystem.
151 */
152void pstore_get_records(void)
153{
154 struct pstore_info *psi = psinfo;
155 size_t size;
156 u64 id;
157 enum pstore_type_id type;
158 struct timespec time;
159 int failed = 0;
160
161 if (!psi)
162 return;
163
164 mutex_lock(&psinfo->buf_mutex);
165 while ((size = psi->read(&id, &type, &time)) > 0) {
166 if (pstore_mkfile(type, psi->name, id, psi->buf, size,
167 time, psi->erase))
168 failed++;
169 }
170 mutex_unlock(&psinfo->buf_mutex);
171
172 if (failed)
173 printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
174 failed, psi->name);
175}
176
177/*
178 * Call platform driver to write a record to the
179 * persistent store.
180 */
181int pstore_write(enum pstore_type_id type, char *buf, size_t size)
182{
183 u64 id;
184
185 if (!psinfo)
186 return -ENODEV;
187
188 if (size > psinfo->bufsize)
189 return -EFBIG;
190
191 mutex_lock(&psinfo->buf_mutex);
192 memcpy(psinfo->buf, buf, size);
193 id = psinfo->write(type, size);
194 if (pstore_is_mounted())
195 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
196 size, CURRENT_TIME, psinfo->erase);
197 mutex_unlock(&psinfo->buf_mutex);
198
199 return 0;
200}
201EXPORT_SYMBOL_GPL(pstore_write);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e63b4171d583..2b0646613f5a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -335,7 +335,6 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
335static const struct address_space_operations qnx4_aops = { 335static const struct address_space_operations qnx4_aops = {
336 .readpage = qnx4_readpage, 336 .readpage = qnx4_readpage,
337 .writepage = qnx4_writepage, 337 .writepage = qnx4_writepage,
338 .sync_page = block_sync_page,
339 .write_begin = qnx4_write_begin, 338 .write_begin = qnx4_write_begin,
340 .write_end = generic_write_end, 339 .write_end = generic_write_end,
341 .bmap = qnx4_bmap 340 .bmap = qnx4_bmap
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a2a622e079f0..fcc8ae75d874 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -76,7 +76,7 @@
76#include <linux/buffer_head.h> 76#include <linux/buffer_head.h>
77#include <linux/capability.h> 77#include <linux/capability.h>
78#include <linux/quotaops.h> 78#include <linux/quotaops.h>
79#include <linux/writeback.h> /* for inode_lock, oddly enough.. */ 79#include "../internal.h" /* ugh */
80 80
81#include <asm/uaccess.h> 81#include <asm/uaccess.h>
82 82
@@ -900,33 +900,38 @@ static void add_dquot_ref(struct super_block *sb, int type)
900 int reserved = 0; 900 int reserved = 0;
901#endif 901#endif
902 902
903 spin_lock(&inode_lock); 903 spin_lock(&inode_sb_list_lock);
904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
905 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 905 spin_lock(&inode->i_lock);
906 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
907 !atomic_read(&inode->i_writecount) ||
908 !dqinit_needed(inode, type)) {
909 spin_unlock(&inode->i_lock);
906 continue; 910 continue;
911 }
907#ifdef CONFIG_QUOTA_DEBUG 912#ifdef CONFIG_QUOTA_DEBUG
908 if (unlikely(inode_get_rsv_space(inode) > 0)) 913 if (unlikely(inode_get_rsv_space(inode) > 0))
909 reserved = 1; 914 reserved = 1;
910#endif 915#endif
911 if (!atomic_read(&inode->i_writecount))
912 continue;
913 if (!dqinit_needed(inode, type))
914 continue;
915
916 __iget(inode); 916 __iget(inode);
917 spin_unlock(&inode_lock); 917 spin_unlock(&inode->i_lock);
918 spin_unlock(&inode_sb_list_lock);
918 919
919 iput(old_inode); 920 iput(old_inode);
920 __dquot_initialize(inode, type); 921 __dquot_initialize(inode, type);
921 /* We hold a reference to 'inode' so it couldn't have been 922
922 * removed from s_inodes list while we dropped the inode_lock. 923 /*
923 * We cannot iput the inode now as we can be holding the last 924 * We hold a reference to 'inode' so it couldn't have been
924 * reference and we cannot iput it under inode_lock. So we 925 * removed from s_inodes list while we dropped the
925 * keep the reference and iput it later. */ 926 * inode_sb_list_lock We cannot iput the inode now as we can be
927 * holding the last reference and we cannot iput it under
928 * inode_sb_list_lock. So we keep the reference and iput it
929 * later.
930 */
926 old_inode = inode; 931 old_inode = inode;
927 spin_lock(&inode_lock); 932 spin_lock(&inode_sb_list_lock);
928 } 933 }
929 spin_unlock(&inode_lock); 934 spin_unlock(&inode_sb_list_lock);
930 iput(old_inode); 935 iput(old_inode);
931 936
932#ifdef CONFIG_QUOTA_DEBUG 937#ifdef CONFIG_QUOTA_DEBUG
@@ -1007,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
1007 struct inode *inode; 1012 struct inode *inode;
1008 int reserved = 0; 1013 int reserved = 0;
1009 1014
1010 spin_lock(&inode_lock); 1015 spin_lock(&inode_sb_list_lock);
1011 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1016 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1012 /* 1017 /*
1013 * We have to scan also I_NEW inodes because they can already 1018 * We have to scan also I_NEW inodes because they can already
@@ -1021,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
1021 remove_inode_dquot_ref(inode, type, tofree_head); 1026 remove_inode_dquot_ref(inode, type, tofree_head);
1022 } 1027 }
1023 } 1028 }
1024 spin_unlock(&inode_lock); 1029 spin_unlock(&inode_sb_list_lock);
1025#ifdef CONFIG_QUOTA_DEBUG 1030#ifdef CONFIG_QUOTA_DEBUG
1026 if (reserved) { 1031 if (reserved) {
1027 printk(KERN_WARNING "VFS (%s): Writes happened after quota" 1032 printk(KERN_WARNING "VFS (%s): Writes happened after quota"
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 65444d29406b..f1ab3604db5a 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -112,7 +112,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
112 if (!info->dqi_priv) { 112 if (!info->dqi_priv) {
113 printk(KERN_WARNING 113 printk(KERN_WARNING
114 "Not enough memory for quota information structure.\n"); 114 "Not enough memory for quota information structure.\n");
115 return -1; 115 return -ENOMEM;
116 } 116 }
117 qinfo = info->dqi_priv; 117 qinfo = info->dqi_priv;
118 if (version == 0) { 118 if (version == 0) {
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 792b3cb2cd18..3c3b00165114 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -31,9 +31,7 @@ endif
31# and causing a panic. Since this behavior only affects ppc32, this ifeq 31# and causing a panic. Since this behavior only affects ppc32, this ifeq
32# will work around it. If any other architecture displays this behavior, 32# will work around it. If any other architecture displays this behavior,
33# add it here. 33# add it here.
34ifeq ($(CONFIG_PPC32),y) 34ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
35EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0400, -O1)
36endif
37 35
38TAGS: 36TAGS:
39 etags *.c 37 etags *.c
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e2..4fd5bb33dbb5 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1593 struct inode *inode = dentry->d_inode; 1593 struct inode *inode = dentry->d_inode;
1594 int maxlen = *lenp; 1594 int maxlen = *lenp;
1595 1595
1596 if (maxlen < 3) 1596 if (need_parent && (maxlen < 5)) {
1597 *lenp = 5;
1597 return 255; 1598 return 255;
1599 } else if (maxlen < 3) {
1600 *lenp = 3;
1601 return 255;
1602 }
1598 1603
1599 data[0] = inode->i_ino; 1604 data[0] = inode->i_ino;
1600 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); 1605 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
@@ -3212,7 +3217,6 @@ const struct address_space_operations reiserfs_address_space_operations = {
3212 .readpages = reiserfs_readpages, 3217 .readpages = reiserfs_readpages,
3213 .releasepage = reiserfs_releasepage, 3218 .releasepage = reiserfs_releasepage,
3214 .invalidatepage = reiserfs_invalidatepage, 3219 .invalidatepage = reiserfs_invalidatepage,
3215 .sync_page = block_sync_page,
3216 .write_begin = reiserfs_write_begin, 3220 .write_begin = reiserfs_write_begin,
3217 .write_end = reiserfs_write_end, 3221 .write_end = reiserfs_write_end,
3218 .bmap = reiserfs_aop_bmap, 3222 .bmap = reiserfs_aop_bmap,
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 79265fdc317a..4e153051bc75 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
59 if (err) 59 if (err)
60 break; 60 break;
61 61
62 if (!is_owner_or_cap(inode)) { 62 if (!inode_owner_or_capable(inode)) {
63 err = -EPERM; 63 err = -EPERM;
64 goto setflags_out; 64 goto setflags_out;
65 } 65 }
@@ -103,7 +103,7 @@ setflags_out:
103 err = put_user(inode->i_generation, (int __user *)arg); 103 err = put_user(inode->i_generation, (int __user *)arg);
104 break; 104 break;
105 case REISERFS_IOC_SETVERSION: 105 case REISERFS_IOC_SETVERSION:
106 if (!is_owner_or_cap(inode)) { 106 if (!inode_owner_or_capable(inode)) {
107 err = -EPERM; 107 err = -EPERM;
108 break; 108 break;
109 } 109 }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3eea859e6990..c77514bd5776 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2876,7 +2876,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2876 reiserfs_mounted_fs_count++; 2876 reiserfs_mounted_fs_count++;
2877 if (reiserfs_mounted_fs_count <= 1) { 2877 if (reiserfs_mounted_fs_count <= 1) {
2878 reiserfs_write_unlock(sb); 2878 reiserfs_write_unlock(sb);
2879 commit_wq = create_workqueue("reiserfs"); 2879 commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
2880 reiserfs_write_lock(sb); 2880 reiserfs_write_lock(sb);
2881 } 2881 }
2882 2882
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ba5f51ec3458..118662690cdf 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -593,7 +593,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
593 new_inode_init(inode, dir, mode); 593 new_inode_init(inode, dir, mode);
594 594
595 jbegin_count += reiserfs_cache_default_acl(dir); 595 jbegin_count += reiserfs_cache_default_acl(dir);
596 retval = reiserfs_security_init(dir, inode, &security); 596 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
597 if (retval < 0) { 597 if (retval < 0) {
598 drop_new_inode(inode); 598 drop_new_inode(inode);
599 return retval; 599 return retval;
@@ -667,7 +667,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
667 new_inode_init(inode, dir, mode); 667 new_inode_init(inode, dir, mode);
668 668
669 jbegin_count += reiserfs_cache_default_acl(dir); 669 jbegin_count += reiserfs_cache_default_acl(dir);
670 retval = reiserfs_security_init(dir, inode, &security); 670 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
671 if (retval < 0) { 671 if (retval < 0) {
672 drop_new_inode(inode); 672 drop_new_inode(inode);
673 return retval; 673 return retval;
@@ -747,7 +747,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
747 new_inode_init(inode, dir, mode); 747 new_inode_init(inode, dir, mode);
748 748
749 jbegin_count += reiserfs_cache_default_acl(dir); 749 jbegin_count += reiserfs_cache_default_acl(dir);
750 retval = reiserfs_security_init(dir, inode, &security); 750 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
751 if (retval < 0) { 751 if (retval < 0) {
752 drop_new_inode(inode); 752 drop_new_inode(inode);
753 return retval; 753 return retval;
@@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
771 EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, 771 EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
772 dentry, inode, &security); 772 dentry, inode, &security);
773 if (retval) { 773 if (retval) {
774 dir->i_nlink--; 774 DEC_DIR_INODE_NLINK(dir)
775 goto out_failed; 775 goto out_failed;
776 } 776 }
777 777
@@ -1032,7 +1032,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
1032 } 1032 }
1033 new_inode_init(inode, parent_dir, mode); 1033 new_inode_init(inode, parent_dir, mode);
1034 1034
1035 retval = reiserfs_security_init(parent_dir, inode, &security); 1035 retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
1036 &security);
1036 if (retval < 0) { 1037 if (retval < 0) {
1037 drop_new_inode(inode); 1038 drop_new_inode(inode);
1038 return retval; 1039 return retval;
@@ -1122,10 +1123,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1122 reiserfs_write_unlock(dir->i_sb); 1123 reiserfs_write_unlock(dir->i_sb);
1123 return -EMLINK; 1124 return -EMLINK;
1124 } 1125 }
1125 if (inode->i_nlink == 0) {
1126 reiserfs_write_unlock(dir->i_sb);
1127 return -ENOENT;
1128 }
1129 1126
1130 /* inc before scheduling so reiserfs_unlink knows we are here */ 1127 /* inc before scheduling so reiserfs_unlink knows we are here */
1131 inc_nlink(inode); 1128 inc_nlink(inode);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 3cfb2e933644..5c11ca82b782 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
978 978
979static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) 979static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
980{ 980{
981 if (nd->flags & LOOKUP_RCU)
982 return -ECHILD;
983 return -EPERM; 981 return -EPERM;
984} 982}
985 983
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 90d2fcb67a31..3dc38f1206fc 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -26,7 +26,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
26 size_t jcreate_blocks; 26 size_t jcreate_blocks;
27 if (!reiserfs_posixacl(inode->i_sb)) 27 if (!reiserfs_posixacl(inode->i_sb))
28 return -EOPNOTSUPP; 28 return -EOPNOTSUPP;
29 if (!is_owner_or_cap(inode)) 29 if (!inode_owner_or_capable(inode))
30 return -EPERM; 30 return -EPERM;
31 31
32 if (value) { 32 if (value) {
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 237c6928d3c6..ef66c18a9332 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -54,6 +54,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
54 * of blocks needed for the transaction. If successful, reiserfs_security 54 * of blocks needed for the transaction. If successful, reiserfs_security
55 * must be released using reiserfs_security_free when the caller is done. */ 55 * must be released using reiserfs_security_free when the caller is done. */
56int reiserfs_security_init(struct inode *dir, struct inode *inode, 56int reiserfs_security_init(struct inode *dir, struct inode *inode,
57 const struct qstr *qstr,
57 struct reiserfs_security_handle *sec) 58 struct reiserfs_security_handle *sec)
58{ 59{
59 int blocks = 0; 60 int blocks = 0;
@@ -65,7 +66,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
65 if (IS_PRIVATE(dir)) 66 if (IS_PRIVATE(dir))
66 return 0; 67 return 0;
67 68
68 error = security_inode_init_security(inode, dir, &sec->name, 69 error = security_inode_init_security(inode, dir, qstr, &sec->name,
69 &sec->value, &sec->length); 70 &sec->value, &sec->length);
70 if (error) { 71 if (error) {
71 if (error == -EOPNOTSUPP) 72 if (error == -EOPNOTSUPP)
diff --git a/fs/select.c b/fs/select.c
index e56560d2b08a..d33418fdc858 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -517,9 +517,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
517 * Update: ERESTARTSYS breaks at least the xview clock binary, so 517 * Update: ERESTARTSYS breaks at least the xview clock binary, so
518 * I'm trying ERESTARTNOHAND which restart only when you want to. 518 * I'm trying ERESTARTNOHAND which restart only when you want to.
519 */ 519 */
520#define MAX_SELECT_SECONDS \
521 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
522
523int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, 520int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
524 fd_set __user *exp, struct timespec *end_time) 521 fd_set __user *exp, struct timespec *end_time)
525{ 522{
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index aa68a8a31518..efc309fa3035 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,12 +5,12 @@ config SQUASHFS
5 help 5 help
6 Saying Y here includes support for SquashFS 4.0 (a Compressed 6 Saying Y here includes support for SquashFS 4.0 (a Compressed
7 Read-Only File System). Squashfs is a highly compressed read-only 7 Read-Only File System). Squashfs is a highly compressed read-only
8 filesystem for Linux. It uses zlib/lzo compression to compress both 8 filesystem for Linux. It uses zlib, lzo or xz compression to
9 files, inodes and directories. Inodes in the system are very small 9 compress both files, inodes and directories. Inodes in the system
10 and all blocks are packed to minimise data overhead. Block sizes 10 are very small and all blocks are packed to minimise data overhead.
11 greater than 4K are supported up to a maximum of 1 Mbytes (default 11 Block sizes greater than 4K are supported up to a maximum of 1 Mbytes
12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files 12 (default block size 128K). SquashFS 4.0 supports 64 bit filesystems
13 (larger than 4GB), full uid/gid information, hard links and 13 and files (larger than 4GB), full uid/gid information, hard links and
14 timestamps. 14 timestamps.
15 15
16 Squashfs is intended for general read-only filesystem use, for 16 Squashfs is intended for general read-only filesystem use, for
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2fb2882f0fa7..8ab48bc2fa7d 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -63,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb,
63 *length = (unsigned char) bh->b_data[*offset] | 63 *length = (unsigned char) bh->b_data[*offset] |
64 (unsigned char) bh->b_data[*offset + 1] << 8; 64 (unsigned char) bh->b_data[*offset + 1] << 8;
65 *offset += 2; 65 *offset += 2;
66
67 if (*offset == msblk->devblksize) {
68 put_bh(bh);
69 bh = sb_bread(sb, ++(*cur_index));
70 if (bh == NULL)
71 return NULL;
72 *offset = 0;
73 }
66 } 74 }
67 75
68 return bh; 76 return bh;
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index a5940e54c4dd..e921bd213738 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/types.h> 24#include <linux/types.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/slab.h>
26#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
27 28
28#include "squashfs_fs.h" 29#include "squashfs_fs.h"
@@ -74,3 +75,36 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
74 75
75 return decompressor[i]; 76 return decompressor[i];
76} 77}
78
79
80void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
81{
82 struct squashfs_sb_info *msblk = sb->s_fs_info;
83 void *strm, *buffer = NULL;
84 int length = 0;
85
86 /*
87 * Read decompressor specific options from file system if present
88 */
89 if (SQUASHFS_COMP_OPTS(flags)) {
90 buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
91 if (buffer == NULL)
92 return ERR_PTR(-ENOMEM);
93
94 length = squashfs_read_data(sb, &buffer,
95 sizeof(struct squashfs_super_block), 0, NULL,
96 PAGE_CACHE_SIZE, 1);
97
98 if (length < 0) {
99 strm = ERR_PTR(length);
100 goto finished;
101 }
102 }
103
104 strm = msblk->decompressor->init(msblk, buffer, length);
105
106finished:
107 kfree(buffer);
108
109 return strm;
110}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 3b305a70f7aa..099745ad5691 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -24,7 +24,7 @@
24 */ 24 */
25 25
26struct squashfs_decompressor { 26struct squashfs_decompressor {
27 void *(*init)(struct squashfs_sb_info *); 27 void *(*init)(struct squashfs_sb_info *, void *, int);
28 void (*free)(void *); 28 void (*free)(void *);
29 int (*decompress)(struct squashfs_sb_info *, void **, 29 int (*decompress)(struct squashfs_sb_info *, void **,
30 struct buffer_head **, int, int, int, int, int); 30 struct buffer_head **, int, int, int, int, int);
@@ -33,11 +33,6 @@ struct squashfs_decompressor {
33 int supported; 33 int supported;
34}; 34};
35 35
36static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
37{
38 return msblk->decompressor->init(msblk);
39}
40
41static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, 36static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
42 void *s) 37 void *s)
43{ 38{
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 0dc340aa2be9..3f79cd1d0c19 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -172,6 +172,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
172 length += sizeof(dirh); 172 length += sizeof(dirh);
173 173
174 dir_count = le32_to_cpu(dirh.count) + 1; 174 dir_count = le32_to_cpu(dirh.count) + 1;
175
176 /* dir_count should never be larger than 256 */
177 if (dir_count > 256)
178 goto failed_read;
179
175 while (dir_count--) { 180 while (dir_count--) {
176 /* 181 /*
177 * Read directory entry. 182 * Read directory entry.
@@ -183,6 +188,10 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
183 188
184 size = le16_to_cpu(dire->size) + 1; 189 size = le16_to_cpu(dire->size) + 1;
185 190
191 /* size should never be larger than SQUASHFS_NAME_LEN */
192 if (size > SQUASHFS_NAME_LEN)
193 goto failed_read;
194
186 err = squashfs_read_metadata(inode->i_sb, dire->name, 195 err = squashfs_read_metadata(inode->i_sb, dire->name,
187 &block, &offset, size); 196 &block, &offset, size);
188 if (err < 0) 197 if (err < 0)
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 7da759e34c52..00f4dfc5f088 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -37,7 +37,7 @@ struct squashfs_lzo {
37 void *output; 37 void *output;
38}; 38};
39 39
40static void *lzo_init(struct squashfs_sb_info *msblk) 40static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len)
41{ 41{
42 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); 42 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
43 43
@@ -58,7 +58,7 @@ failed2:
58failed: 58failed:
59 ERROR("Failed to allocate lzo workspace\n"); 59 ERROR("Failed to allocate lzo workspace\n");
60 kfree(stream); 60 kfree(stream);
61 return NULL; 61 return ERR_PTR(-ENOMEM);
62} 62}
63 63
64 64
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 7a9464d08cf6..5d922a6701ab 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -176,6 +176,11 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
176 length += sizeof(dirh); 176 length += sizeof(dirh);
177 177
178 dir_count = le32_to_cpu(dirh.count) + 1; 178 dir_count = le32_to_cpu(dirh.count) + 1;
179
180 /* dir_count should never be larger than 256 */
181 if (dir_count > 256)
182 goto data_error;
183
179 while (dir_count--) { 184 while (dir_count--) {
180 /* 185 /*
181 * Read directory entry. 186 * Read directory entry.
@@ -187,6 +192,10 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
187 192
188 size = le16_to_cpu(dire->size) + 1; 193 size = le16_to_cpu(dire->size) + 1;
189 194
195 /* size should never be larger than SQUASHFS_NAME_LEN */
196 if (size > SQUASHFS_NAME_LEN)
197 goto data_error;
198
190 err = squashfs_read_metadata(dir->i_sb, dire->name, 199 err = squashfs_read_metadata(dir->i_sb, dire->name,
191 &block, &offset, size); 200 &block, &offset, size);
192 if (err < 0) 201 if (err < 0)
@@ -228,6 +237,9 @@ exit_lookup:
228 d_add(dentry, inode); 237 d_add(dentry, inode);
229 return ERR_PTR(0); 238 return ERR_PTR(0);
230 239
240data_error:
241 err = -EIO;
242
231read_failure: 243read_failure:
232 ERROR("Unable to read directory block [%llx:%x]\n", 244 ERROR("Unable to read directory block [%llx:%x]\n",
233 squashfs_i(dir)->start + msblk->directory_table, 245 squashfs_i(dir)->start + msblk->directory_table,
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index ba729d808876..1f2e608b8785 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -48,6 +48,7 @@ extern int squashfs_read_table(struct super_block *, void *, u64, int);
48 48
49/* decompressor.c */ 49/* decompressor.c */
50extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); 50extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
51extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
51 52
52/* export.c */ 53/* export.c */
53extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, 54extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 39533feffd6d..4582c568ef4d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -57,6 +57,7 @@
57#define SQUASHFS_ALWAYS_FRAG 5 57#define SQUASHFS_ALWAYS_FRAG 5
58#define SQUASHFS_DUPLICATE 6 58#define SQUASHFS_DUPLICATE 6
59#define SQUASHFS_EXPORT 7 59#define SQUASHFS_EXPORT 7
60#define SQUASHFS_COMP_OPT 10
60 61
61#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1) 62#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1)
62 63
@@ -81,6 +82,9 @@
81#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \ 82#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \
82 SQUASHFS_EXPORT) 83 SQUASHFS_EXPORT)
83 84
85#define SQUASHFS_COMP_OPTS(flags) SQUASHFS_BIT(flags, \
86 SQUASHFS_COMP_OPT)
87
84/* Max number of types and file types */ 88/* Max number of types and file types */
85#define SQUASHFS_DIR_TYPE 1 89#define SQUASHFS_DIR_TYPE 1
86#define SQUASHFS_REG_TYPE 2 90#define SQUASHFS_REG_TYPE 2
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 20700b9f2b4c..5c8184c061a4 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -199,10 +199,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
199 199
200 err = -ENOMEM; 200 err = -ENOMEM;
201 201
202 msblk->stream = squashfs_decompressor_init(msblk);
203 if (msblk->stream == NULL)
204 goto failed_mount;
205
206 msblk->block_cache = squashfs_cache_init("metadata", 202 msblk->block_cache = squashfs_cache_init("metadata",
207 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); 203 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
208 if (msblk->block_cache == NULL) 204 if (msblk->block_cache == NULL)
@@ -215,6 +211,13 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
215 goto failed_mount; 211 goto failed_mount;
216 } 212 }
217 213
214 msblk->stream = squashfs_decompressor_init(sb, flags);
215 if (IS_ERR(msblk->stream)) {
216 err = PTR_ERR(msblk->stream);
217 msblk->stream = NULL;
218 goto failed_mount;
219 }
220
218 /* Allocate and read id index table */ 221 /* Allocate and read id index table */
219 msblk->id_table = squashfs_read_id_index_table(sb, 222 msblk->id_table = squashfs_read_id_index_table(sb,
220 le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids)); 223 le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
@@ -370,8 +373,8 @@ static void squashfs_put_super(struct super_block *sb)
370} 373}
371 374
372 375
373static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags, 376static struct dentry *squashfs_mount(struct file_system_type *fs_type,
374 const char *dev_name, void *data) 377 int flags, const char *dev_name, void *data)
375{ 378{
376 return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super); 379 return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
377} 380}
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 856756ca5ee4..aa47a286d1f8 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -26,10 +26,10 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/xz.h> 28#include <linux/xz.h>
29#include <linux/bitops.h>
29 30
30#include "squashfs_fs.h" 31#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 32#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 33#include "squashfs.h"
34#include "decompressor.h" 34#include "decompressor.h"
35 35
@@ -38,24 +38,57 @@ struct squashfs_xz {
38 struct xz_buf buf; 38 struct xz_buf buf;
39}; 39};
40 40
41static void *squashfs_xz_init(struct squashfs_sb_info *msblk) 41struct comp_opts {
42 __le32 dictionary_size;
43 __le32 flags;
44};
45
46static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
47 int len)
42{ 48{
43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); 49 struct comp_opts *comp_opts = buff;
50 struct squashfs_xz *stream;
51 int dict_size = msblk->block_size;
52 int err, n;
53
54 if (comp_opts) {
55 /* check compressor options are the expected length */
56 if (len < sizeof(*comp_opts)) {
57 err = -EIO;
58 goto failed;
59 }
60
61 dict_size = le32_to_cpu(comp_opts->dictionary_size);
62
63 /* the dictionary size should be 2^n or 2^n+2^(n+1) */
64 n = ffs(dict_size) - 1;
65 if (dict_size != (1 << n) && dict_size != (1 << n) +
66 (1 << (n + 1))) {
67 err = -EIO;
68 goto failed;
69 }
70 }
71
72 dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE);
44 73
45 struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL); 74 stream = kmalloc(sizeof(*stream), GFP_KERNEL);
46 if (stream == NULL) 75 if (stream == NULL) {
76 err = -ENOMEM;
47 goto failed; 77 goto failed;
78 }
48 79
49 stream->state = xz_dec_init(XZ_PREALLOC, block_size); 80 stream->state = xz_dec_init(XZ_PREALLOC, dict_size);
50 if (stream->state == NULL) 81 if (stream->state == NULL) {
82 kfree(stream);
83 err = -ENOMEM;
51 goto failed; 84 goto failed;
85 }
52 86
53 return stream; 87 return stream;
54 88
55failed: 89failed:
56 ERROR("Failed to allocate xz workspace\n"); 90 ERROR("Failed to initialise xz decompressor\n");
57 kfree(stream); 91 return ERR_PTR(err);
58 return NULL;
59} 92}
60 93
61 94
@@ -95,12 +128,6 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
95 if (!buffer_uptodate(bh[k])) 128 if (!buffer_uptodate(bh[k]))
96 goto release_mutex; 129 goto release_mutex;
97 130
98 if (avail == 0) {
99 offset = 0;
100 put_bh(bh[k++]);
101 continue;
102 }
103
104 stream->buf.in = bh[k]->b_data + offset; 131 stream->buf.in = bh[k]->b_data + offset;
105 stream->buf.in_size = avail; 132 stream->buf.in_size = avail;
106 stream->buf.in_pos = 0; 133 stream->buf.in_pos = 0;
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 818a5e063faf..517688b32ffa 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -26,19 +26,19 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/zlib.h> 28#include <linux/zlib.h>
29#include <linux/vmalloc.h>
29 30
30#include "squashfs_fs.h" 31#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 32#include "squashfs_fs_sb.h"
32#include "squashfs.h" 33#include "squashfs.h"
33#include "decompressor.h" 34#include "decompressor.h"
34 35
35static void *zlib_init(struct squashfs_sb_info *dummy) 36static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
36{ 37{
37 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); 38 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
38 if (stream == NULL) 39 if (stream == NULL)
39 goto failed; 40 goto failed;
40 stream->workspace = kmalloc(zlib_inflate_workspacesize(), 41 stream->workspace = vmalloc(zlib_inflate_workspacesize());
41 GFP_KERNEL);
42 if (stream->workspace == NULL) 42 if (stream->workspace == NULL)
43 goto failed; 43 goto failed;
44 44
@@ -47,7 +47,7 @@ static void *zlib_init(struct squashfs_sb_info *dummy)
47failed: 47failed:
48 ERROR("Failed to allocate zlib workspace\n"); 48 ERROR("Failed to allocate zlib workspace\n");
49 kfree(stream); 49 kfree(stream);
50 return NULL; 50 return ERR_PTR(-ENOMEM);
51} 51}
52 52
53 53
@@ -56,7 +56,7 @@ static void zlib_free(void *strm)
56 z_stream *stream = strm; 56 z_stream *stream = strm;
57 57
58 if (stream) 58 if (stream)
59 kfree(stream->workspace); 59 vfree(stream->workspace);
60 kfree(stream); 60 kfree(stream);
61} 61}
62 62
@@ -82,12 +82,6 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
82 if (!buffer_uptodate(bh[k])) 82 if (!buffer_uptodate(bh[k]))
83 goto release_mutex; 83 goto release_mutex;
84 84
85 if (avail == 0) {
86 offset = 0;
87 put_bh(bh[k++]);
88 continue;
89 }
90
91 stream->next_in = bh[k]->b_data + offset; 85 stream->next_in = bh[k]->b_data + offset;
92 stream->avail_in = avail; 86 stream->avail_in = avail;
93 offset = 0; 87 offset = 0;
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf2b703..961039121cb8 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
75 int error = -EINVAL; 75 int error = -EINVAL;
76 int lookup_flags = 0; 76 int lookup_flags = 0;
77 77
78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) 78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
79 AT_EMPTY_PATH)) != 0)
79 goto out; 80 goto out;
80 81
81 if (!(flag & AT_SYMLINK_NOFOLLOW)) 82 if (!(flag & AT_SYMLINK_NOFOLLOW))
82 lookup_flags |= LOOKUP_FOLLOW; 83 lookup_flags |= LOOKUP_FOLLOW;
83 if (flag & AT_NO_AUTOMOUNT) 84 if (flag & AT_NO_AUTOMOUNT)
84 lookup_flags |= LOOKUP_NO_AUTOMOUNT; 85 lookup_flags |= LOOKUP_NO_AUTOMOUNT;
86 if (flag & AT_EMPTY_PATH)
87 lookup_flags |= LOOKUP_EMPTY;
85 88
86 error = user_path_at(dfd, filename, lookup_flags, &path); 89 error = user_path_at(dfd, filename, lookup_flags, &path);
87 if (error) 90 if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
297 if (bufsiz <= 0) 300 if (bufsiz <= 0)
298 return -EINVAL; 301 return -EINVAL;
299 302
300 error = user_path_at(dfd, pathname, 0, &path); 303 error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
301 if (!error) { 304 if (!error) {
302 struct inode *inode = path.dentry->d_inode; 305 struct inode *inode = path.dentry->d_inode;
303 306
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8a996b..8244924dec55 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
73} 73}
74EXPORT_SYMBOL(vfs_statfs); 74EXPORT_SYMBOL(vfs_statfs);
75 75
76static int do_statfs_native(struct path *path, struct statfs *buf) 76int user_statfs(const char __user *pathname, struct kstatfs *st)
77{ 77{
78 struct kstatfs st; 78 struct path path;
79 int retval; 79 int error = user_path(pathname, &path);
80 if (!error) {
81 error = vfs_statfs(&path, st);
82 path_put(&path);
83 }
84 return error;
85}
80 86
81 retval = vfs_statfs(path, &st); 87int fd_statfs(int fd, struct kstatfs *st)
82 if (retval) 88{
83 return retval; 89 struct file *file = fget(fd);
90 int error = -EBADF;
91 if (file) {
92 error = vfs_statfs(&file->f_path, st);
93 fput(file);
94 }
95 return error;
96}
84 97
85 if (sizeof(*buf) == sizeof(st)) 98static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
86 memcpy(buf, &st, sizeof(st)); 99{
100 struct statfs buf;
101
102 if (sizeof(buf) == sizeof(*st))
103 memcpy(&buf, st, sizeof(*st));
87 else { 104 else {
88 if (sizeof buf->f_blocks == 4) { 105 if (sizeof buf.f_blocks == 4) {
89 if ((st.f_blocks | st.f_bfree | st.f_bavail | 106 if ((st->f_blocks | st->f_bfree | st->f_bavail |
90 st.f_bsize | st.f_frsize) & 107 st->f_bsize | st->f_frsize) &
91 0xffffffff00000000ULL) 108 0xffffffff00000000ULL)
92 return -EOVERFLOW; 109 return -EOVERFLOW;
93 /* 110 /*
94 * f_files and f_ffree may be -1; it's okay to stuff 111 * f_files and f_ffree may be -1; it's okay to stuff
95 * that into 32 bits 112 * that into 32 bits
96 */ 113 */
97 if (st.f_files != -1 && 114 if (st->f_files != -1 &&
98 (st.f_files & 0xffffffff00000000ULL)) 115 (st->f_files & 0xffffffff00000000ULL))
99 return -EOVERFLOW; 116 return -EOVERFLOW;
100 if (st.f_ffree != -1 && 117 if (st->f_ffree != -1 &&
101 (st.f_ffree & 0xffffffff00000000ULL)) 118 (st->f_ffree & 0xffffffff00000000ULL))
102 return -EOVERFLOW; 119 return -EOVERFLOW;
103 } 120 }
104 121
105 buf->f_type = st.f_type; 122 buf.f_type = st->f_type;
106 buf->f_bsize = st.f_bsize; 123 buf.f_bsize = st->f_bsize;
107 buf->f_blocks = st.f_blocks; 124 buf.f_blocks = st->f_blocks;
108 buf->f_bfree = st.f_bfree; 125 buf.f_bfree = st->f_bfree;
109 buf->f_bavail = st.f_bavail; 126 buf.f_bavail = st->f_bavail;
110 buf->f_files = st.f_files; 127 buf.f_files = st->f_files;
111 buf->f_ffree = st.f_ffree; 128 buf.f_ffree = st->f_ffree;
112 buf->f_fsid = st.f_fsid; 129 buf.f_fsid = st->f_fsid;
113 buf->f_namelen = st.f_namelen; 130 buf.f_namelen = st->f_namelen;
114 buf->f_frsize = st.f_frsize; 131 buf.f_frsize = st->f_frsize;
115 buf->f_flags = st.f_flags; 132 buf.f_flags = st->f_flags;
116 memset(buf->f_spare, 0, sizeof(buf->f_spare)); 133 memset(buf.f_spare, 0, sizeof(buf.f_spare));
117 } 134 }
135 if (copy_to_user(p, &buf, sizeof(buf)))
136 return -EFAULT;
118 return 0; 137 return 0;
119} 138}
120 139
121static int do_statfs64(struct path *path, struct statfs64 *buf) 140static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
122{ 141{
123 struct kstatfs st; 142 struct statfs64 buf;
124 int retval; 143 if (sizeof(buf) == sizeof(*st))
125 144 memcpy(&buf, st, sizeof(*st));
126 retval = vfs_statfs(path, &st);
127 if (retval)
128 return retval;
129
130 if (sizeof(*buf) == sizeof(st))
131 memcpy(buf, &st, sizeof(st));
132 else { 145 else {
133 buf->f_type = st.f_type; 146 buf.f_type = st->f_type;
134 buf->f_bsize = st.f_bsize; 147 buf.f_bsize = st->f_bsize;
135 buf->f_blocks = st.f_blocks; 148 buf.f_blocks = st->f_blocks;
136 buf->f_bfree = st.f_bfree; 149 buf.f_bfree = st->f_bfree;
137 buf->f_bavail = st.f_bavail; 150 buf.f_bavail = st->f_bavail;
138 buf->f_files = st.f_files; 151 buf.f_files = st->f_files;
139 buf->f_ffree = st.f_ffree; 152 buf.f_ffree = st->f_ffree;
140 buf->f_fsid = st.f_fsid; 153 buf.f_fsid = st->f_fsid;
141 buf->f_namelen = st.f_namelen; 154 buf.f_namelen = st->f_namelen;
142 buf->f_frsize = st.f_frsize; 155 buf.f_frsize = st->f_frsize;
143 buf->f_flags = st.f_flags; 156 buf.f_flags = st->f_flags;
144 memset(buf->f_spare, 0, sizeof(buf->f_spare)); 157 memset(buf.f_spare, 0, sizeof(buf.f_spare));
145 } 158 }
159 if (copy_to_user(p, &buf, sizeof(buf)))
160 return -EFAULT;
146 return 0; 161 return 0;
147} 162}
148 163
149SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) 164SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
150{ 165{
151 struct path path; 166 struct kstatfs st;
152 int error; 167 int error = user_statfs(pathname, &st);
153 168 if (!error)
154 error = user_path(pathname, &path); 169 error = do_statfs_native(&st, buf);
155 if (!error) {
156 struct statfs tmp;
157 error = do_statfs_native(&path, &tmp);
158 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
159 error = -EFAULT;
160 path_put(&path);
161 }
162 return error; 170 return error;
163} 171}
164 172
165SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) 173SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
166{ 174{
167 struct path path; 175 struct kstatfs st;
168 long error; 176 int error;
169
170 if (sz != sizeof(*buf)) 177 if (sz != sizeof(*buf))
171 return -EINVAL; 178 return -EINVAL;
172 error = user_path(pathname, &path); 179 error = user_statfs(pathname, &st);
173 if (!error) { 180 if (!error)
174 struct statfs64 tmp; 181 error = do_statfs64(&st, buf);
175 error = do_statfs64(&path, &tmp);
176 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
177 error = -EFAULT;
178 path_put(&path);
179 }
180 return error; 182 return error;
181} 183}
182 184
183SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) 185SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
184{ 186{
185 struct file *file; 187 struct kstatfs st;
186 struct statfs tmp; 188 int error = fd_statfs(fd, &st);
187 int error; 189 if (!error)
188 190 error = do_statfs_native(&st, buf);
189 error = -EBADF;
190 file = fget(fd);
191 if (!file)
192 goto out;
193 error = do_statfs_native(&file->f_path, &tmp);
194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
195 error = -EFAULT;
196 fput(file);
197out:
198 return error; 191 return error;
199} 192}
200 193
201SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) 194SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
202{ 195{
203 struct file *file; 196 struct kstatfs st;
204 struct statfs64 tmp;
205 int error; 197 int error;
206 198
207 if (sz != sizeof(*buf)) 199 if (sz != sizeof(*buf))
208 return -EINVAL; 200 return -EINVAL;
209 201
210 error = -EBADF; 202 error = fd_statfs(fd, &st);
211 file = fget(fd); 203 if (!error)
212 if (!file) 204 error = do_statfs64(&st, buf);
213 goto out;
214 error = do_statfs64(&file->f_path, &tmp);
215 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
216 error = -EFAULT;
217 fput(file);
218out:
219 return error; 205 return error;
220} 206}
221 207
diff --git a/fs/super.c b/fs/super.c
index 74e149efed81..8a06881b1920 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -71,6 +71,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
71#else 71#else
72 INIT_LIST_HEAD(&s->s_files); 72 INIT_LIST_HEAD(&s->s_files);
73#endif 73#endif
74 s->s_bdi = &default_backing_dev_info;
74 INIT_LIST_HEAD(&s->s_instances); 75 INIT_LIST_HEAD(&s->s_instances);
75 INIT_HLIST_BL_HEAD(&s->s_anon); 76 INIT_HLIST_BL_HEAD(&s->s_anon);
76 INIT_LIST_HEAD(&s->s_inodes); 77 INIT_LIST_HEAD(&s->s_inodes);
@@ -177,6 +178,11 @@ void deactivate_locked_super(struct super_block *s)
177 struct file_system_type *fs = s->s_type; 178 struct file_system_type *fs = s->s_type;
178 if (atomic_dec_and_test(&s->s_active)) { 179 if (atomic_dec_and_test(&s->s_active)) {
179 fs->kill_sb(s); 180 fs->kill_sb(s);
181 /*
182 * We need to call rcu_barrier so all the delayed rcu free
183 * inodes are flushed before we release the fs module.
184 */
185 rcu_barrier();
180 put_filesystem(fs); 186 put_filesystem(fs);
181 put_super(s); 187 put_super(s);
182 } else { 188 } else {
@@ -838,23 +844,6 @@ error:
838} 844}
839EXPORT_SYMBOL(mount_bdev); 845EXPORT_SYMBOL(mount_bdev);
840 846
841int get_sb_bdev(struct file_system_type *fs_type,
842 int flags, const char *dev_name, void *data,
843 int (*fill_super)(struct super_block *, void *, int),
844 struct vfsmount *mnt)
845{
846 struct dentry *root;
847
848 root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
849 if (IS_ERR(root))
850 return PTR_ERR(root);
851 mnt->mnt_root = root;
852 mnt->mnt_sb = root->d_sb;
853 return 0;
854}
855
856EXPORT_SYMBOL(get_sb_bdev);
857
858void kill_block_super(struct super_block *sb) 847void kill_block_super(struct super_block *sb)
859{ 848{
860 struct block_device *bdev = sb->s_bdev; 849 struct block_device *bdev = sb->s_bdev;
@@ -892,22 +881,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
892} 881}
893EXPORT_SYMBOL(mount_nodev); 882EXPORT_SYMBOL(mount_nodev);
894 883
895int get_sb_nodev(struct file_system_type *fs_type,
896 int flags, void *data,
897 int (*fill_super)(struct super_block *, void *, int),
898 struct vfsmount *mnt)
899{
900 struct dentry *root;
901
902 root = mount_nodev(fs_type, flags, data, fill_super);
903 if (IS_ERR(root))
904 return PTR_ERR(root);
905 mnt->mnt_root = root;
906 mnt->mnt_sb = root->d_sb;
907 return 0;
908}
909EXPORT_SYMBOL(get_sb_nodev);
910
911static int compare_single(struct super_block *s, void *p) 884static int compare_single(struct super_block *s, void *p)
912{ 885{
913 return 1; 886 return 1;
@@ -938,69 +911,36 @@ struct dentry *mount_single(struct file_system_type *fs_type,
938} 911}
939EXPORT_SYMBOL(mount_single); 912EXPORT_SYMBOL(mount_single);
940 913
941int get_sb_single(struct file_system_type *fs_type, 914struct dentry *
942 int flags, void *data, 915mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
943 int (*fill_super)(struct super_block *, void *, int),
944 struct vfsmount *mnt)
945{
946 struct dentry *root;
947 root = mount_single(fs_type, flags, data, fill_super);
948 if (IS_ERR(root))
949 return PTR_ERR(root);
950 mnt->mnt_root = root;
951 mnt->mnt_sb = root->d_sb;
952 return 0;
953}
954
955EXPORT_SYMBOL(get_sb_single);
956
957struct vfsmount *
958vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
959{ 916{
960 struct vfsmount *mnt;
961 struct dentry *root; 917 struct dentry *root;
918 struct super_block *sb;
962 char *secdata = NULL; 919 char *secdata = NULL;
963 int error; 920 int error = -ENOMEM;
964
965 if (!type)
966 return ERR_PTR(-ENODEV);
967
968 error = -ENOMEM;
969 mnt = alloc_vfsmnt(name);
970 if (!mnt)
971 goto out;
972
973 if (flags & MS_KERNMOUNT)
974 mnt->mnt_flags = MNT_INTERNAL;
975 921
976 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { 922 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
977 secdata = alloc_secdata(); 923 secdata = alloc_secdata();
978 if (!secdata) 924 if (!secdata)
979 goto out_mnt; 925 goto out;
980 926
981 error = security_sb_copy_data(data, secdata); 927 error = security_sb_copy_data(data, secdata);
982 if (error) 928 if (error)
983 goto out_free_secdata; 929 goto out_free_secdata;
984 } 930 }
985 931
986 if (type->mount) { 932 root = type->mount(type, flags, name, data);
987 root = type->mount(type, flags, name, data); 933 if (IS_ERR(root)) {
988 if (IS_ERR(root)) { 934 error = PTR_ERR(root);
989 error = PTR_ERR(root); 935 goto out_free_secdata;
990 goto out_free_secdata;
991 }
992 mnt->mnt_root = root;
993 mnt->mnt_sb = root->d_sb;
994 } else {
995 error = type->get_sb(type, flags, name, data, mnt);
996 if (error < 0)
997 goto out_free_secdata;
998 } 936 }
999 BUG_ON(!mnt->mnt_sb); 937 sb = root->d_sb;
1000 WARN_ON(!mnt->mnt_sb->s_bdi); 938 BUG_ON(!sb);
1001 mnt->mnt_sb->s_flags |= MS_BORN; 939 WARN_ON(!sb->s_bdi);
940 WARN_ON(sb->s_bdi == &default_backing_dev_info);
941 sb->s_flags |= MS_BORN;
1002 942
1003 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); 943 error = security_sb_kern_mount(sb, flags, secdata);
1004 if (error) 944 if (error)
1005 goto out_sb; 945 goto out_sb;
1006 946
@@ -1011,27 +951,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
1011 * violate this rule. This warning should be either removed or 951 * violate this rule. This warning should be either removed or
1012 * converted to a BUG() in 2.6.34. 952 * converted to a BUG() in 2.6.34.
1013 */ 953 */
1014 WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " 954 WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
1015 "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes); 955 "negative value (%lld)\n", type->name, sb->s_maxbytes);
1016 956
1017 mnt->mnt_mountpoint = mnt->mnt_root; 957 up_write(&sb->s_umount);
1018 mnt->mnt_parent = mnt;
1019 up_write(&mnt->mnt_sb->s_umount);
1020 free_secdata(secdata); 958 free_secdata(secdata);
1021 return mnt; 959 return root;
1022out_sb: 960out_sb:
1023 dput(mnt->mnt_root); 961 dput(root);
1024 deactivate_locked_super(mnt->mnt_sb); 962 deactivate_locked_super(sb);
1025out_free_secdata: 963out_free_secdata:
1026 free_secdata(secdata); 964 free_secdata(secdata);
1027out_mnt:
1028 free_vfsmnt(mnt);
1029out: 965out:
1030 return ERR_PTR(error); 966 return ERR_PTR(error);
1031} 967}
1032 968
1033EXPORT_SYMBOL_GPL(vfs_kern_mount);
1034
1035/** 969/**
1036 * freeze_super - lock the filesystem and force it into a consistent state 970 * freeze_super - lock the filesystem and force it into a consistent state
1037 * @sb: the super to lock 971 * @sb: the super to lock
@@ -1121,49 +1055,3 @@ out:
1121 return 0; 1055 return 0;
1122} 1056}
1123EXPORT_SYMBOL(thaw_super); 1057EXPORT_SYMBOL(thaw_super);
1124
1125static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1126{
1127 int err;
1128 const char *subtype = strchr(fstype, '.');
1129 if (subtype) {
1130 subtype++;
1131 err = -EINVAL;
1132 if (!subtype[0])
1133 goto err;
1134 } else
1135 subtype = "";
1136
1137 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
1138 err = -ENOMEM;
1139 if (!mnt->mnt_sb->s_subtype)
1140 goto err;
1141 return mnt;
1142
1143 err:
1144 mntput(mnt);
1145 return ERR_PTR(err);
1146}
1147
1148struct vfsmount *
1149do_kern_mount(const char *fstype, int flags, const char *name, void *data)
1150{
1151 struct file_system_type *type = get_fs_type(fstype);
1152 struct vfsmount *mnt;
1153 if (!type)
1154 return ERR_PTR(-ENODEV);
1155 mnt = vfs_kern_mount(type, flags, name, data);
1156 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1157 !mnt->mnt_sb->s_subtype)
1158 mnt = fs_set_subtype(mnt, fstype);
1159 put_filesystem(type);
1160 return mnt;
1161}
1162EXPORT_SYMBOL_GPL(do_kern_mount);
1163
1164struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
1165{
1166 return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
1167}
1168
1169EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/sync.c b/fs/sync.c
index ba76b9623e7e..c38ec163da6c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -7,6 +7,7 @@
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/namei.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/writeback.h> 12#include <linux/writeback.h>
12#include <linux/syscalls.h> 13#include <linux/syscalls.h>
@@ -33,7 +34,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
33 * This should be safe, as we require bdi backing to actually 34 * This should be safe, as we require bdi backing to actually
34 * write out data in the first place 35 * write out data in the first place
35 */ 36 */
36 if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info) 37 if (sb->s_bdi == &noop_backing_dev_info)
37 return 0; 38 return 0;
38 39
39 if (sb->s_qcop && sb->s_qcop->quota_sync) 40 if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -79,7 +80,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
79 80
80static void sync_one_sb(struct super_block *sb, void *arg) 81static void sync_one_sb(struct super_block *sb, void *arg)
81{ 82{
82 if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi) 83 if (!(sb->s_flags & MS_RDONLY))
83 __sync_filesystem(sb, *(int *)arg); 84 __sync_filesystem(sb, *(int *)arg);
84} 85}
85/* 86/*
@@ -128,6 +129,29 @@ void emergency_sync(void)
128 } 129 }
129} 130}
130 131
132/*
133 * sync a single super
134 */
135SYSCALL_DEFINE1(syncfs, int, fd)
136{
137 struct file *file;
138 struct super_block *sb;
139 int ret;
140 int fput_needed;
141
142 file = fget_light(fd, &fput_needed);
143 if (!file)
144 return -EBADF;
145 sb = file->f_dentry->d_sb;
146
147 down_read(&sb->s_umount);
148 ret = sync_filesystem(sb);
149 up_read(&sb->s_umount);
150
151 fput_light(file, fput_needed);
152 return ret;
153}
154
131/** 155/**
132 * vfs_fsync_range - helper to sync a range of data & metadata to disk 156 * vfs_fsync_range - helper to sync a range of data & metadata to disk
133 * @file: file to sync 157 * @file: file to sync
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 9ca66276315e..fa8d43c92bb8 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -488,7 +488,6 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
488const struct address_space_operations sysv_aops = { 488const struct address_space_operations sysv_aops = {
489 .readpage = sysv_readpage, 489 .readpage = sysv_readpage,
490 .writepage = sysv_writepage, 490 .writepage = sysv_writepage,
491 .sync_page = block_sync_page,
492 .write_begin = sysv_write_begin, 491 .write_begin = sysv_write_begin,
493 .write_end = generic_write_end, 492 .write_end = generic_write_end,
494 .bmap = sysv_bmap 493 .bmap = sysv_bmap
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b427b1208c26..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
245 new_de = sysv_find_entry(new_dentry, &new_page); 245 new_de = sysv_find_entry(new_dentry, &new_page);
246 if (!new_de) 246 if (!new_de)
247 goto out_dir; 247 goto out_dir;
248 inode_inc_link_count(old_inode);
249 sysv_set_link(new_de, new_page, old_inode); 248 sysv_set_link(new_de, new_page, old_inode);
250 new_inode->i_ctime = CURRENT_TIME_SEC; 249 new_inode->i_ctime = CURRENT_TIME_SEC;
251 if (dir_de) 250 if (dir_de)
@@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
257 if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max) 256 if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
258 goto out_dir; 257 goto out_dir;
259 } 258 }
260 inode_inc_link_count(old_inode);
261 err = sysv_add_link(new_dentry, old_inode); 259 err = sysv_add_link(new_dentry, old_inode);
262 if (err) { 260 if (err)
263 inode_dec_link_count(old_inode);
264 goto out_dir; 261 goto out_dir;
265 }
266 if (dir_de) 262 if (dir_de)
267 inode_inc_link_count(new_dir); 263 inode_inc_link_count(new_dir);
268 } 264 }
269 265
270 sysv_delete_entry(old_de, old_page); 266 sysv_delete_entry(old_de, old_page);
271 inode_dec_link_count(old_inode); 267 mark_inode_dirty(old_inode);
272 268
273 if (dir_de) { 269 if (dir_de) {
274 sysv_set_link(dir_de, dir_page, new_dir); 270 sysv_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 830e3f76f442..d7440904be17 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -44,29 +44,17 @@ config UBIFS_FS_ZLIB
44 44
45# Debugging-related stuff 45# Debugging-related stuff
46config UBIFS_FS_DEBUG 46config UBIFS_FS_DEBUG
47 bool "Enable debugging" 47 bool "Enable debugging support"
48 depends on UBIFS_FS 48 depends on UBIFS_FS
49 select DEBUG_FS 49 select DEBUG_FS
50 select KALLSYMS_ALL 50 select KALLSYMS_ALL
51 help 51 help
52 This option enables UBIFS debugging. 52 This option enables UBIFS debugging support. It makes sure various
53 53 assertions, self-checks, debugging messages and test modes are compiled
54config UBIFS_FS_DEBUG_MSG_LVL 54 in (this all is compiled out otherwise). Assertions are light-weight
55 int "Default message level (0 = no extra messages, 3 = lots)" 55 and this option also enables them. Self-checks, debugging messages and
56 depends on UBIFS_FS_DEBUG 56 test modes are switched off by default. Thus, it is safe and actually
57 default "0" 57 recommended to have debugging support enabled, and it should not slow
58 help 58 down UBIFS. You can then further enable / disable individual debugging
59 This controls the amount of debugging messages produced by UBIFS. 59 features using UBIFS module parameters and the corresponding sysfs
60 If reporting bugs, please try to have available a full dump of the 60 interfaces.
61 messages at level 1 while the misbehaviour was occurring. Level 2
62 may become necessary if level 1 messages were not enough to find the
63 bug. Generally Level 3 should be avoided.
64
65config UBIFS_FS_DEBUG_CHKS
66 bool "Enable extra checks"
67 depends on UBIFS_FS_DEBUG
68 help
69 If extra checks are enabled UBIFS will check the consistency of its
70 internal data structures during operation. However, UBIFS performance
71 is dramatically slower when this option is selected especially if the
72 file system is large.
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 02429d81ca33..b148fbc80f8d 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -48,6 +48,56 @@
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include "ubifs.h" 49#include "ubifs.h"
50 50
51/*
52 * nothing_to_commit - check if there is nothing to commit.
53 * @c: UBIFS file-system description object
54 *
55 * This is a helper function which checks if there is anything to commit. It is
56 * used as an optimization to avoid starting the commit if it is not really
57 * necessary. Indeed, the commit operation always assumes flash I/O (e.g.,
58 * writing the commit start node to the log), and it is better to avoid doing
59 * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is
60 * nothing to commit, it is more optimal to avoid any flash I/O.
61 *
62 * This function has to be called with @c->commit_sem locked for writing -
63 * this function does not take LPT/TNC locks because the @c->commit_sem
64 * guarantees that we have exclusive access to the TNC and LPT data structures.
65 *
66 * This function returns %1 if there is nothing to commit and %0 otherwise.
67 */
68static int nothing_to_commit(struct ubifs_info *c)
69{
70 /*
71 * During mounting or remounting from R/O mode to R/W mode we may
72 * commit for various recovery-related reasons.
73 */
74 if (c->mounting || c->remounting_rw)
75 return 0;
76
77 /*
78 * If the root TNC node is dirty, we definitely have something to
79 * commit.
80 */
81 if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags))
82 return 0;
83
84 /*
85 * Even though the TNC is clean, the LPT tree may have dirty nodes. For
86 * example, this may happen if the budgeting subsystem invoked GC to
87 * make some free space, and the GC found an LEB with only dirty and
88 * free space. In this case GC would just change the lprops of this
89 * LEB (by turning all space into free space) and unmap it.
90 */
91 if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
92 return 0;
93
94 ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
95 ubifs_assert(c->dirty_pn_cnt == 0);
96 ubifs_assert(c->dirty_nn_cnt == 0);
97
98 return 1;
99}
100
51/** 101/**
52 * do_commit - commit the journal. 102 * do_commit - commit the journal.
53 * @c: UBIFS file-system description object 103 * @c: UBIFS file-system description object
@@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c)
70 goto out_up; 120 goto out_up;
71 } 121 }
72 122
123 if (nothing_to_commit(c)) {
124 up_write(&c->commit_sem);
125 err = 0;
126 goto out_cancel;
127 }
128
73 /* Sync all write buffers (necessary for recovery) */ 129 /* Sync all write buffers (necessary for recovery) */
74 for (i = 0; i < c->jhead_cnt; i++) { 130 for (i = 0; i < c->jhead_cnt; i++) {
75 err = ubifs_wbuf_sync(&c->jheads[i].wbuf); 131 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
@@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c)
162 if (err) 218 if (err)
163 goto out; 219 goto out;
164 220
221out_cancel:
165 spin_lock(&c->cs_lock); 222 spin_lock(&c->cs_lock);
166 c->cmt_state = COMMIT_RESTING; 223 c->cmt_state = COMMIT_RESTING;
167 wake_up(&c->cmt_wq); 224 wake_up(&c->cmt_wq);
168 dbg_cmt("commit end"); 225 dbg_cmt("commit end");
169 spin_unlock(&c->cs_lock); 226 spin_unlock(&c->cs_lock);
170
171 return 0; 227 return 0;
172 228
173out_up: 229out_up:
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 0bee4dbffc31..f25a7339f800 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -43,8 +43,8 @@ DEFINE_SPINLOCK(dbg_lock);
43static char dbg_key_buf0[128]; 43static char dbg_key_buf0[128];
44static char dbg_key_buf1[128]; 44static char dbg_key_buf1[128];
45 45
46unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT; 46unsigned int ubifs_msg_flags;
47unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT; 47unsigned int ubifs_chk_flags;
48unsigned int ubifs_tst_flags; 48unsigned int ubifs_tst_flags;
49 49
50module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); 50module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
@@ -810,16 +810,24 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
810{ 810{
811 struct ubifs_scan_leb *sleb; 811 struct ubifs_scan_leb *sleb;
812 struct ubifs_scan_node *snod; 812 struct ubifs_scan_node *snod;
813 void *buf;
813 814
814 if (dbg_failure_mode) 815 if (dbg_failure_mode)
815 return; 816 return;
816 817
817 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", 818 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
818 current->pid, lnum); 819 current->pid, lnum);
819 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); 820
821 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
822 if (!buf) {
823 ubifs_err("cannot allocate memory for dumping LEB %d", lnum);
824 return;
825 }
826
827 sleb = ubifs_scan(c, lnum, 0, buf, 0);
820 if (IS_ERR(sleb)) { 828 if (IS_ERR(sleb)) {
821 ubifs_err("scan error %d", (int)PTR_ERR(sleb)); 829 ubifs_err("scan error %d", (int)PTR_ERR(sleb));
822 return; 830 goto out;
823 } 831 }
824 832
825 printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, 833 printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
@@ -835,6 +843,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
835 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", 843 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
836 current->pid, lnum); 844 current->pid, lnum);
837 ubifs_scan_destroy(sleb); 845 ubifs_scan_destroy(sleb);
846
847out:
848 vfree(buf);
838 return; 849 return;
839} 850}
840 851
@@ -2690,16 +2701,8 @@ int ubifs_debugging_init(struct ubifs_info *c)
2690 if (!c->dbg) 2701 if (!c->dbg)
2691 return -ENOMEM; 2702 return -ENOMEM;
2692 2703
2693 c->dbg->buf = vmalloc(c->leb_size);
2694 if (!c->dbg->buf)
2695 goto out;
2696
2697 failure_mode_init(c); 2704 failure_mode_init(c);
2698 return 0; 2705 return 0;
2699
2700out:
2701 kfree(c->dbg);
2702 return -ENOMEM;
2703} 2706}
2704 2707
2705/** 2708/**
@@ -2709,7 +2712,6 @@ out:
2709void ubifs_debugging_exit(struct ubifs_info *c) 2712void ubifs_debugging_exit(struct ubifs_info *c)
2710{ 2713{
2711 failure_mode_exit(c); 2714 failure_mode_exit(c);
2712 vfree(c->dbg->buf);
2713 kfree(c->dbg); 2715 kfree(c->dbg);
2714} 2716}
2715 2717
@@ -2813,19 +2815,19 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
2813 } 2815 }
2814 2816
2815 fname = "dump_lprops"; 2817 fname = "dump_lprops";
2816 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); 2818 dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
2817 if (IS_ERR(dent)) 2819 if (IS_ERR(dent))
2818 goto out_remove; 2820 goto out_remove;
2819 d->dfs_dump_lprops = dent; 2821 d->dfs_dump_lprops = dent;
2820 2822
2821 fname = "dump_budg"; 2823 fname = "dump_budg";
2822 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); 2824 dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
2823 if (IS_ERR(dent)) 2825 if (IS_ERR(dent))
2824 goto out_remove; 2826 goto out_remove;
2825 d->dfs_dump_budg = dent; 2827 d->dfs_dump_budg = dent;
2826 2828
2827 fname = "dump_tnc"; 2829 fname = "dump_tnc";
2828 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); 2830 dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
2829 if (IS_ERR(dent)) 2831 if (IS_ERR(dent))
2830 goto out_remove; 2832 goto out_remove;
2831 d->dfs_dump_tnc = dent; 2833 d->dfs_dump_tnc = dent;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 69ebe4729151..919f0de29d8f 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -27,7 +27,6 @@
27 27
28/** 28/**
29 * ubifs_debug_info - per-FS debugging information. 29 * ubifs_debug_info - per-FS debugging information.
30 * @buf: a buffer of LEB size, used for various purposes
31 * @old_zroot: old index root - used by 'dbg_check_old_index()' 30 * @old_zroot: old index root - used by 'dbg_check_old_index()'
32 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' 31 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
33 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' 32 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
@@ -54,7 +53,6 @@
54 * dfs_dump_tnc: "dump TNC" debugfs knob 53 * dfs_dump_tnc: "dump TNC" debugfs knob
55 */ 54 */
56struct ubifs_debug_info { 55struct ubifs_debug_info {
57 void *buf;
58 struct ubifs_zbranch old_zroot; 56 struct ubifs_zbranch old_zroot;
59 int old_zroot_level; 57 int old_zroot_level;
60 unsigned long long old_zroot_sqnum; 58 unsigned long long old_zroot_sqnum;
@@ -173,7 +171,7 @@ const char *dbg_key_str1(const struct ubifs_info *c,
173#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) 171#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
174 172
175/* 173/*
176 * Debugging message type flags (must match msg_type_names in debug.c). 174 * Debugging message type flags.
177 * 175 *
178 * UBIFS_MSG_GEN: general messages 176 * UBIFS_MSG_GEN: general messages
179 * UBIFS_MSG_JNL: journal messages 177 * UBIFS_MSG_JNL: journal messages
@@ -205,14 +203,8 @@ enum {
205 UBIFS_MSG_RCVRY = 0x1000, 203 UBIFS_MSG_RCVRY = 0x1000,
206}; 204};
207 205
208/* Debugging message type flags for each default debug message level */
209#define UBIFS_MSG_LVL_0 0
210#define UBIFS_MSG_LVL_1 0x1
211#define UBIFS_MSG_LVL_2 0x7f
212#define UBIFS_MSG_LVL_3 0xffff
213
214/* 206/*
215 * Debugging check flags (must match chk_names in debug.c). 207 * Debugging check flags.
216 * 208 *
217 * UBIFS_CHK_GEN: general checks 209 * UBIFS_CHK_GEN: general checks
218 * UBIFS_CHK_TNC: check TNC 210 * UBIFS_CHK_TNC: check TNC
@@ -233,7 +225,7 @@ enum {
233}; 225};
234 226
235/* 227/*
236 * Special testing flags (must match tst_names in debug.c). 228 * Special testing flags.
237 * 229 *
238 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method 230 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
239 * UBIFS_TST_RCVRY: failure mode for recovery testing 231 * UBIFS_TST_RCVRY: failure mode for recovery testing
@@ -243,22 +235,6 @@ enum {
243 UBIFS_TST_RCVRY = 0x4, 235 UBIFS_TST_RCVRY = 0x4,
244}; 236};
245 237
246#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
247#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
248#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
249#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
250#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
251#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
252#else
253#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
254#endif
255
256#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
257#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
258#else
259#define UBIFS_CHK_FLAGS_DEFAULT 0
260#endif
261
262extern spinlock_t dbg_lock; 238extern spinlock_t dbg_lock;
263 239
264extern unsigned int ubifs_msg_flags; 240extern unsigned int ubifs_msg_flags;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b689d7f..7217d67a80a6 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
522 ubifs_assert(mutex_is_locked(&dir->i_mutex)); 522 ubifs_assert(mutex_is_locked(&dir->i_mutex));
523 ubifs_assert(mutex_is_locked(&inode->i_mutex)); 523 ubifs_assert(mutex_is_locked(&inode->i_mutex));
524 524
525 /*
526 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
527 * otherwise has the potential to corrupt the orphan inode list.
528 *
529 * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
530 * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
531 * lock 'dirA->i_mutex', so this is possible. Both of the functions
532 * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
533 * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
534 * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
535 * to the list of orphans. After this, 'vfs_link()' will link
536 * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
537 * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
538 * to the list of orphans.
539 */
540 if (inode->i_nlink == 0)
541 return -ENOENT;
542
543 err = dbg_check_synced_i_size(inode); 525 err = dbg_check_synced_i_size(inode);
544 if (err) 526 if (err)
545 return err; 527 return err;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index d77db7e36484..28be1e6a65e8 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -448,10 +448,12 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { 448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
449 /* 449 /*
450 * We change whole page so no need to load it. But we 450 * We change whole page so no need to load it. But we
451 * have to set the @PG_checked flag to make the further 451 * do not know whether this page exists on the media or
452 * code know that the page is new. This might be not 452 * not, so we assume the latter because it requires
453 * true, but it is better to budget more than to read 453 * larger budget. The assumption is that it is better
454 * the page from the media. 454 * to budget a bit more than to read the page from the
455 * media. Thus, we are setting the @PG_checked flag
456 * here.
455 */ 457 */
456 SetPageChecked(page); 458 SetPageChecked(page);
457 skipped_read = 1; 459 skipped_read = 1;
@@ -559,6 +561,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
559 dbg_gen("copied %d instead of %d, read page and repeat", 561 dbg_gen("copied %d instead of %d, read page and repeat",
560 copied, len); 562 copied, len);
561 cancel_budget(c, page, ui, appending); 563 cancel_budget(c, page, ui, appending);
564 ClearPageChecked(page);
562 565
563 /* 566 /*
564 * Return 0 to force VFS to repeat the whole operation, or the 567 * Return 0 to force VFS to repeat the whole operation, or the
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index d82173182eeb..dfd168b7807e 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -31,6 +31,26 @@
31 * buffer is full or when it is not used for some time (by timer). This is 31 * buffer is full or when it is not used for some time (by timer). This is
32 * similar to the mechanism is used by JFFS2. 32 * similar to the mechanism is used by JFFS2.
33 * 33 *
34 * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum
35 * write size (@c->max_write_size). The latter is the maximum amount of bytes
36 * the underlying flash is able to program at a time, and writing in
37 * @c->max_write_size units should presumably be faster. Obviously,
38 * @c->min_io_size <= @c->max_write_size. Write-buffers are of
39 * @c->max_write_size bytes in size for maximum performance. However, when a
40 * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size
41 * boundary) which contains data is written, not the whole write-buffer,
42 * because this is more space-efficient.
43 *
44 * This optimization adds few complications to the code. Indeed, on the one
45 * hand, we want to write in optimal @c->max_write_size bytes chunks, which
46 * also means aligning writes at the @c->max_write_size bytes offsets. On the
47 * other hand, we do not want to waste space when synchronizing the write
48 * buffer, so during synchronization we writes in smaller chunks. And this makes
49 * the next write offset to be not aligned to @c->max_write_size bytes. So the
50 * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned
51 * to @c->max_write_size bytes again. We do this by temporarily shrinking
52 * write-buffer size (@wbuf->size).
53 *
34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by 54 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
35 * mutexes defined inside these objects. Since sometimes upper-level code 55 * mutexes defined inside these objects. Since sometimes upper-level code
36 * has to lock the write-buffer (e.g. journal space reservation code), many 56 * has to lock the write-buffer (e.g. journal space reservation code), many
@@ -46,8 +66,8 @@
46 * UBIFS uses padding when it pads to the next min. I/O unit. In this case it 66 * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
47 * uses padding nodes or padding bytes, if the padding node does not fit. 67 * uses padding nodes or padding bytes, if the padding node does not fit.
48 * 68 *
49 * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes 69 * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when
50 * every time they are read from the flash media. 70 * they are read from the flash media.
51 */ 71 */
52 72
53#include <linux/crc32.h> 73#include <linux/crc32.h>
@@ -88,8 +108,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
88 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is 108 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
89 * true, which is controlled by corresponding UBIFS mount option. However, if 109 * true, which is controlled by corresponding UBIFS mount option. However, if
90 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is 110 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
91 * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is 111 * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are
92 * ignored and CRC is checked. 112 * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC
113 * is checked. This is because during mounting or re-mounting from R/O mode to
114 * R/W mode we may read journal nodes (when replying the journal or doing the
115 * recovery) and the journal nodes may potentially be corrupted, so checking is
116 * required.
93 * 117 *
94 * This function returns zero in case of success and %-EUCLEAN in case of bad 118 * This function returns zero in case of success and %-EUCLEAN in case of bad
95 * CRC or magic. 119 * CRC or magic.
@@ -131,8 +155,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
131 node_len > c->ranges[type].max_len) 155 node_len > c->ranges[type].max_len)
132 goto out_len; 156 goto out_len;
133 157
134 if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc && 158 if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting &&
135 c->no_chk_data_crc) 159 !c->remounting_rw && c->no_chk_data_crc)
136 return 0; 160 return 0;
137 161
138 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 162 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
@@ -343,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
343 * 367 *
344 * This function synchronizes write-buffer @buf and returns zero in case of 368 * This function synchronizes write-buffer @buf and returns zero in case of
345 * success or a negative error code in case of failure. 369 * success or a negative error code in case of failure.
370 *
371 * Note, although write-buffers are of @c->max_write_size, this function does
372 * not necessarily writes all @c->max_write_size bytes to the flash. Instead,
373 * if the write-buffer is only partially filled with data, only the used part
374 * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized.
375 * This way we waste less space.
346 */ 376 */
347int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) 377int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
348{ 378{
349 struct ubifs_info *c = wbuf->c; 379 struct ubifs_info *c = wbuf->c;
350 int err, dirt; 380 int err, dirt, sync_len;
351 381
352 cancel_wbuf_timer_nolock(wbuf); 382 cancel_wbuf_timer_nolock(wbuf);
353 if (!wbuf->used || wbuf->lnum == -1) 383 if (!wbuf->used || wbuf->lnum == -1)
@@ -357,27 +387,53 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
357 dbg_io("LEB %d:%d, %d bytes, jhead %s", 387 dbg_io("LEB %d:%d, %d bytes, jhead %s",
358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); 388 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
359 ubifs_assert(!(wbuf->avail & 7)); 389 ubifs_assert(!(wbuf->avail & 7));
360 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 390 ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size);
391 ubifs_assert(wbuf->size >= c->min_io_size);
392 ubifs_assert(wbuf->size <= c->max_write_size);
393 ubifs_assert(wbuf->size % c->min_io_size == 0);
361 ubifs_assert(!c->ro_media && !c->ro_mount); 394 ubifs_assert(!c->ro_media && !c->ro_mount);
395 if (c->leb_size - wbuf->offs >= c->max_write_size)
396 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
362 397
363 if (c->ro_error) 398 if (c->ro_error)
364 return -EROFS; 399 return -EROFS;
365 400
366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); 401 /*
402 * Do not write whole write buffer but write only the minimum necessary
403 * amount of min. I/O units.
404 */
405 sync_len = ALIGN(wbuf->used, c->min_io_size);
406 dirt = sync_len - wbuf->used;
407 if (dirt)
408 ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
367 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, 409 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
368 c->min_io_size, wbuf->dtype); 410 sync_len, wbuf->dtype);
369 if (err) { 411 if (err) {
370 ubifs_err("cannot write %d bytes to LEB %d:%d", 412 ubifs_err("cannot write %d bytes to LEB %d:%d",
371 c->min_io_size, wbuf->lnum, wbuf->offs); 413 sync_len, wbuf->lnum, wbuf->offs);
372 dbg_dump_stack(); 414 dbg_dump_stack();
373 return err; 415 return err;
374 } 416 }
375 417
376 dirt = wbuf->avail;
377
378 spin_lock(&wbuf->lock); 418 spin_lock(&wbuf->lock);
379 wbuf->offs += c->min_io_size; 419 wbuf->offs += sync_len;
380 wbuf->avail = c->min_io_size; 420 /*
421 * Now @wbuf->offs is not necessarily aligned to @c->max_write_size.
422 * But our goal is to optimize writes and make sure we write in
423 * @c->max_write_size chunks and to @c->max_write_size-aligned offset.
424 * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make
425 * sure that @wbuf->offs + @wbuf->size is aligned to
426 * @c->max_write_size. This way we make sure that after next
427 * write-buffer flush we are again at the optimal offset (aligned to
428 * @c->max_write_size).
429 */
430 if (c->leb_size - wbuf->offs < c->max_write_size)
431 wbuf->size = c->leb_size - wbuf->offs;
432 else if (wbuf->offs & (c->max_write_size - 1))
433 wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
434 else
435 wbuf->size = c->max_write_size;
436 wbuf->avail = wbuf->size;
381 wbuf->used = 0; 437 wbuf->used = 0;
382 wbuf->next_ino = 0; 438 wbuf->next_ino = 0;
383 spin_unlock(&wbuf->lock); 439 spin_unlock(&wbuf->lock);
@@ -420,7 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
420 spin_lock(&wbuf->lock); 476 spin_lock(&wbuf->lock);
421 wbuf->lnum = lnum; 477 wbuf->lnum = lnum;
422 wbuf->offs = offs; 478 wbuf->offs = offs;
423 wbuf->avail = c->min_io_size; 479 if (c->leb_size - wbuf->offs < c->max_write_size)
480 wbuf->size = c->leb_size - wbuf->offs;
481 else if (wbuf->offs & (c->max_write_size - 1))
482 wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
483 else
484 wbuf->size = c->max_write_size;
485 wbuf->avail = wbuf->size;
424 wbuf->used = 0; 486 wbuf->used = 0;
425 spin_unlock(&wbuf->lock); 487 spin_unlock(&wbuf->lock);
426 wbuf->dtype = dtype; 488 wbuf->dtype = dtype;
@@ -500,8 +562,9 @@ out_timers:
500 * 562 *
501 * This function writes data to flash via write-buffer @wbuf. This means that 563 * This function writes data to flash via write-buffer @wbuf. This means that
502 * the last piece of the node won't reach the flash media immediately if it 564 * the last piece of the node won't reach the flash media immediately if it
503 * does not take whole minimal I/O unit. Instead, the node will sit in RAM 565 * does not take whole max. write unit (@c->max_write_size). Instead, the node
504 * until the write-buffer is synchronized (e.g., by timer). 566 * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or
567 * because more data are appended to the write-buffer).
505 * 568 *
506 * This function returns zero in case of success and a negative error code in 569 * This function returns zero in case of success and a negative error code in
507 * case of failure. If the node cannot be written because there is no more 570 * case of failure. If the node cannot be written because there is no more
@@ -518,9 +581,14 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
518 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); 581 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
519 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); 582 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
520 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 583 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
521 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); 584 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size);
585 ubifs_assert(wbuf->size >= c->min_io_size);
586 ubifs_assert(wbuf->size <= c->max_write_size);
587 ubifs_assert(wbuf->size % c->min_io_size == 0);
522 ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); 588 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
523 ubifs_assert(!c->ro_media && !c->ro_mount); 589 ubifs_assert(!c->ro_media && !c->ro_mount);
590 if (c->leb_size - wbuf->offs >= c->max_write_size)
591 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
524 592
525 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { 593 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
526 err = -ENOSPC; 594 err = -ENOSPC;
@@ -543,14 +611,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
543 dbg_io("flush jhead %s wbuf to LEB %d:%d", 611 dbg_io("flush jhead %s wbuf to LEB %d:%d",
544 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); 612 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
545 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, 613 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
546 wbuf->offs, c->min_io_size, 614 wbuf->offs, wbuf->size,
547 wbuf->dtype); 615 wbuf->dtype);
548 if (err) 616 if (err)
549 goto out; 617 goto out;
550 618
551 spin_lock(&wbuf->lock); 619 spin_lock(&wbuf->lock);
552 wbuf->offs += c->min_io_size; 620 wbuf->offs += wbuf->size;
553 wbuf->avail = c->min_io_size; 621 if (c->leb_size - wbuf->offs >= c->max_write_size)
622 wbuf->size = c->max_write_size;
623 else
624 wbuf->size = c->leb_size - wbuf->offs;
625 wbuf->avail = wbuf->size;
554 wbuf->used = 0; 626 wbuf->used = 0;
555 wbuf->next_ino = 0; 627 wbuf->next_ino = 0;
556 spin_unlock(&wbuf->lock); 628 spin_unlock(&wbuf->lock);
@@ -564,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
564 goto exit; 636 goto exit;
565 } 637 }
566 638
567 /* 639 offs = wbuf->offs;
568 * The node is large enough and does not fit entirely within current 640 written = 0;
569 * minimal I/O unit. We have to fill and flush write-buffer and switch
570 * to the next min. I/O unit.
571 */
572 dbg_io("flush jhead %s wbuf to LEB %d:%d",
573 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
574 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
575 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
576 c->min_io_size, wbuf->dtype);
577 if (err)
578 goto out;
579 641
580 offs = wbuf->offs + c->min_io_size; 642 if (wbuf->used) {
581 len -= wbuf->avail; 643 /*
582 aligned_len -= wbuf->avail; 644 * The node is large enough and does not fit entirely within
583 written = wbuf->avail; 645 * current available space. We have to fill and flush
646 * write-buffer and switch to the next max. write unit.
647 */
648 dbg_io("flush jhead %s wbuf to LEB %d:%d",
649 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
650 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
651 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
652 wbuf->size, wbuf->dtype);
653 if (err)
654 goto out;
655
656 offs += wbuf->size;
657 len -= wbuf->avail;
658 aligned_len -= wbuf->avail;
659 written += wbuf->avail;
660 } else if (wbuf->offs & (c->max_write_size - 1)) {
661 /*
662 * The write-buffer offset is not aligned to
663 * @c->max_write_size and @wbuf->size is less than
664 * @c->max_write_size. Write @wbuf->size bytes to make sure the
665 * following writes are done in optimal @c->max_write_size
666 * chunks.
667 */
668 dbg_io("write %d bytes to LEB %d:%d",
669 wbuf->size, wbuf->lnum, wbuf->offs);
670 err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs,
671 wbuf->size, wbuf->dtype);
672 if (err)
673 goto out;
674
675 offs += wbuf->size;
676 len -= wbuf->size;
677 aligned_len -= wbuf->size;
678 written += wbuf->size;
679 }
584 680
585 /* 681 /*
586 * The remaining data may take more whole min. I/O units, so write the 682 * The remaining data may take more whole max. write units, so write the
587 * remains multiple to min. I/O unit size directly to the flash media. 683 * remains multiple to max. write unit size directly to the flash media.
588 * We align node length to 8-byte boundary because we anyway flash wbuf 684 * We align node length to 8-byte boundary because we anyway flash wbuf
589 * if the remaining space is less than 8 bytes. 685 * if the remaining space is less than 8 bytes.
590 */ 686 */
591 n = aligned_len >> c->min_io_shift; 687 n = aligned_len >> c->max_write_shift;
592 if (n) { 688 if (n) {
593 n <<= c->min_io_shift; 689 n <<= c->max_write_shift;
594 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); 690 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
595 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, 691 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
596 wbuf->dtype); 692 wbuf->dtype);
@@ -606,14 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
606 if (aligned_len) 702 if (aligned_len)
607 /* 703 /*
608 * And now we have what's left and what does not take whole 704 * And now we have what's left and what does not take whole
609 * min. I/O unit, so write it to the write-buffer and we are 705 * max. write unit, so write it to the write-buffer and we are
610 * done. 706 * done.
611 */ 707 */
612 memcpy(wbuf->buf, buf + written, len); 708 memcpy(wbuf->buf, buf + written, len);
613 709
614 wbuf->offs = offs; 710 wbuf->offs = offs;
711 if (c->leb_size - wbuf->offs >= c->max_write_size)
712 wbuf->size = c->max_write_size;
713 else
714 wbuf->size = c->leb_size - wbuf->offs;
715 wbuf->avail = wbuf->size - aligned_len;
615 wbuf->used = aligned_len; 716 wbuf->used = aligned_len;
616 wbuf->avail = c->min_io_size - aligned_len;
617 wbuf->next_ino = 0; 717 wbuf->next_ino = 0;
618 spin_unlock(&wbuf->lock); 718 spin_unlock(&wbuf->lock);
619 719
@@ -837,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
837{ 937{
838 size_t size; 938 size_t size;
839 939
840 wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); 940 wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL);
841 if (!wbuf->buf) 941 if (!wbuf->buf)
842 return -ENOMEM; 942 return -ENOMEM;
843 943
844 size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); 944 size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
845 wbuf->inodes = kmalloc(size, GFP_KERNEL); 945 wbuf->inodes = kmalloc(size, GFP_KERNEL);
846 if (!wbuf->inodes) { 946 if (!wbuf->inodes) {
847 kfree(wbuf->buf); 947 kfree(wbuf->buf);
@@ -851,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
851 951
852 wbuf->used = 0; 952 wbuf->used = 0;
853 wbuf->lnum = wbuf->offs = -1; 953 wbuf->lnum = wbuf->offs = -1;
854 wbuf->avail = c->min_io_size; 954 /*
955 * If the LEB starts at the max. write size aligned address, then
956 * write-buffer size has to be set to @c->max_write_size. Otherwise,
957 * set it to something smaller so that it ends at the closest max.
958 * write size boundary.
959 */
960 size = c->max_write_size - (c->leb_start % c->max_write_size);
961 wbuf->avail = wbuf->size = size;
855 wbuf->dtype = UBI_UNKNOWN; 962 wbuf->dtype = UBI_UNKNOWN;
856 wbuf->sync_callback = NULL; 963 wbuf->sync_callback = NULL;
857 mutex_init(&wbuf->io_mutex); 964 mutex_init(&wbuf->io_mutex);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 8aacd64957a2..548acf494afd 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -160,7 +160,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
160 if (IS_RDONLY(inode)) 160 if (IS_RDONLY(inode))
161 return -EROFS; 161 return -EROFS;
162 162
163 if (!is_owner_or_cap(inode)) 163 if (!inode_owner_or_capable(inode))
164 return -EACCES; 164 return -EACCES;
165 165
166 if (get_user(flags, (int __user *) arg)) 166 if (get_user(flags, (int __user *) arg))
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 914f1bd89e57..aed25e864227 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -690,7 +690,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
690{ 690{
691 struct ubifs_data_node *data; 691 struct ubifs_data_node *data;
692 int err, lnum, offs, compr_type, out_len; 692 int err, lnum, offs, compr_type, out_len;
693 int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR; 693 int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
694 struct ubifs_inode *ui = ubifs_inode(inode); 694 struct ubifs_inode *ui = ubifs_inode(inode);
695 695
696 dbg_jnl("ino %lu, blk %u, len %d, key %s", 696 dbg_jnl("ino %lu, blk %u, len %d, key %s",
@@ -698,9 +698,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
698 DBGKEY(key)); 698 DBGKEY(key));
699 ubifs_assert(len <= UBIFS_BLOCK_SIZE); 699 ubifs_assert(len <= UBIFS_BLOCK_SIZE);
700 700
701 data = kmalloc(dlen, GFP_NOFS); 701 data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
702 if (!data) 702 if (!data) {
703 return -ENOMEM; 703 /*
704 * Fall-back to the write reserve buffer. Note, we might be
705 * currently on the memory reclaim path, when the kernel is
706 * trying to free some memory by writing out dirty pages. The
707 * write reserve buffer helps us to guarantee that we are
708 * always able to write the data.
709 */
710 allocated = 0;
711 mutex_lock(&c->write_reserve_mutex);
712 data = c->write_reserve_buf;
713 }
704 714
705 data->ch.node_type = UBIFS_DATA_NODE; 715 data->ch.node_type = UBIFS_DATA_NODE;
706 key_write(c, key, &data->key); 716 key_write(c, key, &data->key);
@@ -736,7 +746,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
736 goto out_ro; 746 goto out_ro;
737 747
738 finish_reservation(c); 748 finish_reservation(c);
739 kfree(data); 749 if (!allocated)
750 mutex_unlock(&c->write_reserve_mutex);
751 else
752 kfree(data);
740 return 0; 753 return 0;
741 754
742out_release: 755out_release:
@@ -745,7 +758,10 @@ out_ro:
745 ubifs_ro_mode(c, err); 758 ubifs_ro_mode(c, err);
746 finish_reservation(c); 759 finish_reservation(c);
747out_free: 760out_free:
748 kfree(data); 761 if (!allocated)
762 mutex_unlock(&c->write_reserve_mutex);
763 else
764 kfree(data);
749 return err; 765 return err;
750} 766}
751 767
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4d4ca388889b..0ee0847f2421 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1035,7 +1035,8 @@ static int scan_check_cb(struct ubifs_info *c,
1035 struct ubifs_scan_leb *sleb; 1035 struct ubifs_scan_leb *sleb;
1036 struct ubifs_scan_node *snod; 1036 struct ubifs_scan_node *snod;
1037 struct ubifs_lp_stats *lst = &data->lst; 1037 struct ubifs_lp_stats *lst = &data->lst;
1038 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty; 1038 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
1039 void *buf = NULL;
1039 1040
1040 cat = lp->flags & LPROPS_CAT_MASK; 1041 cat = lp->flags & LPROPS_CAT_MASK;
1041 if (cat != LPROPS_UNCAT) { 1042 if (cat != LPROPS_UNCAT) {
@@ -1093,7 +1094,13 @@ static int scan_check_cb(struct ubifs_info *c,
1093 } 1094 }
1094 } 1095 }
1095 1096
1096 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); 1097 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
1098 if (!buf) {
1099 ubifs_err("cannot allocate memory to scan LEB %d", lnum);
1100 goto out;
1101 }
1102
1103 sleb = ubifs_scan(c, lnum, 0, buf, 0);
1097 if (IS_ERR(sleb)) { 1104 if (IS_ERR(sleb)) {
1098 /* 1105 /*
1099 * After an unclean unmount, empty and freeable LEBs 1106 * After an unclean unmount, empty and freeable LEBs
@@ -1105,7 +1112,8 @@ static int scan_check_cb(struct ubifs_info *c,
1105 lst->empty_lebs += 1; 1112 lst->empty_lebs += 1;
1106 lst->total_free += c->leb_size; 1113 lst->total_free += c->leb_size;
1107 lst->total_dark += ubifs_calc_dark(c, c->leb_size); 1114 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1108 return LPT_SCAN_CONTINUE; 1115 ret = LPT_SCAN_CONTINUE;
1116 goto exit;
1109 } 1117 }
1110 1118
1111 if (lp->free + lp->dirty == c->leb_size && 1119 if (lp->free + lp->dirty == c->leb_size &&
@@ -1115,10 +1123,12 @@ static int scan_check_cb(struct ubifs_info *c,
1115 lst->total_free += lp->free; 1123 lst->total_free += lp->free;
1116 lst->total_dirty += lp->dirty; 1124 lst->total_dirty += lp->dirty;
1117 lst->total_dark += ubifs_calc_dark(c, c->leb_size); 1125 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1118 return LPT_SCAN_CONTINUE; 1126 ret = LPT_SCAN_CONTINUE;
1127 goto exit;
1119 } 1128 }
1120 data->err = PTR_ERR(sleb); 1129 data->err = PTR_ERR(sleb);
1121 return LPT_SCAN_STOP; 1130 ret = LPT_SCAN_STOP;
1131 goto exit;
1122 } 1132 }
1123 1133
1124 is_idx = -1; 1134 is_idx = -1;
@@ -1236,7 +1246,10 @@ static int scan_check_cb(struct ubifs_info *c,
1236 } 1246 }
1237 1247
1238 ubifs_scan_destroy(sleb); 1248 ubifs_scan_destroy(sleb);
1239 return LPT_SCAN_CONTINUE; 1249 ret = LPT_SCAN_CONTINUE;
1250exit:
1251 vfree(buf);
1252 return ret;
1240 1253
1241out_print: 1254out_print:
1242 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " 1255 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1246,6 +1259,7 @@ out_print:
1246out_destroy: 1259out_destroy:
1247 ubifs_scan_destroy(sleb); 1260 ubifs_scan_destroy(sleb);
1248out: 1261out:
1262 vfree(buf);
1249 data->err = -EINVAL; 1263 data->err = -EINVAL;
1250 return LPT_SCAN_STOP; 1264 return LPT_SCAN_STOP;
1251} 1265}
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 5c90dec5db0b..0c9c69bd983a 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1628,29 +1628,35 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
1628{ 1628{
1629 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; 1629 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
1630 int ret; 1630 int ret;
1631 void *buf = c->dbg->buf; 1631 void *buf, *p;
1632 1632
1633 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) 1633 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1634 return 0; 1634 return 0;
1635 1635
1636 buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
1637 if (!buf) {
1638 ubifs_err("cannot allocate memory for ltab checking");
1639 return 0;
1640 }
1641
1636 dbg_lp("LEB %d", lnum); 1642 dbg_lp("LEB %d", lnum);
1637 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); 1643 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1638 if (err) { 1644 if (err) {
1639 dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); 1645 dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
1640 return err; 1646 goto out;
1641 } 1647 }
1642 while (1) { 1648 while (1) {
1643 if (!is_a_node(c, buf, len)) { 1649 if (!is_a_node(c, p, len)) {
1644 int i, pad_len; 1650 int i, pad_len;
1645 1651
1646 pad_len = get_pad_len(c, buf, len); 1652 pad_len = get_pad_len(c, p, len);
1647 if (pad_len) { 1653 if (pad_len) {
1648 buf += pad_len; 1654 p += pad_len;
1649 len -= pad_len; 1655 len -= pad_len;
1650 dirty += pad_len; 1656 dirty += pad_len;
1651 continue; 1657 continue;
1652 } 1658 }
1653 if (!dbg_is_all_ff(buf, len)) { 1659 if (!dbg_is_all_ff(p, len)) {
1654 dbg_msg("invalid empty space in LEB %d at %d", 1660 dbg_msg("invalid empty space in LEB %d at %d",
1655 lnum, c->leb_size - len); 1661 lnum, c->leb_size - len);
1656 err = -EINVAL; 1662 err = -EINVAL;
@@ -1668,16 +1674,21 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
1668 lnum, dirty, c->ltab[i].dirty); 1674 lnum, dirty, c->ltab[i].dirty);
1669 err = -EINVAL; 1675 err = -EINVAL;
1670 } 1676 }
1671 return err; 1677 goto out;
1672 } 1678 }
1673 node_type = get_lpt_node_type(c, buf, &node_num); 1679 node_type = get_lpt_node_type(c, p, &node_num);
1674 node_len = get_lpt_node_len(c, node_type); 1680 node_len = get_lpt_node_len(c, node_type);
1675 ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); 1681 ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
1676 if (ret == 1) 1682 if (ret == 1)
1677 dirty += node_len; 1683 dirty += node_len;
1678 buf += node_len; 1684 p += node_len;
1679 len -= node_len; 1685 len -= node_len;
1680 } 1686 }
1687
1688 err = 0;
1689out:
1690 vfree(buf);
1691 return err;
1681} 1692}
1682 1693
1683/** 1694/**
@@ -1870,25 +1881,31 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1870static void dump_lpt_leb(const struct ubifs_info *c, int lnum) 1881static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1871{ 1882{
1872 int err, len = c->leb_size, node_type, node_num, node_len, offs; 1883 int err, len = c->leb_size, node_type, node_num, node_len, offs;
1873 void *buf = c->dbg->buf; 1884 void *buf, *p;
1874 1885
1875 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", 1886 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
1876 current->pid, lnum); 1887 current->pid, lnum);
1888 buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
1889 if (!buf) {
1890 ubifs_err("cannot allocate memory to dump LPT");
1891 return;
1892 }
1893
1877 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); 1894 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1878 if (err) { 1895 if (err) {
1879 ubifs_err("cannot read LEB %d, error %d", lnum, err); 1896 ubifs_err("cannot read LEB %d, error %d", lnum, err);
1880 return; 1897 goto out;
1881 } 1898 }
1882 while (1) { 1899 while (1) {
1883 offs = c->leb_size - len; 1900 offs = c->leb_size - len;
1884 if (!is_a_node(c, buf, len)) { 1901 if (!is_a_node(c, p, len)) {
1885 int pad_len; 1902 int pad_len;
1886 1903
1887 pad_len = get_pad_len(c, buf, len); 1904 pad_len = get_pad_len(c, p, len);
1888 if (pad_len) { 1905 if (pad_len) {
1889 printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", 1906 printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
1890 lnum, offs, pad_len); 1907 lnum, offs, pad_len);
1891 buf += pad_len; 1908 p += pad_len;
1892 len -= pad_len; 1909 len -= pad_len;
1893 continue; 1910 continue;
1894 } 1911 }
@@ -1898,7 +1915,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1898 break; 1915 break;
1899 } 1916 }
1900 1917
1901 node_type = get_lpt_node_type(c, buf, &node_num); 1918 node_type = get_lpt_node_type(c, p, &node_num);
1902 switch (node_type) { 1919 switch (node_type) {
1903 case UBIFS_LPT_PNODE: 1920 case UBIFS_LPT_PNODE:
1904 { 1921 {
@@ -1923,7 +1940,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1923 else 1940 else
1924 printk(KERN_DEBUG "LEB %d:%d, nnode, ", 1941 printk(KERN_DEBUG "LEB %d:%d, nnode, ",
1925 lnum, offs); 1942 lnum, offs);
1926 err = ubifs_unpack_nnode(c, buf, &nnode); 1943 err = ubifs_unpack_nnode(c, p, &nnode);
1927 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1944 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1928 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, 1945 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
1929 nnode.nbranch[i].offs); 1946 nnode.nbranch[i].offs);
@@ -1944,15 +1961,18 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1944 break; 1961 break;
1945 default: 1962 default:
1946 ubifs_err("LPT node type %d not recognized", node_type); 1963 ubifs_err("LPT node type %d not recognized", node_type);
1947 return; 1964 goto out;
1948 } 1965 }
1949 1966
1950 buf += node_len; 1967 p += node_len;
1951 len -= node_len; 1968 len -= node_len;
1952 } 1969 }
1953 1970
1954 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", 1971 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
1955 current->pid, lnum); 1972 current->pid, lnum);
1973out:
1974 vfree(buf);
1975 return;
1956} 1976}
1957 1977
1958/** 1978/**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 82009c74b6a3..09df318e368f 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -892,15 +892,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
892static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) 892static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
893{ 893{
894 int lnum, err = 0; 894 int lnum, err = 0;
895 void *buf;
895 896
896 /* Check no-orphans flag and skip this if no orphans */ 897 /* Check no-orphans flag and skip this if no orphans */
897 if (c->no_orphs) 898 if (c->no_orphs)
898 return 0; 899 return 0;
899 900
901 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
902 if (!buf) {
903 ubifs_err("cannot allocate memory to check orphans");
904 return 0;
905 }
906
900 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { 907 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
901 struct ubifs_scan_leb *sleb; 908 struct ubifs_scan_leb *sleb;
902 909
903 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); 910 sleb = ubifs_scan(c, lnum, 0, buf, 0);
904 if (IS_ERR(sleb)) { 911 if (IS_ERR(sleb)) {
905 err = PTR_ERR(sleb); 912 err = PTR_ERR(sleb);
906 break; 913 break;
@@ -912,6 +919,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
912 break; 919 break;
913 } 920 }
914 921
922 vfree(buf);
915 return err; 923 return err;
916} 924}
917 925
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 77e9b874b6c2..936f2cbfe6b6 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -28,6 +28,23 @@
28 * UBIFS always cleans away all remnants of an unclean un-mount, so that 28 * UBIFS always cleans away all remnants of an unclean un-mount, so that
29 * errors do not accumulate. However UBIFS defers recovery if it is mounted 29 * errors do not accumulate. However UBIFS defers recovery if it is mounted
30 * read-only, and the flash is not modified in that case. 30 * read-only, and the flash is not modified in that case.
31 *
32 * The general UBIFS approach to the recovery is that it recovers from
33 * corruptions which could be caused by power cuts, but it refuses to recover
34 * from corruption caused by other reasons. And UBIFS tries to distinguish
35 * between these 2 reasons of corruptions and silently recover in the former
36 * case and loudly complain in the latter case.
37 *
38 * UBIFS writes only to erased LEBs, so it writes only to the flash space
39 * containing only 0xFFs. UBIFS also always writes strictly from the beginning
40 * of the LEB to the end. And UBIFS assumes that the underlying flash media
41 * writes in @c->max_write_size bytes at a time.
42 *
43 * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
44 * I/O unit corresponding to offset X to contain corrupted data, all the
45 * following min. I/O units have to contain empty space (all 0xFFs). If this is
46 * not true, the corruption cannot be the result of a power cut, and UBIFS
47 * refuses to mount.
31 */ 48 */
32 49
33#include <linux/crc32.h> 50#include <linux/crc32.h>
@@ -362,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
362 * @offs: offset to check 379 * @offs: offset to check
363 * 380 *
364 * This function returns %1 if @offs was in the last write to the LEB whose data 381 * This function returns %1 if @offs was in the last write to the LEB whose data
365 * is in @buf, otherwise %0 is returned. The determination is made by checking 382 * is in @buf, otherwise %0 is returned. The determination is made by checking
366 * for subsequent empty space starting from the next @c->min_io_size boundary. 383 * for subsequent empty space starting from the next @c->max_write_size
384 * boundary.
367 */ 385 */
368static int is_last_write(const struct ubifs_info *c, void *buf, int offs) 386static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
369{ 387{
@@ -371,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
371 uint8_t *p; 389 uint8_t *p;
372 390
373 /* 391 /*
374 * Round up to the next @c->min_io_size boundary i.e. @offs is in the 392 * Round up to the next @c->max_write_size boundary i.e. @offs is in
375 * last wbuf written. After that should be empty space. 393 * the last wbuf written. After that should be empty space.
376 */ 394 */
377 empty_offs = ALIGN(offs + 1, c->min_io_size); 395 empty_offs = ALIGN(offs + 1, c->max_write_size);
378 check_len = c->leb_size - empty_offs; 396 check_len = c->leb_size - empty_offs;
379 p = buf + empty_offs - offs; 397 p = buf + empty_offs - offs;
380 return is_empty(p, check_len); 398 return is_empty(p, check_len);
@@ -429,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
429 int skip, dlen = le32_to_cpu(ch->len); 447 int skip, dlen = le32_to_cpu(ch->len);
430 448
431 /* Check for empty space after the corrupt node's common header */ 449 /* Check for empty space after the corrupt node's common header */
432 skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; 450 skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs;
433 if (is_empty(buf + skip, len - skip)) 451 if (is_empty(buf + skip, len - skip))
434 return 1; 452 return 1;
435 /* 453 /*
@@ -441,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
441 return 0; 459 return 0;
442 } 460 }
443 /* Now we know the corrupt node's length we can skip over it */ 461 /* Now we know the corrupt node's length we can skip over it */
444 skip = ALIGN(offs + dlen, c->min_io_size) - offs; 462 skip = ALIGN(offs + dlen, c->max_write_size) - offs;
445 /* After which there should be empty space */ 463 /* After which there should be empty space */
446 if (is_empty(buf + skip, len - skip)) 464 if (is_empty(buf + skip, len - skip))
447 return 1; 465 return 1;
@@ -671,10 +689,14 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
671 } else { 689 } else {
672 int corruption = first_non_ff(buf, len); 690 int corruption = first_non_ff(buf, len);
673 691
692 /*
693 * See header comment for this file for more
694 * explanations about the reasons we have this check.
695 */
674 ubifs_err("corrupt empty space LEB %d:%d, corruption " 696 ubifs_err("corrupt empty space LEB %d:%d, corruption "
675 "starts at %d", lnum, offs, corruption); 697 "starts at %d", lnum, offs, corruption);
676 /* Make sure we dump interesting non-0xFF data */ 698 /* Make sure we dump interesting non-0xFF data */
677 offs = corruption; 699 offs += corruption;
678 buf += corruption; 700 buf += corruption;
679 goto corrupted; 701 goto corrupted;
680 } 702 }
@@ -836,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
836static int recover_head(const struct ubifs_info *c, int lnum, int offs, 858static int recover_head(const struct ubifs_info *c, int lnum, int offs,
837 void *sbuf) 859 void *sbuf)
838{ 860{
839 int len, err; 861 int len = c->max_write_size, err;
840 862
841 if (c->min_io_size > 1)
842 len = c->min_io_size;
843 else
844 len = 512;
845 if (offs + len > c->leb_size) 863 if (offs + len > c->leb_size)
846 len = c->leb_size - offs; 864 len = c->leb_size - offs;
847 865
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 3e1ee57dbeaa..36216b46f772 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
328 if (!quiet) 328 if (!quiet)
329 ubifs_err("empty space starts at non-aligned offset %d", 329 ubifs_err("empty space starts at non-aligned offset %d",
330 offs); 330 offs);
331 goto corrupted;; 331 goto corrupted;
332 } 332 }
333 333
334 ubifs_end_scan(c, sleb, lnum, offs); 334 ubifs_end_scan(c, sleb, lnum, offs);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6e11c2975dcf..6ddd9973e681 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -512,9 +512,12 @@ static int init_constants_early(struct ubifs_info *c)
512 512
513 c->leb_cnt = c->vi.size; 513 c->leb_cnt = c->vi.size;
514 c->leb_size = c->vi.usable_leb_size; 514 c->leb_size = c->vi.usable_leb_size;
515 c->leb_start = c->di.leb_start;
515 c->half_leb_size = c->leb_size / 2; 516 c->half_leb_size = c->leb_size / 2;
516 c->min_io_size = c->di.min_io_size; 517 c->min_io_size = c->di.min_io_size;
517 c->min_io_shift = fls(c->min_io_size) - 1; 518 c->min_io_shift = fls(c->min_io_size) - 1;
519 c->max_write_size = c->di.max_write_size;
520 c->max_write_shift = fls(c->max_write_size) - 1;
518 521
519 if (c->leb_size < UBIFS_MIN_LEB_SZ) { 522 if (c->leb_size < UBIFS_MIN_LEB_SZ) {
520 ubifs_err("too small LEBs (%d bytes), min. is %d bytes", 523 ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
@@ -534,6 +537,18 @@ static int init_constants_early(struct ubifs_info *c)
534 } 537 }
535 538
536 /* 539 /*
540 * Maximum write size has to be greater or equivalent to min. I/O
541 * size, and be multiple of min. I/O size.
542 */
543 if (c->max_write_size < c->min_io_size ||
544 c->max_write_size % c->min_io_size ||
545 !is_power_of_2(c->max_write_size)) {
546 ubifs_err("bad write buffer size %d for %d min. I/O unit",
547 c->max_write_size, c->min_io_size);
548 return -EINVAL;
549 }
550
551 /*
537 * UBIFS aligns all node to 8-byte boundary, so to make function in 552 * UBIFS aligns all node to 8-byte boundary, so to make function in
538 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is 553 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
539 * less than 8. 554 * less than 8.
@@ -541,6 +556,10 @@ static int init_constants_early(struct ubifs_info *c)
541 if (c->min_io_size < 8) { 556 if (c->min_io_size < 8) {
542 c->min_io_size = 8; 557 c->min_io_size = 8;
543 c->min_io_shift = 3; 558 c->min_io_shift = 3;
559 if (c->max_write_size < c->min_io_size) {
560 c->max_write_size = c->min_io_size;
561 c->max_write_shift = c->min_io_shift;
562 }
544 } 563 }
545 564
546 c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); 565 c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
@@ -1202,11 +1221,14 @@ static int mount_ubifs(struct ubifs_info *c)
1202 if (c->bulk_read == 1) 1221 if (c->bulk_read == 1)
1203 bu_init(c); 1222 bu_init(c);
1204 1223
1205 /* 1224 if (!c->ro_mount) {
1206 * We have to check all CRCs, even for data nodes, when we mount the FS 1225 c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
1207 * (specifically, when we are replaying). 1226 GFP_KERNEL);
1208 */ 1227 if (!c->write_reserve_buf)
1209 c->always_chk_crc = 1; 1228 goto out_free;
1229 }
1230
1231 c->mounting = 1;
1210 1232
1211 err = ubifs_read_superblock(c); 1233 err = ubifs_read_superblock(c);
1212 if (err) 1234 if (err)
@@ -1382,7 +1404,7 @@ static int mount_ubifs(struct ubifs_info *c)
1382 if (err) 1404 if (err)
1383 goto out_infos; 1405 goto out_infos;
1384 1406
1385 c->always_chk_crc = 0; 1407 c->mounting = 0;
1386 1408
1387 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", 1409 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
1388 c->vi.ubi_num, c->vi.vol_id, c->vi.name); 1410 c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1403,6 +1425,7 @@ static int mount_ubifs(struct ubifs_info *c)
1403 1425
1404 dbg_msg("compiled on: " __DATE__ " at " __TIME__); 1426 dbg_msg("compiled on: " __DATE__ " at " __TIME__);
1405 dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); 1427 dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
1428 dbg_msg("max. write size: %d bytes", c->max_write_size);
1406 dbg_msg("LEB size: %d bytes (%d KiB)", 1429 dbg_msg("LEB size: %d bytes (%d KiB)",
1407 c->leb_size, c->leb_size >> 10); 1430 c->leb_size, c->leb_size >> 10);
1408 dbg_msg("data journal heads: %d", 1431 dbg_msg("data journal heads: %d",
@@ -1432,9 +1455,9 @@ static int mount_ubifs(struct ubifs_info *c)
1432 UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); 1455 UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
1433 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", 1456 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
1434 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); 1457 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
1435 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu", 1458 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
1436 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, 1459 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
1437 UBIFS_MAX_DENT_NODE_SZ); 1460 UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
1438 dbg_msg("dead watermark: %d", c->dead_wm); 1461 dbg_msg("dead watermark: %d", c->dead_wm);
1439 dbg_msg("dark watermark: %d", c->dark_wm); 1462 dbg_msg("dark watermark: %d", c->dark_wm);
1440 dbg_msg("LEB overhead: %d", c->leb_overhead); 1463 dbg_msg("LEB overhead: %d", c->leb_overhead);
@@ -1474,6 +1497,7 @@ out_wbufs:
1474out_cbuf: 1497out_cbuf:
1475 kfree(c->cbuf); 1498 kfree(c->cbuf);
1476out_free: 1499out_free:
1500 kfree(c->write_reserve_buf);
1477 kfree(c->bu.buf); 1501 kfree(c->bu.buf);
1478 vfree(c->ileb_buf); 1502 vfree(c->ileb_buf);
1479 vfree(c->sbuf); 1503 vfree(c->sbuf);
@@ -1512,6 +1536,7 @@ static void ubifs_umount(struct ubifs_info *c)
1512 kfree(c->cbuf); 1536 kfree(c->cbuf);
1513 kfree(c->rcvrd_mst_node); 1537 kfree(c->rcvrd_mst_node);
1514 kfree(c->mst_node); 1538 kfree(c->mst_node);
1539 kfree(c->write_reserve_buf);
1515 kfree(c->bu.buf); 1540 kfree(c->bu.buf);
1516 vfree(c->ileb_buf); 1541 vfree(c->ileb_buf);
1517 vfree(c->sbuf); 1542 vfree(c->sbuf);
@@ -1543,7 +1568,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1543 mutex_lock(&c->umount_mutex); 1568 mutex_lock(&c->umount_mutex);
1544 dbg_save_space_info(c); 1569 dbg_save_space_info(c);
1545 c->remounting_rw = 1; 1570 c->remounting_rw = 1;
1546 c->always_chk_crc = 1;
1547 1571
1548 err = check_free_space(c); 1572 err = check_free_space(c);
1549 if (err) 1573 if (err)
@@ -1598,6 +1622,10 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1598 goto out; 1622 goto out;
1599 } 1623 }
1600 1624
1625 c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
1626 if (!c->write_reserve_buf)
1627 goto out;
1628
1601 err = ubifs_lpt_init(c, 0, 1); 1629 err = ubifs_lpt_init(c, 0, 1);
1602 if (err) 1630 if (err)
1603 goto out; 1631 goto out;
@@ -1650,7 +1678,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1650 dbg_gen("re-mounted read-write"); 1678 dbg_gen("re-mounted read-write");
1651 c->ro_mount = 0; 1679 c->ro_mount = 0;
1652 c->remounting_rw = 0; 1680 c->remounting_rw = 0;
1653 c->always_chk_crc = 0;
1654 err = dbg_check_space_info(c); 1681 err = dbg_check_space_info(c);
1655 mutex_unlock(&c->umount_mutex); 1682 mutex_unlock(&c->umount_mutex);
1656 return err; 1683 return err;
@@ -1663,11 +1690,12 @@ out:
1663 c->bgt = NULL; 1690 c->bgt = NULL;
1664 } 1691 }
1665 free_wbufs(c); 1692 free_wbufs(c);
1693 kfree(c->write_reserve_buf);
1694 c->write_reserve_buf = NULL;
1666 vfree(c->ileb_buf); 1695 vfree(c->ileb_buf);
1667 c->ileb_buf = NULL; 1696 c->ileb_buf = NULL;
1668 ubifs_lpt_free(c, 1); 1697 ubifs_lpt_free(c, 1);
1669 c->remounting_rw = 0; 1698 c->remounting_rw = 0;
1670 c->always_chk_crc = 0;
1671 mutex_unlock(&c->umount_mutex); 1699 mutex_unlock(&c->umount_mutex);
1672 return err; 1700 return err;
1673} 1701}
@@ -1707,6 +1735,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1707 free_wbufs(c); 1735 free_wbufs(c);
1708 vfree(c->orph_buf); 1736 vfree(c->orph_buf);
1709 c->orph_buf = NULL; 1737 c->orph_buf = NULL;
1738 kfree(c->write_reserve_buf);
1739 c->write_reserve_buf = NULL;
1710 vfree(c->ileb_buf); 1740 vfree(c->ileb_buf);
1711 c->ileb_buf = NULL; 1741 c->ileb_buf = NULL;
1712 ubifs_lpt_free(c, 1); 1742 ubifs_lpt_free(c, 1);
@@ -1937,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1937 mutex_init(&c->mst_mutex); 1967 mutex_init(&c->mst_mutex);
1938 mutex_init(&c->umount_mutex); 1968 mutex_init(&c->umount_mutex);
1939 mutex_init(&c->bu_mutex); 1969 mutex_init(&c->bu_mutex);
1970 mutex_init(&c->write_reserve_mutex);
1940 init_waitqueue_head(&c->cmt_wq); 1971 init_waitqueue_head(&c->cmt_wq);
1941 c->buds = RB_ROOT; 1972 c->buds = RB_ROOT;
1942 c->old_idx = RB_ROOT; 1973 c->old_idx = RB_ROOT;
@@ -1954,6 +1985,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1954 INIT_LIST_HEAD(&c->old_buds); 1985 INIT_LIST_HEAD(&c->old_buds);
1955 INIT_LIST_HEAD(&c->orph_list); 1986 INIT_LIST_HEAD(&c->orph_list);
1956 INIT_LIST_HEAD(&c->orph_new); 1987 INIT_LIST_HEAD(&c->orph_new);
1988 c->no_chk_data_crc = 1;
1957 1989
1958 c->vfs_sb = sb; 1990 c->vfs_sb = sb;
1959 c->highest_inum = UBIFS_FIRST_INO; 1991 c->highest_inum = UBIFS_FIRST_INO;
@@ -1979,7 +2011,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1979 */ 2011 */
1980 c->bdi.name = "ubifs", 2012 c->bdi.name = "ubifs",
1981 c->bdi.capabilities = BDI_CAP_MAP_COPY; 2013 c->bdi.capabilities = BDI_CAP_MAP_COPY;
1982 c->bdi.unplug_io_fn = default_unplug_io_fn;
1983 err = bdi_init(&c->bdi); 2014 err = bdi_init(&c->bdi);
1984 if (err) 2015 if (err)
1985 goto out_close; 2016 goto out_close;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index ad9cf0133622..de485979ca39 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
447 * 447 *
448 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc 448 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
449 * is true (it is controlled by corresponding mount option). However, if 449 * is true (it is controlled by corresponding mount option). However, if
450 * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always 450 * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to
451 * checked. 451 * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is
452 * because during mounting or re-mounting from R/O mode to R/W mode we may read
453 * journal nodes (when replying the journal or doing the recovery) and the
454 * journal nodes may potentially be corrupted, so checking is required.
452 */ 455 */
453static int try_read_node(const struct ubifs_info *c, void *buf, int type, 456static int try_read_node(const struct ubifs_info *c, void *buf, int type,
454 int len, int lnum, int offs) 457 int len, int lnum, int offs)
@@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
476 if (node_len != len) 479 if (node_len != len)
477 return 0; 480 return 0;
478 481
479 if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc) 482 if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting &&
483 !c->remounting_rw)
480 return 1; 484 return 1;
481 485
482 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 486 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 381d6b207a52..8c40ad3c6721 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -151,6 +151,12 @@
151 */ 151 */
152#define WORST_COMPR_FACTOR 2 152#define WORST_COMPR_FACTOR 2
153 153
154/*
155 * How much memory is needed for a buffer where we comress a data node.
156 */
157#define COMPRESSED_DATA_NODE_BUF_SZ \
158 (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
159
154/* Maximum expected tree height for use by bottom_up_buf */ 160/* Maximum expected tree height for use by bottom_up_buf */
155#define BOTTOM_UP_HEIGHT 64 161#define BOTTOM_UP_HEIGHT 64
156 162
@@ -646,6 +652,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
646 * @offs: write-buffer offset in this logical eraseblock 652 * @offs: write-buffer offset in this logical eraseblock
647 * @avail: number of bytes available in the write-buffer 653 * @avail: number of bytes available in the write-buffer
648 * @used: number of used bytes in the write-buffer 654 * @used: number of used bytes in the write-buffer
655 * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
649 * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, 656 * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
650 * %UBI_UNKNOWN) 657 * %UBI_UNKNOWN)
651 * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep 658 * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
@@ -680,6 +687,7 @@ struct ubifs_wbuf {
680 int offs; 687 int offs;
681 int avail; 688 int avail;
682 int used; 689 int used;
690 int size;
683 int dtype; 691 int dtype;
684 int jhead; 692 int jhead;
685 int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); 693 int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
@@ -1003,6 +1011,11 @@ struct ubifs_debug_info;
1003 * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu 1011 * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
1004 * @bu: pre-allocated bulk-read information 1012 * @bu: pre-allocated bulk-read information
1005 * 1013 *
1014 * @write_reserve_mutex: protects @write_reserve_buf
1015 * @write_reserve_buf: on the write path we allocate memory, which might
1016 * sometimes be unavailable, in which case we use this
1017 * write reserve buffer
1018 *
1006 * @log_lebs: number of logical eraseblocks in the log 1019 * @log_lebs: number of logical eraseblocks in the log
1007 * @log_bytes: log size in bytes 1020 * @log_bytes: log size in bytes
1008 * @log_last: last LEB of the log 1021 * @log_last: last LEB of the log
@@ -1024,7 +1037,12 @@ struct ubifs_debug_info;
1024 * 1037 *
1025 * @min_io_size: minimal input/output unit size 1038 * @min_io_size: minimal input/output unit size
1026 * @min_io_shift: number of bits in @min_io_size minus one 1039 * @min_io_shift: number of bits in @min_io_size minus one
1040 * @max_write_size: maximum amount of bytes the underlying flash can write at a
1041 * time (MTD write buffer size)
1042 * @max_write_shift: number of bits in @max_write_size minus one
1027 * @leb_size: logical eraseblock size in bytes 1043 * @leb_size: logical eraseblock size in bytes
1044 * @leb_start: starting offset of logical eraseblocks within physical
1045 * eraseblocks
1028 * @half_leb_size: half LEB size 1046 * @half_leb_size: half LEB size
1029 * @idx_leb_size: how many bytes of an LEB are effectively available when it is 1047 * @idx_leb_size: how many bytes of an LEB are effectively available when it is
1030 * used to store indexing nodes (@leb_size - @max_idx_node_sz) 1048 * used to store indexing nodes (@leb_size - @max_idx_node_sz)
@@ -1166,22 +1184,21 @@ struct ubifs_debug_info;
1166 * @rp_uid: reserved pool user ID 1184 * @rp_uid: reserved pool user ID
1167 * @rp_gid: reserved pool group ID 1185 * @rp_gid: reserved pool group ID
1168 * 1186 *
1169 * @empty: if the UBI device is empty 1187 * @empty: %1 if the UBI device is empty
1188 * @need_recovery: %1 if the file-system needs recovery
1189 * @replaying: %1 during journal replay
1190 * @mounting: %1 while mounting
1191 * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
1170 * @replay_tree: temporary tree used during journal replay 1192 * @replay_tree: temporary tree used during journal replay
1171 * @replay_list: temporary list used during journal replay 1193 * @replay_list: temporary list used during journal replay
1172 * @replay_buds: list of buds to replay 1194 * @replay_buds: list of buds to replay
1173 * @cs_sqnum: sequence number of first node in the log (commit start node) 1195 * @cs_sqnum: sequence number of first node in the log (commit start node)
1174 * @replay_sqnum: sequence number of node currently being replayed 1196 * @replay_sqnum: sequence number of node currently being replayed
1175 * @need_recovery: file-system needs recovery
1176 * @replaying: set to %1 during journal replay
1177 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W 1197 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
1178 * mode 1198 * mode
1179 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted 1199 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
1180 * FS to R/W mode 1200 * FS to R/W mode
1181 * @size_tree: inode size information for recovery 1201 * @size_tree: inode size information for recovery
1182 * @remounting_rw: set while re-mounting from R/O mode to R/W mode
1183 * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
1184 * mode)
1185 * @mount_opts: UBIFS-specific mount options 1202 * @mount_opts: UBIFS-specific mount options
1186 * 1203 *
1187 * @dbg: debugging-related information 1204 * @dbg: debugging-related information
@@ -1250,6 +1267,9 @@ struct ubifs_info {
1250 struct mutex bu_mutex; 1267 struct mutex bu_mutex;
1251 struct bu_info bu; 1268 struct bu_info bu;
1252 1269
1270 struct mutex write_reserve_mutex;
1271 void *write_reserve_buf;
1272
1253 int log_lebs; 1273 int log_lebs;
1254 long long log_bytes; 1274 long long log_bytes;
1255 int log_last; 1275 int log_last;
@@ -1271,7 +1291,10 @@ struct ubifs_info {
1271 1291
1272 int min_io_size; 1292 int min_io_size;
1273 int min_io_shift; 1293 int min_io_shift;
1294 int max_write_size;
1295 int max_write_shift;
1274 int leb_size; 1296 int leb_size;
1297 int leb_start;
1275 int half_leb_size; 1298 int half_leb_size;
1276 int idx_leb_size; 1299 int idx_leb_size;
1277 int leb_cnt; 1300 int leb_cnt;
@@ -1402,19 +1425,19 @@ struct ubifs_info {
1402 gid_t rp_gid; 1425 gid_t rp_gid;
1403 1426
1404 /* The below fields are used only during mounting and re-mounting */ 1427 /* The below fields are used only during mounting and re-mounting */
1405 int empty; 1428 unsigned int empty:1;
1429 unsigned int need_recovery:1;
1430 unsigned int replaying:1;
1431 unsigned int mounting:1;
1432 unsigned int remounting_rw:1;
1406 struct rb_root replay_tree; 1433 struct rb_root replay_tree;
1407 struct list_head replay_list; 1434 struct list_head replay_list;
1408 struct list_head replay_buds; 1435 struct list_head replay_buds;
1409 unsigned long long cs_sqnum; 1436 unsigned long long cs_sqnum;
1410 unsigned long long replay_sqnum; 1437 unsigned long long replay_sqnum;
1411 int need_recovery;
1412 int replaying;
1413 struct list_head unclean_leb_list; 1438 struct list_head unclean_leb_list;
1414 struct ubifs_mst_node *rcvrd_mst_node; 1439 struct ubifs_mst_node *rcvrd_mst_node;
1415 struct rb_root size_tree; 1440 struct rb_root size_tree;
1416 int remounting_rw;
1417 int always_chk_crc;
1418 struct ubifs_mount_opts mount_opts; 1441 struct ubifs_mount_opts mount_opts;
1419 1442
1420#ifdef CONFIG_UBIFS_FS_DEBUG 1443#ifdef CONFIG_UBIFS_FS_DEBUG
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 306ee39ef2c3..95518a9f589e 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -27,11 +27,10 @@
27#include "udf_i.h" 27#include "udf_i.h"
28#include "udf_sb.h" 28#include "udf_sb.h"
29 29
30#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr) 30#define udf_clear_bit __test_and_clear_bit_le
31#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr) 31#define udf_set_bit __test_and_set_bit_le
32#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr) 32#define udf_test_bit test_bit_le
33#define udf_find_next_one_bit(addr, size, offset) \ 33#define udf_find_next_one_bit find_next_bit_le
34 ext2_find_next_bit(addr, size, offset)
35 34
36static int read_block_bitmap(struct super_block *sb, 35static int read_block_bitmap(struct super_block *sb,
37 struct udf_bitmap *bitmap, unsigned int block, 36 struct udf_bitmap *bitmap, unsigned int block,
@@ -297,7 +296,7 @@ repeat:
297 break; 296 break;
298 } 297 }
299 } else { 298 } else {
300 bit = udf_find_next_one_bit((char *)bh->b_data, 299 bit = udf_find_next_one_bit(bh->b_data,
301 sb->s_blocksize << 3, 300 sb->s_blocksize << 3,
302 group_start << 3); 301 group_start << 3);
303 if (bit < sb->s_blocksize << 3) 302 if (bit < sb->s_blocksize << 3)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 89c78486cbbe..2a346bb1d9f5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -98,7 +98,6 @@ static int udf_adinicb_write_end(struct file *file,
98const struct address_space_operations udf_adinicb_aops = { 98const struct address_space_operations udf_adinicb_aops = {
99 .readpage = udf_adinicb_readpage, 99 .readpage = udf_adinicb_readpage,
100 .writepage = udf_adinicb_writepage, 100 .writepage = udf_adinicb_writepage,
101 .sync_page = block_sync_page,
102 .write_begin = simple_write_begin, 101 .write_begin = simple_write_begin,
103 .write_end = udf_adinicb_write_end, 102 .write_end = udf_adinicb_write_end,
104}; 103};
@@ -123,8 +122,8 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
123 if (inode->i_sb->s_blocksize < 122 if (inode->i_sb->s_blocksize <
124 (udf_file_entry_alloc_offset(inode) + 123 (udf_file_entry_alloc_offset(inode) +
125 pos + count)) { 124 pos + count)) {
126 udf_expand_file_adinicb(inode, pos + count, &err); 125 err = udf_expand_file_adinicb(inode);
127 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 126 if (err) {
128 udf_debug("udf_expand_adinicb: err=%d\n", err); 127 udf_debug("udf_expand_adinicb: err=%d\n", err);
129 up_write(&iinfo->i_data_sem); 128 up_write(&iinfo->i_data_sem);
130 return err; 129 return err;
@@ -237,7 +236,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
237 236
238 if ((attr->ia_valid & ATTR_SIZE) && 237 if ((attr->ia_valid & ATTR_SIZE) &&
239 attr->ia_size != i_size_read(inode)) { 238 attr->ia_size != i_size_read(inode)) {
240 error = vmtruncate(inode, attr->ia_size); 239 error = udf_setsize(inode, attr->ia_size);
241 if (error) 240 if (error)
242 return error; 241 return error;
243 } 242 }
@@ -249,5 +248,4 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
249 248
250const struct inode_operations udf_file_inode_operations = { 249const struct inode_operations udf_file_inode_operations = {
251 .setattr = udf_setattr, 250 .setattr = udf_setattr,
252 .truncate = udf_truncate,
253}; 251};
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c6a2e782b97b..1d1358ed80c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -73,14 +73,12 @@ void udf_evict_inode(struct inode *inode)
73 struct udf_inode_info *iinfo = UDF_I(inode); 73 struct udf_inode_info *iinfo = UDF_I(inode);
74 int want_delete = 0; 74 int want_delete = 0;
75 75
76 truncate_inode_pages(&inode->i_data, 0);
77
78 if (!inode->i_nlink && !is_bad_inode(inode)) { 76 if (!inode->i_nlink && !is_bad_inode(inode)) {
79 want_delete = 1; 77 want_delete = 1;
80 inode->i_size = 0; 78 udf_setsize(inode, 0);
81 udf_truncate(inode);
82 udf_update_inode(inode, IS_SYNC(inode)); 79 udf_update_inode(inode, IS_SYNC(inode));
83 } 80 } else
81 truncate_inode_pages(&inode->i_data, 0);
84 invalidate_inode_buffers(inode); 82 invalidate_inode_buffers(inode);
85 end_writeback(inode); 83 end_writeback(inode);
86 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 84 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
@@ -117,9 +115,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
117 115
118 ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block); 116 ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
119 if (unlikely(ret)) { 117 if (unlikely(ret)) {
120 loff_t isize = mapping->host->i_size; 118 struct inode *inode = mapping->host;
121 if (pos + len > isize) 119 struct udf_inode_info *iinfo = UDF_I(inode);
122 vmtruncate(mapping->host, isize); 120 loff_t isize = inode->i_size;
121
122 if (pos + len > isize) {
123 truncate_pagecache(inode, pos + len, isize);
124 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
125 down_write(&iinfo->i_data_sem);
126 udf_truncate_extents(inode);
127 up_write(&iinfo->i_data_sem);
128 }
129 }
123 } 130 }
124 131
125 return ret; 132 return ret;
@@ -133,36 +140,36 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
133const struct address_space_operations udf_aops = { 140const struct address_space_operations udf_aops = {
134 .readpage = udf_readpage, 141 .readpage = udf_readpage,
135 .writepage = udf_writepage, 142 .writepage = udf_writepage,
136 .sync_page = block_sync_page,
137 .write_begin = udf_write_begin, 143 .write_begin = udf_write_begin,
138 .write_end = generic_write_end, 144 .write_end = generic_write_end,
139 .bmap = udf_bmap, 145 .bmap = udf_bmap,
140}; 146};
141 147
142void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err) 148int udf_expand_file_adinicb(struct inode *inode)
143{ 149{
144 struct page *page; 150 struct page *page;
145 char *kaddr; 151 char *kaddr;
146 struct udf_inode_info *iinfo = UDF_I(inode); 152 struct udf_inode_info *iinfo = UDF_I(inode);
153 int err;
147 struct writeback_control udf_wbc = { 154 struct writeback_control udf_wbc = {
148 .sync_mode = WB_SYNC_NONE, 155 .sync_mode = WB_SYNC_NONE,
149 .nr_to_write = 1, 156 .nr_to_write = 1,
150 }; 157 };
151 158
152 /* from now on we have normal address_space methods */
153 inode->i_data.a_ops = &udf_aops;
154
155 if (!iinfo->i_lenAlloc) { 159 if (!iinfo->i_lenAlloc) {
156 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) 160 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
157 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; 161 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
158 else 162 else
159 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; 163 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
164 /* from now on we have normal address_space methods */
165 inode->i_data.a_ops = &udf_aops;
160 mark_inode_dirty(inode); 166 mark_inode_dirty(inode);
161 return; 167 return 0;
162 } 168 }
163 169
164 page = grab_cache_page(inode->i_mapping, 0); 170 page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
165 BUG_ON(!PageLocked(page)); 171 if (!page)
172 return -ENOMEM;
166 173
167 if (!PageUptodate(page)) { 174 if (!PageUptodate(page)) {
168 kaddr = kmap(page); 175 kaddr = kmap(page);
@@ -181,11 +188,24 @@ void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
181 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; 188 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
182 else 189 else
183 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; 190 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
184 191 /* from now on we have normal address_space methods */
185 inode->i_data.a_ops->writepage(page, &udf_wbc); 192 inode->i_data.a_ops = &udf_aops;
193 err = inode->i_data.a_ops->writepage(page, &udf_wbc);
194 if (err) {
195 /* Restore everything back so that we don't lose data... */
196 lock_page(page);
197 kaddr = kmap(page);
198 memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
199 inode->i_size);
200 kunmap(page);
201 unlock_page(page);
202 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
203 inode->i_data.a_ops = &udf_adinicb_aops;
204 }
186 page_cache_release(page); 205 page_cache_release(page);
187
188 mark_inode_dirty(inode); 206 mark_inode_dirty(inode);
207
208 return err;
189} 209}
190 210
191struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, 211struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
@@ -348,8 +368,10 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
348} 368}
349 369
350/* Extend the file by 'blocks' blocks, return the number of extents added */ 370/* Extend the file by 'blocks' blocks, return the number of extents added */
351int udf_extend_file(struct inode *inode, struct extent_position *last_pos, 371static int udf_do_extend_file(struct inode *inode,
352 struct kernel_long_ad *last_ext, sector_t blocks) 372 struct extent_position *last_pos,
373 struct kernel_long_ad *last_ext,
374 sector_t blocks)
353{ 375{
354 sector_t add; 376 sector_t add;
355 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); 377 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
@@ -357,6 +379,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
357 struct kernel_lb_addr prealloc_loc = {}; 379 struct kernel_lb_addr prealloc_loc = {};
358 int prealloc_len = 0; 380 int prealloc_len = 0;
359 struct udf_inode_info *iinfo; 381 struct udf_inode_info *iinfo;
382 int err;
360 383
361 /* The previous extent is fake and we should not extend by anything 384 /* The previous extent is fake and we should not extend by anything
362 * - there's nothing to do... */ 385 * - there's nothing to do... */
@@ -422,26 +445,29 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
422 /* Create enough extents to cover the whole hole */ 445 /* Create enough extents to cover the whole hole */
423 while (blocks > add) { 446 while (blocks > add) {
424 blocks -= add; 447 blocks -= add;
425 if (udf_add_aext(inode, last_pos, &last_ext->extLocation, 448 err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
426 last_ext->extLength, 1) == -1) 449 last_ext->extLength, 1);
427 return -1; 450 if (err)
451 return err;
428 count++; 452 count++;
429 } 453 }
430 if (blocks) { 454 if (blocks) {
431 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 455 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
432 (blocks << sb->s_blocksize_bits); 456 (blocks << sb->s_blocksize_bits);
433 if (udf_add_aext(inode, last_pos, &last_ext->extLocation, 457 err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
434 last_ext->extLength, 1) == -1) 458 last_ext->extLength, 1);
435 return -1; 459 if (err)
460 return err;
436 count++; 461 count++;
437 } 462 }
438 463
439out: 464out:
440 /* Do we have some preallocated blocks saved? */ 465 /* Do we have some preallocated blocks saved? */
441 if (prealloc_len) { 466 if (prealloc_len) {
442 if (udf_add_aext(inode, last_pos, &prealloc_loc, 467 err = udf_add_aext(inode, last_pos, &prealloc_loc,
443 prealloc_len, 1) == -1) 468 prealloc_len, 1);
444 return -1; 469 if (err)
470 return err;
445 last_ext->extLocation = prealloc_loc; 471 last_ext->extLocation = prealloc_loc;
446 last_ext->extLength = prealloc_len; 472 last_ext->extLength = prealloc_len;
447 count++; 473 count++;
@@ -453,11 +479,68 @@ out:
453 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 479 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
454 last_pos->offset -= sizeof(struct long_ad); 480 last_pos->offset -= sizeof(struct long_ad);
455 else 481 else
456 return -1; 482 return -EIO;
457 483
458 return count; 484 return count;
459} 485}
460 486
487static int udf_extend_file(struct inode *inode, loff_t newsize)
488{
489
490 struct extent_position epos;
491 struct kernel_lb_addr eloc;
492 uint32_t elen;
493 int8_t etype;
494 struct super_block *sb = inode->i_sb;
495 sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
496 int adsize;
497 struct udf_inode_info *iinfo = UDF_I(inode);
498 struct kernel_long_ad extent;
499 int err;
500
501 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
502 adsize = sizeof(struct short_ad);
503 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
504 adsize = sizeof(struct long_ad);
505 else
506 BUG();
507
508 etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
509
510 /* File has extent covering the new size (could happen when extending
511 * inside a block)? */
512 if (etype != -1)
513 return 0;
514 if (newsize & (sb->s_blocksize - 1))
515 offset++;
516 /* Extended file just to the boundary of the last file block? */
517 if (offset == 0)
518 return 0;
519
520 /* Truncate is extending the file by 'offset' blocks */
521 if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
522 (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
523 /* File has no extents at all or has empty last
524 * indirect extent! Create a fake extent... */
525 extent.extLocation.logicalBlockNum = 0;
526 extent.extLocation.partitionReferenceNum = 0;
527 extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
528 } else {
529 epos.offset -= adsize;
530 etype = udf_next_aext(inode, &epos, &extent.extLocation,
531 &extent.extLength, 0);
532 extent.extLength |= etype << 30;
533 }
534 err = udf_do_extend_file(inode, &epos, &extent, offset);
535 if (err < 0)
536 goto out;
537 err = 0;
538 iinfo->i_lenExtents = newsize;
539out:
540 brelse(epos.bh);
541 return err;
542}
543
461static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, 544static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
462 int *err, sector_t *phys, int *new) 545 int *err, sector_t *phys, int *new)
463{ 546{
@@ -540,7 +623,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
540 elen = EXT_RECORDED_ALLOCATED | 623 elen = EXT_RECORDED_ALLOCATED |
541 ((elen + inode->i_sb->s_blocksize - 1) & 624 ((elen + inode->i_sb->s_blocksize - 1) &
542 ~(inode->i_sb->s_blocksize - 1)); 625 ~(inode->i_sb->s_blocksize - 1));
543 etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1); 626 udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
544 } 627 }
545 brelse(prev_epos.bh); 628 brelse(prev_epos.bh);
546 brelse(cur_epos.bh); 629 brelse(cur_epos.bh);
@@ -564,19 +647,17 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
564 memset(&laarr[0].extLocation, 0x00, 647 memset(&laarr[0].extLocation, 0x00,
565 sizeof(struct kernel_lb_addr)); 648 sizeof(struct kernel_lb_addr));
566 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; 649 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
567 /* Will udf_extend_file() create real extent from 650 /* Will udf_do_extend_file() create real extent from
568 a fake one? */ 651 a fake one? */
569 startnum = (offset > 0); 652 startnum = (offset > 0);
570 } 653 }
571 /* Create extents for the hole between EOF and offset */ 654 /* Create extents for the hole between EOF and offset */
572 ret = udf_extend_file(inode, &prev_epos, laarr, offset); 655 ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
573 if (ret == -1) { 656 if (ret < 0) {
574 brelse(prev_epos.bh); 657 brelse(prev_epos.bh);
575 brelse(cur_epos.bh); 658 brelse(cur_epos.bh);
576 brelse(next_epos.bh); 659 brelse(next_epos.bh);
577 /* We don't really know the error here so we just make 660 *err = ret;
578 * something up */
579 *err = -ENOSPC;
580 return NULL; 661 return NULL;
581 } 662 }
582 c = 0; 663 c = 0;
@@ -1005,52 +1086,66 @@ struct buffer_head *udf_bread(struct inode *inode, int block,
1005 return NULL; 1086 return NULL;
1006} 1087}
1007 1088
1008void udf_truncate(struct inode *inode) 1089int udf_setsize(struct inode *inode, loff_t newsize)
1009{ 1090{
1010 int offset;
1011 int err; 1091 int err;
1012 struct udf_inode_info *iinfo; 1092 struct udf_inode_info *iinfo;
1093 int bsize = 1 << inode->i_blkbits;
1013 1094
1014 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1095 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1015 S_ISLNK(inode->i_mode))) 1096 S_ISLNK(inode->i_mode)))
1016 return; 1097 return -EINVAL;
1017 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 1098 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1018 return; 1099 return -EPERM;
1019 1100
1020 iinfo = UDF_I(inode); 1101 iinfo = UDF_I(inode);
1021 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1102 if (newsize > inode->i_size) {
1022 down_write(&iinfo->i_data_sem); 1103 down_write(&iinfo->i_data_sem);
1023 if (inode->i_sb->s_blocksize < 1104 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1024 (udf_file_entry_alloc_offset(inode) + 1105 if (bsize <
1025 inode->i_size)) { 1106 (udf_file_entry_alloc_offset(inode) + newsize)) {
1026 udf_expand_file_adinicb(inode, inode->i_size, &err); 1107 err = udf_expand_file_adinicb(inode);
1027 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1108 if (err) {
1028 inode->i_size = iinfo->i_lenAlloc; 1109 up_write(&iinfo->i_data_sem);
1029 up_write(&iinfo->i_data_sem); 1110 return err;
1030 return; 1111 }
1031 } else 1112 } else
1032 udf_truncate_extents(inode); 1113 iinfo->i_lenAlloc = newsize;
1033 } else { 1114 }
1034 offset = inode->i_size & (inode->i_sb->s_blocksize - 1); 1115 err = udf_extend_file(inode, newsize);
1035 memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset, 1116 if (err) {
1036 0x00, inode->i_sb->s_blocksize - 1117 up_write(&iinfo->i_data_sem);
1037 offset - udf_file_entry_alloc_offset(inode)); 1118 return err;
1038 iinfo->i_lenAlloc = inode->i_size;
1039 } 1119 }
1120 truncate_setsize(inode, newsize);
1040 up_write(&iinfo->i_data_sem); 1121 up_write(&iinfo->i_data_sem);
1041 } else { 1122 } else {
1042 block_truncate_page(inode->i_mapping, inode->i_size, 1123 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1043 udf_get_block); 1124 down_write(&iinfo->i_data_sem);
1125 memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
1126 0x00, bsize - newsize -
1127 udf_file_entry_alloc_offset(inode));
1128 iinfo->i_lenAlloc = newsize;
1129 truncate_setsize(inode, newsize);
1130 up_write(&iinfo->i_data_sem);
1131 goto update_time;
1132 }
1133 err = block_truncate_page(inode->i_mapping, newsize,
1134 udf_get_block);
1135 if (err)
1136 return err;
1044 down_write(&iinfo->i_data_sem); 1137 down_write(&iinfo->i_data_sem);
1138 truncate_setsize(inode, newsize);
1045 udf_truncate_extents(inode); 1139 udf_truncate_extents(inode);
1046 up_write(&iinfo->i_data_sem); 1140 up_write(&iinfo->i_data_sem);
1047 } 1141 }
1048 1142update_time:
1049 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); 1143 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
1050 if (IS_SYNC(inode)) 1144 if (IS_SYNC(inode))
1051 udf_sync_inode(inode); 1145 udf_sync_inode(inode);
1052 else 1146 else
1053 mark_inode_dirty(inode); 1147 mark_inode_dirty(inode);
1148 return 0;
1054} 1149}
1055 1150
1056static void __udf_read_inode(struct inode *inode) 1151static void __udf_read_inode(struct inode *inode)
@@ -1637,14 +1732,13 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
1637 return NULL; 1732 return NULL;
1638} 1733}
1639 1734
1640int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, 1735int udf_add_aext(struct inode *inode, struct extent_position *epos,
1641 struct kernel_lb_addr *eloc, uint32_t elen, int inc) 1736 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1642{ 1737{
1643 int adsize; 1738 int adsize;
1644 struct short_ad *sad = NULL; 1739 struct short_ad *sad = NULL;
1645 struct long_ad *lad = NULL; 1740 struct long_ad *lad = NULL;
1646 struct allocExtDesc *aed; 1741 struct allocExtDesc *aed;
1647 int8_t etype;
1648 uint8_t *ptr; 1742 uint8_t *ptr;
1649 struct udf_inode_info *iinfo = UDF_I(inode); 1743 struct udf_inode_info *iinfo = UDF_I(inode);
1650 1744
@@ -1660,7 +1754,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1660 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1754 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1661 adsize = sizeof(struct long_ad); 1755 adsize = sizeof(struct long_ad);
1662 else 1756 else
1663 return -1; 1757 return -EIO;
1664 1758
1665 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { 1759 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
1666 unsigned char *sptr, *dptr; 1760 unsigned char *sptr, *dptr;
@@ -1672,12 +1766,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1672 obloc.partitionReferenceNum, 1766 obloc.partitionReferenceNum,
1673 obloc.logicalBlockNum, &err); 1767 obloc.logicalBlockNum, &err);
1674 if (!epos->block.logicalBlockNum) 1768 if (!epos->block.logicalBlockNum)
1675 return -1; 1769 return -ENOSPC;
1676 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, 1770 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
1677 &epos->block, 1771 &epos->block,
1678 0)); 1772 0));
1679 if (!nbh) 1773 if (!nbh)
1680 return -1; 1774 return -EIO;
1681 lock_buffer(nbh); 1775 lock_buffer(nbh);
1682 memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); 1776 memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
1683 set_buffer_uptodate(nbh); 1777 set_buffer_uptodate(nbh);
@@ -1746,7 +1840,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1746 epos->bh = nbh; 1840 epos->bh = nbh;
1747 } 1841 }
1748 1842
1749 etype = udf_write_aext(inode, epos, eloc, elen, inc); 1843 udf_write_aext(inode, epos, eloc, elen, inc);
1750 1844
1751 if (!epos->bh) { 1845 if (!epos->bh) {
1752 iinfo->i_lenAlloc += adsize; 1846 iinfo->i_lenAlloc += adsize;
@@ -1764,11 +1858,11 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1764 mark_buffer_dirty_inode(epos->bh, inode); 1858 mark_buffer_dirty_inode(epos->bh, inode);
1765 } 1859 }
1766 1860
1767 return etype; 1861 return 0;
1768} 1862}
1769 1863
1770int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, 1864void udf_write_aext(struct inode *inode, struct extent_position *epos,
1771 struct kernel_lb_addr *eloc, uint32_t elen, int inc) 1865 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1772{ 1866{
1773 int adsize; 1867 int adsize;
1774 uint8_t *ptr; 1868 uint8_t *ptr;
@@ -1798,7 +1892,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1798 adsize = sizeof(struct long_ad); 1892 adsize = sizeof(struct long_ad);
1799 break; 1893 break;
1800 default: 1894 default:
1801 return -1; 1895 return;
1802 } 1896 }
1803 1897
1804 if (epos->bh) { 1898 if (epos->bh) {
@@ -1817,8 +1911,6 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1817 1911
1818 if (inc) 1912 if (inc)
1819 epos->offset += adsize; 1913 epos->offset += adsize;
1820
1821 return (elen >> 30);
1822} 1914}
1823 1915
1824int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, 1916int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2be0f9eb86d2..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,8 @@
32#include <linux/crc-itu-t.h> 32#include <linux/crc-itu-t.h>
33#include <linux/exportfs.h> 33#include <linux/exportfs.h>
34 34
35enum { UDF_MAX_LINKS = 0xffff };
36
35static inline int udf_match(int len1, const unsigned char *name1, int len2, 37static inline int udf_match(int len1, const unsigned char *name1, int len2,
36 const unsigned char *name2) 38 const unsigned char *name2)
37{ 39{
@@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
650 struct udf_inode_info *iinfo; 652 struct udf_inode_info *iinfo;
651 653
652 err = -EMLINK; 654 err = -EMLINK;
653 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 655 if (dir->i_nlink >= UDF_MAX_LINKS)
654 goto out; 656 goto out;
655 657
656 err = -EIO; 658 err = -EIO;
@@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1034 struct fileIdentDesc cfi, *fi; 1036 struct fileIdentDesc cfi, *fi;
1035 int err; 1037 int err;
1036 1038
1037 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1039 if (inode->i_nlink >= UDF_MAX_LINKS)
1038 return -EMLINK; 1040 return -EMLINK;
1039 }
1040 1041
1041 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 1042 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1042 if (!fi) { 1043 if (!fi) {
@@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1131 goto end_rename; 1132 goto end_rename;
1132 1133
1133 retval = -EMLINK; 1134 retval = -EMLINK;
1134 if (!new_inode && 1135 if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
1135 new_dir->i_nlink >=
1136 (256 << sizeof(new_dir->i_nlink)) - 1)
1137 goto end_rename; 1136 goto end_rename;
1138 } 1137 }
1139 if (!nfi) { 1138 if (!nfi) {
@@ -1287,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
1287 struct fid *fid = (struct fid *)fh; 1286 struct fid *fid = (struct fid *)fh;
1288 int type = FILEID_UDF_WITHOUT_PARENT; 1287 int type = FILEID_UDF_WITHOUT_PARENT;
1289 1288
1290 if (len < 3 || (connectable && len < 5)) 1289 if (connectable && (len < 5)) {
1290 *lenp = 5;
1291 return 255; 1291 return 255;
1292 } else if (len < 3) {
1293 *lenp = 3;
1294 return 255;
1295 }
1292 1296
1293 *lenp = 3; 1297 *lenp = 3;
1294 fid->udf.block = location.logicalBlockNum; 1298 fid->udf.block = location.logicalBlockNum;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 225527cdc885..8424308db4b4 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -197,6 +197,11 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
197 mark_buffer_dirty_inode(epos->bh, inode); 197 mark_buffer_dirty_inode(epos->bh, inode);
198} 198}
199 199
200/*
201 * Truncate extents of inode to inode->i_size. This function can be used only
202 * for making file shorter. For making file longer, udf_extend_file() has to
203 * be used.
204 */
200void udf_truncate_extents(struct inode *inode) 205void udf_truncate_extents(struct inode *inode)
201{ 206{
202 struct extent_position epos; 207 struct extent_position epos;
@@ -219,96 +224,65 @@ void udf_truncate_extents(struct inode *inode)
219 etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); 224 etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
220 byte_offset = (offset << sb->s_blocksize_bits) + 225 byte_offset = (offset << sb->s_blocksize_bits) +
221 (inode->i_size & (sb->s_blocksize - 1)); 226 (inode->i_size & (sb->s_blocksize - 1));
222 if (etype != -1) { 227 if (etype == -1) {
223 epos.offset -= adsize; 228 /* We should extend the file? */
224 extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); 229 WARN_ON(byte_offset);
225 epos.offset += adsize; 230 return;
226 if (byte_offset) 231 }
227 lenalloc = epos.offset; 232 epos.offset -= adsize;
228 else 233 extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
229 lenalloc = epos.offset - adsize; 234 epos.offset += adsize;
230 235 if (byte_offset)
231 if (!epos.bh) 236 lenalloc = epos.offset;
232 lenalloc -= udf_file_entry_alloc_offset(inode); 237 else
233 else 238 lenalloc = epos.offset - adsize;
234 lenalloc -= sizeof(struct allocExtDesc);
235
236 while ((etype = udf_current_aext(inode, &epos, &eloc,
237 &elen, 0)) != -1) {
238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
239 udf_write_aext(inode, &epos, &neloc, nelen, 0);
240 if (indirect_ext_len) {
241 /* We managed to free all extents in the
242 * indirect extent - free it too */
243 BUG_ON(!epos.bh);
244 udf_free_blocks(sb, inode, &epos.block,
245 0, indirect_ext_len);
246 } else if (!epos.bh) {
247 iinfo->i_lenAlloc = lenalloc;
248 mark_inode_dirty(inode);
249 } else
250 udf_update_alloc_ext_desc(inode,
251 &epos, lenalloc);
252 brelse(epos.bh);
253 epos.offset = sizeof(struct allocExtDesc);
254 epos.block = eloc;
255 epos.bh = udf_tread(sb,
256 udf_get_lb_pblock(sb, &eloc, 0));
257 if (elen)
258 indirect_ext_len =
259 (elen + sb->s_blocksize - 1) >>
260 sb->s_blocksize_bits;
261 else
262 indirect_ext_len = 1;
263 } else {
264 extent_trunc(inode, &epos, &eloc, etype,
265 elen, 0);
266 epos.offset += adsize;
267 }
268 }
269 239
270 if (indirect_ext_len) { 240 if (!epos.bh)
271 BUG_ON(!epos.bh); 241 lenalloc -= udf_file_entry_alloc_offset(inode);
272 udf_free_blocks(sb, inode, &epos.block, 0, 242 else
273 indirect_ext_len); 243 lenalloc -= sizeof(struct allocExtDesc);
274 } else if (!epos.bh) {
275 iinfo->i_lenAlloc = lenalloc;
276 mark_inode_dirty(inode);
277 } else
278 udf_update_alloc_ext_desc(inode, &epos, lenalloc);
279 } else if (inode->i_size) {
280 if (byte_offset) {
281 struct kernel_long_ad extent;
282 244
283 /* 245 while ((etype = udf_current_aext(inode, &epos, &eloc,
284 * OK, there is not extent covering inode->i_size and 246 &elen, 0)) != -1) {
285 * no extent above inode->i_size => truncate is 247 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
286 * extending the file by 'offset' blocks. 248 udf_write_aext(inode, &epos, &neloc, nelen, 0);
287 */ 249 if (indirect_ext_len) {
288 if ((!epos.bh && 250 /* We managed to free all extents in the
289 epos.offset == 251 * indirect extent - free it too */
290 udf_file_entry_alloc_offset(inode)) || 252 BUG_ON(!epos.bh);
291 (epos.bh && epos.offset == 253 udf_free_blocks(sb, inode, &epos.block,
292 sizeof(struct allocExtDesc))) { 254 0, indirect_ext_len);
293 /* File has no extents at all or has empty last 255 } else if (!epos.bh) {
294 * indirect extent! Create a fake extent... */ 256 iinfo->i_lenAlloc = lenalloc;
295 extent.extLocation.logicalBlockNum = 0; 257 mark_inode_dirty(inode);
296 extent.extLocation.partitionReferenceNum = 0; 258 } else
297 extent.extLength = 259 udf_update_alloc_ext_desc(inode,
298 EXT_NOT_RECORDED_NOT_ALLOCATED; 260 &epos, lenalloc);
299 } else { 261 brelse(epos.bh);
300 epos.offset -= adsize; 262 epos.offset = sizeof(struct allocExtDesc);
301 etype = udf_next_aext(inode, &epos, 263 epos.block = eloc;
302 &extent.extLocation, 264 epos.bh = udf_tread(sb,
303 &extent.extLength, 0); 265 udf_get_lb_pblock(sb, &eloc, 0));
304 extent.extLength |= etype << 30; 266 if (elen)
305 } 267 indirect_ext_len =
306 udf_extend_file(inode, &epos, &extent, 268 (elen + sb->s_blocksize - 1) >>
307 offset + 269 sb->s_blocksize_bits;
308 ((inode->i_size & 270 else
309 (sb->s_blocksize - 1)) != 0)); 271 indirect_ext_len = 1;
272 } else {
273 extent_trunc(inode, &epos, &eloc, etype, elen, 0);
274 epos.offset += adsize;
310 } 275 }
311 } 276 }
277
278 if (indirect_ext_len) {
279 BUG_ON(!epos.bh);
280 udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len);
281 } else if (!epos.bh) {
282 iinfo->i_lenAlloc = lenalloc;
283 mark_inode_dirty(inode);
284 } else
285 udf_update_alloc_ext_desc(inode, &epos, lenalloc);
312 iinfo->i_lenExtents = inode->i_size; 286 iinfo->i_lenExtents = inode->i_size;
313 287
314 brelse(epos.bh); 288 brelse(epos.bh);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index eba48209f9f3..dbd52d4b5eed 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -136,22 +136,20 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
136extern long udf_ioctl(struct file *, unsigned int, unsigned long); 136extern long udf_ioctl(struct file *, unsigned int, unsigned long);
137/* inode.c */ 137/* inode.c */
138extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 138extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
139extern void udf_expand_file_adinicb(struct inode *, int, int *); 139extern int udf_expand_file_adinicb(struct inode *);
140extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 140extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
141extern struct buffer_head *udf_bread(struct inode *, int, int, int *); 141extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
142extern void udf_truncate(struct inode *); 142extern int udf_setsize(struct inode *, loff_t);
143extern void udf_read_inode(struct inode *); 143extern void udf_read_inode(struct inode *);
144extern void udf_evict_inode(struct inode *); 144extern void udf_evict_inode(struct inode *);
145extern int udf_write_inode(struct inode *, struct writeback_control *wbc); 145extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
146extern long udf_block_map(struct inode *, sector_t); 146extern long udf_block_map(struct inode *, sector_t);
147extern int udf_extend_file(struct inode *, struct extent_position *,
148 struct kernel_long_ad *, sector_t);
149extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, 147extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
150 struct kernel_lb_addr *, uint32_t *, sector_t *); 148 struct kernel_lb_addr *, uint32_t *, sector_t *);
151extern int8_t udf_add_aext(struct inode *, struct extent_position *, 149extern int udf_add_aext(struct inode *, struct extent_position *,
150 struct kernel_lb_addr *, uint32_t, int);
151extern void udf_write_aext(struct inode *, struct extent_position *,
152 struct kernel_lb_addr *, uint32_t, int); 152 struct kernel_lb_addr *, uint32_t, int);
153extern int8_t udf_write_aext(struct inode *, struct extent_position *,
154 struct kernel_lb_addr *, uint32_t, int);
155extern int8_t udf_delete_aext(struct inode *, struct extent_position, 153extern int8_t udf_delete_aext(struct inode *, struct extent_position,
156 struct kernel_lb_addr, uint32_t); 154 struct kernel_lb_addr, uint32_t);
157extern int8_t udf_next_aext(struct inode *, struct extent_position *, 155extern int8_t udf_next_aext(struct inode *, struct extent_position *,
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 30c8f223253d..e4f10a40768a 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,7 +1,6 @@
1config UFS_FS 1config UFS_FS
2 tristate "UFS file system support (read only)" 2 tristate "UFS file system support (read only)"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # probably fixable
5 help 4 help
6 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, 5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
7 OpenBSD and NeXTstep) use a file system called UFS. Some System V 6 OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b251f2093af..27a4babe7df0 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -34,7 +34,6 @@
34#include <linux/stat.h> 34#include <linux/stat.h>
35#include <linux/string.h> 35#include <linux/string.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
39#include <linux/writeback.h> 38#include <linux/writeback.h>
40 39
@@ -43,7 +42,7 @@
43#include "swab.h" 42#include "swab.h"
44#include "util.h" 43#include "util.h"
45 44
46static u64 ufs_frag_map(struct inode *inode, sector_t frag); 45static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
47 46
48static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) 47static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
49{ 48{
@@ -82,7 +81,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
82 * the begining of the filesystem. 81 * the begining of the filesystem.
83 */ 82 */
84 83
85static u64 ufs_frag_map(struct inode *inode, sector_t frag) 84static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
86{ 85{
87 struct ufs_inode_info *ufsi = UFS_I(inode); 86 struct ufs_inode_info *ufsi = UFS_I(inode);
88 struct super_block *sb = inode->i_sb; 87 struct super_block *sb = inode->i_sb;
@@ -107,7 +106,8 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag)
107 106
108 p = offsets; 107 p = offsets;
109 108
110 lock_kernel(); 109 if (needs_lock)
110 lock_ufs(sb);
111 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 111 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
112 goto ufs2; 112 goto ufs2;
113 113
@@ -152,7 +152,8 @@ ufs2:
152 ret = temp + (u64) (frag & uspi->s_fpbmask); 152 ret = temp + (u64) (frag & uspi->s_fpbmask);
153 153
154out: 154out:
155 unlock_kernel(); 155 if (needs_lock)
156 unlock_ufs(sb);
156 return ret; 157 return ret;
157} 158}
158 159
@@ -415,14 +416,16 @@ out:
415int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) 416int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
416{ 417{
417 struct super_block * sb = inode->i_sb; 418 struct super_block * sb = inode->i_sb;
418 struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; 419 struct ufs_sb_info * sbi = UFS_SB(sb);
420 struct ufs_sb_private_info * uspi = sbi->s_uspi;
419 struct buffer_head * bh; 421 struct buffer_head * bh;
420 int ret, err, new; 422 int ret, err, new;
421 unsigned long ptr,phys; 423 unsigned long ptr,phys;
422 u64 phys64 = 0; 424 u64 phys64 = 0;
425 bool needs_lock = (sbi->mutex_owner != current);
423 426
424 if (!create) { 427 if (!create) {
425 phys64 = ufs_frag_map(inode, fragment); 428 phys64 = ufs_frag_map(inode, fragment, needs_lock);
426 UFSD("phys64 = %llu\n", (unsigned long long)phys64); 429 UFSD("phys64 = %llu\n", (unsigned long long)phys64);
427 if (phys64) 430 if (phys64)
428 map_bh(bh_result, sb, phys64); 431 map_bh(bh_result, sb, phys64);
@@ -436,7 +439,8 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
436 ret = 0; 439 ret = 0;
437 bh = NULL; 440 bh = NULL;
438 441
439 lock_kernel(); 442 if (needs_lock)
443 lock_ufs(sb);
440 444
441 UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); 445 UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
442 if (fragment > 446 if (fragment >
@@ -498,7 +502,9 @@ out:
498 set_buffer_new(bh_result); 502 set_buffer_new(bh_result);
499 map_bh(bh_result, sb, phys); 503 map_bh(bh_result, sb, phys);
500abort: 504abort:
501 unlock_kernel(); 505 if (needs_lock)
506 unlock_ufs(sb);
507
502 return err; 508 return err;
503 509
504abort_too_big: 510abort_too_big:
@@ -506,48 +512,6 @@ abort_too_big:
506 goto abort; 512 goto abort;
507} 513}
508 514
509static struct buffer_head *ufs_getfrag(struct inode *inode,
510 unsigned int fragment,
511 int create, int *err)
512{
513 struct buffer_head dummy;
514 int error;
515
516 dummy.b_state = 0;
517 dummy.b_blocknr = -1000;
518 error = ufs_getfrag_block(inode, fragment, &dummy, create);
519 *err = error;
520 if (!error && buffer_mapped(&dummy)) {
521 struct buffer_head *bh;
522 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
523 if (buffer_new(&dummy)) {
524 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
525 set_buffer_uptodate(bh);
526 mark_buffer_dirty(bh);
527 }
528 return bh;
529 }
530 return NULL;
531}
532
533struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
534 int create, int * err)
535{
536 struct buffer_head * bh;
537
538 UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
539 bh = ufs_getfrag (inode, fragment, create, err);
540 if (!bh || buffer_uptodate(bh))
541 return bh;
542 ll_rw_block (READ, 1, &bh);
543 wait_on_buffer (bh);
544 if (buffer_uptodate(bh))
545 return bh;
546 brelse (bh);
547 *err = -EIO;
548 return NULL;
549}
550
551static int ufs_writepage(struct page *page, struct writeback_control *wbc) 515static int ufs_writepage(struct page *page, struct writeback_control *wbc)
552{ 516{
553 return block_write_full_page(page,ufs_getfrag_block,wbc); 517 return block_write_full_page(page,ufs_getfrag_block,wbc);
@@ -588,7 +552,6 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
588const struct address_space_operations ufs_aops = { 552const struct address_space_operations ufs_aops = {
589 .readpage = ufs_readpage, 553 .readpage = ufs_readpage,
590 .writepage = ufs_writepage, 554 .writepage = ufs_writepage,
591 .sync_page = block_sync_page,
592 .write_begin = ufs_write_begin, 555 .write_begin = ufs_write_begin,
593 .write_end = generic_write_end, 556 .write_end = generic_write_end,
594 .bmap = ufs_bmap 557 .bmap = ufs_bmap
@@ -900,9 +863,9 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
900int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) 863int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
901{ 864{
902 int ret; 865 int ret;
903 lock_kernel(); 866 lock_ufs(inode->i_sb);
904 ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 867 ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
905 unlock_kernel(); 868 unlock_ufs(inode->i_sb);
906 return ret; 869 return ret;
907} 870}
908 871
@@ -922,22 +885,22 @@ void ufs_evict_inode(struct inode * inode)
922 if (want_delete) { 885 if (want_delete) {
923 loff_t old_i_size; 886 loff_t old_i_size;
924 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ 887 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
925 lock_kernel(); 888 lock_ufs(inode->i_sb);
926 mark_inode_dirty(inode); 889 mark_inode_dirty(inode);
927 ufs_update_inode(inode, IS_SYNC(inode)); 890 ufs_update_inode(inode, IS_SYNC(inode));
928 old_i_size = inode->i_size; 891 old_i_size = inode->i_size;
929 inode->i_size = 0; 892 inode->i_size = 0;
930 if (inode->i_blocks && ufs_truncate(inode, old_i_size)) 893 if (inode->i_blocks && ufs_truncate(inode, old_i_size))
931 ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); 894 ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
932 unlock_kernel(); 895 unlock_ufs(inode->i_sb);
933 } 896 }
934 897
935 invalidate_inode_buffers(inode); 898 invalidate_inode_buffers(inode);
936 end_writeback(inode); 899 end_writeback(inode);
937 900
938 if (want_delete) { 901 if (want_delete) {
939 lock_kernel(); 902 lock_ufs(inode->i_sb);
940 ufs_free_inode (inode); 903 ufs_free_inode (inode);
941 unlock_kernel(); 904 unlock_ufs(inode->i_sb);
942 } 905 }
943} 906}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 12f39b9e4437..29309e25417f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h>
33 32
34#include "ufs_fs.h" 33#include "ufs_fs.h"
35#include "ufs.h" 34#include "ufs.h"
@@ -55,16 +54,16 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
55 if (dentry->d_name.len > UFS_MAXNAMLEN) 54 if (dentry->d_name.len > UFS_MAXNAMLEN)
56 return ERR_PTR(-ENAMETOOLONG); 55 return ERR_PTR(-ENAMETOOLONG);
57 56
58 lock_kernel(); 57 lock_ufs(dir->i_sb);
59 ino = ufs_inode_by_name(dir, &dentry->d_name); 58 ino = ufs_inode_by_name(dir, &dentry->d_name);
60 if (ino) { 59 if (ino) {
61 inode = ufs_iget(dir->i_sb, ino); 60 inode = ufs_iget(dir->i_sb, ino);
62 if (IS_ERR(inode)) { 61 if (IS_ERR(inode)) {
63 unlock_kernel(); 62 unlock_ufs(dir->i_sb);
64 return ERR_CAST(inode); 63 return ERR_CAST(inode);
65 } 64 }
66 } 65 }
67 unlock_kernel(); 66 unlock_ufs(dir->i_sb);
68 d_add(dentry, inode); 67 d_add(dentry, inode);
69 return NULL; 68 return NULL;
70} 69}
@@ -93,9 +92,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
93 inode->i_fop = &ufs_file_operations; 92 inode->i_fop = &ufs_file_operations;
94 inode->i_mapping->a_ops = &ufs_aops; 93 inode->i_mapping->a_ops = &ufs_aops;
95 mark_inode_dirty(inode); 94 mark_inode_dirty(inode);
96 lock_kernel(); 95 lock_ufs(dir->i_sb);
97 err = ufs_add_nondir(dentry, inode); 96 err = ufs_add_nondir(dentry, inode);
98 unlock_kernel(); 97 unlock_ufs(dir->i_sb);
99 } 98 }
100 UFSD("END: err=%d\n", err); 99 UFSD("END: err=%d\n", err);
101 return err; 100 return err;
@@ -115,9 +114,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
115 init_special_inode(inode, mode, rdev); 114 init_special_inode(inode, mode, rdev);
116 ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); 115 ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
117 mark_inode_dirty(inode); 116 mark_inode_dirty(inode);
118 lock_kernel(); 117 lock_ufs(dir->i_sb);
119 err = ufs_add_nondir(dentry, inode); 118 err = ufs_add_nondir(dentry, inode);
120 unlock_kernel(); 119 unlock_ufs(dir->i_sb);
121 } 120 }
122 return err; 121 return err;
123} 122}
@@ -133,7 +132,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
133 if (l > sb->s_blocksize) 132 if (l > sb->s_blocksize)
134 goto out_notlocked; 133 goto out_notlocked;
135 134
136 lock_kernel(); 135 lock_ufs(dir->i_sb);
137 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 136 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
138 err = PTR_ERR(inode); 137 err = PTR_ERR(inode);
139 if (IS_ERR(inode)) 138 if (IS_ERR(inode))
@@ -156,7 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
156 155
157 err = ufs_add_nondir(dentry, inode); 156 err = ufs_add_nondir(dentry, inode);
158out: 157out:
159 unlock_kernel(); 158 unlock_ufs(dir->i_sb);
160out_notlocked: 159out_notlocked:
161 return err; 160 return err;
162 161
@@ -172,9 +171,9 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
172 struct inode *inode = old_dentry->d_inode; 171 struct inode *inode = old_dentry->d_inode;
173 int error; 172 int error;
174 173
175 lock_kernel(); 174 lock_ufs(dir->i_sb);
176 if (inode->i_nlink >= UFS_LINK_MAX) { 175 if (inode->i_nlink >= UFS_LINK_MAX) {
177 unlock_kernel(); 176 unlock_ufs(dir->i_sb);
178 return -EMLINK; 177 return -EMLINK;
179 } 178 }
180 179
@@ -183,7 +182,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
183 ihold(inode); 182 ihold(inode);
184 183
185 error = ufs_add_nondir(dentry, inode); 184 error = ufs_add_nondir(dentry, inode);
186 unlock_kernel(); 185 unlock_ufs(dir->i_sb);
187 return error; 186 return error;
188} 187}
189 188
@@ -195,7 +194,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
195 if (dir->i_nlink >= UFS_LINK_MAX) 194 if (dir->i_nlink >= UFS_LINK_MAX)
196 goto out; 195 goto out;
197 196
198 lock_kernel(); 197 lock_ufs(dir->i_sb);
199 inode_inc_link_count(dir); 198 inode_inc_link_count(dir);
200 199
201 inode = ufs_new_inode(dir, S_IFDIR|mode); 200 inode = ufs_new_inode(dir, S_IFDIR|mode);
@@ -216,7 +215,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
216 err = ufs_add_link(dentry, inode); 215 err = ufs_add_link(dentry, inode);
217 if (err) 216 if (err)
218 goto out_fail; 217 goto out_fail;
219 unlock_kernel(); 218 unlock_ufs(dir->i_sb);
220 219
221 d_instantiate(dentry, inode); 220 d_instantiate(dentry, inode);
222out: 221out:
@@ -228,7 +227,7 @@ out_fail:
228 iput (inode); 227 iput (inode);
229out_dir: 228out_dir:
230 inode_dec_link_count(dir); 229 inode_dec_link_count(dir);
231 unlock_kernel(); 230 unlock_ufs(dir->i_sb);
232 goto out; 231 goto out;
233} 232}
234 233
@@ -259,7 +258,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
259 struct inode * inode = dentry->d_inode; 258 struct inode * inode = dentry->d_inode;
260 int err= -ENOTEMPTY; 259 int err= -ENOTEMPTY;
261 260
262 lock_kernel(); 261 lock_ufs(dir->i_sb);
263 if (ufs_empty_dir (inode)) { 262 if (ufs_empty_dir (inode)) {
264 err = ufs_unlink(dir, dentry); 263 err = ufs_unlink(dir, dentry);
265 if (!err) { 264 if (!err) {
@@ -268,7 +267,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
268 inode_dec_link_count(dir); 267 inode_dec_link_count(dir);
269 } 268 }
270 } 269 }
271 unlock_kernel(); 270 unlock_ufs(dir->i_sb);
272 return err; 271 return err;
273} 272}
274 273
@@ -306,7 +305,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
306 new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); 305 new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
307 if (!new_de) 306 if (!new_de)
308 goto out_dir; 307 goto out_dir;
309 inode_inc_link_count(old_inode);
310 ufs_set_link(new_dir, new_de, new_page, old_inode); 308 ufs_set_link(new_dir, new_de, new_page, old_inode);
311 new_inode->i_ctime = CURRENT_TIME_SEC; 309 new_inode->i_ctime = CURRENT_TIME_SEC;
312 if (dir_de) 310 if (dir_de)
@@ -318,12 +316,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
318 if (new_dir->i_nlink >= UFS_LINK_MAX) 316 if (new_dir->i_nlink >= UFS_LINK_MAX)
319 goto out_dir; 317 goto out_dir;
320 } 318 }
321 inode_inc_link_count(old_inode);
322 err = ufs_add_link(new_dentry, old_inode); 319 err = ufs_add_link(new_dentry, old_inode);
323 if (err) { 320 if (err)
324 inode_dec_link_count(old_inode);
325 goto out_dir; 321 goto out_dir;
326 }
327 if (dir_de) 322 if (dir_de)
328 inode_inc_link_count(new_dir); 323 inode_inc_link_count(new_dir);
329 } 324 }
@@ -331,12 +326,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
331 /* 326 /*
332 * Like most other Unix systems, set the ctime for inodes on a 327 * Like most other Unix systems, set the ctime for inodes on a
333 * rename. 328 * rename.
334 * inode_dec_link_count() will mark the inode dirty.
335 */ 329 */
336 old_inode->i_ctime = CURRENT_TIME_SEC; 330 old_inode->i_ctime = CURRENT_TIME_SEC;
337 331
338 ufs_delete_entry(old_dir, old_de, old_page); 332 ufs_delete_entry(old_dir, old_de, old_page);
339 inode_dec_link_count(old_inode); 333 mark_inode_dirty(old_inode);
340 334
341 if (dir_de) { 335 if (dir_de) {
342 ufs_set_link(old_inode, dir_de, dir_page, new_dir); 336 ufs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c61ac5d4e48..7693d6293404 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -84,7 +84,6 @@
84#include <linux/blkdev.h> 84#include <linux/blkdev.h>
85#include <linux/init.h> 85#include <linux/init.h>
86#include <linux/parser.h> 86#include <linux/parser.h>
87#include <linux/smp_lock.h>
88#include <linux/buffer_head.h> 87#include <linux/buffer_head.h>
89#include <linux/vfs.h> 88#include <linux/vfs.h>
90#include <linux/log2.h> 89#include <linux/log2.h>
@@ -96,6 +95,26 @@
96#include "swab.h" 95#include "swab.h"
97#include "util.h" 96#include "util.h"
98 97
98void lock_ufs(struct super_block *sb)
99{
100#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
101 struct ufs_sb_info *sbi = UFS_SB(sb);
102
103 mutex_lock(&sbi->mutex);
104 sbi->mutex_owner = current;
105#endif
106}
107
108void unlock_ufs(struct super_block *sb)
109{
110#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
111 struct ufs_sb_info *sbi = UFS_SB(sb);
112
113 sbi->mutex_owner = NULL;
114 mutex_unlock(&sbi->mutex);
115#endif
116}
117
99static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) 118static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
100{ 119{
101 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; 120 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -313,7 +332,6 @@ void ufs_panic (struct super_block * sb, const char * function,
313 struct ufs_super_block_first * usb1; 332 struct ufs_super_block_first * usb1;
314 va_list args; 333 va_list args;
315 334
316 lock_kernel();
317 uspi = UFS_SB(sb)->s_uspi; 335 uspi = UFS_SB(sb)->s_uspi;
318 usb1 = ubh_get_usb_first(uspi); 336 usb1 = ubh_get_usb_first(uspi);
319 337
@@ -521,7 +539,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
521 */ 539 */
522 size = uspi->s_cssize; 540 size = uspi->s_cssize;
523 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; 541 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
524 base = space = kmalloc(size, GFP_KERNEL); 542 base = space = kmalloc(size, GFP_NOFS);
525 if (!base) 543 if (!base)
526 goto failed; 544 goto failed;
527 sbi->s_csp = (struct ufs_csum *)space; 545 sbi->s_csp = (struct ufs_csum *)space;
@@ -546,7 +564,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
546 * Read cylinder group (we read only first fragment from block 564 * Read cylinder group (we read only first fragment from block
547 * at this time) and prepare internal data structures for cg caching. 565 * at this time) and prepare internal data structures for cg caching.
548 */ 566 */
549 if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_KERNEL))) 567 if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS)))
550 goto failed; 568 goto failed;
551 for (i = 0; i < uspi->s_ncg; i++) 569 for (i = 0; i < uspi->s_ncg; i++)
552 sbi->s_ucg[i] = NULL; 570 sbi->s_ucg[i] = NULL;
@@ -564,7 +582,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
564 ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); 582 ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
565 } 583 }
566 for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { 584 for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
567 if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL))) 585 if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS)))
568 goto failed; 586 goto failed;
569 sbi->s_cgno[i] = UFS_CGNO_EMPTY; 587 sbi->s_cgno[i] = UFS_CGNO_EMPTY;
570 } 588 }
@@ -646,8 +664,6 @@ static void ufs_put_super_internal(struct super_block *sb)
646 664
647 UFSD("ENTER\n"); 665 UFSD("ENTER\n");
648 666
649 lock_kernel();
650
651 ufs_put_cstotal(sb); 667 ufs_put_cstotal(sb);
652 size = uspi->s_cssize; 668 size = uspi->s_cssize;
653 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; 669 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -676,8 +692,6 @@ static void ufs_put_super_internal(struct super_block *sb)
676 kfree (sbi->s_ucg); 692 kfree (sbi->s_ucg);
677 kfree (base); 693 kfree (base);
678 694
679 unlock_kernel();
680
681 UFSD("EXIT\n"); 695 UFSD("EXIT\n");
682} 696}
683 697
@@ -696,8 +710,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
696 unsigned maxsymlen; 710 unsigned maxsymlen;
697 int ret = -EINVAL; 711 int ret = -EINVAL;
698 712
699 lock_kernel();
700
701 uspi = NULL; 713 uspi = NULL;
702 ubh = NULL; 714 ubh = NULL;
703 flags = 0; 715 flags = 0;
@@ -718,6 +730,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
718 goto failed; 730 goto failed;
719 } 731 }
720#endif 732#endif
733 mutex_init(&sbi->mutex);
721 /* 734 /*
722 * Set default mount options 735 * Set default mount options
723 * Parse mount options 736 * Parse mount options
@@ -1165,7 +1178,6 @@ magic_found:
1165 goto failed; 1178 goto failed;
1166 1179
1167 UFSD("EXIT\n"); 1180 UFSD("EXIT\n");
1168 unlock_kernel();
1169 return 0; 1181 return 0;
1170 1182
1171dalloc_failed: 1183dalloc_failed:
@@ -1177,12 +1189,10 @@ failed:
1177 kfree(sbi); 1189 kfree(sbi);
1178 sb->s_fs_info = NULL; 1190 sb->s_fs_info = NULL;
1179 UFSD("EXIT (FAILED)\n"); 1191 UFSD("EXIT (FAILED)\n");
1180 unlock_kernel();
1181 return ret; 1192 return ret;
1182 1193
1183failed_nomem: 1194failed_nomem:
1184 UFSD("EXIT (NOMEM)\n"); 1195 UFSD("EXIT (NOMEM)\n");
1185 unlock_kernel();
1186 return -ENOMEM; 1196 return -ENOMEM;
1187} 1197}
1188 1198
@@ -1193,8 +1203,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
1193 struct ufs_super_block_third * usb3; 1203 struct ufs_super_block_third * usb3;
1194 unsigned flags; 1204 unsigned flags;
1195 1205
1206 lock_ufs(sb);
1196 lock_super(sb); 1207 lock_super(sb);
1197 lock_kernel();
1198 1208
1199 UFSD("ENTER\n"); 1209 UFSD("ENTER\n");
1200 1210
@@ -1213,8 +1223,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
1213 sb->s_dirt = 0; 1223 sb->s_dirt = 0;
1214 1224
1215 UFSD("EXIT\n"); 1225 UFSD("EXIT\n");
1216 unlock_kernel();
1217 unlock_super(sb); 1226 unlock_super(sb);
1227 unlock_ufs(sb);
1218 1228
1219 return 0; 1229 return 0;
1220} 1230}
@@ -1256,7 +1266,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1256 unsigned new_mount_opt, ufstype; 1266 unsigned new_mount_opt, ufstype;
1257 unsigned flags; 1267 unsigned flags;
1258 1268
1259 lock_kernel(); 1269 lock_ufs(sb);
1260 lock_super(sb); 1270 lock_super(sb);
1261 uspi = UFS_SB(sb)->s_uspi; 1271 uspi = UFS_SB(sb)->s_uspi;
1262 flags = UFS_SB(sb)->s_flags; 1272 flags = UFS_SB(sb)->s_flags;
@@ -1272,7 +1282,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1272 ufs_set_opt (new_mount_opt, ONERROR_LOCK); 1282 ufs_set_opt (new_mount_opt, ONERROR_LOCK);
1273 if (!ufs_parse_options (data, &new_mount_opt)) { 1283 if (!ufs_parse_options (data, &new_mount_opt)) {
1274 unlock_super(sb); 1284 unlock_super(sb);
1275 unlock_kernel(); 1285 unlock_ufs(sb);
1276 return -EINVAL; 1286 return -EINVAL;
1277 } 1287 }
1278 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { 1288 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1280,14 +1290,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1280 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { 1290 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
1281 printk("ufstype can't be changed during remount\n"); 1291 printk("ufstype can't be changed during remount\n");
1282 unlock_super(sb); 1292 unlock_super(sb);
1283 unlock_kernel(); 1293 unlock_ufs(sb);
1284 return -EINVAL; 1294 return -EINVAL;
1285 } 1295 }
1286 1296
1287 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1297 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1288 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1298 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1289 unlock_super(sb); 1299 unlock_super(sb);
1290 unlock_kernel(); 1300 unlock_ufs(sb);
1291 return 0; 1301 return 0;
1292 } 1302 }
1293 1303
@@ -1313,7 +1323,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1313 printk("ufs was compiled with read-only support, " 1323 printk("ufs was compiled with read-only support, "
1314 "can't be mounted as read-write\n"); 1324 "can't be mounted as read-write\n");
1315 unlock_super(sb); 1325 unlock_super(sb);
1316 unlock_kernel(); 1326 unlock_ufs(sb);
1317 return -EINVAL; 1327 return -EINVAL;
1318#else 1328#else
1319 if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 1329 if (ufstype != UFS_MOUNT_UFSTYPE_SUN &&
@@ -1323,13 +1333,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1323 ufstype != UFS_MOUNT_UFSTYPE_UFS2) { 1333 ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
1324 printk("this ufstype is read-only supported\n"); 1334 printk("this ufstype is read-only supported\n");
1325 unlock_super(sb); 1335 unlock_super(sb);
1326 unlock_kernel(); 1336 unlock_ufs(sb);
1327 return -EINVAL; 1337 return -EINVAL;
1328 } 1338 }
1329 if (!ufs_read_cylinder_structures(sb)) { 1339 if (!ufs_read_cylinder_structures(sb)) {
1330 printk("failed during remounting\n"); 1340 printk("failed during remounting\n");
1331 unlock_super(sb); 1341 unlock_super(sb);
1332 unlock_kernel(); 1342 unlock_ufs(sb);
1333 return -EPERM; 1343 return -EPERM;
1334 } 1344 }
1335 sb->s_flags &= ~MS_RDONLY; 1345 sb->s_flags &= ~MS_RDONLY;
@@ -1337,7 +1347,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1337 } 1347 }
1338 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1348 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1339 unlock_super(sb); 1349 unlock_super(sb);
1340 unlock_kernel(); 1350 unlock_ufs(sb);
1341 return 0; 1351 return 0;
1342} 1352}
1343 1353
@@ -1371,7 +1381,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1371 struct ufs_super_block_third *usb3; 1381 struct ufs_super_block_third *usb3;
1372 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 1382 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
1373 1383
1374 lock_kernel(); 1384 lock_ufs(sb);
1375 1385
1376 usb1 = ubh_get_usb_first(uspi); 1386 usb1 = ubh_get_usb_first(uspi);
1377 usb2 = ubh_get_usb_second(uspi); 1387 usb2 = ubh_get_usb_second(uspi);
@@ -1395,7 +1405,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1395 buf->f_fsid.val[0] = (u32)id; 1405 buf->f_fsid.val[0] = (u32)id;
1396 buf->f_fsid.val[1] = (u32)(id >> 32); 1406 buf->f_fsid.val[1] = (u32)(id >> 32);
1397 1407
1398 unlock_kernel(); 1408 unlock_ufs(sb);
1399 1409
1400 return 0; 1410 return 0;
1401} 1411}
@@ -1405,7 +1415,7 @@ static struct kmem_cache * ufs_inode_cachep;
1405static struct inode *ufs_alloc_inode(struct super_block *sb) 1415static struct inode *ufs_alloc_inode(struct super_block *sb)
1406{ 1416{
1407 struct ufs_inode_info *ei; 1417 struct ufs_inode_info *ei;
1408 ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL); 1418 ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
1409 if (!ei) 1419 if (!ei)
1410 return NULL; 1420 return NULL;
1411 ei->vfs_inode.i_version = 1; 1421 ei->vfs_inode.i_version = 1;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index a58f9155fc9a..11014302c9ca 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -40,7 +40,6 @@
40#include <linux/time.h> 40#include <linux/time.h>
41#include <linux/stat.h> 41#include <linux/stat.h>
42#include <linux/string.h> 42#include <linux/string.h>
43#include <linux/smp_lock.h>
44#include <linux/buffer_head.h> 43#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 44#include <linux/blkdev.h>
46#include <linux/sched.h> 45#include <linux/sched.h>
@@ -467,7 +466,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
467 466
468 block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); 467 block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
469 468
470 lock_kernel();
471 while (1) { 469 while (1) {
472 retry = ufs_trunc_direct(inode); 470 retry = ufs_trunc_direct(inode);
473 retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, 471 retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
@@ -481,13 +479,12 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
481 break; 479 break;
482 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) 480 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
483 ufs_sync_inode (inode); 481 ufs_sync_inode (inode);
484 blk_run_address_space(inode->i_mapping); 482 blk_flush_plug(current);
485 yield(); 483 yield();
486 } 484 }
487 485
488 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 486 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
489 ufsi->i_lastfrag = DIRECT_FRAGMENT; 487 ufsi->i_lastfrag = DIRECT_FRAGMENT;
490 unlock_kernel();
491 mark_inode_dirty(inode); 488 mark_inode_dirty(inode);
492out: 489out:
493 UFSD("EXIT: err %d\n", err); 490 UFSD("EXIT: err %d\n", err);
@@ -510,7 +507,9 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
510 /* XXX(truncate): truncate_setsize should be called last */ 507 /* XXX(truncate): truncate_setsize should be called last */
511 truncate_setsize(inode, attr->ia_size); 508 truncate_setsize(inode, attr->ia_size);
512 509
510 lock_ufs(inode->i_sb);
513 error = ufs_truncate(inode, old_i_size); 511 error = ufs_truncate(inode, old_i_size);
512 unlock_ufs(inode->i_sb);
514 if (error) 513 if (error)
515 return error; 514 return error;
516 } 515 }
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index c08782e1b48a..5be2755dd715 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -18,6 +18,8 @@ struct ufs_sb_info {
18 unsigned s_cgno[UFS_MAX_GROUP_LOADED]; 18 unsigned s_cgno[UFS_MAX_GROUP_LOADED];
19 unsigned short s_cg_loaded; 19 unsigned short s_cg_loaded;
20 unsigned s_mount_opt; 20 unsigned s_mount_opt;
21 struct mutex mutex;
22 struct task_struct *mutex_owner;
21}; 23};
22 24
23struct ufs_inode_info { 25struct ufs_inode_info {
@@ -109,7 +111,6 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
109extern int ufs_write_inode (struct inode *, struct writeback_control *); 111extern int ufs_write_inode (struct inode *, struct writeback_control *);
110extern int ufs_sync_inode (struct inode *); 112extern int ufs_sync_inode (struct inode *);
111extern void ufs_evict_inode (struct inode *); 113extern void ufs_evict_inode (struct inode *);
112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
113extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); 114extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
114 115
115/* namei.c */ 116/* namei.c */
@@ -154,4 +155,7 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
154 return do_div(b, uspi->s_fpg); 155 return do_div(b, uspi->s_fpg);
155} 156}
156 157
158extern void lock_ufs(struct super_block *sb);
159extern void unlock_ufs(struct super_block *sb);
160
157#endif /* _UFS_UFS_H */ 161#endif /* _UFS_UFS_H */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index d2c36d53fe66..95425b59ce0a 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -27,7 +27,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
27 if (count > UFS_MAXFRAG) 27 if (count > UFS_MAXFRAG)
28 return NULL; 28 return NULL;
29 ubh = (struct ufs_buffer_head *) 29 ubh = (struct ufs_buffer_head *)
30 kmalloc (sizeof (struct ufs_buffer_head), GFP_KERNEL); 30 kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
31 if (!ubh) 31 if (!ubh)
32 return NULL; 32 return NULL;
33 ubh->fragment = fragment; 33 ubh->fragment = fragment;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 9f8775ce381c..954175928240 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -408,7 +408,7 @@ static inline unsigned _ubh_find_next_zero_bit_(
408 for (;;) { 408 for (;;) {
409 count = min_t(unsigned int, size + offset, uspi->s_bpf); 409 count = min_t(unsigned int, size + offset, uspi->s_bpf);
410 size -= count - offset; 410 size -= count - offset;
411 pos = ext2_find_next_zero_bit (ubh->bh[base]->b_data, count, offset); 411 pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset);
412 if (pos < count || !size) 412 if (pos < count || !size)
413 break; 413 break;
414 base++; 414 base++;
diff --git a/fs/utimes.c b/fs/utimes.c
index 179b58690657..ba653f3dc1bc 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -95,7 +95,7 @@ static int utimes_common(struct path *path, struct timespec *times)
95 if (IS_IMMUTABLE(inode)) 95 if (IS_IMMUTABLE(inode))
96 goto mnt_drop_write_and_out; 96 goto mnt_drop_write_and_out;
97 97
98 if (!is_owner_or_cap(inode)) { 98 if (!inode_owner_or_capable(inode)) {
99 error = inode_permission(inode, MAY_WRITE); 99 error = inode_permission(inode, MAY_WRITE);
100 if (error) 100 if (error)
101 goto mnt_drop_write_and_out; 101 goto mnt_drop_write_and_out;
diff --git a/fs/xattr.c b/fs/xattr.c
index 01bb8135e14a..a19acdb81cd1 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -59,7 +59,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
59 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) 59 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
60 return -EPERM; 60 return -EPERM;
61 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && 61 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
62 (mask & MAY_WRITE) && !is_owner_or_cap(inode)) 62 (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
63 return -EPERM; 63 return -EPERM;
64 } 64 }
65 65
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index faca44997099..284a7c89697e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,14 +16,11 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 19ccflags-y := -I$(src) -I$(src)/linux-2.6
20ccflags-$(CONFIG_XFS_DEBUG) += -g
20 21
21XFS_LINUX := linux-2.6 22XFS_LINUX := linux-2.6
22 23
23ifeq ($(CONFIG_XFS_DEBUG),y)
24 EXTRA_CFLAGS += -g
25endif
26
27obj-$(CONFIG_XFS_FS) += xfs.o 24obj-$(CONFIG_XFS_FS) += xfs.o
28 25
29xfs-y += linux-2.6/xfs_trace.o 26xfs-y += linux-2.6/xfs_trace.o
@@ -105,11 +102,10 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
105 xfs_globals.o \ 102 xfs_globals.o \
106 xfs_ioctl.o \ 103 xfs_ioctl.o \
107 xfs_iops.o \ 104 xfs_iops.o \
105 xfs_message.o \
108 xfs_super.o \ 106 xfs_super.o \
109 xfs_sync.o \ 107 xfs_sync.o \
110 xfs_xattr.o) 108 xfs_xattr.o)
111 109
112# Objects in support/ 110# Objects in support/
113xfs-y += $(addprefix support/, \ 111xfs-y += support/uuid.o
114 debug.o \
115 uuid.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb6..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include "time.h" 24#include "time.h"
25#include "kmem.h" 25#include "kmem.h"
26#include "xfs_message.h"
26 27
27/* 28/*
28 * Greedy allocation. May fail and may return vmalloced memory. 29 * Greedy allocation. May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
56 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 57 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
57 return ptr; 58 return ptr;
58 if (!(++retries % 100)) 59 if (!(++retries % 100))
59 printk(KERN_ERR "XFS: possible memory allocation " 60 xfs_err(NULL,
60 "deadlock in %s (mode:0x%x)\n", 61 "possible memory allocation deadlock in %s (mode:0x%x)",
61 __func__, lflags); 62 __func__, lflags);
62 congestion_wait(BLK_RW_ASYNC, HZ/50); 63 congestion_wait(BLK_RW_ASYNC, HZ/50);
63 } while (1); 64 } while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
112 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 113 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
113 return ptr; 114 return ptr;
114 if (!(++retries % 100)) 115 if (!(++retries % 100))
115 printk(KERN_ERR "XFS: possible memory allocation " 116 xfs_err(NULL,
116 "deadlock in %s (mode:0x%x)\n", 117 "possible memory allocation deadlock in %s (mode:0x%x)",
117 __func__, lflags); 118 __func__, lflags);
118 congestion_wait(BLK_RW_ASYNC, HZ/50); 119 congestion_wait(BLK_RW_ASYNC, HZ/50);
119 } while (1); 120 } while (1);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index ec7bbb5645b6..52dbd14260ba 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -413,8 +413,7 @@ xfs_submit_ioend_bio(
413 if (xfs_ioend_new_eof(ioend)) 413 if (xfs_ioend_new_eof(ioend))
414 xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); 414 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
415 415
416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
417 WRITE_SYNC_PLUG : WRITE, bio);
418} 417}
419 418
420STATIC struct bio * 419STATIC struct bio *
@@ -854,7 +853,7 @@ xfs_aops_discard_page(
854 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 853 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
855 goto out_invalidate; 854 goto out_invalidate;
856 855
857 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 856 xfs_alert(ip->i_mount,
858 "page discard on page %p, inode 0x%llx, offset %llu.", 857 "page discard on page %p, inode 0x%llx, offset %llu.",
859 page, ip->i_ino, offset); 858 page, ip->i_ino, offset);
860 859
@@ -872,7 +871,7 @@ xfs_aops_discard_page(
872 if (error) { 871 if (error) {
873 /* something screwed, just bail */ 872 /* something screwed, just bail */
874 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 873 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
875 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 874 xfs_alert(ip->i_mount,
876 "page discard unable to remove delalloc mapping."); 875 "page discard unable to remove delalloc mapping.");
877 } 876 }
878 break; 877 break;
@@ -1411,7 +1410,7 @@ xfs_vm_write_failed(
1411 if (error) { 1410 if (error) {
1412 /* something screwed, just bail */ 1411 /* something screwed, just bail */
1413 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1412 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1414 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 1413 xfs_alert(ip->i_mount,
1415 "xfs_vm_write_failed: unable to clean up ino %lld", 1414 "xfs_vm_write_failed: unable to clean up ino %lld",
1416 ip->i_ino); 1415 ip->i_ino);
1417 } 1416 }
@@ -1495,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
1495 .readpages = xfs_vm_readpages, 1494 .readpages = xfs_vm_readpages,
1496 .writepage = xfs_vm_writepage, 1495 .writepage = xfs_vm_writepage,
1497 .writepages = xfs_vm_writepages, 1496 .writepages = xfs_vm_writepages,
1498 .sync_page = block_sync_page,
1499 .releasepage = xfs_vm_releasepage, 1497 .releasepage = xfs_vm_releasepage,
1500 .invalidatepage = xfs_vm_invalidatepage, 1498 .invalidatepage = xfs_vm_invalidatepage,
1501 .write_begin = xfs_vm_write_begin, 1499 .write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378dd..c05324d3282c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -401,9 +401,8 @@ _xfs_buf_lookup_pages(
401 * handle buffer allocation failures we can't do much. 401 * handle buffer allocation failures we can't do much.
402 */ 402 */
403 if (!(++retries % 100)) 403 if (!(++retries % 100))
404 printk(KERN_ERR 404 xfs_err(NULL,
405 "XFS: possible memory allocation " 405 "possible memory allocation deadlock in %s (mode:0x%x)",
406 "deadlock in %s (mode:0x%x)\n",
407 __func__, gfp_mask); 406 __func__, gfp_mask);
408 407
409 XFS_STATS_INC(xb_page_retries); 408 XFS_STATS_INC(xb_page_retries);
@@ -615,8 +614,8 @@ xfs_buf_get(
615 if (!(bp->b_flags & XBF_MAPPED)) { 614 if (!(bp->b_flags & XBF_MAPPED)) {
616 error = _xfs_buf_map_pages(bp, flags); 615 error = _xfs_buf_map_pages(bp, flags);
617 if (unlikely(error)) { 616 if (unlikely(error)) {
618 printk(KERN_WARNING "%s: failed to map pages\n", 617 xfs_warn(target->bt_mount,
619 __func__); 618 "%s: failed to map pages\n", __func__);
620 goto no_buffer; 619 goto no_buffer;
621 } 620 }
622 } 621 }
@@ -850,8 +849,8 @@ xfs_buf_get_uncached(
850 849
851 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 850 error = _xfs_buf_map_pages(bp, XBF_MAPPED);
852 if (unlikely(error)) { 851 if (unlikely(error)) {
853 printk(KERN_WARNING "%s: failed to map pages\n", 852 xfs_warn(target->bt_mount,
854 __func__); 853 "%s: failed to map pages\n", __func__);
855 goto fail_free_mem; 854 goto fail_free_mem;
856 } 855 }
857 856
@@ -991,7 +990,7 @@ xfs_buf_lock(
991 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 990 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
992 xfs_log_force(bp->b_target->bt_mount, 0); 991 xfs_log_force(bp->b_target->bt_mount, 0);
993 if (atomic_read(&bp->b_io_remaining)) 992 if (atomic_read(&bp->b_io_remaining))
994 blk_run_address_space(bp->b_target->bt_mapping); 993 blk_flush_plug(current);
995 down(&bp->b_sema); 994 down(&bp->b_sema);
996 XB_SET_OWNER(bp); 995 XB_SET_OWNER(bp);
997 996
@@ -1035,9 +1034,7 @@ xfs_buf_wait_unpin(
1035 set_current_state(TASK_UNINTERRUPTIBLE); 1034 set_current_state(TASK_UNINTERRUPTIBLE);
1036 if (atomic_read(&bp->b_pin_count) == 0) 1035 if (atomic_read(&bp->b_pin_count) == 0)
1037 break; 1036 break;
1038 if (atomic_read(&bp->b_io_remaining)) 1037 io_schedule();
1039 blk_run_address_space(bp->b_target->bt_mapping);
1040 schedule();
1041 } 1038 }
1042 remove_wait_queue(&bp->b_waiters, &wait); 1039 remove_wait_queue(&bp->b_waiters, &wait);
1043 set_current_state(TASK_RUNNING); 1040 set_current_state(TASK_RUNNING);
@@ -1443,7 +1440,7 @@ xfs_buf_iowait(
1443 trace_xfs_buf_iowait(bp, _RET_IP_); 1440 trace_xfs_buf_iowait(bp, _RET_IP_);
1444 1441
1445 if (atomic_read(&bp->b_io_remaining)) 1442 if (atomic_read(&bp->b_io_remaining))
1446 blk_run_address_space(bp->b_target->bt_mapping); 1443 blk_flush_plug(current);
1447 wait_for_completion(&bp->b_iowait); 1444 wait_for_completion(&bp->b_iowait);
1448 1445
1449 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1446 trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1617,8 +1614,8 @@ xfs_setsize_buftarg_flags(
1617 btp->bt_smask = sectorsize - 1; 1614 btp->bt_smask = sectorsize - 1;
1618 1615
1619 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1616 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1620 printk(KERN_WARNING 1617 xfs_warn(btp->bt_mount,
1621 "XFS: Cannot set_blocksize to %u on device %s\n", 1618 "Cannot set_blocksize to %u on device %s\n",
1622 sectorsize, XFS_BUFTARG_NAME(btp)); 1619 sectorsize, XFS_BUFTARG_NAME(btp));
1623 return EINVAL; 1620 return EINVAL;
1624 } 1621 }
@@ -1667,7 +1664,6 @@ xfs_mapping_buftarg(
1667 struct inode *inode; 1664 struct inode *inode;
1668 struct address_space *mapping; 1665 struct address_space *mapping;
1669 static const struct address_space_operations mapping_aops = { 1666 static const struct address_space_operations mapping_aops = {
1670 .sync_page = block_sync_page,
1671 .migratepage = fail_migrate_page, 1667 .migratepage = fail_migrate_page,
1672 }; 1668 };
1673 1669
@@ -1948,7 +1944,7 @@ xfsbufd(
1948 count++; 1944 count++;
1949 } 1945 }
1950 if (count) 1946 if (count)
1951 blk_run_address_space(target->bt_mapping); 1947 blk_flush_plug(current);
1952 1948
1953 } while (!kthread_should_stop()); 1949 } while (!kthread_should_stop());
1954 1950
@@ -1996,7 +1992,7 @@ xfs_flush_buftarg(
1996 1992
1997 if (wait) { 1993 if (wait) {
1998 /* Expedite and wait for IO to complete. */ 1994 /* Expedite and wait for IO to complete. */
1999 blk_run_address_space(target->bt_mapping); 1995 blk_flush_plug(current);
2000 while (!list_empty(&wait_list)) { 1996 while (!list_empty(&wait_list)) {
2001 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1997 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2002 1998
@@ -2022,11 +2018,12 @@ xfs_buf_init(void)
2022 if (!xfslogd_workqueue) 2018 if (!xfslogd_workqueue)
2023 goto out_free_buf_zone; 2019 goto out_free_buf_zone;
2024 2020
2025 xfsdatad_workqueue = create_workqueue("xfsdatad"); 2021 xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
2026 if (!xfsdatad_workqueue) 2022 if (!xfsdatad_workqueue)
2027 goto out_destroy_xfslogd_workqueue; 2023 goto out_destroy_xfslogd_workqueue;
2028 2024
2029 xfsconvertd_workqueue = create_workqueue("xfsconvertd"); 2025 xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
2026 WQ_MEM_RECLAIM, 1);
2030 if (!xfsconvertd_workqueue) 2027 if (!xfsconvertd_workqueue)
2031 goto out_destroy_xfsdatad_workqueue; 2028 goto out_destroy_xfsdatad_workqueue;
2032 2029
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index 05201ae719e5..d61611c88012 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -152,6 +152,8 @@ xfs_ioc_trim(
152 152
153 if (!capable(CAP_SYS_ADMIN)) 153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM); 154 return -XFS_ERROR(EPERM);
155 if (!blk_queue_discard(q))
156 return -XFS_ERROR(EOPNOTSUPP);
155 if (copy_from_user(&range, urange, sizeof(range))) 157 if (copy_from_user(&range, urange, sizeof(range)))
156 return -XFS_ERROR(EFAULT); 158 return -XFS_ERROR(EFAULT);
157 159
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fdd..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
89 * seven combinations work. The real answer is "don't use v2". 89 * seven combinations work. The real answer is "don't use v2".
90 */ 90 */
91 len = xfs_fileid_length(fileid_type); 91 len = xfs_fileid_length(fileid_type);
92 if (*max_len < len) 92 if (*max_len < len) {
93 *max_len = len;
93 return 255; 94 return 255;
95 }
94 *max_len = len; 96 *max_len = len;
95 97
96 switch (fileid_type) { 98 switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index b06ede1d0bed..0ca0e3c024d7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -695,14 +695,19 @@ xfs_ioc_fsgeometry_v1(
695 xfs_mount_t *mp, 695 xfs_mount_t *mp,
696 void __user *arg) 696 void __user *arg)
697{ 697{
698 xfs_fsop_geom_v1_t fsgeo; 698 xfs_fsop_geom_t fsgeo;
699 int error; 699 int error;
700 700
701 error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); 701 error = xfs_fs_geometry(mp, &fsgeo, 3);
702 if (error) 702 if (error)
703 return -error; 703 return -error;
704 704
705 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) 705 /*
706 * Caller should have passed an argument of type
707 * xfs_fsop_geom_v1_t. This is a proper subset of the
708 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
709 */
710 if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
706 return -XFS_ERROR(EFAULT); 711 return -XFS_ERROR(EFAULT);
707 return 0; 712 return 0;
708} 713}
@@ -985,10 +990,22 @@ xfs_ioctl_setattr(
985 990
986 /* 991 /*
987 * Extent size must be a multiple of the appropriate block 992 * Extent size must be a multiple of the appropriate block
988 * size, if set at all. 993 * size, if set at all. It must also be smaller than the
994 * maximum extent size supported by the filesystem.
995 *
996 * Also, for non-realtime files, limit the extent size hint to
997 * half the size of the AGs in the filesystem so alignment
998 * doesn't result in extents larger than an AG.
989 */ 999 */
990 if (fa->fsx_extsize != 0) { 1000 if (fa->fsx_extsize != 0) {
991 xfs_extlen_t size; 1001 xfs_extlen_t size;
1002 xfs_fsblock_t extsize_fsb;
1003
1004 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1005 if (extsize_fsb > MAXEXTLEN) {
1006 code = XFS_ERROR(EINVAL);
1007 goto error_return;
1008 }
992 1009
993 if (XFS_IS_REALTIME_INODE(ip) || 1010 if (XFS_IS_REALTIME_INODE(ip) ||
994 ((mask & FSX_XFLAGS) && 1011 ((mask & FSX_XFLAGS) &&
@@ -997,6 +1014,10 @@ xfs_ioctl_setattr(
997 mp->m_sb.sb_blocklog; 1014 mp->m_sb.sb_blocklog;
998 } else { 1015 } else {
999 size = mp->m_sb.sb_blocksize; 1016 size = mp->m_sb.sb_blocksize;
1017 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1018 code = XFS_ERROR(EINVAL);
1019 goto error_return;
1020 }
1000 } 1021 }
1001 1022
1002 if (fa->fsx_extsize % size) { 1023 if (fa->fsx_extsize % size) {
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index bd5727852fd6..9ff7fc603d2f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -102,7 +102,8 @@ xfs_mark_inode_dirty(
102STATIC int 102STATIC int
103xfs_init_security( 103xfs_init_security(
104 struct inode *inode, 104 struct inode *inode,
105 struct inode *dir) 105 struct inode *dir,
106 const struct qstr *qstr)
106{ 107{
107 struct xfs_inode *ip = XFS_I(inode); 108 struct xfs_inode *ip = XFS_I(inode);
108 size_t length; 109 size_t length;
@@ -110,7 +111,7 @@ xfs_init_security(
110 unsigned char *name; 111 unsigned char *name;
111 int error; 112 int error;
112 113
113 error = security_inode_init_security(inode, dir, (char **)&name, 114 error = security_inode_init_security(inode, dir, qstr, (char **)&name,
114 &value, &length); 115 &value, &length);
115 if (error) { 116 if (error) {
116 if (error == -EOPNOTSUPP) 117 if (error == -EOPNOTSUPP)
@@ -194,7 +195,7 @@ xfs_vn_mknod(
194 195
195 inode = VFS_I(ip); 196 inode = VFS_I(ip);
196 197
197 error = xfs_init_security(inode, dir); 198 error = xfs_init_security(inode, dir, &dentry->d_name);
198 if (unlikely(error)) 199 if (unlikely(error))
199 goto out_cleanup_inode; 200 goto out_cleanup_inode;
200 201
@@ -367,7 +368,7 @@ xfs_vn_symlink(
367 368
368 inode = VFS_I(cip); 369 inode = VFS_I(cip);
369 370
370 error = xfs_init_security(inode, dir); 371 error = xfs_init_security(inode, dir, &dentry->d_name);
371 if (unlikely(error)) 372 if (unlikely(error))
372 goto out_cleanup_inode; 373 goto out_cleanup_inode;
373 374
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 096494997747..244be9cbfe78 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -39,7 +39,6 @@
39#include <mrlock.h> 39#include <mrlock.h>
40#include <time.h> 40#include <time.h>
41 41
42#include <support/debug.h>
43#include <support/uuid.h> 42#include <support/uuid.h>
44 43
45#include <linux/semaphore.h> 44#include <linux/semaphore.h>
@@ -86,6 +85,7 @@
86#include <xfs_aops.h> 85#include <xfs_aops.h>
87#include <xfs_super.h> 86#include <xfs_super.h>
88#include <xfs_buf.h> 87#include <xfs_buf.h>
88#include <xfs_message.h>
89 89
90/* 90/*
91 * Feature macros (disable/enable) 91 * Feature macros (disable/enable)
@@ -280,4 +280,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
280#define __arch_pack 280#define __arch_pack
281#endif 281#endif
282 282
283#define ASSERT_ALWAYS(expr) \
284 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
285
286#ifndef DEBUG
287#define ASSERT(expr) ((void)0)
288
289#ifndef STATIC
290# define STATIC static noinline
291#endif
292
293#else /* DEBUG */
294
295#define ASSERT(expr) \
296 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
297
298#ifndef STATIC
299# define STATIC noinline
300#endif
301
302#endif /* DEBUG */
303
283#endif /* __XFS_LINUX__ */ 304#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 000000000000..508e06fd7d1e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,133 @@
1/*
2 * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27
28/*
29 * XFS logging functions
30 */
31static int
32__xfs_printk(
33 const char *level,
34 const struct xfs_mount *mp,
35 struct va_format *vaf)
36{
37 if (mp && mp->m_fsname)
38 return printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
39 return printk("%sXFS: %pV\n", level, vaf);
40}
41
42int xfs_printk(
43 const char *level,
44 const struct xfs_mount *mp,
45 const char *fmt, ...)
46{
47 struct va_format vaf;
48 va_list args;
49 int r;
50
51 va_start(args, fmt);
52
53 vaf.fmt = fmt;
54 vaf.va = &args;
55
56 r = __xfs_printk(level, mp, &vaf);
57 va_end(args);
58
59 return r;
60}
61
62#define define_xfs_printk_level(func, kern_level) \
63int func(const struct xfs_mount *mp, const char *fmt, ...) \
64{ \
65 struct va_format vaf; \
66 va_list args; \
67 int r; \
68 \
69 va_start(args, fmt); \
70 \
71 vaf.fmt = fmt; \
72 vaf.va = &args; \
73 \
74 r = __xfs_printk(kern_level, mp, &vaf); \
75 va_end(args); \
76 \
77 return r; \
78} \
79
80define_xfs_printk_level(xfs_emerg, KERN_EMERG);
81define_xfs_printk_level(xfs_alert, KERN_ALERT);
82define_xfs_printk_level(xfs_crit, KERN_CRIT);
83define_xfs_printk_level(xfs_err, KERN_ERR);
84define_xfs_printk_level(xfs_warn, KERN_WARNING);
85define_xfs_printk_level(xfs_notice, KERN_NOTICE);
86define_xfs_printk_level(xfs_info, KERN_INFO);
87#ifdef DEBUG
88define_xfs_printk_level(xfs_debug, KERN_DEBUG);
89#endif
90
91int
92xfs_alert_tag(
93 const struct xfs_mount *mp,
94 int panic_tag,
95 const char *fmt, ...)
96{
97 struct va_format vaf;
98 va_list args;
99 int do_panic = 0;
100 int r;
101
102 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
103 xfs_printk(KERN_ALERT, mp,
104 "XFS: Transforming an alert into a BUG.");
105 do_panic = 1;
106 }
107
108 va_start(args, fmt);
109
110 vaf.fmt = fmt;
111 vaf.va = &args;
112
113 r = __xfs_printk(KERN_ALERT, mp, &vaf);
114 va_end(args);
115
116 BUG_ON(do_panic);
117
118 return r;
119}
120
121void
122assfail(char *expr, char *file, int line)
123{
124 xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
125 expr, file, line);
126 BUG();
127}
128
129void
130xfs_hex_dump(void *p, int length)
131{
132 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
133}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 000000000000..e77ffa16745b
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,38 @@
1#ifndef __XFS_MESSAGE_H
2#define __XFS_MESSAGE_H 1
3
4struct xfs_mount;
5
6extern int xfs_printk(const char *level, const struct xfs_mount *mp,
7 const char *fmt, ...)
8 __attribute__ ((format (printf, 3, 4)));
9extern int xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
10 __attribute__ ((format (printf, 2, 3)));
11extern int xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
12 __attribute__ ((format (printf, 2, 3)));
13extern int xfs_alert_tag(const struct xfs_mount *mp, int tag,
14 const char *fmt, ...)
15 __attribute__ ((format (printf, 3, 4)));
16extern int xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
17 __attribute__ ((format (printf, 2, 3)));
18extern int xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
19 __attribute__ ((format (printf, 2, 3)));
20extern int xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
21 __attribute__ ((format (printf, 2, 3)));
22extern int xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
23 __attribute__ ((format (printf, 2, 3)));
24extern int xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
25 __attribute__ ((format (printf, 2, 3)));
26
27#ifdef DEBUG
28extern int xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
29 __attribute__ ((format (printf, 2, 3)));
30#else
31#define xfs_debug(mp, fmt, ...) (0)
32#endif
33
34extern void assfail(char *expr, char *f, int l);
35
36extern void xfs_hex_dump(void *p, int length);
37
38#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9731898083ae..818c4cf2de86 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -173,6 +173,15 @@ xfs_parseargs(
173 __uint8_t iosizelog = 0; 173 __uint8_t iosizelog = 0;
174 174
175 /* 175 /*
176 * set up the mount name first so all the errors will refer to the
177 * correct device.
178 */
179 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
180 if (!mp->m_fsname)
181 return ENOMEM;
182 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
183
184 /*
176 * Copy binary VFS mount flags we are interested in. 185 * Copy binary VFS mount flags we are interested in.
177 */ 186 */
178 if (sb->s_flags & MS_RDONLY) 187 if (sb->s_flags & MS_RDONLY)
@@ -189,6 +198,7 @@ xfs_parseargs(
189 mp->m_flags |= XFS_MOUNT_BARRIER; 198 mp->m_flags |= XFS_MOUNT_BARRIER;
190 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; 199 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
191 mp->m_flags |= XFS_MOUNT_SMALL_INUMS; 200 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
201 mp->m_flags |= XFS_MOUNT_DELAYLOG;
192 202
193 /* 203 /*
194 * These can be overridden by the mount option parsing. 204 * These can be overridden by the mount option parsing.
@@ -207,24 +217,21 @@ xfs_parseargs(
207 217
208 if (!strcmp(this_char, MNTOPT_LOGBUFS)) { 218 if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
209 if (!value || !*value) { 219 if (!value || !*value) {
210 cmn_err(CE_WARN, 220 xfs_warn(mp, "%s option requires an argument",
211 "XFS: %s option requires an argument",
212 this_char); 221 this_char);
213 return EINVAL; 222 return EINVAL;
214 } 223 }
215 mp->m_logbufs = simple_strtoul(value, &eov, 10); 224 mp->m_logbufs = simple_strtoul(value, &eov, 10);
216 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { 225 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
217 if (!value || !*value) { 226 if (!value || !*value) {
218 cmn_err(CE_WARN, 227 xfs_warn(mp, "%s option requires an argument",
219 "XFS: %s option requires an argument",
220 this_char); 228 this_char);
221 return EINVAL; 229 return EINVAL;
222 } 230 }
223 mp->m_logbsize = suffix_strtoul(value, &eov, 10); 231 mp->m_logbsize = suffix_strtoul(value, &eov, 10);
224 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { 232 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
225 if (!value || !*value) { 233 if (!value || !*value) {
226 cmn_err(CE_WARN, 234 xfs_warn(mp, "%s option requires an argument",
227 "XFS: %s option requires an argument",
228 this_char); 235 this_char);
229 return EINVAL; 236 return EINVAL;
230 } 237 }
@@ -232,14 +239,12 @@ xfs_parseargs(
232 if (!mp->m_logname) 239 if (!mp->m_logname)
233 return ENOMEM; 240 return ENOMEM;
234 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 241 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
235 cmn_err(CE_WARN, 242 xfs_warn(mp, "%s option not allowed on this system",
236 "XFS: %s option not allowed on this system",
237 this_char); 243 this_char);
238 return EINVAL; 244 return EINVAL;
239 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 245 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
240 if (!value || !*value) { 246 if (!value || !*value) {
241 cmn_err(CE_WARN, 247 xfs_warn(mp, "%s option requires an argument",
242 "XFS: %s option requires an argument",
243 this_char); 248 this_char);
244 return EINVAL; 249 return EINVAL;
245 } 250 }
@@ -248,8 +253,7 @@ xfs_parseargs(
248 return ENOMEM; 253 return ENOMEM;
249 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { 254 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
250 if (!value || !*value) { 255 if (!value || !*value) {
251 cmn_err(CE_WARN, 256 xfs_warn(mp, "%s option requires an argument",
252 "XFS: %s option requires an argument",
253 this_char); 257 this_char);
254 return EINVAL; 258 return EINVAL;
255 } 259 }
@@ -257,8 +261,7 @@ xfs_parseargs(
257 iosizelog = ffs(iosize) - 1; 261 iosizelog = ffs(iosize) - 1;
258 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { 262 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
259 if (!value || !*value) { 263 if (!value || !*value) {
260 cmn_err(CE_WARN, 264 xfs_warn(mp, "%s option requires an argument",
261 "XFS: %s option requires an argument",
262 this_char); 265 this_char);
263 return EINVAL; 266 return EINVAL;
264 } 267 }
@@ -280,16 +283,14 @@ xfs_parseargs(
280 mp->m_flags |= XFS_MOUNT_SWALLOC; 283 mp->m_flags |= XFS_MOUNT_SWALLOC;
281 } else if (!strcmp(this_char, MNTOPT_SUNIT)) { 284 } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
282 if (!value || !*value) { 285 if (!value || !*value) {
283 cmn_err(CE_WARN, 286 xfs_warn(mp, "%s option requires an argument",
284 "XFS: %s option requires an argument",
285 this_char); 287 this_char);
286 return EINVAL; 288 return EINVAL;
287 } 289 }
288 dsunit = simple_strtoul(value, &eov, 10); 290 dsunit = simple_strtoul(value, &eov, 10);
289 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { 291 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
290 if (!value || !*value) { 292 if (!value || !*value) {
291 cmn_err(CE_WARN, 293 xfs_warn(mp, "%s option requires an argument",
292 "XFS: %s option requires an argument",
293 this_char); 294 this_char);
294 return EINVAL; 295 return EINVAL;
295 } 296 }
@@ -297,8 +298,7 @@ xfs_parseargs(
297 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { 298 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
298 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; 299 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
299#if !XFS_BIG_INUMS 300#if !XFS_BIG_INUMS
300 cmn_err(CE_WARN, 301 xfs_warn(mp, "%s option not allowed on this system",
301 "XFS: %s option not allowed on this system",
302 this_char); 302 this_char);
303 return EINVAL; 303 return EINVAL;
304#endif 304#endif
@@ -356,20 +356,19 @@ xfs_parseargs(
356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
358 } else if (!strcmp(this_char, "ihashsize")) { 358 } else if (!strcmp(this_char, "ihashsize")) {
359 cmn_err(CE_WARN, 359 xfs_warn(mp,
360 "XFS: ihashsize no longer used, option is deprecated."); 360 "ihashsize no longer used, option is deprecated.");
361 } else if (!strcmp(this_char, "osyncisdsync")) { 361 } else if (!strcmp(this_char, "osyncisdsync")) {
362 cmn_err(CE_WARN, 362 xfs_warn(mp,
363 "XFS: osyncisdsync has no effect, option is deprecated."); 363 "osyncisdsync has no effect, option is deprecated.");
364 } else if (!strcmp(this_char, "osyncisosync")) { 364 } else if (!strcmp(this_char, "osyncisosync")) {
365 cmn_err(CE_WARN, 365 xfs_warn(mp,
366 "XFS: osyncisosync has no effect, option is deprecated."); 366 "osyncisosync has no effect, option is deprecated.");
367 } else if (!strcmp(this_char, "irixsgid")) { 367 } else if (!strcmp(this_char, "irixsgid")) {
368 cmn_err(CE_WARN, 368 xfs_warn(mp,
369 "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); 369 "irixsgid is now a sysctl(2) variable, option is deprecated.");
370 } else { 370 } else {
371 cmn_err(CE_WARN, 371 xfs_warn(mp, "unknown mount option [%s].", this_char);
372 "XFS: unknown mount option [%s].", this_char);
373 return EINVAL; 372 return EINVAL;
374 } 373 }
375 } 374 }
@@ -379,40 +378,37 @@ xfs_parseargs(
379 */ 378 */
380 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && 379 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
381 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 380 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
382 cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only."); 381 xfs_warn(mp, "no-recovery mounts must be read-only.");
383 return EINVAL; 382 return EINVAL;
384 } 383 }
385 384
386 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { 385 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
387 cmn_err(CE_WARN, 386 xfs_warn(mp,
388 "XFS: sunit and swidth options incompatible with the noalign option"); 387 "sunit and swidth options incompatible with the noalign option");
389 return EINVAL; 388 return EINVAL;
390 } 389 }
391 390
392#ifndef CONFIG_XFS_QUOTA 391#ifndef CONFIG_XFS_QUOTA
393 if (XFS_IS_QUOTA_RUNNING(mp)) { 392 if (XFS_IS_QUOTA_RUNNING(mp)) {
394 cmn_err(CE_WARN, 393 xfs_warn(mp, "quota support not available in this kernel.");
395 "XFS: quota support not available in this kernel.");
396 return EINVAL; 394 return EINVAL;
397 } 395 }
398#endif 396#endif
399 397
400 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && 398 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
401 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { 399 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
402 cmn_err(CE_WARN, 400 xfs_warn(mp, "cannot mount with both project and group quota");
403 "XFS: cannot mount with both project and group quota");
404 return EINVAL; 401 return EINVAL;
405 } 402 }
406 403
407 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 404 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
408 cmn_err(CE_WARN, 405 xfs_warn(mp, "sunit and swidth must be specified together");
409 "XFS: sunit and swidth must be specified together");
410 return EINVAL; 406 return EINVAL;
411 } 407 }
412 408
413 if (dsunit && (dswidth % dsunit != 0)) { 409 if (dsunit && (dswidth % dsunit != 0)) {
414 cmn_err(CE_WARN, 410 xfs_warn(mp,
415 "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)", 411 "stripe width (%d) must be a multiple of the stripe unit (%d)",
416 dswidth, dsunit); 412 dswidth, dsunit);
417 return EINVAL; 413 return EINVAL;
418 } 414 }
@@ -438,8 +434,7 @@ done:
438 mp->m_logbufs != 0 && 434 mp->m_logbufs != 0 &&
439 (mp->m_logbufs < XLOG_MIN_ICLOGS || 435 (mp->m_logbufs < XLOG_MIN_ICLOGS ||
440 mp->m_logbufs > XLOG_MAX_ICLOGS)) { 436 mp->m_logbufs > XLOG_MAX_ICLOGS)) {
441 cmn_err(CE_WARN, 437 xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
442 "XFS: invalid logbufs value: %d [not %d-%d]",
443 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); 438 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
444 return XFS_ERROR(EINVAL); 439 return XFS_ERROR(EINVAL);
445 } 440 }
@@ -448,22 +443,16 @@ done:
448 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || 443 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
449 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || 444 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
450 !is_power_of_2(mp->m_logbsize))) { 445 !is_power_of_2(mp->m_logbsize))) {
451 cmn_err(CE_WARN, 446 xfs_warn(mp,
452 "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", 447 "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
453 mp->m_logbsize); 448 mp->m_logbsize);
454 return XFS_ERROR(EINVAL); 449 return XFS_ERROR(EINVAL);
455 } 450 }
456 451
457 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
458 if (!mp->m_fsname)
459 return ENOMEM;
460 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
461
462 if (iosizelog) { 452 if (iosizelog) {
463 if (iosizelog > XFS_MAX_IO_LOG || 453 if (iosizelog > XFS_MAX_IO_LOG ||
464 iosizelog < XFS_MIN_IO_LOG) { 454 iosizelog < XFS_MIN_IO_LOG) {
465 cmn_err(CE_WARN, 455 xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
466 "XFS: invalid log iosize: %d [not %d-%d]",
467 iosizelog, XFS_MIN_IO_LOG, 456 iosizelog, XFS_MIN_IO_LOG,
468 XFS_MAX_IO_LOG); 457 XFS_MAX_IO_LOG);
469 return XFS_ERROR(EINVAL); 458 return XFS_ERROR(EINVAL);
@@ -610,7 +599,7 @@ xfs_blkdev_get(
610 mp); 599 mp);
611 if (IS_ERR(*bdevp)) { 600 if (IS_ERR(*bdevp)) {
612 error = PTR_ERR(*bdevp); 601 error = PTR_ERR(*bdevp);
613 printk("XFS: Invalid device [%s], error=%d\n", name, error); 602 xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
614 } 603 }
615 604
616 return -error; 605 return -error;
@@ -664,23 +653,23 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
664 int error; 653 int error;
665 654
666 if (mp->m_logdev_targp != mp->m_ddev_targp) { 655 if (mp->m_logdev_targp != mp->m_ddev_targp) {
667 xfs_fs_cmn_err(CE_NOTE, mp, 656 xfs_notice(mp,
668 "Disabling barriers, not supported with external log device"); 657 "Disabling barriers, not supported with external log device");
669 mp->m_flags &= ~XFS_MOUNT_BARRIER; 658 mp->m_flags &= ~XFS_MOUNT_BARRIER;
670 return; 659 return;
671 } 660 }
672 661
673 if (xfs_readonly_buftarg(mp->m_ddev_targp)) { 662 if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
674 xfs_fs_cmn_err(CE_NOTE, mp, 663 xfs_notice(mp,
675 "Disabling barriers, underlying device is readonly"); 664 "Disabling barriers, underlying device is readonly");
676 mp->m_flags &= ~XFS_MOUNT_BARRIER; 665 mp->m_flags &= ~XFS_MOUNT_BARRIER;
677 return; 666 return;
678 } 667 }
679 668
680 error = xfs_barrier_test(mp); 669 error = xfs_barrier_test(mp);
681 if (error) { 670 if (error) {
682 xfs_fs_cmn_err(CE_NOTE, mp, 671 xfs_notice(mp,
683 "Disabling barriers, trial barrier write failed"); 672 "Disabling barriers, trial barrier write failed");
684 mp->m_flags &= ~XFS_MOUNT_BARRIER; 673 mp->m_flags &= ~XFS_MOUNT_BARRIER;
685 return; 674 return;
686 } 675 }
@@ -743,8 +732,8 @@ xfs_open_devices(
743 goto out_close_logdev; 732 goto out_close_logdev;
744 733
745 if (rtdev == ddev || rtdev == logdev) { 734 if (rtdev == ddev || rtdev == logdev) {
746 cmn_err(CE_WARN, 735 xfs_warn(mp,
747 "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); 736 "Cannot mount filesystem with identical rtdev and ddev/logdev.");
748 error = EINVAL; 737 error = EINVAL;
749 goto out_close_rtdev; 738 goto out_close_rtdev;
750 } 739 }
@@ -1345,8 +1334,8 @@ xfs_fs_remount(
1345 * options that we can't actually change. 1334 * options that we can't actually change.
1346 */ 1335 */
1347#if 0 1336#if 0
1348 printk(KERN_INFO 1337 xfs_info(mp,
1349 "XFS: mount option \"%s\" not supported for remount\n", p); 1338 "mount option \"%s\" not supported for remount\n", p);
1350 return -EINVAL; 1339 return -EINVAL;
1351#else 1340#else
1352 break; 1341 break;
@@ -1367,8 +1356,7 @@ xfs_fs_remount(
1367 if (mp->m_update_flags) { 1356 if (mp->m_update_flags) {
1368 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1357 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1369 if (error) { 1358 if (error) {
1370 cmn_err(CE_WARN, 1359 xfs_warn(mp, "failed to write sb changes");
1371 "XFS: failed to write sb changes");
1372 return error; 1360 return error;
1373 } 1361 }
1374 mp->m_update_flags = 0; 1362 mp->m_update_flags = 0;
@@ -1452,15 +1440,15 @@ xfs_finish_flags(
1452 mp->m_logbsize = mp->m_sb.sb_logsunit; 1440 mp->m_logbsize = mp->m_sb.sb_logsunit;
1453 } else if (mp->m_logbsize > 0 && 1441 } else if (mp->m_logbsize > 0 &&
1454 mp->m_logbsize < mp->m_sb.sb_logsunit) { 1442 mp->m_logbsize < mp->m_sb.sb_logsunit) {
1455 cmn_err(CE_WARN, 1443 xfs_warn(mp,
1456 "XFS: logbuf size must be greater than or equal to log stripe size"); 1444 "logbuf size must be greater than or equal to log stripe size");
1457 return XFS_ERROR(EINVAL); 1445 return XFS_ERROR(EINVAL);
1458 } 1446 }
1459 } else { 1447 } else {
1460 /* Fail a mount if the logbuf is larger than 32K */ 1448 /* Fail a mount if the logbuf is larger than 32K */
1461 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { 1449 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
1462 cmn_err(CE_WARN, 1450 xfs_warn(mp,
1463 "XFS: logbuf size for version 1 logs must be 16K or 32K"); 1451 "logbuf size for version 1 logs must be 16K or 32K");
1464 return XFS_ERROR(EINVAL); 1452 return XFS_ERROR(EINVAL);
1465 } 1453 }
1466 } 1454 }
@@ -1477,8 +1465,8 @@ xfs_finish_flags(
1477 * prohibit r/w mounts of read-only filesystems 1465 * prohibit r/w mounts of read-only filesystems
1478 */ 1466 */
1479 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { 1467 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
1480 cmn_err(CE_WARN, 1468 xfs_warn(mp,
1481 "XFS: cannot mount a read-only filesystem as read-write"); 1469 "cannot mount a read-only filesystem as read-write");
1482 return XFS_ERROR(EROFS); 1470 return XFS_ERROR(EROFS);
1483 } 1471 }
1484 1472
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index e22f0057d21f..6c10f1d2e3d3 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -425,8 +425,7 @@ xfs_quiesce_attr(
425 /* Push the superblock and write an unmount record */ 425 /* Push the superblock and write an unmount record */
426 error = xfs_log_sbcount(mp, 1); 426 error = xfs_log_sbcount(mp, 1);
427 if (error) 427 if (error)
428 xfs_fs_cmn_err(CE_WARN, mp, 428 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
429 "xfs_attr_quiesce: failed to log sb changes. "
430 "Frozen image may not be consistent."); 429 "Frozen image may not be consistent.");
431 xfs_log_unmount_write(mp); 430 xfs_log_unmount_write(mp);
432 xfs_unmountfs_writesb(mp); 431 xfs_unmountfs_writesb(mp);
@@ -806,7 +805,7 @@ xfs_reclaim_inode(
806 * pass on the error. 805 * pass on the error.
807 */ 806 */
808 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 807 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
809 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 808 xfs_warn(ip->i_mount,
810 "inode 0x%llx background reclaim flush failed with %d", 809 "inode 0x%llx background reclaim flush failed with %d",
811 (long long)ip->i_ino, error); 810 (long long)ip->i_ino, error);
812 } 811 }
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index ee3cee097e7e..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -37,7 +37,7 @@ xfs_stats_clear_proc_handler(
37 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); 37 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
38 38
39 if (!ret && write && *valp) { 39 if (!ret && write && *valp) {
40 printk("XFS Clearing xfsstats\n"); 40 xfs_notice(NULL, "Clearing xfsstats");
41 for_each_possible_cpu(c) { 41 for_each_possible_cpu(c) {
42 preempt_disable(); 42 preempt_disable();
43 /* save vn_active, it's a universal truth! */ 43 /* save vn_active, it's a universal truth! */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d22aa3103106..7e2416478503 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -544,9 +544,10 @@ xfs_qm_dqtobp(
544 /* 544 /*
545 * A simple sanity check in case we got a corrupted dquot... 545 * A simple sanity check in case we got a corrupted dquot...
546 */ 546 */
547 if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, 547 error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
548 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), 548 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
549 "dqtobp")) { 549 "dqtobp");
550 if (error) {
550 if (!(flags & XFS_QMOPT_DQREPAIR)) { 551 if (!(flags & XFS_QMOPT_DQREPAIR)) {
551 xfs_trans_brelse(tp, bp); 552 xfs_trans_brelse(tp, bp);
552 return XFS_ERROR(EIO); 553 return XFS_ERROR(EIO);
@@ -827,7 +828,7 @@ xfs_qm_dqget(
827 if (xfs_do_dqerror) { 828 if (xfs_do_dqerror) {
828 if ((xfs_dqerror_target == mp->m_ddev_targp) && 829 if ((xfs_dqerror_target == mp->m_ddev_targp) &&
829 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { 830 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
830 cmn_err(CE_DEBUG, "Returning error in dqget"); 831 xfs_debug(mp, "Returning error in dqget");
831 return (EIO); 832 return (EIO);
832 } 833 }
833 } 834 }
@@ -1207,8 +1208,9 @@ xfs_qm_dqflush(
1207 /* 1208 /*
1208 * A simple sanity check in case we got a corrupted dquot.. 1209 * A simple sanity check in case we got a corrupted dquot..
1209 */ 1210 */
1210 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 1211 error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
1211 XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { 1212 XFS_QMOPT_DOWARN, "dqflush (incore copy)");
1213 if (error) {
1212 xfs_buf_relse(bp); 1214 xfs_buf_relse(bp);
1213 xfs_dqfunlock(dqp); 1215 xfs_dqfunlock(dqp);
1214 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1216 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1391,8 +1393,8 @@ xfs_qm_dqpurge(
1391 */ 1393 */
1392 error = xfs_qm_dqflush(dqp, SYNC_WAIT); 1394 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
1393 if (error) 1395 if (error)
1394 xfs_fs_cmn_err(CE_WARN, mp, 1396 xfs_warn(mp, "%s: dquot %p flush failed",
1395 "xfs_qm_dqpurge: dquot %p flush failed", dqp); 1397 __func__, dqp);
1396 xfs_dqflock(dqp); 1398 xfs_dqflock(dqp);
1397 } 1399 }
1398 ASSERT(atomic_read(&dqp->q_pincount) == 0); 1400 ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -1425,36 +1427,38 @@ xfs_qm_dqpurge(
1425void 1427void
1426xfs_qm_dqprint(xfs_dquot_t *dqp) 1428xfs_qm_dqprint(xfs_dquot_t *dqp)
1427{ 1429{
1428 cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------"); 1430 struct xfs_mount *mp = dqp->q_mount;
1429 cmn_err(CE_DEBUG, "---- dquotID = %d", 1431
1432 xfs_debug(mp, "-----------KERNEL DQUOT----------------");
1433 xfs_debug(mp, "---- dquotID = %d",
1430 (int)be32_to_cpu(dqp->q_core.d_id)); 1434 (int)be32_to_cpu(dqp->q_core.d_id));
1431 cmn_err(CE_DEBUG, "---- type = %s", DQFLAGTO_TYPESTR(dqp)); 1435 xfs_debug(mp, "---- type = %s", DQFLAGTO_TYPESTR(dqp));
1432 cmn_err(CE_DEBUG, "---- fs = 0x%p", dqp->q_mount); 1436 xfs_debug(mp, "---- fs = 0x%p", dqp->q_mount);
1433 cmn_err(CE_DEBUG, "---- blkno = 0x%x", (int) dqp->q_blkno); 1437 xfs_debug(mp, "---- blkno = 0x%x", (int) dqp->q_blkno);
1434 cmn_err(CE_DEBUG, "---- boffset = 0x%x", (int) dqp->q_bufoffset); 1438 xfs_debug(mp, "---- boffset = 0x%x", (int) dqp->q_bufoffset);
1435 cmn_err(CE_DEBUG, "---- blkhlimit = %Lu (0x%x)", 1439 xfs_debug(mp, "---- blkhlimit = %Lu (0x%x)",
1436 be64_to_cpu(dqp->q_core.d_blk_hardlimit), 1440 be64_to_cpu(dqp->q_core.d_blk_hardlimit),
1437 (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit)); 1441 (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
1438 cmn_err(CE_DEBUG, "---- blkslimit = %Lu (0x%x)", 1442 xfs_debug(mp, "---- blkslimit = %Lu (0x%x)",
1439 be64_to_cpu(dqp->q_core.d_blk_softlimit), 1443 be64_to_cpu(dqp->q_core.d_blk_softlimit),
1440 (int)be64_to_cpu(dqp->q_core.d_blk_softlimit)); 1444 (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
1441 cmn_err(CE_DEBUG, "---- inohlimit = %Lu (0x%x)", 1445 xfs_debug(mp, "---- inohlimit = %Lu (0x%x)",
1442 be64_to_cpu(dqp->q_core.d_ino_hardlimit), 1446 be64_to_cpu(dqp->q_core.d_ino_hardlimit),
1443 (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit)); 1447 (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
1444 cmn_err(CE_DEBUG, "---- inoslimit = %Lu (0x%x)", 1448 xfs_debug(mp, "---- inoslimit = %Lu (0x%x)",
1445 be64_to_cpu(dqp->q_core.d_ino_softlimit), 1449 be64_to_cpu(dqp->q_core.d_ino_softlimit),
1446 (int)be64_to_cpu(dqp->q_core.d_ino_softlimit)); 1450 (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
1447 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", 1451 xfs_debug(mp, "---- bcount = %Lu (0x%x)",
1448 be64_to_cpu(dqp->q_core.d_bcount), 1452 be64_to_cpu(dqp->q_core.d_bcount),
1449 (int)be64_to_cpu(dqp->q_core.d_bcount)); 1453 (int)be64_to_cpu(dqp->q_core.d_bcount));
1450 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", 1454 xfs_debug(mp, "---- icount = %Lu (0x%x)",
1451 be64_to_cpu(dqp->q_core.d_icount), 1455 be64_to_cpu(dqp->q_core.d_icount),
1452 (int)be64_to_cpu(dqp->q_core.d_icount)); 1456 (int)be64_to_cpu(dqp->q_core.d_icount));
1453 cmn_err(CE_DEBUG, "---- btimer = %d", 1457 xfs_debug(mp, "---- btimer = %d",
1454 (int)be32_to_cpu(dqp->q_core.d_btimer)); 1458 (int)be32_to_cpu(dqp->q_core.d_btimer));
1455 cmn_err(CE_DEBUG, "---- itimer = %d", 1459 xfs_debug(mp, "---- itimer = %d",
1456 (int)be32_to_cpu(dqp->q_core.d_itimer)); 1460 (int)be32_to_cpu(dqp->q_core.d_itimer));
1457 cmn_err(CE_DEBUG, "---------------------------"); 1461 xfs_debug(mp, "---------------------------");
1458} 1462}
1459#endif 1463#endif
1460 1464
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2a1f3dc10a02..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -136,9 +136,8 @@ xfs_qm_dquot_logitem_push(
136 */ 136 */
137 error = xfs_qm_dqflush(dqp, 0); 137 error = xfs_qm_dqflush(dqp, 0);
138 if (error) 138 if (error)
139 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 139 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
140 "xfs_qm_dquot_logitem_push: push error %d on dqp %p", 140 __func__, error, dqp);
141 error, dqp);
142 xfs_dqunlock(dqp); 141 xfs_dqunlock(dqp);
143} 142}
144 143
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde8..254ee062bd7d 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -80,7 +80,7 @@ xfs_qm_dquot_list_print(
80 int i = 0; 80 int i = 0;
81 81
82 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { 82 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
83 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " 83 xfs_debug(mp, " %d. \"%d (%s)\" "
84 "bcnt = %lld, icnt = %lld, refs = %d", 84 "bcnt = %lld, icnt = %lld, refs = %d",
85 i++, be32_to_cpu(dqp->q_core.d_id), 85 i++, be32_to_cpu(dqp->q_core.d_id),
86 DQFLAGTO_TYPESTR(dqp), 86 DQFLAGTO_TYPESTR(dqp),
@@ -205,7 +205,7 @@ xfs_qm_destroy(
205 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { 205 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
206 xfs_dqlock(dqp); 206 xfs_dqlock(dqp);
207#ifdef QUOTADEBUG 207#ifdef QUOTADEBUG
208 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); 208 xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
209#endif 209#endif
210 list_del_init(&dqp->q_freelist); 210 list_del_init(&dqp->q_freelist);
211 xfs_Gqm->qm_dqfrlist_cnt--; 211 xfs_Gqm->qm_dqfrlist_cnt--;
@@ -341,9 +341,7 @@ xfs_qm_mount_quotas(
341 * quotas immediately. 341 * quotas immediately.
342 */ 342 */
343 if (mp->m_sb.sb_rextents) { 343 if (mp->m_sb.sb_rextents) {
344 cmn_err(CE_NOTE, 344 xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
345 "Cannot turn on quotas for realtime filesystem %s",
346 mp->m_fsname);
347 mp->m_qflags = 0; 345 mp->m_qflags = 0;
348 goto write_changes; 346 goto write_changes;
349 } 347 }
@@ -402,14 +400,13 @@ xfs_qm_mount_quotas(
402 * off, but the on disk superblock doesn't know that ! 400 * off, but the on disk superblock doesn't know that !
403 */ 401 */
404 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); 402 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
405 xfs_fs_cmn_err(CE_ALERT, mp, 403 xfs_alert(mp, "%s: Superblock update failed!",
406 "XFS mount_quotas: Superblock update failed!"); 404 __func__);
407 } 405 }
408 } 406 }
409 407
410 if (error) { 408 if (error) {
411 xfs_fs_cmn_err(CE_WARN, mp, 409 xfs_warn(mp, "Failed to initialize disk quotas.");
412 "Failed to initialize disk quotas.");
413 return; 410 return;
414 } 411 }
415 412
@@ -1230,13 +1227,6 @@ xfs_qm_qino_alloc(
1230 } 1227 }
1231 1228
1232 /* 1229 /*
1233 * Keep an extra reference to this quota inode. This inode is
1234 * locked exclusively and joined to the transaction already.
1235 */
1236 ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
1237 IHOLD(*ip);
1238
1239 /*
1240 * Make the changes in the superblock, and log those too. 1230 * Make the changes in the superblock, and log those too.
1241 * sbfields arg may contain fields other than *QUOTINO; 1231 * sbfields arg may contain fields other than *QUOTINO;
1242 * VERSIONNUM for example. 1232 * VERSIONNUM for example.
@@ -1264,7 +1254,7 @@ xfs_qm_qino_alloc(
1264 xfs_mod_sb(tp, sbfields); 1254 xfs_mod_sb(tp, sbfields);
1265 1255
1266 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { 1256 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
1267 xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!"); 1257 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
1268 return error; 1258 return error;
1269 } 1259 }
1270 return 0; 1260 return 0;
@@ -1299,7 +1289,7 @@ xfs_qm_reset_dqcounts(
1299 * output any warnings because it's perfectly possible to 1289 * output any warnings because it's perfectly possible to
1300 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck. 1290 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
1301 */ 1291 */
1302 (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR, 1292 (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
1303 "xfs_quotacheck"); 1293 "xfs_quotacheck");
1304 ddq->d_bcount = 0; 1294 ddq->d_bcount = 0;
1305 ddq->d_icount = 0; 1295 ddq->d_icount = 0;
@@ -1676,7 +1666,7 @@ xfs_qm_quotacheck(
1676 */ 1666 */
1677 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); 1667 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
1678 1668
1679 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); 1669 xfs_notice(mp, "Quotacheck needed: Please wait.");
1680 1670
1681 /* 1671 /*
1682 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset 1672 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
@@ -1754,9 +1744,9 @@ xfs_qm_quotacheck(
1754 1744
1755 error_return: 1745 error_return:
1756 if (error) { 1746 if (error) {
1757 cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): " 1747 xfs_warn(mp,
1758 "Disabling quotas.", 1748 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
1759 mp->m_fsname, error); 1749 error);
1760 /* 1750 /*
1761 * We must turn off quotas. 1751 * We must turn off quotas.
1762 */ 1752 */
@@ -1764,12 +1754,11 @@ xfs_qm_quotacheck(
1764 ASSERT(xfs_Gqm != NULL); 1754 ASSERT(xfs_Gqm != NULL);
1765 xfs_qm_destroy_quotainfo(mp); 1755 xfs_qm_destroy_quotainfo(mp);
1766 if (xfs_mount_reset_sbqflags(mp)) { 1756 if (xfs_mount_reset_sbqflags(mp)) {
1767 cmn_err(CE_WARN, "XFS quotacheck %s: " 1757 xfs_warn(mp,
1768 "Failed to reset quota flags.", mp->m_fsname); 1758 "Quotacheck: Failed to reset quota flags.");
1769 } 1759 }
1770 } else { 1760 } else
1771 cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname); 1761 xfs_notice(mp, "Quotacheck: Done.");
1772 }
1773 return (error); 1762 return (error);
1774} 1763}
1775 1764
@@ -1863,12 +1852,14 @@ xfs_qm_dqreclaim_one(void)
1863 xfs_dquot_t *dqpout; 1852 xfs_dquot_t *dqpout;
1864 xfs_dquot_t *dqp; 1853 xfs_dquot_t *dqp;
1865 int restarts; 1854 int restarts;
1855 int startagain;
1866 1856
1867 restarts = 0; 1857 restarts = 0;
1868 dqpout = NULL; 1858 dqpout = NULL;
1869 1859
1870 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ 1860 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1871startagain: 1861again:
1862 startagain = 0;
1872 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); 1863 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1873 1864
1874 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { 1865 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1876,10 @@ startagain:
1885 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); 1876 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1886 1877
1887 trace_xfs_dqreclaim_want(dqp); 1878 trace_xfs_dqreclaim_want(dqp);
1888
1889 xfs_dqunlock(dqp);
1890 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1891 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1892 return NULL;
1893 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1879 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1894 goto startagain; 1880 restarts++;
1881 startagain = 1;
1882 goto dqunlock;
1895 } 1883 }
1896 1884
1897 /* 1885 /*
@@ -1906,23 +1894,20 @@ startagain:
1906 ASSERT(list_empty(&dqp->q_mplist)); 1894 ASSERT(list_empty(&dqp->q_mplist));
1907 list_del_init(&dqp->q_freelist); 1895 list_del_init(&dqp->q_freelist);
1908 xfs_Gqm->qm_dqfrlist_cnt--; 1896 xfs_Gqm->qm_dqfrlist_cnt--;
1909 xfs_dqunlock(dqp);
1910 dqpout = dqp; 1897 dqpout = dqp;
1911 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1898 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1912 break; 1899 goto dqunlock;
1913 } 1900 }
1914 1901
1915 ASSERT(dqp->q_hash); 1902 ASSERT(dqp->q_hash);
1916 ASSERT(!list_empty(&dqp->q_mplist)); 1903 ASSERT(!list_empty(&dqp->q_mplist));
1917 1904
1918 /* 1905 /*
1919 * Try to grab the flush lock. If this dquot is in the process of 1906 * Try to grab the flush lock. If this dquot is in the process
1920 * getting flushed to disk, we don't want to reclaim it. 1907 * of getting flushed to disk, we don't want to reclaim it.
1921 */ 1908 */
1922 if (!xfs_dqflock_nowait(dqp)) { 1909 if (!xfs_dqflock_nowait(dqp))
1923 xfs_dqunlock(dqp); 1910 goto dqunlock;
1924 continue;
1925 }
1926 1911
1927 /* 1912 /*
1928 * We have the flush lock so we know that this is not in the 1913 * We have the flush lock so we know that this is not in the
@@ -1941,11 +1926,10 @@ startagain:
1941 */ 1926 */
1942 error = xfs_qm_dqflush(dqp, 0); 1927 error = xfs_qm_dqflush(dqp, 0);
1943 if (error) { 1928 if (error) {
1944 xfs_fs_cmn_err(CE_WARN, mp, 1929 xfs_warn(mp, "%s: dquot %p flush failed",
1945 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 1930 __func__, dqp);
1946 } 1931 }
1947 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 1932 goto dqunlock;
1948 continue;
1949 } 1933 }
1950 1934
1951 /* 1935 /*
@@ -1967,13 +1951,8 @@ startagain:
1967 */ 1951 */
1968 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { 1952 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
1969 restarts++; 1953 restarts++;
1970 mutex_unlock(&dqp->q_hash->qh_lock); 1954 startagain = 1;
1971 xfs_dqfunlock(dqp); 1955 goto qhunlock;
1972 xfs_dqunlock(dqp);
1973 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1974 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
1975 return NULL;
1976 goto startagain;
1977 } 1956 }
1978 1957
1979 ASSERT(dqp->q_nrefs == 0); 1958 ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1965,20 @@ startagain:
1986 xfs_Gqm->qm_dqfrlist_cnt--; 1965 xfs_Gqm->qm_dqfrlist_cnt--;
1987 dqpout = dqp; 1966 dqpout = dqp;
1988 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); 1967 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1968qhunlock:
1989 mutex_unlock(&dqp->q_hash->qh_lock); 1969 mutex_unlock(&dqp->q_hash->qh_lock);
1990dqfunlock: 1970dqfunlock:
1991 xfs_dqfunlock(dqp); 1971 xfs_dqfunlock(dqp);
1972dqunlock:
1992 xfs_dqunlock(dqp); 1973 xfs_dqunlock(dqp);
1993 if (dqpout) 1974 if (dqpout)
1994 break; 1975 break;
1995 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1976 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1996 return NULL; 1977 break;
1978 if (startagain) {
1979 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1980 goto again;
1981 }
1997 } 1982 }
1998 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); 1983 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1999 return dqpout; 1984 return dqpout;
@@ -2119,7 +2104,7 @@ xfs_qm_write_sb_changes(
2119 int error; 2104 int error;
2120 2105
2121#ifdef QUOTADEBUG 2106#ifdef QUOTADEBUG
2122 cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname); 2107 xfs_notice(mp, "Writing superblock quota changes");
2123#endif 2108#endif
2124 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 2109 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
2125 if ((error = xfs_trans_reserve(tp, 0, 2110 if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 45b5cb1788ab..774d7ec6df8e 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -119,8 +119,7 @@ xfs_qm_newmount(
119 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || 119 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
120 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && 120 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) &&
121 xfs_dev_is_read_only(mp, "changing quota state")) { 121 xfs_dev_is_read_only(mp, "changing quota state")) {
122 cmn_err(CE_WARN, 122 xfs_warn(mp, "please mount with%s%s%s%s.",
123 "XFS: please mount with%s%s%s%s.",
124 (!quotaondisk ? "out quota" : ""), 123 (!quotaondisk ? "out quota" : ""),
125 (uquotaondisk ? " usrquota" : ""), 124 (uquotaondisk ? " usrquota" : ""),
126 (pquotaondisk ? " prjquota" : ""), 125 (pquotaondisk ? " prjquota" : ""),
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index bdebc183223e..c82f06778a27 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -41,12 +41,6 @@
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43 43
44#ifdef DEBUG
45# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args)
46#else
47# define qdprintk(s, args...) do { } while (0)
48#endif
49
50STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 44STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
51STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 45STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
52 uint); 46 uint);
@@ -294,7 +288,8 @@ xfs_qm_scall_trunc_qfiles(
294 int error = 0, error2 = 0; 288 int error = 0, error2 = 0;
295 289
296 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 290 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
297 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 291 xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
292 __func__, flags, mp->m_qflags);
298 return XFS_ERROR(EINVAL); 293 return XFS_ERROR(EINVAL);
299 } 294 }
300 295
@@ -331,7 +326,8 @@ xfs_qm_scall_quotaon(
331 sbflags = 0; 326 sbflags = 0;
332 327
333 if (flags == 0) { 328 if (flags == 0) {
334 qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags); 329 xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
330 __func__, mp->m_qflags);
335 return XFS_ERROR(EINVAL); 331 return XFS_ERROR(EINVAL);
336 } 332 }
337 333
@@ -352,8 +348,9 @@ xfs_qm_scall_quotaon(
352 (flags & XFS_GQUOTA_ACCT) == 0 && 348 (flags & XFS_GQUOTA_ACCT) == 0 &&
353 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && 349 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
354 (flags & XFS_OQUOTA_ENFD))) { 350 (flags & XFS_OQUOTA_ENFD))) {
355 qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n", 351 xfs_debug(mp,
356 flags, mp->m_sb.sb_qflags); 352 "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
353 __func__, flags, mp->m_sb.sb_qflags);
357 return XFS_ERROR(EINVAL); 354 return XFS_ERROR(EINVAL);
358 } 355 }
359 /* 356 /*
@@ -541,7 +538,7 @@ xfs_qm_scall_setqlim(
541 q->qi_bsoftlimit = soft; 538 q->qi_bsoftlimit = soft;
542 } 539 }
543 } else { 540 } else {
544 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); 541 xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
545 } 542 }
546 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? 543 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
547 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : 544 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -557,7 +554,7 @@ xfs_qm_scall_setqlim(
557 q->qi_rtbsoftlimit = soft; 554 q->qi_rtbsoftlimit = soft;
558 } 555 }
559 } else { 556 } else {
560 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); 557 xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
561 } 558 }
562 559
563 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? 560 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -574,7 +571,7 @@ xfs_qm_scall_setqlim(
574 q->qi_isoftlimit = soft; 571 q->qi_isoftlimit = soft;
575 } 572 }
576 } else { 573 } else {
577 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); 574 xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
578 } 575 }
579 576
580 /* 577 /*
@@ -939,10 +936,11 @@ struct mutex qcheck_lock;
939#define DQTEST_LIST_PRINT(l, NXT, title) \ 936#define DQTEST_LIST_PRINT(l, NXT, title) \
940{ \ 937{ \
941 xfs_dqtest_t *dqp; int i = 0;\ 938 xfs_dqtest_t *dqp; int i = 0;\
942 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ 939 xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
943 for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \ 940 for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
944 dqp = (xfs_dqtest_t *)dqp->NXT) { \ 941 dqp = (xfs_dqtest_t *)dqp->NXT) { \
945 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \ 942 xfs_debug(dqp->q_mount, \
943 " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \
946 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \ 944 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \
947 dqp->d_bcount, dqp->d_icount); } \ 945 dqp->d_bcount, dqp->d_icount); } \
948} 946}
@@ -966,16 +964,17 @@ xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
966} 964}
967STATIC void 965STATIC void
968xfs_qm_dqtest_print( 966xfs_qm_dqtest_print(
969 xfs_dqtest_t *d) 967 struct xfs_mount *mp,
968 struct dqtest *d)
970{ 969{
971 cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------"); 970 xfs_debug(mp, "-----------DQTEST DQUOT----------------");
972 cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id); 971 xfs_debug(mp, "---- dquot ID = %d", d->d_id);
973 cmn_err(CE_DEBUG, "---- fs = 0x%p", d->q_mount); 972 xfs_debug(mp, "---- fs = 0x%p", d->q_mount);
974 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", 973 xfs_debug(mp, "---- bcount = %Lu (0x%x)",
975 d->d_bcount, (int)d->d_bcount); 974 d->d_bcount, (int)d->d_bcount);
976 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", 975 xfs_debug(mp, "---- icount = %Lu (0x%x)",
977 d->d_icount, (int)d->d_icount); 976 d->d_icount, (int)d->d_icount);
978 cmn_err(CE_DEBUG, "---------------------------"); 977 xfs_debug(mp, "---------------------------");
979} 978}
980 979
981STATIC void 980STATIC void
@@ -989,12 +988,14 @@ xfs_qm_dqtest_failed(
989{ 988{
990 qmtest_nfails++; 989 qmtest_nfails++;
991 if (error) 990 if (error)
992 cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s", 991 xfs_debug(dqp->q_mount,
993 d->d_id, error, reason); 992 "quotacheck failed id=%d, err=%d\nreason: %s",
993 d->d_id, error, reason);
994 else 994 else
995 cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]", 995 xfs_debug(dqp->q_mount,
996 d->d_id, reason, (int)a, (int)b); 996 "quotacheck failed id=%d (%s) [%d != %d]",
997 xfs_qm_dqtest_print(d); 997 d->d_id, reason, (int)a, (int)b);
998 xfs_qm_dqtest_print(dqp->q_mount, d);
998 if (dqp) 999 if (dqp)
999 xfs_qm_dqprint(dqp); 1000 xfs_qm_dqprint(dqp);
1000} 1001}
@@ -1021,9 +1022,9 @@ xfs_dqtest_cmp2(
1021 be64_to_cpu(dqp->q_core.d_bcount) >= 1022 be64_to_cpu(dqp->q_core.d_bcount) >=
1022 be64_to_cpu(dqp->q_core.d_blk_softlimit)) { 1023 be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
1023 if (!dqp->q_core.d_btimer && dqp->q_core.d_id) { 1024 if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
1024 cmn_err(CE_DEBUG, 1025 xfs_debug(dqp->q_mount,
1025 "%d [%s] [0x%p] BLK TIMER NOT STARTED", 1026 "%d [%s] BLK TIMER NOT STARTED",
1026 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1027 d->d_id, DQFLAGTO_TYPESTR(d));
1027 err++; 1028 err++;
1028 } 1029 }
1029 } 1030 }
@@ -1031,16 +1032,16 @@ xfs_dqtest_cmp2(
1031 be64_to_cpu(dqp->q_core.d_icount) >= 1032 be64_to_cpu(dqp->q_core.d_icount) >=
1032 be64_to_cpu(dqp->q_core.d_ino_softlimit)) { 1033 be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
1033 if (!dqp->q_core.d_itimer && dqp->q_core.d_id) { 1034 if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
1034 cmn_err(CE_DEBUG, 1035 xfs_debug(dqp->q_mount,
1035 "%d [%s] [0x%p] INO TIMER NOT STARTED", 1036 "%d [%s] INO TIMER NOT STARTED",
1036 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1037 d->d_id, DQFLAGTO_TYPESTR(d));
1037 err++; 1038 err++;
1038 } 1039 }
1039 } 1040 }
1040#ifdef QUOTADEBUG 1041#ifdef QUOTADEBUG
1041 if (!err) { 1042 if (!err) {
1042 cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked", 1043 xfs_debug(dqp->q_mount, "%d [%s] qchecked",
1043 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1044 d->d_id, DQFLAGTO_TYPESTR(d));
1044 } 1045 }
1045#endif 1046#endif
1046 return (err); 1047 return (err);
@@ -1137,8 +1138,8 @@ xfs_qm_internalqcheck_adjust(
1137 1138
1138 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { 1139 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
1139 *res = BULKSTAT_RV_NOTHING; 1140 *res = BULKSTAT_RV_NOTHING;
1140 qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n", 1141 xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
1141 (unsigned long long) ino, 1142 __func__, (unsigned long long) ino,
1142 (unsigned long long) mp->m_sb.sb_uquotino, 1143 (unsigned long long) mp->m_sb.sb_uquotino,
1143 (unsigned long long) mp->m_sb.sb_gquotino); 1144 (unsigned long long) mp->m_sb.sb_gquotino);
1144 return XFS_ERROR(EINVAL); 1145 return XFS_ERROR(EINVAL);
@@ -1223,12 +1224,12 @@ xfs_qm_internalqcheck(
1223 xfs_qm_internalqcheck_adjust, 1224 xfs_qm_internalqcheck_adjust,
1224 0, NULL, &done); 1225 0, NULL, &done);
1225 if (error) { 1226 if (error) {
1226 cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); 1227 xfs_debug(mp, "Bulkstat returned error 0x%x", error);
1227 break; 1228 break;
1228 } 1229 }
1229 } while (!done); 1230 } while (!done);
1230 1231
1231 cmn_err(CE_DEBUG, "Checking results against system dquots"); 1232 xfs_debug(mp, "Checking results against system dquots");
1232 for (i = 0; i < qmtest_hashmask; i++) { 1233 for (i = 0; i < qmtest_hashmask; i++) {
1233 xfs_dqtest_t *d, *n; 1234 xfs_dqtest_t *d, *n;
1234 xfs_dqhash_t *h; 1235 xfs_dqhash_t *h;
@@ -1246,10 +1247,10 @@ xfs_qm_internalqcheck(
1246 } 1247 }
1247 1248
1248 if (qmtest_nfails) { 1249 if (qmtest_nfails) {
1249 cmn_err(CE_DEBUG, "******** quotacheck failed ********"); 1250 xfs_debug(mp, "******** quotacheck failed ********");
1250 cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails); 1251 xfs_debug(mp, "failures = %d", qmtest_nfails);
1251 } else { 1252 } else {
1252 cmn_err(CE_DEBUG, "******** quotacheck successful! ********"); 1253 xfs_debug(mp, "******** quotacheck successful! ********");
1253 } 1254 }
1254 kmem_free(qmtest_udqtab); 1255 kmem_free(qmtest_udqtab);
1255 kmem_free(qmtest_gdqtab); 1256 kmem_free(qmtest_gdqtab);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de91d1b75c0..2a3648731331 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -643,8 +643,9 @@ xfs_trans_dqresv(
643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && 643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { 644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
645#ifdef QUOTADEBUG 645#ifdef QUOTADEBUG
646 cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld" 646 xfs_debug(mp,
647 " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit); 647 "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
648 nblks, *resbcountp, hardlimit);
648#endif 649#endif
649 if (nblks > 0) { 650 if (nblks > 0) {
650 /* 651 /*
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index 0df88897ef84..000000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,107 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include <xfs.h>
19#include "debug.h"
20
21/* xfs_mount.h drags a lot of crap in, sorry.. */
22#include "xfs_sb.h"
23#include "xfs_inum.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_error.h"
27
28void
29cmn_err(
30 const char *lvl,
31 const char *fmt,
32 ...)
33{
34 struct va_format vaf;
35 va_list args;
36
37 va_start(args, fmt);
38 vaf.fmt = fmt;
39 vaf.va = &args;
40
41 printk("%s%pV", lvl, &vaf);
42 va_end(args);
43
44 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
45}
46
47void
48xfs_fs_cmn_err(
49 const char *lvl,
50 struct xfs_mount *mp,
51 const char *fmt,
52 ...)
53{
54 struct va_format vaf;
55 va_list args;
56
57 va_start(args, fmt);
58 vaf.fmt = fmt;
59 vaf.va = &args;
60
61 printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
62 va_end(args);
63
64 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
65}
66
67/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
68void
69xfs_cmn_err(
70 int panic_tag,
71 const char *lvl,
72 struct xfs_mount *mp,
73 const char *fmt,
74 ...)
75{
76 struct va_format vaf;
77 va_list args;
78 int do_panic = 0;
79
80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
81 printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
82 do_panic = 1;
83 }
84
85 va_start(args, fmt);
86 vaf.fmt = fmt;
87 vaf.va = &args;
88
89 printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
90 va_end(args);
91
92 BUG_ON(do_panic);
93}
94
95void
96assfail(char *expr, char *file, int line)
97{
98 printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
99 file, line);
100 BUG();
101}
102
103void
104xfs_hex_dump(void *p, int length)
105{
106 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
107}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index 05699f67d475..000000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,61 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_DEBUG_H__
19#define __XFS_SUPPORT_DEBUG_H__
20
21#include <stdarg.h>
22
23struct xfs_mount;
24
25#define CE_DEBUG KERN_DEBUG
26#define CE_CONT KERN_INFO
27#define CE_NOTE KERN_NOTICE
28#define CE_WARN KERN_WARNING
29#define CE_ALERT KERN_ALERT
30#define CE_PANIC KERN_EMERG
31
32void cmn_err(const char *lvl, const char *fmt, ...)
33 __attribute__ ((format (printf, 2, 3)));
34void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
35 const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
36void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
37 const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
38
39extern void assfail(char *expr, char *f, int l);
40
41#define ASSERT_ALWAYS(expr) \
42 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
43
44#ifndef DEBUG
45#define ASSERT(expr) ((void)0)
46
47#ifndef STATIC
48# define STATIC static noinline
49#endif
50
51#else /* DEBUG */
52
53#define ASSERT(expr) \
54 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
55
56#ifndef STATIC
57# define STATIC noinline
58#endif
59
60#endif /* DEBUG */
61#endif /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index f3227984a9bf..4bc3c649aee4 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -147,10 +147,9 @@ xfs_alloc_get_rec(
147 */ 147 */
148STATIC void 148STATIC void
149xfs_alloc_compute_aligned( 149xfs_alloc_compute_aligned(
150 xfs_alloc_arg_t *args, /* allocation argument structure */
150 xfs_agblock_t foundbno, /* starting block in found extent */ 151 xfs_agblock_t foundbno, /* starting block in found extent */
151 xfs_extlen_t foundlen, /* length in found extent */ 152 xfs_extlen_t foundlen, /* length in found extent */
152 xfs_extlen_t alignment, /* alignment for allocation */
153 xfs_extlen_t minlen, /* minimum length for allocation */
154 xfs_agblock_t *resbno, /* result block number */ 153 xfs_agblock_t *resbno, /* result block number */
155 xfs_extlen_t *reslen) /* result length */ 154 xfs_extlen_t *reslen) /* result length */
156{ 155{
@@ -158,8 +157,8 @@ xfs_alloc_compute_aligned(
158 xfs_extlen_t diff; 157 xfs_extlen_t diff;
159 xfs_extlen_t len; 158 xfs_extlen_t len;
160 159
161 if (alignment > 1 && foundlen >= minlen) { 160 if (args->alignment > 1 && foundlen >= args->minlen) {
162 bno = roundup(foundbno, alignment); 161 bno = roundup(foundbno, args->alignment);
163 diff = bno - foundbno; 162 diff = bno - foundbno;
164 len = diff >= foundlen ? 0 : foundlen - diff; 163 len = diff >= foundlen ? 0 : foundlen - diff;
165 } else { 164 } else {
@@ -464,6 +463,27 @@ xfs_alloc_read_agfl(
464 return 0; 463 return 0;
465} 464}
466 465
466STATIC int
467xfs_alloc_update_counters(
468 struct xfs_trans *tp,
469 struct xfs_perag *pag,
470 struct xfs_buf *agbp,
471 long len)
472{
473 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
474
475 pag->pagf_freeblks += len;
476 be32_add_cpu(&agf->agf_freeblks, len);
477
478 xfs_trans_agblocks_delta(tp, len);
479 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
480 be32_to_cpu(agf->agf_length)))
481 return EFSCORRUPTED;
482
483 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
484 return 0;
485}
486
467/* 487/*
468 * Allocation group level functions. 488 * Allocation group level functions.
469 */ 489 */
@@ -505,49 +525,44 @@ xfs_alloc_ag_vextent(
505 ASSERT(0); 525 ASSERT(0);
506 /* NOTREACHED */ 526 /* NOTREACHED */
507 } 527 }
508 if (error) 528
529 if (error || args->agbno == NULLAGBLOCK)
509 return error; 530 return error;
510 /*
511 * If the allocation worked, need to change the agf structure
512 * (and log it), and the superblock.
513 */
514 if (args->agbno != NULLAGBLOCK) {
515 xfs_agf_t *agf; /* allocation group freelist header */
516 long slen = (long)args->len;
517 531
518 ASSERT(args->len >= args->minlen && args->len <= args->maxlen); 532 ASSERT(args->len >= args->minlen);
519 ASSERT(!(args->wasfromfl) || !args->isfl); 533 ASSERT(args->len <= args->maxlen);
520 ASSERT(args->agbno % args->alignment == 0); 534 ASSERT(!args->wasfromfl || !args->isfl);
521 if (!(args->wasfromfl)) { 535 ASSERT(args->agbno % args->alignment == 0);
522 536
523 agf = XFS_BUF_TO_AGF(args->agbp); 537 if (!args->wasfromfl) {
524 be32_add_cpu(&agf->agf_freeblks, -(args->len)); 538 error = xfs_alloc_update_counters(args->tp, args->pag,
525 xfs_trans_agblocks_delta(args->tp, 539 args->agbp,
526 -((long)(args->len))); 540 -((long)(args->len)));
527 args->pag->pagf_freeblks -= args->len; 541 if (error)
528 ASSERT(be32_to_cpu(agf->agf_freeblks) <= 542 return error;
529 be32_to_cpu(agf->agf_length)); 543
530 xfs_alloc_log_agf(args->tp, args->agbp, 544 /*
531 XFS_AGF_FREEBLKS); 545 * Search the busylist for these blocks and mark the
532 /* 546 * transaction as synchronous if blocks are found. This
533 * Search the busylist for these blocks and mark the 547 * avoids the need to block due to a synchronous log
534 * transaction as synchronous if blocks are found. This 548 * force to ensure correct ordering as the synchronous
535 * avoids the need to block due to a synchronous log 549 * transaction will guarantee that for us.
536 * force to ensure correct ordering as the synchronous 550 */
537 * transaction will guarantee that for us. 551 if (xfs_alloc_busy_search(args->mp, args->agno,
538 */ 552 args->agbno, args->len))
539 if (xfs_alloc_busy_search(args->mp, args->agno, 553 xfs_trans_set_sync(args->tp);
540 args->agbno, args->len))
541 xfs_trans_set_sync(args->tp);
542 }
543 if (!args->isfl)
544 xfs_trans_mod_sb(args->tp,
545 args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
546 XFS_TRANS_SB_FDBLOCKS, -slen);
547 XFS_STATS_INC(xs_allocx);
548 XFS_STATS_ADD(xs_allocb, args->len);
549 } 554 }
550 return 0; 555
556 if (!args->isfl) {
557 xfs_trans_mod_sb(args->tp, args->wasdel ?
558 XFS_TRANS_SB_RES_FDBLOCKS :
559 XFS_TRANS_SB_FDBLOCKS,
560 -((long)(args->len)));
561 }
562
563 XFS_STATS_INC(xs_allocx);
564 XFS_STATS_ADD(xs_allocb, args->len);
565 return error;
551} 566}
552 567
553/* 568/*
@@ -693,8 +708,7 @@ xfs_alloc_find_best_extent(
693 if (error) 708 if (error)
694 goto error0; 709 goto error0;
695 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 710 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
696 xfs_alloc_compute_aligned(*sbno, *slen, args->alignment, 711 xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
697 args->minlen, &bno, slena);
698 712
699 /* 713 /*
700 * The good extent is closer than this one. 714 * The good extent is closer than this one.
@@ -866,8 +880,8 @@ xfs_alloc_ag_vextent_near(
866 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 880 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
867 goto error0; 881 goto error0;
868 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 882 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
869 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 883 xfs_alloc_compute_aligned(args, ltbno, ltlen,
870 args->minlen, &ltbnoa, &ltlena); 884 &ltbnoa, &ltlena);
871 if (ltlena < args->minlen) 885 if (ltlena < args->minlen)
872 continue; 886 continue;
873 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 887 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -987,8 +1001,8 @@ xfs_alloc_ag_vextent_near(
987 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i))) 1001 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
988 goto error0; 1002 goto error0;
989 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1003 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
990 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 1004 xfs_alloc_compute_aligned(args, ltbno, ltlen,
991 args->minlen, &ltbnoa, &ltlena); 1005 &ltbnoa, &ltlena);
992 if (ltlena >= args->minlen) 1006 if (ltlena >= args->minlen)
993 break; 1007 break;
994 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) 1008 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1003,8 +1017,8 @@ xfs_alloc_ag_vextent_near(
1003 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i))) 1017 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
1004 goto error0; 1018 goto error0;
1005 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1019 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1006 xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment, 1020 xfs_alloc_compute_aligned(args, gtbno, gtlen,
1007 args->minlen, &gtbnoa, &gtlena); 1021 &gtbnoa, &gtlena);
1008 if (gtlena >= args->minlen) 1022 if (gtlena >= args->minlen)
1009 break; 1023 break;
1010 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) 1024 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1183,8 +1197,7 @@ xfs_alloc_ag_vextent_size(
1183 * once aligned; if not, we search left for something better. 1197 * once aligned; if not, we search left for something better.
1184 * This can't happen in the second case above. 1198 * This can't happen in the second case above.
1185 */ 1199 */
1186 xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen, 1200 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1187 &rbno, &rlen);
1188 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1201 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1189 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1202 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1190 (rlen <= flen && rbno + rlen <= fbno + flen), error0); 1203 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1209,8 +1222,8 @@ xfs_alloc_ag_vextent_size(
1209 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1222 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1210 if (flen < bestrlen) 1223 if (flen < bestrlen)
1211 break; 1224 break;
1212 xfs_alloc_compute_aligned(fbno, flen, args->alignment, 1225 xfs_alloc_compute_aligned(args, fbno, flen,
1213 args->minlen, &rbno, &rlen); 1226 &rbno, &rlen);
1214 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1227 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1215 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1228 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1216 (rlen <= flen && rbno + rlen <= fbno + flen), 1229 (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1388,6 +1401,7 @@ xfs_free_ag_extent(
1388 xfs_mount_t *mp; /* mount point struct for filesystem */ 1401 xfs_mount_t *mp; /* mount point struct for filesystem */
1389 xfs_agblock_t nbno; /* new starting block of freespace */ 1402 xfs_agblock_t nbno; /* new starting block of freespace */
1390 xfs_extlen_t nlen; /* new length of freespace */ 1403 xfs_extlen_t nlen; /* new length of freespace */
1404 xfs_perag_t *pag; /* per allocation group data */
1391 1405
1392 mp = tp->t_mountp; 1406 mp = tp->t_mountp;
1393 /* 1407 /*
@@ -1586,30 +1600,20 @@ xfs_free_ag_extent(
1586 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1600 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1587 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1601 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1588 cnt_cur = NULL; 1602 cnt_cur = NULL;
1603
1589 /* 1604 /*
1590 * Update the freespace totals in the ag and superblock. 1605 * Update the freespace totals in the ag and superblock.
1591 */ 1606 */
1592 { 1607 pag = xfs_perag_get(mp, agno);
1593 xfs_agf_t *agf; 1608 error = xfs_alloc_update_counters(tp, pag, agbp, len);
1594 xfs_perag_t *pag; /* per allocation group data */ 1609 xfs_perag_put(pag);
1595 1610 if (error)
1596 pag = xfs_perag_get(mp, agno); 1611 goto error0;
1597 pag->pagf_freeblks += len; 1612
1598 xfs_perag_put(pag); 1613 if (!isfl)
1599 1614 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1600 agf = XFS_BUF_TO_AGF(agbp); 1615 XFS_STATS_INC(xs_freex);
1601 be32_add_cpu(&agf->agf_freeblks, len); 1616 XFS_STATS_ADD(xs_freeb, len);
1602 xfs_trans_agblocks_delta(tp, len);
1603 XFS_WANT_CORRUPTED_GOTO(
1604 be32_to_cpu(agf->agf_freeblks) <=
1605 be32_to_cpu(agf->agf_length),
1606 error0);
1607 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
1608 if (!isfl)
1609 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1610 XFS_STATS_INC(xs_freex);
1611 XFS_STATS_ADD(xs_freeb, len);
1612 }
1613 1617
1614 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); 1618 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
1615 1619
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 0ab56b32c7eb..d0b3bc72005b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -75,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
75#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) 75#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
76 76
77/* 77/*
78 * When deciding how much space to allocate out of an AG, we limit the
79 * allocation maximum size to the size the AG. However, we cannot use all the
80 * blocks in the AG - some are permanently used by metadata. These
81 * blocks are generally:
82 * - the AG superblock, AGF, AGI and AGFL
83 * - the AGF (bno and cnt) and AGI btree root blocks
84 * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
85 *
86 * The AG headers are sector sized, so the amount of space they take up is
87 * dependent on filesystem geometry. The others are all single blocks.
88 */
89#define XFS_ALLOC_AG_MAX_USABLE(mp) \
90 ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
91
92
93/*
78 * Argument structure for xfs_alloc routines. 94 * Argument structure for xfs_alloc routines.
79 * This is turned into a structure to avoid having 20 arguments passed 95 * This is turned into a structure to avoid having 20 arguments passed
80 * down several levels of the stack. 96 * down several levels of the stack.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4111cd3966c7..fa00788de2f5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
1038 * Filling in the middle part of a previous delayed allocation. 1038 * Filling in the middle part of a previous delayed allocation.
1039 * Contiguity is impossible here. 1039 * Contiguity is impossible here.
1040 * This case is avoided almost all the time. 1040 * This case is avoided almost all the time.
1041 *
1042 * We start with a delayed allocation:
1043 *
1044 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
1045 * PREV @ idx
1046 *
1047 * and we are allocating:
1048 * +rrrrrrrrrrrrrrrrr+
1049 * new
1050 *
1051 * and we set it up for insertion as:
1052 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
1053 * new
1054 * PREV @ idx LEFT RIGHT
1055 * inserted at idx + 1
1041 */ 1056 */
1042 temp = new->br_startoff - PREV.br_startoff; 1057 temp = new->br_startoff - PREV.br_startoff;
1043 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1044 xfs_bmbt_set_blockcount(ep, temp);
1045 r[0] = *new;
1046 r[1].br_state = PREV.br_state;
1047 r[1].br_startblock = 0;
1048 r[1].br_startoff = new_endoff;
1049 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1058 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1050 r[1].br_blockcount = temp2; 1059 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1051 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1060 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1061 LEFT = *new;
1062 RIGHT.br_state = PREV.br_state;
1063 RIGHT.br_startblock = nullstartblock(
1064 (int)xfs_bmap_worst_indlen(ip, temp2));
1065 RIGHT.br_startoff = new_endoff;
1066 RIGHT.br_blockcount = temp2;
1067 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1068 xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
1052 ip->i_df.if_lastex = idx + 1; 1069 ip->i_df.if_lastex = idx + 1;
1053 ip->i_d.di_nextents++; 1070 ip->i_d.di_nextents++;
1054 if (cur == NULL) 1071 if (cur == NULL)
@@ -2348,6 +2365,13 @@ xfs_bmap_rtalloc(
2348 */ 2365 */
2349 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) 2366 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
2350 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; 2367 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
2368
2369 /*
2370 * Lock out other modifications to the RT bitmap inode.
2371 */
2372 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2373 xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2374
2351 /* 2375 /*
2352 * If it's an allocation to an empty file at offset 0, 2376 * If it's an allocation to an empty file at offset 0,
2353 * pick an extent that will space things out in the rt area. 2377 * pick an extent that will space things out in the rt area.
@@ -2430,7 +2454,7 @@ xfs_bmap_btalloc_nullfb(
2430 startag = ag = 0; 2454 startag = ag = 0;
2431 2455
2432 pag = xfs_perag_get(mp, ag); 2456 pag = xfs_perag_get(mp, ag);
2433 while (*blen < ap->alen) { 2457 while (*blen < args->maxlen) {
2434 if (!pag->pagf_init) { 2458 if (!pag->pagf_init) {
2435 error = xfs_alloc_pagf_init(mp, args->tp, ag, 2459 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2436 XFS_ALLOC_FLAG_TRYLOCK); 2460 XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2476,7 @@ xfs_bmap_btalloc_nullfb(
2452 notinit = 1; 2476 notinit = 1;
2453 2477
2454 if (xfs_inode_is_filestream(ap->ip)) { 2478 if (xfs_inode_is_filestream(ap->ip)) {
2455 if (*blen >= ap->alen) 2479 if (*blen >= args->maxlen)
2456 break; 2480 break;
2457 2481
2458 if (ap->userdata) { 2482 if (ap->userdata) {
@@ -2498,14 +2522,14 @@ xfs_bmap_btalloc_nullfb(
2498 * If the best seen length is less than the request 2522 * If the best seen length is less than the request
2499 * length, use the best as the minimum. 2523 * length, use the best as the minimum.
2500 */ 2524 */
2501 else if (*blen < ap->alen) 2525 else if (*blen < args->maxlen)
2502 args->minlen = *blen; 2526 args->minlen = *blen;
2503 /* 2527 /*
2504 * Otherwise we've seen an extent as big as alen, 2528 * Otherwise we've seen an extent as big as maxlen,
2505 * use that as the minimum. 2529 * use that as the minimum.
2506 */ 2530 */
2507 else 2531 else
2508 args->minlen = ap->alen; 2532 args->minlen = args->maxlen;
2509 2533
2510 /* 2534 /*
2511 * set the failure fallback case to look in the selected 2535 * set the failure fallback case to look in the selected
@@ -2573,7 +2597,9 @@ xfs_bmap_btalloc(
2573 args.tp = ap->tp; 2597 args.tp = ap->tp;
2574 args.mp = mp; 2598 args.mp = mp;
2575 args.fsbno = ap->rval; 2599 args.fsbno = ap->rval;
2576 args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); 2600
2601 /* Trim the allocation back to the maximum an AG can fit. */
2602 args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
2577 args.firstblock = ap->firstblock; 2603 args.firstblock = ap->firstblock;
2578 blen = 0; 2604 blen = 0;
2579 if (nullfb) { 2605 if (nullfb) {
@@ -2621,7 +2647,7 @@ xfs_bmap_btalloc(
2621 /* 2647 /*
2622 * Adjust for alignment 2648 * Adjust for alignment
2623 */ 2649 */
2624 if (blen > args.alignment && blen <= ap->alen) 2650 if (blen > args.alignment && blen <= args.maxlen)
2625 args.minlen = blen - args.alignment; 2651 args.minlen = blen - args.alignment;
2626 args.minalignslop = 0; 2652 args.minalignslop = 0;
2627 } else { 2653 } else {
@@ -2640,7 +2666,7 @@ xfs_bmap_btalloc(
2640 * of minlen+alignment+slop doesn't go up 2666 * of minlen+alignment+slop doesn't go up
2641 * between the calls. 2667 * between the calls.
2642 */ 2668 */
2643 if (blen > mp->m_dalign && blen <= ap->alen) 2669 if (blen > mp->m_dalign && blen <= args.maxlen)
2644 nextminlen = blen - mp->m_dalign; 2670 nextminlen = blen - mp->m_dalign;
2645 else 2671 else
2646 nextminlen = args.minlen; 2672 nextminlen = args.minlen;
@@ -3500,7 +3526,7 @@ xfs_bmap_search_extents(
3500 3526
3501 if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && 3527 if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
3502 !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { 3528 !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
3503 xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, 3529 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
3504 "Access to block zero in inode %llu " 3530 "Access to block zero in inode %llu "
3505 "start_block: %llx start_off: %llx " 3531 "start_block: %llx start_off: %llx "
3506 "blkcnt: %llx extent-state: %x lastx: %x\n", 3532 "blkcnt: %llx extent-state: %x lastx: %x\n",
@@ -4174,12 +4200,11 @@ xfs_bmap_read_extents(
4174 num_recs = xfs_btree_get_numrecs(block); 4200 num_recs = xfs_btree_get_numrecs(block);
4175 if (unlikely(i + num_recs > room)) { 4201 if (unlikely(i + num_recs > room)) {
4176 ASSERT(i + num_recs <= room); 4202 ASSERT(i + num_recs <= room);
4177 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 4203 xfs_warn(ip->i_mount,
4178 "corrupt dinode %Lu, (btree extents).", 4204 "corrupt dinode %Lu, (btree extents).",
4179 (unsigned long long) ip->i_ino); 4205 (unsigned long long) ip->i_ino);
4180 XFS_ERROR_REPORT("xfs_bmap_read_extents(1)", 4206 XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
4181 XFS_ERRLEVEL_LOW, 4207 XFS_ERRLEVEL_LOW, ip->i_mount, block);
4182 ip->i_mount);
4183 goto error0; 4208 goto error0;
4184 } 4209 }
4185 XFS_WANT_CORRUPTED_GOTO( 4210 XFS_WANT_CORRUPTED_GOTO(
@@ -4485,6 +4510,16 @@ xfs_bmapi(
4485 /* Figure out the extent size, adjust alen */ 4510 /* Figure out the extent size, adjust alen */
4486 extsz = xfs_get_extsz_hint(ip); 4511 extsz = xfs_get_extsz_hint(ip);
4487 if (extsz) { 4512 if (extsz) {
4513 /*
4514 * make sure we don't exceed a single
4515 * extent length when we align the
4516 * extent by reducing length we are
4517 * going to allocate by the maximum
4518 * amount extent size aligment may
4519 * require.
4520 */
4521 alen = XFS_FILBLKS_MIN(len,
4522 MAXEXTLEN - (2 * extsz - 1));
4488 error = xfs_bmap_extsize_align(mp, 4523 error = xfs_bmap_extsize_align(mp,
4489 &got, &prev, extsz, 4524 &got, &prev, extsz,
4490 rt, eof, 4525 rt, eof,
@@ -5743,7 +5778,7 @@ xfs_check_block(
5743 else 5778 else
5744 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); 5779 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
5745 if (*thispa == *pp) { 5780 if (*thispa == *pp) {
5746 cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", 5781 xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
5747 __func__, j, i, 5782 __func__, j, i,
5748 (unsigned long long)be64_to_cpu(*thispa)); 5783 (unsigned long long)be64_to_cpu(*thispa));
5749 panic("%s: ptrs are equal in node\n", 5784 panic("%s: ptrs are equal in node\n",
@@ -5908,11 +5943,11 @@ xfs_bmap_check_leaf_extents(
5908 return; 5943 return;
5909 5944
5910error0: 5945error0:
5911 cmn_err(CE_WARN, "%s: at error0", __func__); 5946 xfs_warn(mp, "%s: at error0", __func__);
5912 if (bp_release) 5947 if (bp_release)
5913 xfs_trans_brelse(NULL, bp); 5948 xfs_trans_brelse(NULL, bp);
5914error_norelse: 5949error_norelse:
5915 cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents", 5950 xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
5916 __func__, i); 5951 __func__, i);
5917 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); 5952 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
5918 return; 5953 return;
@@ -6115,7 +6150,7 @@ xfs_bmap_punch_delalloc_range(
6115 if (error) { 6150 if (error) {
6116 /* something screwed, just bail */ 6151 /* something screwed, just bail */
6117 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 6152 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
6118 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 6153 xfs_alert(ip->i_mount,
6119 "Failed delalloc mapping lookup ino %lld fsb %lld.", 6154 "Failed delalloc mapping lookup ino %lld fsb %lld.",
6120 ip->i_ino, start_fsb); 6155 ip->i_ino, start_fsb);
6121 } 6156 }
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 98c6f73b6752..e5413d96f1af 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -130,10 +130,12 @@ xfs_buf_item_log_check(
130 orig = bip->bli_orig; 130 orig = bip->bli_orig;
131 buffer = XFS_BUF_PTR(bp); 131 buffer = XFS_BUF_PTR(bp);
132 for (x = 0; x < XFS_BUF_COUNT(bp); x++) { 132 for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
133 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) 133 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
134 cmn_err(CE_PANIC, 134 xfs_emerg(bp->b_mount,
135 "xfs_buf_item_log_check bip %x buffer %x orig %x index %d", 135 "%s: bip %x buffer %x orig %x index %d",
136 bip, bp, orig, x); 136 __func__, bip, bp, orig, x);
137 ASSERT(0);
138 }
137 } 139 }
138} 140}
139#else 141#else
@@ -427,13 +429,15 @@ xfs_buf_item_unpin(
427 429
428 if (remove) { 430 if (remove) {
429 /* 431 /*
430 * We have to remove the log item from the transaction 432 * If we are in a transaction context, we have to
431 * as we are about to release our reference to the 433 * remove the log item from the transaction as we are
432 * buffer. If we don't, the unlock that occurs later 434 * about to release our reference to the buffer. If we
433 * in xfs_trans_uncommit() will ry to reference the 435 * don't, the unlock that occurs later in
436 * xfs_trans_uncommit() will try to reference the
434 * buffer which we no longer have a hold on. 437 * buffer which we no longer have a hold on.
435 */ 438 */
436 xfs_trans_del_item(lip); 439 if (lip->li_desc)
440 xfs_trans_del_item(lip);
437 441
438 /* 442 /*
439 * Since the transaction no longer refers to the buffer, 443 * Since the transaction no longer refers to the buffer,
@@ -981,10 +985,9 @@ xfs_buf_iodone_callbacks(
981 if (XFS_BUF_TARGET(bp) != lasttarg || 985 if (XFS_BUF_TARGET(bp) != lasttarg ||
982 time_after(jiffies, (lasttime + 5*HZ))) { 986 time_after(jiffies, (lasttime + 5*HZ))) {
983 lasttime = jiffies; 987 lasttime = jiffies;
984 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 988 xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
985 " block 0x%llx in %s",
986 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 989 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
987 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 990 (__uint64_t)XFS_BUF_ADDR(bp));
988 } 991 }
989 lasttarg = XFS_BUF_TARGET(bp); 992 lasttarg = XFS_BUF_TARGET(bp);
990 993
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 1c00bedb3175..6102ac6d1dff 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,13 +1995,12 @@ xfs_da_do_buf(
1995 error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); 1995 error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
1996 if (unlikely(error == EFSCORRUPTED)) { 1996 if (unlikely(error == EFSCORRUPTED)) {
1997 if (xfs_error_level >= XFS_ERRLEVEL_LOW) { 1997 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
1998 cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n", 1998 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
1999 (long long)bno); 1999 __func__, (long long)bno,
2000 cmn_err(CE_ALERT, "dir: inode %lld\n",
2001 (long long)dp->i_ino); 2000 (long long)dp->i_ino);
2002 for (i = 0; i < nmap; i++) { 2001 for (i = 0; i < nmap; i++) {
2003 cmn_err(CE_ALERT, 2002 xfs_alert(mp,
2004 "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n", 2003"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
2005 i, 2004 i,
2006 (long long)mapp[i].br_startoff, 2005 (long long)mapp[i].br_startoff,
2007 (long long)mapp[i].br_startblock, 2006 (long long)mapp[i].br_startblock,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e60490bc00a6..be628677c288 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -270,9 +270,9 @@ xfs_swap_extents(
270 /* check inode formats now that data is flushed */ 270 /* check inode formats now that data is flushed */
271 error = xfs_swap_extents_check_format(ip, tip); 271 error = xfs_swap_extents_check_format(ip, tip);
272 if (error) { 272 if (error) {
273 xfs_fs_cmn_err(CE_NOTE, mp, 273 xfs_notice(mp,
274 "%s: inode 0x%llx format is incompatible for exchanging.", 274 "%s: inode 0x%llx format is incompatible for exchanging.",
275 __FILE__, ip->i_ino); 275 __func__, ip->i_ino);
276 goto out_unlock; 276 goto out_unlock;
277 } 277 }
278 278
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a1321bc7f192..dba7a71cedf3 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -159,7 +159,7 @@ xfs_dir_ino_validate(
159 XFS_AGINO_TO_INO(mp, agno, agino) == ino; 159 XFS_AGINO_TO_INO(mp, agno, agino) == ino;
160 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, 160 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
161 XFS_RANDOM_DIR_INO_VALIDATE))) { 161 XFS_RANDOM_DIR_INO_VALIDATE))) {
162 xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx", 162 xfs_warn(mp, "Invalid inode number 0x%Lx",
163 (unsigned long long) ino); 163 (unsigned long long) ino);
164 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); 164 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
165 return XFS_ERROR(EFSCORRUPTED); 165 return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f9a0864b696a..a0aab7d3294f 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -899,10 +899,9 @@ xfs_dir2_leafn_rebalance(
899 if(blk2->index < 0) { 899 if(blk2->index < 0) {
900 state->inleaf = 1; 900 state->inleaf = 1;
901 blk2->index = 0; 901 blk2->index = 0;
902 cmn_err(CE_ALERT, 902 xfs_alert(args->dp->i_mount,
903 "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: " 903 "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
904 "blk1->index %d\n", 904 __func__, blk1->index);
905 blk1->index);
906 } 905 }
907} 906}
908 907
@@ -1641,26 +1640,22 @@ xfs_dir2_node_addname_int(
1641 } 1640 }
1642 1641
1643 if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { 1642 if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
1644 cmn_err(CE_ALERT, 1643 xfs_alert(mp,
1645 "xfs_dir2_node_addname_int: dir ino " 1644 "%s: dir ino " "%llu needed freesp block %lld for\n"
1646 "%llu needed freesp block %lld for\n" 1645 " data block %lld, got %lld ifbno %llu lastfbno %d",
1647 " data block %lld, got %lld\n" 1646 __func__, (unsigned long long)dp->i_ino,
1648 " ifbno %llu lastfbno %d\n",
1649 (unsigned long long)dp->i_ino,
1650 (long long)xfs_dir2_db_to_fdb(mp, dbno), 1647 (long long)xfs_dir2_db_to_fdb(mp, dbno),
1651 (long long)dbno, (long long)fbno, 1648 (long long)dbno, (long long)fbno,
1652 (unsigned long long)ifbno, lastfbno); 1649 (unsigned long long)ifbno, lastfbno);
1653 if (fblk) { 1650 if (fblk) {
1654 cmn_err(CE_ALERT, 1651 xfs_alert(mp,
1655 " fblk 0x%p blkno %llu " 1652 " fblk 0x%p blkno %llu index %d magic 0x%x",
1656 "index %d magic 0x%x\n",
1657 fblk, 1653 fblk,
1658 (unsigned long long)fblk->blkno, 1654 (unsigned long long)fblk->blkno,
1659 fblk->index, 1655 fblk->index,
1660 fblk->magic); 1656 fblk->magic);
1661 } else { 1657 } else {
1662 cmn_err(CE_ALERT, 1658 xfs_alert(mp, " ... fblk is NULL");
1663 " ... fblk is NULL\n");
1664 } 1659 }
1665 XFS_ERROR_REPORT("xfs_dir2_node_addname_int", 1660 XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
1666 XFS_ERRLEVEL_LOW, mp); 1661 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 4c7db74a05f7..39f06336b99d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -48,7 +48,7 @@ xfs_error_trap(int e)
48 break; 48 break;
49 if (e != xfs_etrap[i]) 49 if (e != xfs_etrap[i])
50 continue; 50 continue;
51 cmn_err(CE_NOTE, "xfs_error_trap: error %d", e); 51 xfs_notice(NULL, "%s: error %d", __func__, e);
52 BUG(); 52 BUG();
53 break; 53 break;
54 } 54 }
@@ -74,7 +74,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
74 74
75 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 75 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
76 if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { 76 if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
77 cmn_err(CE_WARN, 77 xfs_warn(NULL,
78 "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", 78 "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
79 expression, file, line, xfs_etest_fsname[i]); 79 expression, file, line, xfs_etest_fsname[i]);
80 return 1; 80 return 1;
@@ -95,14 +95,14 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
95 95
96 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 96 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
97 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { 97 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
98 cmn_err(CE_WARN, "XFS error tag #%d on", error_tag); 98 xfs_warn(mp, "error tag #%d on", error_tag);
99 return 0; 99 return 0;
100 } 100 }
101 } 101 }
102 102
103 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 103 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
104 if (xfs_etest[i] == 0) { 104 if (xfs_etest[i] == 0) {
105 cmn_err(CE_WARN, "Turned on XFS error tag #%d", 105 xfs_warn(mp, "Turned on XFS error tag #%d",
106 error_tag); 106 error_tag);
107 xfs_etest[i] = error_tag; 107 xfs_etest[i] = error_tag;
108 xfs_etest_fsid[i] = fsid; 108 xfs_etest_fsid[i] = fsid;
@@ -114,7 +114,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
114 } 114 }
115 } 115 }
116 116
117 cmn_err(CE_WARN, "error tag overflow, too many turned on"); 117 xfs_warn(mp, "error tag overflow, too many turned on");
118 118
119 return 1; 119 return 1;
120} 120}
@@ -133,7 +133,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
133 if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && 133 if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
134 xfs_etest[i] != 0) { 134 xfs_etest[i] != 0) {
135 cleared = 1; 135 cleared = 1;
136 cmn_err(CE_WARN, "Clearing XFS error tag #%d", 136 xfs_warn(mp, "Clearing XFS error tag #%d",
137 xfs_etest[i]); 137 xfs_etest[i]);
138 xfs_etest[i] = 0; 138 xfs_etest[i] = 0;
139 xfs_etest_fsid[i] = 0LL; 139 xfs_etest_fsid[i] = 0LL;
@@ -144,9 +144,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
144 } 144 }
145 145
146 if (loud || cleared) 146 if (loud || cleared)
147 cmn_err(CE_WARN, 147 xfs_warn(mp, "Cleared all XFS error tags for filesystem");
148 "Cleared all XFS error tags for filesystem \"%s\"",
149 mp->m_fsname);
150 148
151 return 0; 149 return 0;
152} 150}
@@ -162,9 +160,8 @@ xfs_error_report(
162 inst_t *ra) 160 inst_t *ra)
163{ 161{
164 if (level <= xfs_error_level) { 162 if (level <= xfs_error_level) {
165 xfs_cmn_err(XFS_PTAG_ERROR_REPORT, 163 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
166 CE_ALERT, mp, 164 "Internal error %s at line %d of file %s. Caller 0x%p\n",
167 "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
168 tag, linenum, filename, ra); 165 tag, linenum, filename, ra);
169 166
170 xfs_stack_trace(); 167 xfs_stack_trace();
@@ -184,4 +181,5 @@ xfs_corruption_error(
184 if (level <= xfs_error_level) 181 if (level <= xfs_error_level)
185 xfs_hex_dump(p, 16); 182 xfs_hex_dump(p, 16);
186 xfs_error_report(tag, level, mp, filename, linenum, ra); 183 xfs_error_report(tag, level, mp, filename, linenum, ra);
184 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
187} 185}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 10dce5475f02..079a367f44ee 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -145,10 +145,8 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
145#endif /* DEBUG */ 145#endif /* DEBUG */
146 146
147/* 147/*
148 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into 148 * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
149 * a panic by setting xfs_panic_mask in a 149 * a panic by setting xfs_panic_mask in a sysctl.
150 * sysctl. update xfs_max[XFS_PARAM] if
151 * more are added.
152 */ 150 */
153#define XFS_NO_PTAG 0 151#define XFS_NO_PTAG 0
154#define XFS_PTAG_IFLUSH 0x00000001 152#define XFS_PTAG_IFLUSH 0x00000001
@@ -160,17 +158,4 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
160#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 158#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
161#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 159#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
162 160
163struct xfs_mount;
164
165extern void xfs_hex_dump(void *p, int length);
166
167#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
168 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
169
170#define xfs_fs_mount_cmn_err(f, fmt, args...) \
171 do { \
172 if (!(f & XFS_MFSI_QUIET)) \
173 cmn_err(CE_WARN, "XFS: " fmt, ## args); \
174 } while (0)
175
176#endif /* __XFS_ERROR_H__ */ 161#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 75f2ef60e579..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -138,7 +138,8 @@ xfs_efi_item_unpin(
138 138
139 if (remove) { 139 if (remove) {
140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); 140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
141 xfs_trans_del_item(lip); 141 if (lip->li_desc)
142 xfs_trans_del_item(lip);
142 xfs_efi_item_free(efip); 143 xfs_efi_item_free(efip);
143 return; 144 return;
144 } 145 }
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cec89dd5d7d2..9153d2c77caf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
53 xfs_fsop_geom_t *geo, 53 xfs_fsop_geom_t *geo,
54 int new_version) 54 int new_version)
55{ 55{
56
57 memset(geo, 0, sizeof(*geo));
58
56 geo->blocksize = mp->m_sb.sb_blocksize; 59 geo->blocksize = mp->m_sb.sb_blocksize;
57 geo->rtextsize = mp->m_sb.sb_rextsize; 60 geo->rtextsize = mp->m_sb.sb_rextsize;
58 geo->agblocks = mp->m_sb.sb_agblocks; 61 geo->agblocks = mp->m_sb.sb_agblocks;
@@ -382,8 +385,8 @@ xfs_growfs_data_private(
382 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 385 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
383 XFS_FSS_TO_BB(mp, 1), 0, &bp); 386 XFS_FSS_TO_BB(mp, 1), 0, &bp);
384 if (error) { 387 if (error) {
385 xfs_fs_cmn_err(CE_WARN, mp, 388 xfs_warn(mp,
386 "error %d reading secondary superblock for ag %d", 389 "error %d reading secondary superblock for ag %d",
387 error, agno); 390 error, agno);
388 break; 391 break;
389 } 392 }
@@ -396,7 +399,7 @@ xfs_growfs_data_private(
396 if (!(error = xfs_bwrite(mp, bp))) { 399 if (!(error = xfs_bwrite(mp, bp))) {
397 continue; 400 continue;
398 } else { 401 } else {
399 xfs_fs_cmn_err(CE_WARN, mp, 402 xfs_warn(mp,
400 "write error %d updating secondary superblock for ag %d", 403 "write error %d updating secondary superblock for ag %d",
401 error, agno); 404 error, agno);
402 break; /* no point in continuing */ 405 break; /* no point in continuing */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0626a32c3447..84ebeec16642 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1055,28 +1055,23 @@ xfs_difree(
1055 */ 1055 */
1056 agno = XFS_INO_TO_AGNO(mp, inode); 1056 agno = XFS_INO_TO_AGNO(mp, inode);
1057 if (agno >= mp->m_sb.sb_agcount) { 1057 if (agno >= mp->m_sb.sb_agcount) {
1058 cmn_err(CE_WARN, 1058 xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
1059 "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s. Returning EINVAL.", 1059 __func__, agno, mp->m_sb.sb_agcount);
1060 agno, mp->m_sb.sb_agcount, mp->m_fsname);
1061 ASSERT(0); 1060 ASSERT(0);
1062 return XFS_ERROR(EINVAL); 1061 return XFS_ERROR(EINVAL);
1063 } 1062 }
1064 agino = XFS_INO_TO_AGINO(mp, inode); 1063 agino = XFS_INO_TO_AGINO(mp, inode);
1065 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { 1064 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
1066 cmn_err(CE_WARN, 1065 xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
1067 "xfs_difree: inode != XFS_AGINO_TO_INO() " 1066 __func__, (unsigned long long)inode,
1068 "(%llu != %llu) on %s. Returning EINVAL.", 1067 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
1069 (unsigned long long)inode,
1070 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
1071 mp->m_fsname);
1072 ASSERT(0); 1068 ASSERT(0);
1073 return XFS_ERROR(EINVAL); 1069 return XFS_ERROR(EINVAL);
1074 } 1070 }
1075 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 1071 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1076 if (agbno >= mp->m_sb.sb_agblocks) { 1072 if (agbno >= mp->m_sb.sb_agblocks) {
1077 cmn_err(CE_WARN, 1073 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
1078 "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s. Returning EINVAL.", 1074 __func__, agbno, mp->m_sb.sb_agblocks);
1079 agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
1080 ASSERT(0); 1075 ASSERT(0);
1081 return XFS_ERROR(EINVAL); 1076 return XFS_ERROR(EINVAL);
1082 } 1077 }
@@ -1085,9 +1080,8 @@ xfs_difree(
1085 */ 1080 */
1086 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1081 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1087 if (error) { 1082 if (error) {
1088 cmn_err(CE_WARN, 1083 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
1089 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", 1084 __func__, error);
1090 error, mp->m_fsname);
1091 return error; 1085 return error;
1092 } 1086 }
1093 agi = XFS_BUF_TO_AGI(agbp); 1087 agi = XFS_BUF_TO_AGI(agbp);
@@ -1106,17 +1100,15 @@ xfs_difree(
1106 * Look for the entry describing this inode. 1100 * Look for the entry describing this inode.
1107 */ 1101 */
1108 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { 1102 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1109 cmn_err(CE_WARN, 1103 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
1110 "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.", 1104 __func__, error);
1111 error, mp->m_fsname);
1112 goto error0; 1105 goto error0;
1113 } 1106 }
1114 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1107 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1115 error = xfs_inobt_get_rec(cur, &rec, &i); 1108 error = xfs_inobt_get_rec(cur, &rec, &i);
1116 if (error) { 1109 if (error) {
1117 cmn_err(CE_WARN, 1110 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1118 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", 1111 __func__, error);
1119 error, mp->m_fsname);
1120 goto error0; 1112 goto error0;
1121 } 1113 }
1122 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1114 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1157,8 +1149,8 @@ xfs_difree(
1157 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 1149 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1158 1150
1159 if ((error = xfs_btree_delete(cur, &i))) { 1151 if ((error = xfs_btree_delete(cur, &i))) {
1160 cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n", 1152 xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
1161 error, mp->m_fsname); 1153 __func__, error);
1162 goto error0; 1154 goto error0;
1163 } 1155 }
1164 1156
@@ -1170,9 +1162,8 @@ xfs_difree(
1170 1162
1171 error = xfs_inobt_update(cur, &rec); 1163 error = xfs_inobt_update(cur, &rec);
1172 if (error) { 1164 if (error) {
1173 cmn_err(CE_WARN, 1165 xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
1174 "xfs_difree: xfs_inobt_update returned an error %d on %s.", 1166 __func__, error);
1175 error, mp->m_fsname);
1176 goto error0; 1167 goto error0;
1177 } 1168 }
1178 1169
@@ -1218,10 +1209,9 @@ xfs_imap_lookup(
1218 1209
1219 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1210 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1220 if (error) { 1211 if (error) {
1221 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1212 xfs_alert(mp,
1222 "xfs_ialloc_read_agi() returned " 1213 "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
1223 "error %d, agno %d", 1214 __func__, error, agno);
1224 error, agno);
1225 return error; 1215 return error;
1226 } 1216 }
1227 1217
@@ -1299,24 +1289,21 @@ xfs_imap(
1299 if (flags & XFS_IGET_UNTRUSTED) 1289 if (flags & XFS_IGET_UNTRUSTED)
1300 return XFS_ERROR(EINVAL); 1290 return XFS_ERROR(EINVAL);
1301 if (agno >= mp->m_sb.sb_agcount) { 1291 if (agno >= mp->m_sb.sb_agcount) {
1302 xfs_fs_cmn_err(CE_ALERT, mp, 1292 xfs_alert(mp,
1303 "xfs_imap: agno (%d) >= " 1293 "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
1304 "mp->m_sb.sb_agcount (%d)", 1294 __func__, agno, mp->m_sb.sb_agcount);
1305 agno, mp->m_sb.sb_agcount);
1306 } 1295 }
1307 if (agbno >= mp->m_sb.sb_agblocks) { 1296 if (agbno >= mp->m_sb.sb_agblocks) {
1308 xfs_fs_cmn_err(CE_ALERT, mp, 1297 xfs_alert(mp,
1309 "xfs_imap: agbno (0x%llx) >= " 1298 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
1310 "mp->m_sb.sb_agblocks (0x%lx)", 1299 __func__, (unsigned long long)agbno,
1311 (unsigned long long) agbno, 1300 (unsigned long)mp->m_sb.sb_agblocks);
1312 (unsigned long) mp->m_sb.sb_agblocks);
1313 } 1301 }
1314 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { 1302 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1315 xfs_fs_cmn_err(CE_ALERT, mp, 1303 xfs_alert(mp,
1316 "xfs_imap: ino (0x%llx) != " 1304 "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
1317 "XFS_AGINO_TO_INO(mp, agno, agino) " 1305 __func__, ino,
1318 "(0x%llx)", 1306 XFS_AGINO_TO_INO(mp, agno, agino));
1319 ino, XFS_AGINO_TO_INO(mp, agno, agino));
1320 } 1307 }
1321 xfs_stack_trace(); 1308 xfs_stack_trace();
1322#endif /* DEBUG */ 1309#endif /* DEBUG */
@@ -1388,10 +1375,9 @@ out_map:
1388 */ 1375 */
1389 if ((imap->im_blkno + imap->im_len) > 1376 if ((imap->im_blkno + imap->im_len) >
1390 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 1377 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
1391 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1378 xfs_alert(mp,
1392 "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > " 1379 "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
1393 " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)", 1380 __func__, (unsigned long long) imap->im_blkno,
1394 (unsigned long long) imap->im_blkno,
1395 (unsigned long long) imap->im_len, 1381 (unsigned long long) imap->im_len,
1396 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 1382 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1397 return XFS_ERROR(EINVAL); 1383 return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index be7cf625421f..da871f532236 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -110,8 +110,8 @@ xfs_inobp_check(
110 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 110 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
111 i * mp->m_sb.sb_inodesize); 111 i * mp->m_sb.sb_inodesize);
112 if (!dip->di_next_unlinked) { 112 if (!dip->di_next_unlinked) {
113 xfs_fs_cmn_err(CE_ALERT, mp, 113 xfs_alert(mp,
114 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", 114 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
115 bp); 115 bp);
116 ASSERT(dip->di_next_unlinked); 116 ASSERT(dip->di_next_unlinked);
117 } 117 }
@@ -142,10 +142,9 @@ xfs_imap_to_bp(
142 (int)imap->im_len, buf_flags, &bp); 142 (int)imap->im_len, buf_flags, &bp);
143 if (error) { 143 if (error) {
144 if (error != EAGAIN) { 144 if (error != EAGAIN) {
145 cmn_err(CE_WARN, 145 xfs_warn(mp,
146 "xfs_imap_to_bp: xfs_trans_read_buf()returned " 146 "%s: xfs_trans_read_buf() returned error %d.",
147 "an error %d on %s. Returning error.", 147 __func__, error);
148 error, mp->m_fsname);
149 } else { 148 } else {
150 ASSERT(buf_flags & XBF_TRYLOCK); 149 ASSERT(buf_flags & XBF_TRYLOCK);
151 } 150 }
@@ -180,12 +179,11 @@ xfs_imap_to_bp(
180 XFS_CORRUPTION_ERROR("xfs_imap_to_bp", 179 XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
181 XFS_ERRLEVEL_HIGH, mp, dip); 180 XFS_ERRLEVEL_HIGH, mp, dip);
182#ifdef DEBUG 181#ifdef DEBUG
183 cmn_err(CE_PANIC, 182 xfs_emerg(mp,
184 "Device %s - bad inode magic/vsn " 183 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
185 "daddr %lld #%d (magic=%x)",
186 XFS_BUFTARG_NAME(mp->m_ddev_targp),
187 (unsigned long long)imap->im_blkno, i, 184 (unsigned long long)imap->im_blkno, i,
188 be16_to_cpu(dip->di_magic)); 185 be16_to_cpu(dip->di_magic));
186 ASSERT(0);
189#endif 187#endif
190 xfs_trans_brelse(tp, bp); 188 xfs_trans_brelse(tp, bp);
191 return XFS_ERROR(EFSCORRUPTED); 189 return XFS_ERROR(EFSCORRUPTED);
@@ -317,7 +315,7 @@ xfs_iformat(
317 if (unlikely(be32_to_cpu(dip->di_nextents) + 315 if (unlikely(be32_to_cpu(dip->di_nextents) +
318 be16_to_cpu(dip->di_anextents) > 316 be16_to_cpu(dip->di_anextents) >
319 be64_to_cpu(dip->di_nblocks))) { 317 be64_to_cpu(dip->di_nblocks))) {
320 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 318 xfs_warn(ip->i_mount,
321 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 319 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
322 (unsigned long long)ip->i_ino, 320 (unsigned long long)ip->i_ino,
323 (int)(be32_to_cpu(dip->di_nextents) + 321 (int)(be32_to_cpu(dip->di_nextents) +
@@ -330,8 +328,7 @@ xfs_iformat(
330 } 328 }
331 329
332 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 330 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
333 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 331 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
334 "corrupt dinode %Lu, forkoff = 0x%x.",
335 (unsigned long long)ip->i_ino, 332 (unsigned long long)ip->i_ino,
336 dip->di_forkoff); 333 dip->di_forkoff);
337 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 334 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -341,7 +338,7 @@ xfs_iformat(
341 338
342 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 339 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
343 !ip->i_mount->m_rtdev_targp)) { 340 !ip->i_mount->m_rtdev_targp)) {
344 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 341 xfs_warn(ip->i_mount,
345 "corrupt dinode %Lu, has realtime flag set.", 342 "corrupt dinode %Lu, has realtime flag set.",
346 ip->i_ino); 343 ip->i_ino);
347 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 344 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
@@ -373,9 +370,8 @@ xfs_iformat(
373 * no local regular files yet 370 * no local regular files yet
374 */ 371 */
375 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { 372 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
376 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 373 xfs_warn(ip->i_mount,
377 "corrupt inode %Lu " 374 "corrupt inode %Lu (local format for regular file).",
378 "(local format for regular file).",
379 (unsigned long long) ip->i_ino); 375 (unsigned long long) ip->i_ino);
380 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 376 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
381 XFS_ERRLEVEL_LOW, 377 XFS_ERRLEVEL_LOW,
@@ -385,9 +381,8 @@ xfs_iformat(
385 381
386 di_size = be64_to_cpu(dip->di_size); 382 di_size = be64_to_cpu(dip->di_size);
387 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 383 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
388 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 384 xfs_warn(ip->i_mount,
389 "corrupt inode %Lu " 385 "corrupt inode %Lu (bad size %Ld for local inode).",
390 "(bad size %Ld for local inode).",
391 (unsigned long long) ip->i_ino, 386 (unsigned long long) ip->i_ino,
392 (long long) di_size); 387 (long long) di_size);
393 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 388 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -431,9 +426,8 @@ xfs_iformat(
431 size = be16_to_cpu(atp->hdr.totsize); 426 size = be16_to_cpu(atp->hdr.totsize);
432 427
433 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { 428 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
434 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 429 xfs_warn(ip->i_mount,
435 "corrupt inode %Lu " 430 "corrupt inode %Lu (bad attr fork size %Ld).",
436 "(bad attr fork size %Ld).",
437 (unsigned long long) ip->i_ino, 431 (unsigned long long) ip->i_ino,
438 (long long) size); 432 (long long) size);
439 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 433 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
@@ -488,9 +482,8 @@ xfs_iformat_local(
488 * kmem_alloc() or memcpy() below. 482 * kmem_alloc() or memcpy() below.
489 */ 483 */
490 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 484 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
491 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 485 xfs_warn(ip->i_mount,
492 "corrupt inode %Lu " 486 "corrupt inode %Lu (bad size %d for local fork, size = %d).",
493 "(bad size %d for local fork, size = %d).",
494 (unsigned long long) ip->i_ino, size, 487 (unsigned long long) ip->i_ino, size,
495 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 488 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
496 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 489 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -547,8 +540,7 @@ xfs_iformat_extents(
547 * kmem_alloc() or memcpy() below. 540 * kmem_alloc() or memcpy() below.
548 */ 541 */
549 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 542 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
550 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 543 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
551 "corrupt inode %Lu ((a)extents = %d).",
552 (unsigned long long) ip->i_ino, nex); 544 (unsigned long long) ip->i_ino, nex);
553 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 545 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
554 ip->i_mount, dip); 546 ip->i_mount, dip);
@@ -623,11 +615,10 @@ xfs_iformat_btree(
623 || XFS_BMDR_SPACE_CALC(nrecs) > 615 || XFS_BMDR_SPACE_CALC(nrecs) >
624 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 616 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
625 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 617 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
626 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 618 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
627 "corrupt inode %Lu (btree).",
628 (unsigned long long) ip->i_ino); 619 (unsigned long long) ip->i_ino);
629 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 620 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
630 ip->i_mount); 621 ip->i_mount, dip);
631 return XFS_ERROR(EFSCORRUPTED); 622 return XFS_ERROR(EFSCORRUPTED);
632 } 623 }
633 624
@@ -813,11 +804,9 @@ xfs_iread(
813 */ 804 */
814 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { 805 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
815#ifdef DEBUG 806#ifdef DEBUG
816 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 807 xfs_alert(mp,
817 "dip->di_magic (0x%x) != " 808 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
818 "XFS_DINODE_MAGIC (0x%x)", 809 __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
819 be16_to_cpu(dip->di_magic),
820 XFS_DINODE_MAGIC);
821#endif /* DEBUG */ 810#endif /* DEBUG */
822 error = XFS_ERROR(EINVAL); 811 error = XFS_ERROR(EINVAL);
823 goto out_brelse; 812 goto out_brelse;
@@ -835,9 +824,8 @@ xfs_iread(
835 error = xfs_iformat(ip, dip); 824 error = xfs_iformat(ip, dip);
836 if (error) { 825 if (error) {
837#ifdef DEBUG 826#ifdef DEBUG
838 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 827 xfs_alert(mp, "%s: xfs_iformat() returned error %d",
839 "xfs_iformat() returned error %d", 828 __func__, error);
840 error);
841#endif /* DEBUG */ 829#endif /* DEBUG */
842 goto out_brelse; 830 goto out_brelse;
843 } 831 }
@@ -1016,8 +1004,8 @@ xfs_ialloc(
1016 * This is because we're setting fields here we need 1004 * This is because we're setting fields here we need
1017 * to prevent others from looking at until we're done. 1005 * to prevent others from looking at until we're done.
1018 */ 1006 */
1019 error = xfs_trans_iget(tp->t_mountp, tp, ino, 1007 error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
1020 XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 1008 XFS_ILOCK_EXCL, &ip);
1021 if (error) 1009 if (error)
1022 return error; 1010 return error;
1023 ASSERT(ip != NULL); 1011 ASSERT(ip != NULL);
@@ -1166,6 +1154,7 @@ xfs_ialloc(
1166 /* 1154 /*
1167 * Log the new values stuffed into the inode. 1155 * Log the new values stuffed into the inode.
1168 */ 1156 */
1157 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
1169 xfs_trans_log_inode(tp, ip, flags); 1158 xfs_trans_log_inode(tp, ip, flags);
1170 1159
1171 /* now that we have an i_mode we can setup inode ops and unlock */ 1160 /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1820,9 +1809,8 @@ xfs_iunlink_remove(
1820 */ 1809 */
1821 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1810 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1822 if (error) { 1811 if (error) {
1823 cmn_err(CE_WARN, 1812 xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
1824 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1813 __func__, error);
1825 error, mp->m_fsname);
1826 return error; 1814 return error;
1827 } 1815 }
1828 next_agino = be32_to_cpu(dip->di_next_unlinked); 1816 next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1867,9 +1855,9 @@ xfs_iunlink_remove(
1867 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 1855 error = xfs_inotobp(mp, tp, next_ino, &last_dip,
1868 &last_ibp, &last_offset, 0); 1856 &last_ibp, &last_offset, 0);
1869 if (error) { 1857 if (error) {
1870 cmn_err(CE_WARN, 1858 xfs_warn(mp,
1871 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", 1859 "%s: xfs_inotobp() returned error %d.",
1872 error, mp->m_fsname); 1860 __func__, error);
1873 return error; 1861 return error;
1874 } 1862 }
1875 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 1863 next_agino = be32_to_cpu(last_dip->di_next_unlinked);
@@ -1882,9 +1870,8 @@ xfs_iunlink_remove(
1882 */ 1870 */
1883 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1871 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1884 if (error) { 1872 if (error) {
1885 cmn_err(CE_WARN, 1873 xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
1886 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1874 __func__, error);
1887 error, mp->m_fsname);
1888 return error; 1875 return error;
1889 } 1876 }
1890 next_agino = be32_to_cpu(dip->di_next_unlinked); 1877 next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -2939,16 +2926,16 @@ xfs_iflush_int(
2939 2926
2940 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, 2927 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
2941 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2928 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2942 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2929 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2943 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2930 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
2944 ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2931 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2945 goto corrupt_out; 2932 goto corrupt_out;
2946 } 2933 }
2947 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 2934 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
2948 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 2935 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
2949 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2936 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2950 "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 2937 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
2951 ip->i_ino, ip, ip->i_d.di_magic); 2938 __func__, ip->i_ino, ip, ip->i_d.di_magic);
2952 goto corrupt_out; 2939 goto corrupt_out;
2953 } 2940 }
2954 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 2941 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
@@ -2956,9 +2943,9 @@ xfs_iflush_int(
2956 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2943 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2957 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2944 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
2958 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 2945 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
2959 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2946 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2960 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", 2947 "%s: Bad regular inode %Lu, ptr 0x%p",
2961 ip->i_ino, ip); 2948 __func__, ip->i_ino, ip);
2962 goto corrupt_out; 2949 goto corrupt_out;
2963 } 2950 }
2964 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 2951 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
@@ -2967,28 +2954,28 @@ xfs_iflush_int(
2967 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2954 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
2968 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 2955 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
2969 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 2956 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
2970 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2957 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2971 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", 2958 "%s: Bad directory inode %Lu, ptr 0x%p",
2972 ip->i_ino, ip); 2959 __func__, ip->i_ino, ip);
2973 goto corrupt_out; 2960 goto corrupt_out;
2974 } 2961 }
2975 } 2962 }
2976 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 2963 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
2977 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 2964 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
2978 XFS_RANDOM_IFLUSH_5)) { 2965 XFS_RANDOM_IFLUSH_5)) {
2979 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2966 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2980 "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", 2967 "%s: detected corrupt incore inode %Lu, "
2981 ip->i_ino, 2968 "total extents = %d, nblocks = %Ld, ptr 0x%p",
2969 __func__, ip->i_ino,
2982 ip->i_d.di_nextents + ip->i_d.di_anextents, 2970 ip->i_d.di_nextents + ip->i_d.di_anextents,
2983 ip->i_d.di_nblocks, 2971 ip->i_d.di_nblocks, ip);
2984 ip);
2985 goto corrupt_out; 2972 goto corrupt_out;
2986 } 2973 }
2987 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 2974 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
2988 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 2975 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
2989 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2976 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2990 "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 2977 "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
2991 ip->i_ino, ip->i_d.di_forkoff, ip); 2978 __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2992 goto corrupt_out; 2979 goto corrupt_out;
2993 } 2980 }
2994 /* 2981 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 5c95fa8ec11d..f753200cef8d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -409,28 +409,35 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
409/* 409/*
410 * Flags for lockdep annotations. 410 * Flags for lockdep annotations.
411 * 411 *
412 * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes 412 * XFS_LOCK_PARENT - for directory operations that require locking a
413 * (ie directory operations that require locking a directory inode and 413 * parent directory inode and a child entry inode. The parent gets locked
414 * an entry inode). The first inode gets locked with this flag so it 414 * with this flag so it gets a lockdep subclass of 1 and the child entry
415 * gets a lockdep subclass of 1 and the second lock will have a lockdep 415 * lock will have a lockdep subclass of 0.
416 * subclass of 0. 416 *
417 * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
418 * inodes do not participate in the normal lock order, and thus have their
419 * own subclasses.
417 * 420 *
418 * XFS_LOCK_INUMORDER - for locking several inodes at the some time 421 * XFS_LOCK_INUMORDER - for locking several inodes at the some time
419 * with xfs_lock_inodes(). This flag is used as the starting subclass 422 * with xfs_lock_inodes(). This flag is used as the starting subclass
420 * and each subsequent lock acquired will increment the subclass by one. 423 * and each subsequent lock acquired will increment the subclass by one.
421 * So the first lock acquired will have a lockdep subclass of 2, the 424 * So the first lock acquired will have a lockdep subclass of 4, the
422 * second lock will have a lockdep subclass of 3, and so on. It is 425 * second lock will have a lockdep subclass of 5, and so on. It is
423 * the responsibility of the class builder to shift this to the correct 426 * the responsibility of the class builder to shift this to the correct
424 * portion of the lock_mode lockdep mask. 427 * portion of the lock_mode lockdep mask.
425 */ 428 */
426#define XFS_LOCK_PARENT 1 429#define XFS_LOCK_PARENT 1
427#define XFS_LOCK_INUMORDER 2 430#define XFS_LOCK_RTBITMAP 2
431#define XFS_LOCK_RTSUM 3
432#define XFS_LOCK_INUMORDER 4
428 433
429#define XFS_IOLOCK_SHIFT 16 434#define XFS_IOLOCK_SHIFT 16
430#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) 435#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
431 436
432#define XFS_ILOCK_SHIFT 24 437#define XFS_ILOCK_SHIFT 24
433#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) 438#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
439#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
440#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
434 441
435#define XFS_IOLOCK_DEP_MASK 0x00ff0000 442#define XFS_IOLOCK_DEP_MASK 0x00ff0000
436#define XFS_ILOCK_DEP_MASK 0xff000000 443#define XFS_ILOCK_DEP_MASK 0xff000000
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 55582bd66659..091d82b94c4d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -101,11 +101,11 @@ xfs_iomap_eof_align_last_fsb(
101} 101}
102 102
103STATIC int 103STATIC int
104xfs_cmn_err_fsblock_zero( 104xfs_alert_fsblock_zero(
105 xfs_inode_t *ip, 105 xfs_inode_t *ip,
106 xfs_bmbt_irec_t *imap) 106 xfs_bmbt_irec_t *imap)
107{ 107{
108 xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, 108 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
109 "Access to block zero in inode %llu " 109 "Access to block zero in inode %llu "
110 "start_block: %llx start_off: %llx " 110 "start_block: %llx start_off: %llx "
111 "blkcnt: %llx extent-state: %x\n", 111 "blkcnt: %llx extent-state: %x\n",
@@ -246,7 +246,7 @@ xfs_iomap_write_direct(
246 } 246 }
247 247
248 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { 248 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
249 error = xfs_cmn_err_fsblock_zero(ip, imap); 249 error = xfs_alert_fsblock_zero(ip, imap);
250 goto error_out; 250 goto error_out;
251 } 251 }
252 252
@@ -337,7 +337,12 @@ xfs_iomap_prealloc_size(
337 int shift = 0; 337 int shift = 0;
338 int64_t freesp; 338 int64_t freesp;
339 339
340 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size); 340 /*
341 * rounddown_pow_of_two() returns an undefined result
342 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
343 * ensure we always pass in a non-zero value.
344 */
345 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
341 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, 346 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
342 rounddown_pow_of_two(alloc_blocks)); 347 rounddown_pow_of_two(alloc_blocks));
343 348
@@ -459,7 +464,7 @@ retry:
459 } 464 }
460 465
461 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 466 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
462 return xfs_cmn_err_fsblock_zero(ip, &imap[0]); 467 return xfs_alert_fsblock_zero(ip, &imap[0]);
463 468
464 *ret_imap = imap[0]; 469 *ret_imap = imap[0];
465 return 0; 470 return 0;
@@ -609,7 +614,7 @@ xfs_iomap_write_allocate(
609 * covers at least part of the callers request 614 * covers at least part of the callers request
610 */ 615 */
611 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) 616 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
612 return xfs_cmn_err_fsblock_zero(ip, imap); 617 return xfs_alert_fsblock_zero(ip, imap);
613 618
614 if ((offset_fsb >= imap->br_startoff) && 619 if ((offset_fsb >= imap->br_startoff) &&
615 (offset_fsb < (imap->br_startoff + 620 (offset_fsb < (imap->br_startoff +
@@ -719,7 +724,7 @@ xfs_iomap_write_unwritten(
719 return XFS_ERROR(error); 724 return XFS_ERROR(error);
720 725
721 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 726 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
722 return xfs_cmn_err_fsblock_zero(ip, &imap); 727 return xfs_alert_fsblock_zero(ip, &imap);
723 728
724 if ((numblks_fsb = imap.br_blockcount) == 0) { 729 if ((numblks_fsb = imap.br_blockcount) == 0) {
725 /* 730 /*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ae6fef1ff563..25efa9b8a602 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -374,11 +374,10 @@ xfs_log_mount(
374 int error; 374 int error;
375 375
376 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 376 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
377 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 377 xfs_notice(mp, "Mounting Filesystem");
378 else { 378 else {
379 cmn_err(CE_NOTE, 379 xfs_notice(mp,
380 "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 380"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent.");
381 mp->m_fsname);
382 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 381 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
383 } 382 }
384 383
@@ -393,7 +392,7 @@ xfs_log_mount(
393 */ 392 */
394 error = xfs_trans_ail_init(mp); 393 error = xfs_trans_ail_init(mp);
395 if (error) { 394 if (error) {
396 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); 395 xfs_warn(mp, "AIL initialisation failed: error %d", error);
397 goto out_free_log; 396 goto out_free_log;
398 } 397 }
399 mp->m_log->l_ailp = mp->m_ail; 398 mp->m_log->l_ailp = mp->m_ail;
@@ -413,7 +412,8 @@ xfs_log_mount(
413 if (readonly) 412 if (readonly)
414 mp->m_flags |= XFS_MOUNT_RDONLY; 413 mp->m_flags |= XFS_MOUNT_RDONLY;
415 if (error) { 414 if (error) {
416 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); 415 xfs_warn(mp, "log mount/recovery failed: error %d",
416 error);
417 goto out_destroy_ail; 417 goto out_destroy_ail;
418 } 418 }
419 } 419 }
@@ -542,10 +542,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
542 */ 542 */
543 } 543 }
544 544
545 if (error) { 545 if (error)
546 xfs_fs_cmn_err(CE_ALERT, mp, 546 xfs_alert(mp, "%s: unmount record failed", __func__);
547 "xfs_log_unmount: unmount record failed");
548 }
549 547
550 548
551 spin_lock(&log->l_icloglock); 549 spin_lock(&log->l_icloglock);
@@ -852,7 +850,7 @@ xlog_space_left(
852 * In this case we just want to return the size of the 850 * In this case we just want to return the size of the
853 * log as the amount of space left. 851 * log as the amount of space left.
854 */ 852 */
855 xfs_fs_cmn_err(CE_ALERT, log->l_mp, 853 xfs_alert(log->l_mp,
856 "xlog_space_left: head behind tail\n" 854 "xlog_space_left: head behind tail\n"
857 " tail_cycle = %d, tail_bytes = %d\n" 855 " tail_cycle = %d, tail_bytes = %d\n"
858 " GH cycle = %d, GH bytes = %d", 856 " GH cycle = %d, GH bytes = %d",
@@ -1001,7 +999,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1001 999
1002 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); 1000 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1003 if (!log) { 1001 if (!log) {
1004 xlog_warn("XFS: Log allocation failed: No memory!"); 1002 xfs_warn(mp, "Log allocation failed: No memory!");
1005 goto out; 1003 goto out;
1006 } 1004 }
1007 1005
@@ -1029,24 +1027,24 @@ xlog_alloc_log(xfs_mount_t *mp,
1029 if (xfs_sb_version_hassector(&mp->m_sb)) { 1027 if (xfs_sb_version_hassector(&mp->m_sb)) {
1030 log2_size = mp->m_sb.sb_logsectlog; 1028 log2_size = mp->m_sb.sb_logsectlog;
1031 if (log2_size < BBSHIFT) { 1029 if (log2_size < BBSHIFT) {
1032 xlog_warn("XFS: Log sector size too small " 1030 xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
1033 "(0x%x < 0x%x)", log2_size, BBSHIFT); 1031 log2_size, BBSHIFT);
1034 goto out_free_log; 1032 goto out_free_log;
1035 } 1033 }
1036 1034
1037 log2_size -= BBSHIFT; 1035 log2_size -= BBSHIFT;
1038 if (log2_size > mp->m_sectbb_log) { 1036 if (log2_size > mp->m_sectbb_log) {
1039 xlog_warn("XFS: Log sector size too large " 1037 xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
1040 "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log); 1038 log2_size, mp->m_sectbb_log);
1041 goto out_free_log; 1039 goto out_free_log;
1042 } 1040 }
1043 1041
1044 /* for larger sector sizes, must have v2 or external log */ 1042 /* for larger sector sizes, must have v2 or external log */
1045 if (log2_size && log->l_logBBstart > 0 && 1043 if (log2_size && log->l_logBBstart > 0 &&
1046 !xfs_sb_version_haslogv2(&mp->m_sb)) { 1044 !xfs_sb_version_haslogv2(&mp->m_sb)) {
1047 1045 xfs_warn(mp,
1048 xlog_warn("XFS: log sector size (0x%x) invalid " 1046 "log sector size (0x%x) invalid for configuration.",
1049 "for configuration.", log2_size); 1047 log2_size);
1050 goto out_free_log; 1048 goto out_free_log;
1051 } 1049 }
1052 } 1050 }
@@ -1563,38 +1561,36 @@ xlog_print_tic_res(
1563 "SWAPEXT" 1561 "SWAPEXT"
1564 }; 1562 };
1565 1563
1566 xfs_fs_cmn_err(CE_WARN, mp, 1564 xfs_warn(mp,
1567 "xfs_log_write: reservation summary:\n" 1565 "xfs_log_write: reservation summary:\n"
1568 " trans type = %s (%u)\n" 1566 " trans type = %s (%u)\n"
1569 " unit res = %d bytes\n" 1567 " unit res = %d bytes\n"
1570 " current res = %d bytes\n" 1568 " current res = %d bytes\n"
1571 " total reg = %u bytes (o/flow = %u bytes)\n" 1569 " total reg = %u bytes (o/flow = %u bytes)\n"
1572 " ophdrs = %u (ophdr space = %u bytes)\n" 1570 " ophdrs = %u (ophdr space = %u bytes)\n"
1573 " ophdr + reg = %u bytes\n" 1571 " ophdr + reg = %u bytes\n"
1574 " num regions = %u\n", 1572 " num regions = %u\n",
1575 ((ticket->t_trans_type <= 0 || 1573 ((ticket->t_trans_type <= 0 ||
1576 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? 1574 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
1577 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), 1575 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
1578 ticket->t_trans_type, 1576 ticket->t_trans_type,
1579 ticket->t_unit_res, 1577 ticket->t_unit_res,
1580 ticket->t_curr_res, 1578 ticket->t_curr_res,
1581 ticket->t_res_arr_sum, ticket->t_res_o_flow, 1579 ticket->t_res_arr_sum, ticket->t_res_o_flow,
1582 ticket->t_res_num_ophdrs, ophdr_spc, 1580 ticket->t_res_num_ophdrs, ophdr_spc,
1583 ticket->t_res_arr_sum + 1581 ticket->t_res_arr_sum +
1584 ticket->t_res_o_flow + ophdr_spc, 1582 ticket->t_res_o_flow + ophdr_spc,
1585 ticket->t_res_num); 1583 ticket->t_res_num);
1586 1584
1587 for (i = 0; i < ticket->t_res_num; i++) { 1585 for (i = 0; i < ticket->t_res_num; i++) {
1588 uint r_type = ticket->t_res_arr[i].r_type; 1586 uint r_type = ticket->t_res_arr[i].r_type;
1589 cmn_err(CE_WARN, 1587 xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
1590 "region[%u]: %s - %u bytes\n",
1591 i,
1592 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? 1588 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
1593 "bad-rtype" : res_type_str[r_type-1]), 1589 "bad-rtype" : res_type_str[r_type-1]),
1594 ticket->t_res_arr[i].r_len); 1590 ticket->t_res_arr[i].r_len);
1595 } 1591 }
1596 1592
1597 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, 1593 xfs_alert_tag(mp, XFS_PTAG_LOGRES,
1598 "xfs_log_write: reservation ran out. Need to up reservation"); 1594 "xfs_log_write: reservation ran out. Need to up reservation");
1599 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1595 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1600} 1596}
@@ -1682,7 +1678,7 @@ xlog_write_setup_ophdr(
1682 case XFS_LOG: 1678 case XFS_LOG:
1683 break; 1679 break;
1684 default: 1680 default:
1685 xfs_fs_cmn_err(CE_WARN, log->l_mp, 1681 xfs_warn(log->l_mp,
1686 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1682 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1687 ophdr->oh_clientid, ticket); 1683 ophdr->oh_clientid, ticket);
1688 return NULL; 1684 return NULL;
@@ -2264,7 +2260,7 @@ xlog_state_do_callback(
2264 if (repeats > 5000) { 2260 if (repeats > 5000) {
2265 flushcnt += repeats; 2261 flushcnt += repeats;
2266 repeats = 0; 2262 repeats = 0;
2267 xfs_fs_cmn_err(CE_WARN, log->l_mp, 2263 xfs_warn(log->l_mp,
2268 "%s: possible infinite loop (%d iterations)", 2264 "%s: possible infinite loop (%d iterations)",
2269 __func__, flushcnt); 2265 __func__, flushcnt);
2270 } 2266 }
@@ -3052,10 +3048,8 @@ xfs_log_force(
3052 int error; 3048 int error;
3053 3049
3054 error = _xfs_log_force(mp, flags, NULL); 3050 error = _xfs_log_force(mp, flags, NULL);
3055 if (error) { 3051 if (error)
3056 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " 3052 xfs_warn(mp, "%s: error %d returned.", __func__, error);
3057 "error %d returned.", error);
3058 }
3059} 3053}
3060 3054
3061/* 3055/*
@@ -3204,10 +3198,8 @@ xfs_log_force_lsn(
3204 int error; 3198 int error;
3205 3199
3206 error = _xfs_log_force_lsn(mp, lsn, flags, NULL); 3200 error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3207 if (error) { 3201 if (error)
3208 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " 3202 xfs_warn(mp, "%s: error %d returned.", __func__, error);
3209 "error %d returned.", error);
3210 }
3211} 3203}
3212 3204
3213/* 3205/*
@@ -3412,7 +3404,7 @@ xlog_verify_dest_ptr(
3412 } 3404 }
3413 3405
3414 if (!good_ptr) 3406 if (!good_ptr)
3415 xlog_panic("xlog_verify_dest_ptr: invalid ptr"); 3407 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3416} 3408}
3417 3409
3418STATIC void 3410STATIC void
@@ -3448,16 +3440,16 @@ xlog_verify_tail_lsn(xlog_t *log,
3448 blocks = 3440 blocks =
3449 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); 3441 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
3450 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) 3442 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
3451 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3443 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3452 } else { 3444 } else {
3453 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); 3445 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
3454 3446
3455 if (BLOCK_LSN(tail_lsn) == log->l_prev_block) 3447 if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
3456 xlog_panic("xlog_verify_tail_lsn: tail wrapped"); 3448 xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
3457 3449
3458 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; 3450 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
3459 if (blocks < BTOBB(iclog->ic_offset) + 1) 3451 if (blocks < BTOBB(iclog->ic_offset) + 1)
3460 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3452 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3461 } 3453 }
3462} /* xlog_verify_tail_lsn */ 3454} /* xlog_verify_tail_lsn */
3463 3455
@@ -3497,22 +3489,23 @@ xlog_verify_iclog(xlog_t *log,
3497 icptr = log->l_iclog; 3489 icptr = log->l_iclog;
3498 for (i=0; i < log->l_iclog_bufs; i++) { 3490 for (i=0; i < log->l_iclog_bufs; i++) {
3499 if (icptr == NULL) 3491 if (icptr == NULL)
3500 xlog_panic("xlog_verify_iclog: invalid ptr"); 3492 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3501 icptr = icptr->ic_next; 3493 icptr = icptr->ic_next;
3502 } 3494 }
3503 if (icptr != log->l_iclog) 3495 if (icptr != log->l_iclog)
3504 xlog_panic("xlog_verify_iclog: corrupt iclog ring"); 3496 xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
3505 spin_unlock(&log->l_icloglock); 3497 spin_unlock(&log->l_icloglock);
3506 3498
3507 /* check log magic numbers */ 3499 /* check log magic numbers */
3508 if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM) 3500 if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
3509 xlog_panic("xlog_verify_iclog: invalid magic num"); 3501 xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
3510 3502
3511 ptr = (xfs_caddr_t) &iclog->ic_header; 3503 ptr = (xfs_caddr_t) &iclog->ic_header;
3512 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; 3504 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
3513 ptr += BBSIZE) { 3505 ptr += BBSIZE) {
3514 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) 3506 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
3515 xlog_panic("xlog_verify_iclog: unexpected magic num"); 3507 xfs_emerg(log->l_mp, "%s: unexpected magic num",
3508 __func__);
3516 } 3509 }
3517 3510
3518 /* check fields */ 3511 /* check fields */
@@ -3542,9 +3535,10 @@ xlog_verify_iclog(xlog_t *log,
3542 } 3535 }
3543 } 3536 }
3544 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) 3537 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
3545 cmn_err(CE_WARN, "xlog_verify_iclog: " 3538 xfs_warn(log->l_mp,
3546 "invalid clientid %d op 0x%p offset 0x%lx", 3539 "%s: invalid clientid %d op 0x%p offset 0x%lx",
3547 clientid, ophead, (unsigned long)field_offset); 3540 __func__, clientid, ophead,
3541 (unsigned long)field_offset);
3548 3542
3549 /* check length */ 3543 /* check length */
3550 field_offset = (__psint_t) 3544 field_offset = (__psint_t)
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..3bd3291ef8d2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket);
191 191
192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); 192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
193 193
194int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 194void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
195 struct xfs_log_vec *log_vector, 195 struct xfs_log_vec *log_vector,
196 xfs_lsn_t *commit_lsn, int flags); 196 xfs_lsn_t *commit_lsn, int flags);
197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9dc8125d04e5..9ca59be08977 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -543,7 +543,7 @@ xlog_cil_push(
543 543
544 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); 544 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
545 if (error) 545 if (error)
546 goto out_abort; 546 goto out_abort_free_ticket;
547 547
548 /* 548 /*
549 * now that we've written the checkpoint into the log, strictly 549 * now that we've written the checkpoint into the log, strictly
@@ -569,8 +569,9 @@ restart:
569 } 569 }
570 spin_unlock(&cil->xc_cil_lock); 570 spin_unlock(&cil->xc_cil_lock);
571 571
572 /* xfs_log_done always frees the ticket on error. */
572 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 573 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
573 if (error || commit_lsn == -1) 574 if (commit_lsn == -1)
574 goto out_abort; 575 goto out_abort;
575 576
576 /* attach all the transactions w/ busy extents to iclog */ 577 /* attach all the transactions w/ busy extents to iclog */
@@ -600,6 +601,8 @@ out_free_ticket:
600 kmem_free(new_ctx); 601 kmem_free(new_ctx);
601 return 0; 602 return 0;
602 603
604out_abort_free_ticket:
605 xfs_log_ticket_put(tic);
603out_abort: 606out_abort:
604 xlog_cil_committed(ctx, XFS_LI_ABORTED); 607 xlog_cil_committed(ctx, XFS_LI_ABORTED);
605 return XFS_ERROR(EIO); 608 return XFS_ERROR(EIO);
@@ -622,7 +625,7 @@ out_abort:
622 * background commit, returns without it held once background commits are 625 * background commit, returns without it held once background commits are
623 * allowed again. 626 * allowed again.
624 */ 627 */
625int 628void
626xfs_log_commit_cil( 629xfs_log_commit_cil(
627 struct xfs_mount *mp, 630 struct xfs_mount *mp,
628 struct xfs_trans *tp, 631 struct xfs_trans *tp,
@@ -637,11 +640,6 @@ xfs_log_commit_cil(
637 if (flags & XFS_TRANS_RELEASE_LOG_RES) 640 if (flags & XFS_TRANS_RELEASE_LOG_RES)
638 log_flags = XFS_LOG_REL_PERM_RESERV; 641 log_flags = XFS_LOG_REL_PERM_RESERV;
639 642
640 if (XLOG_FORCED_SHUTDOWN(log)) {
641 xlog_cil_free_logvec(log_vector);
642 return XFS_ERROR(EIO);
643 }
644
645 /* 643 /*
646 * do all the hard work of formatting items (including memory 644 * do all the hard work of formatting items (including memory
647 * allocation) outside the CIL context lock. This prevents stalling CIL 645 * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -701,7 +699,6 @@ xfs_log_commit_cil(
701 */ 699 */
702 if (push) 700 if (push)
703 xlog_cil_push(log, 0); 701 xlog_cil_push(log, 0);
704 return 0;
705} 702}
706 703
707/* 704/*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d5f8be8f4bf6..15dbf1f9c2be 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -87,10 +87,6 @@ static inline uint xlog_get_client_id(__be32 i)
87 return be32_to_cpu(i) >> 24; 87 return be32_to_cpu(i) >> 24;
88} 88}
89 89
90#define xlog_panic(args...) cmn_err(CE_PANIC, ## args)
91#define xlog_exit(args...) cmn_err(CE_PANIC, ## args)
92#define xlog_warn(args...) cmn_err(CE_WARN, ## args)
93
94/* 90/*
95 * In core log state 91 * In core log state
96 */ 92 */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index aa0ebb776903..0c4a5618e7af 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -92,7 +92,7 @@ xlog_get_bp(
92 int nbblks) 92 int nbblks)
93{ 93{
94 if (!xlog_buf_bbcount_valid(log, nbblks)) { 94 if (!xlog_buf_bbcount_valid(log, nbblks)) {
95 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 95 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
96 nbblks); 96 nbblks);
97 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 97 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
98 return NULL; 98 return NULL;
@@ -160,7 +160,7 @@ xlog_bread_noalign(
160 int error; 160 int error;
161 161
162 if (!xlog_buf_bbcount_valid(log, nbblks)) { 162 if (!xlog_buf_bbcount_valid(log, nbblks)) {
163 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 163 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
164 nbblks); 164 nbblks);
165 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 165 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
166 return EFSCORRUPTED; 166 return EFSCORRUPTED;
@@ -219,7 +219,7 @@ xlog_bwrite(
219 int error; 219 int error;
220 220
221 if (!xlog_buf_bbcount_valid(log, nbblks)) { 221 if (!xlog_buf_bbcount_valid(log, nbblks)) {
222 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 222 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
223 nbblks); 223 nbblks);
224 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 224 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
225 return EFSCORRUPTED; 225 return EFSCORRUPTED;
@@ -254,9 +254,9 @@ xlog_header_check_dump(
254 xfs_mount_t *mp, 254 xfs_mount_t *mp,
255 xlog_rec_header_t *head) 255 xlog_rec_header_t *head)
256{ 256{
257 cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n", 257 xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",
258 __func__, &mp->m_sb.sb_uuid, XLOG_FMT); 258 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
259 cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n", 259 xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",
260 &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); 260 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
261} 261}
262#else 262#else
@@ -279,15 +279,15 @@ xlog_header_check_recover(
279 * a dirty log created in IRIX. 279 * a dirty log created in IRIX.
280 */ 280 */
281 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { 281 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
282 xlog_warn( 282 xfs_warn(mp,
283 "XFS: dirty log written in incompatible format - can't recover"); 283 "dirty log written in incompatible format - can't recover");
284 xlog_header_check_dump(mp, head); 284 xlog_header_check_dump(mp, head);
285 XFS_ERROR_REPORT("xlog_header_check_recover(1)", 285 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
286 XFS_ERRLEVEL_HIGH, mp); 286 XFS_ERRLEVEL_HIGH, mp);
287 return XFS_ERROR(EFSCORRUPTED); 287 return XFS_ERROR(EFSCORRUPTED);
288 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 288 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
289 xlog_warn( 289 xfs_warn(mp,
290 "XFS: dirty log entry has mismatched uuid - can't recover"); 290 "dirty log entry has mismatched uuid - can't recover");
291 xlog_header_check_dump(mp, head); 291 xlog_header_check_dump(mp, head);
292 XFS_ERROR_REPORT("xlog_header_check_recover(2)", 292 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
293 XFS_ERRLEVEL_HIGH, mp); 293 XFS_ERRLEVEL_HIGH, mp);
@@ -312,9 +312,9 @@ xlog_header_check_mount(
312 * h_fs_uuid is nil, we assume this log was last mounted 312 * h_fs_uuid is nil, we assume this log was last mounted
313 * by IRIX and continue. 313 * by IRIX and continue.
314 */ 314 */
315 xlog_warn("XFS: nil uuid in log - IRIX style log"); 315 xfs_warn(mp, "nil uuid in log - IRIX style log");
316 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 316 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
317 xlog_warn("XFS: log has mismatched uuid - can't recover"); 317 xfs_warn(mp, "log has mismatched uuid - can't recover");
318 xlog_header_check_dump(mp, head); 318 xlog_header_check_dump(mp, head);
319 XFS_ERROR_REPORT("xlog_header_check_mount", 319 XFS_ERROR_REPORT("xlog_header_check_mount",
320 XFS_ERRLEVEL_HIGH, mp); 320 XFS_ERRLEVEL_HIGH, mp);
@@ -490,8 +490,8 @@ xlog_find_verify_log_record(
490 for (i = (*last_blk) - 1; i >= 0; i--) { 490 for (i = (*last_blk) - 1; i >= 0; i--) {
491 if (i < start_blk) { 491 if (i < start_blk) {
492 /* valid log record not found */ 492 /* valid log record not found */
493 xlog_warn( 493 xfs_warn(log->l_mp,
494 "XFS: Log inconsistent (didn't find previous header)"); 494 "Log inconsistent (didn't find previous header)");
495 ASSERT(0); 495 ASSERT(0);
496 error = XFS_ERROR(EIO); 496 error = XFS_ERROR(EIO);
497 goto out; 497 goto out;
@@ -591,12 +591,12 @@ xlog_find_head(
591 * mkfs etc write a dummy unmount record to a fresh 591 * mkfs etc write a dummy unmount record to a fresh
592 * log so we can store the uuid in there 592 * log so we can store the uuid in there
593 */ 593 */
594 xlog_warn("XFS: totally zeroed log"); 594 xfs_warn(log->l_mp, "totally zeroed log");
595 } 595 }
596 596
597 return 0; 597 return 0;
598 } else if (error) { 598 } else if (error) {
599 xlog_warn("XFS: empty log check failed"); 599 xfs_warn(log->l_mp, "empty log check failed");
600 return error; 600 return error;
601 } 601 }
602 602
@@ -819,7 +819,7 @@ validate_head:
819 xlog_put_bp(bp); 819 xlog_put_bp(bp);
820 820
821 if (error) 821 if (error)
822 xlog_warn("XFS: failed to find log head"); 822 xfs_warn(log->l_mp, "failed to find log head");
823 return error; 823 return error;
824} 824}
825 825
@@ -912,7 +912,7 @@ xlog_find_tail(
912 } 912 }
913 } 913 }
914 if (!found) { 914 if (!found) {
915 xlog_warn("XFS: xlog_find_tail: couldn't find sync record"); 915 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
916 ASSERT(0); 916 ASSERT(0);
917 return XFS_ERROR(EIO); 917 return XFS_ERROR(EIO);
918 } 918 }
@@ -1028,7 +1028,7 @@ done:
1028 xlog_put_bp(bp); 1028 xlog_put_bp(bp);
1029 1029
1030 if (error) 1030 if (error)
1031 xlog_warn("XFS: failed to locate log tail"); 1031 xfs_warn(log->l_mp, "failed to locate log tail");
1032 return error; 1032 return error;
1033} 1033}
1034 1034
@@ -1092,7 +1092,8 @@ xlog_find_zeroed(
1092 * the first block must be 1. If it's not, maybe we're 1092 * the first block must be 1. If it's not, maybe we're
1093 * not looking at a log... Bail out. 1093 * not looking at a log... Bail out.
1094 */ 1094 */
1095 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)"); 1095 xfs_warn(log->l_mp,
1096 "Log inconsistent or not a log (last==0, first!=1)");
1096 return XFS_ERROR(EINVAL); 1097 return XFS_ERROR(EINVAL);
1097 } 1098 }
1098 1099
@@ -1506,8 +1507,8 @@ xlog_recover_add_to_trans(
1506 if (list_empty(&trans->r_itemq)) { 1507 if (list_empty(&trans->r_itemq)) {
1507 /* we need to catch log corruptions here */ 1508 /* we need to catch log corruptions here */
1508 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 1509 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1509 xlog_warn("XFS: xlog_recover_add_to_trans: " 1510 xfs_warn(log->l_mp, "%s: bad header magic number",
1510 "bad header magic number"); 1511 __func__);
1511 ASSERT(0); 1512 ASSERT(0);
1512 return XFS_ERROR(EIO); 1513 return XFS_ERROR(EIO);
1513 } 1514 }
@@ -1534,8 +1535,8 @@ xlog_recover_add_to_trans(
1534 if (item->ri_total == 0) { /* first region to be added */ 1535 if (item->ri_total == 0) { /* first region to be added */
1535 if (in_f->ilf_size == 0 || 1536 if (in_f->ilf_size == 0 ||
1536 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { 1537 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1537 xlog_warn( 1538 xfs_warn(log->l_mp,
1538 "XFS: bad number of regions (%d) in inode log format", 1539 "bad number of regions (%d) in inode log format",
1539 in_f->ilf_size); 1540 in_f->ilf_size);
1540 ASSERT(0); 1541 ASSERT(0);
1541 return XFS_ERROR(EIO); 1542 return XFS_ERROR(EIO);
@@ -1592,8 +1593,9 @@ xlog_recover_reorder_trans(
1592 list_move_tail(&item->ri_list, &trans->r_itemq); 1593 list_move_tail(&item->ri_list, &trans->r_itemq);
1593 break; 1594 break;
1594 default: 1595 default:
1595 xlog_warn( 1596 xfs_warn(log->l_mp,
1596 "XFS: xlog_recover_reorder_trans: unrecognized type of log operation"); 1597 "%s: unrecognized type of log operation",
1598 __func__);
1597 ASSERT(0); 1599 ASSERT(0);
1598 return XFS_ERROR(EIO); 1600 return XFS_ERROR(EIO);
1599 } 1601 }
@@ -1803,8 +1805,9 @@ xlog_recover_do_inode_buffer(
1803 logged_nextp = item->ri_buf[item_index].i_addr + 1805 logged_nextp = item->ri_buf[item_index].i_addr +
1804 next_unlinked_offset - reg_buf_offset; 1806 next_unlinked_offset - reg_buf_offset;
1805 if (unlikely(*logged_nextp == 0)) { 1807 if (unlikely(*logged_nextp == 0)) {
1806 xfs_fs_cmn_err(CE_ALERT, mp, 1808 xfs_alert(mp,
1807 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", 1809 "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
1810 "Trying to replay bad (0) inode di_next_unlinked field.",
1808 item, bp); 1811 item, bp);
1809 XFS_ERROR_REPORT("xlog_recover_do_inode_buf", 1812 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1810 XFS_ERRLEVEL_LOW, mp); 1813 XFS_ERRLEVEL_LOW, mp);
@@ -1863,17 +1866,17 @@ xlog_recover_do_reg_buffer(
1863 if (buf_f->blf_flags & 1866 if (buf_f->blf_flags &
1864 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 1867 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1865 if (item->ri_buf[i].i_addr == NULL) { 1868 if (item->ri_buf[i].i_addr == NULL) {
1866 cmn_err(CE_ALERT, 1869 xfs_alert(mp,
1867 "XFS: NULL dquot in %s.", __func__); 1870 "XFS: NULL dquot in %s.", __func__);
1868 goto next; 1871 goto next;
1869 } 1872 }
1870 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { 1873 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1871 cmn_err(CE_ALERT, 1874 xfs_alert(mp,
1872 "XFS: dquot too small (%d) in %s.", 1875 "XFS: dquot too small (%d) in %s.",
1873 item->ri_buf[i].i_len, __func__); 1876 item->ri_buf[i].i_len, __func__);
1874 goto next; 1877 goto next;
1875 } 1878 }
1876 error = xfs_qm_dqcheck(item->ri_buf[i].i_addr, 1879 error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
1877 -1, 0, XFS_QMOPT_DOWARN, 1880 -1, 0, XFS_QMOPT_DOWARN,
1878 "dquot_buf_recover"); 1881 "dquot_buf_recover");
1879 if (error) 1882 if (error)
@@ -1898,6 +1901,7 @@ xlog_recover_do_reg_buffer(
1898 */ 1901 */
1899int 1902int
1900xfs_qm_dqcheck( 1903xfs_qm_dqcheck(
1904 struct xfs_mount *mp,
1901 xfs_disk_dquot_t *ddq, 1905 xfs_disk_dquot_t *ddq,
1902 xfs_dqid_t id, 1906 xfs_dqid_t id,
1903 uint type, /* used only when IO_dorepair is true */ 1907 uint type, /* used only when IO_dorepair is true */
@@ -1924,14 +1928,14 @@ xfs_qm_dqcheck(
1924 */ 1928 */
1925 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { 1929 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
1926 if (flags & XFS_QMOPT_DOWARN) 1930 if (flags & XFS_QMOPT_DOWARN)
1927 cmn_err(CE_ALERT, 1931 xfs_alert(mp,
1928 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", 1932 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
1929 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); 1933 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
1930 errs++; 1934 errs++;
1931 } 1935 }
1932 if (ddq->d_version != XFS_DQUOT_VERSION) { 1936 if (ddq->d_version != XFS_DQUOT_VERSION) {
1933 if (flags & XFS_QMOPT_DOWARN) 1937 if (flags & XFS_QMOPT_DOWARN)
1934 cmn_err(CE_ALERT, 1938 xfs_alert(mp,
1935 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", 1939 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
1936 str, id, ddq->d_version, XFS_DQUOT_VERSION); 1940 str, id, ddq->d_version, XFS_DQUOT_VERSION);
1937 errs++; 1941 errs++;
@@ -1941,7 +1945,7 @@ xfs_qm_dqcheck(
1941 ddq->d_flags != XFS_DQ_PROJ && 1945 ddq->d_flags != XFS_DQ_PROJ &&
1942 ddq->d_flags != XFS_DQ_GROUP) { 1946 ddq->d_flags != XFS_DQ_GROUP) {
1943 if (flags & XFS_QMOPT_DOWARN) 1947 if (flags & XFS_QMOPT_DOWARN)
1944 cmn_err(CE_ALERT, 1948 xfs_alert(mp,
1945 "%s : XFS dquot ID 0x%x, unknown flags 0x%x", 1949 "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
1946 str, id, ddq->d_flags); 1950 str, id, ddq->d_flags);
1947 errs++; 1951 errs++;
@@ -1949,7 +1953,7 @@ xfs_qm_dqcheck(
1949 1953
1950 if (id != -1 && id != be32_to_cpu(ddq->d_id)) { 1954 if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
1951 if (flags & XFS_QMOPT_DOWARN) 1955 if (flags & XFS_QMOPT_DOWARN)
1952 cmn_err(CE_ALERT, 1956 xfs_alert(mp,
1953 "%s : ondisk-dquot 0x%p, ID mismatch: " 1957 "%s : ondisk-dquot 0x%p, ID mismatch: "
1954 "0x%x expected, found id 0x%x", 1958 "0x%x expected, found id 0x%x",
1955 str, ddq, id, be32_to_cpu(ddq->d_id)); 1959 str, ddq, id, be32_to_cpu(ddq->d_id));
@@ -1962,9 +1966,8 @@ xfs_qm_dqcheck(
1962 be64_to_cpu(ddq->d_blk_softlimit)) { 1966 be64_to_cpu(ddq->d_blk_softlimit)) {
1963 if (!ddq->d_btimer) { 1967 if (!ddq->d_btimer) {
1964 if (flags & XFS_QMOPT_DOWARN) 1968 if (flags & XFS_QMOPT_DOWARN)
1965 cmn_err(CE_ALERT, 1969 xfs_alert(mp,
1966 "%s : Dquot ID 0x%x (0x%p) " 1970 "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
1967 "BLK TIMER NOT STARTED",
1968 str, (int)be32_to_cpu(ddq->d_id), ddq); 1971 str, (int)be32_to_cpu(ddq->d_id), ddq);
1969 errs++; 1972 errs++;
1970 } 1973 }
@@ -1974,9 +1977,8 @@ xfs_qm_dqcheck(
1974 be64_to_cpu(ddq->d_ino_softlimit)) { 1977 be64_to_cpu(ddq->d_ino_softlimit)) {
1975 if (!ddq->d_itimer) { 1978 if (!ddq->d_itimer) {
1976 if (flags & XFS_QMOPT_DOWARN) 1979 if (flags & XFS_QMOPT_DOWARN)
1977 cmn_err(CE_ALERT, 1980 xfs_alert(mp,
1978 "%s : Dquot ID 0x%x (0x%p) " 1981 "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
1979 "INODE TIMER NOT STARTED",
1980 str, (int)be32_to_cpu(ddq->d_id), ddq); 1982 str, (int)be32_to_cpu(ddq->d_id), ddq);
1981 errs++; 1983 errs++;
1982 } 1984 }
@@ -1986,9 +1988,8 @@ xfs_qm_dqcheck(
1986 be64_to_cpu(ddq->d_rtb_softlimit)) { 1988 be64_to_cpu(ddq->d_rtb_softlimit)) {
1987 if (!ddq->d_rtbtimer) { 1989 if (!ddq->d_rtbtimer) {
1988 if (flags & XFS_QMOPT_DOWARN) 1990 if (flags & XFS_QMOPT_DOWARN)
1989 cmn_err(CE_ALERT, 1991 xfs_alert(mp,
1990 "%s : Dquot ID 0x%x (0x%p) " 1992 "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
1991 "RTBLK TIMER NOT STARTED",
1992 str, (int)be32_to_cpu(ddq->d_id), ddq); 1993 str, (int)be32_to_cpu(ddq->d_id), ddq);
1993 errs++; 1994 errs++;
1994 } 1995 }
@@ -1999,7 +2000,7 @@ xfs_qm_dqcheck(
1999 return errs; 2000 return errs;
2000 2001
2001 if (flags & XFS_QMOPT_DOWARN) 2002 if (flags & XFS_QMOPT_DOWARN)
2002 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id); 2003 xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
2003 2004
2004 /* 2005 /*
2005 * Typically, a repair is only requested by quotacheck. 2006 * Typically, a repair is only requested by quotacheck.
@@ -2218,9 +2219,9 @@ xlog_recover_inode_pass2(
2218 */ 2219 */
2219 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { 2220 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
2220 xfs_buf_relse(bp); 2221 xfs_buf_relse(bp);
2221 xfs_fs_cmn_err(CE_ALERT, mp, 2222 xfs_alert(mp,
2222 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2223 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2223 dip, bp, in_f->ilf_ino); 2224 __func__, dip, bp, in_f->ilf_ino);
2224 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", 2225 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2225 XFS_ERRLEVEL_LOW, mp); 2226 XFS_ERRLEVEL_LOW, mp);
2226 error = EFSCORRUPTED; 2227 error = EFSCORRUPTED;
@@ -2229,9 +2230,9 @@ xlog_recover_inode_pass2(
2229 dicp = item->ri_buf[1].i_addr; 2230 dicp = item->ri_buf[1].i_addr;
2230 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2231 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2231 xfs_buf_relse(bp); 2232 xfs_buf_relse(bp);
2232 xfs_fs_cmn_err(CE_ALERT, mp, 2233 xfs_alert(mp,
2233 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", 2234 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2234 item, in_f->ilf_ino); 2235 __func__, item, in_f->ilf_ino);
2235 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", 2236 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2236 XFS_ERRLEVEL_LOW, mp); 2237 XFS_ERRLEVEL_LOW, mp);
2237 error = EFSCORRUPTED; 2238 error = EFSCORRUPTED;
@@ -2263,9 +2264,10 @@ xlog_recover_inode_pass2(
2263 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 2264 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2264 XFS_ERRLEVEL_LOW, mp, dicp); 2265 XFS_ERRLEVEL_LOW, mp, dicp);
2265 xfs_buf_relse(bp); 2266 xfs_buf_relse(bp);
2266 xfs_fs_cmn_err(CE_ALERT, mp, 2267 xfs_alert(mp,
2267 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2268 "%s: Bad regular inode log record, rec ptr 0x%p, "
2268 item, dip, bp, in_f->ilf_ino); 2269 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2270 __func__, item, dip, bp, in_f->ilf_ino);
2269 error = EFSCORRUPTED; 2271 error = EFSCORRUPTED;
2270 goto error; 2272 goto error;
2271 } 2273 }
@@ -2276,9 +2278,10 @@ xlog_recover_inode_pass2(
2276 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", 2278 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2277 XFS_ERRLEVEL_LOW, mp, dicp); 2279 XFS_ERRLEVEL_LOW, mp, dicp);
2278 xfs_buf_relse(bp); 2280 xfs_buf_relse(bp);
2279 xfs_fs_cmn_err(CE_ALERT, mp, 2281 xfs_alert(mp,
2280 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2282 "%s: Bad dir inode log record, rec ptr 0x%p, "
2281 item, dip, bp, in_f->ilf_ino); 2283 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2284 __func__, item, dip, bp, in_f->ilf_ino);
2282 error = EFSCORRUPTED; 2285 error = EFSCORRUPTED;
2283 goto error; 2286 goto error;
2284 } 2287 }
@@ -2287,9 +2290,10 @@ xlog_recover_inode_pass2(
2287 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", 2290 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2288 XFS_ERRLEVEL_LOW, mp, dicp); 2291 XFS_ERRLEVEL_LOW, mp, dicp);
2289 xfs_buf_relse(bp); 2292 xfs_buf_relse(bp);
2290 xfs_fs_cmn_err(CE_ALERT, mp, 2293 xfs_alert(mp,
2291 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2294 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2292 item, dip, bp, in_f->ilf_ino, 2295 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2296 __func__, item, dip, bp, in_f->ilf_ino,
2293 dicp->di_nextents + dicp->di_anextents, 2297 dicp->di_nextents + dicp->di_anextents,
2294 dicp->di_nblocks); 2298 dicp->di_nblocks);
2295 error = EFSCORRUPTED; 2299 error = EFSCORRUPTED;
@@ -2299,8 +2303,9 @@ xlog_recover_inode_pass2(
2299 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", 2303 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2300 XFS_ERRLEVEL_LOW, mp, dicp); 2304 XFS_ERRLEVEL_LOW, mp, dicp);
2301 xfs_buf_relse(bp); 2305 xfs_buf_relse(bp);
2302 xfs_fs_cmn_err(CE_ALERT, mp, 2306 xfs_alert(mp,
2303 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", 2307 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2308 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2304 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); 2309 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2305 error = EFSCORRUPTED; 2310 error = EFSCORRUPTED;
2306 goto error; 2311 goto error;
@@ -2309,9 +2314,9 @@ xlog_recover_inode_pass2(
2309 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 2314 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2310 XFS_ERRLEVEL_LOW, mp, dicp); 2315 XFS_ERRLEVEL_LOW, mp, dicp);
2311 xfs_buf_relse(bp); 2316 xfs_buf_relse(bp);
2312 xfs_fs_cmn_err(CE_ALERT, mp, 2317 xfs_alert(mp,
2313 "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p", 2318 "%s: Bad inode log record length %d, rec ptr 0x%p",
2314 item->ri_buf[1].i_len, item); 2319 __func__, item->ri_buf[1].i_len, item);
2315 error = EFSCORRUPTED; 2320 error = EFSCORRUPTED;
2316 goto error; 2321 goto error;
2317 } 2322 }
@@ -2398,7 +2403,7 @@ xlog_recover_inode_pass2(
2398 break; 2403 break;
2399 2404
2400 default: 2405 default:
2401 xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag"); 2406 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2402 ASSERT(0); 2407 ASSERT(0);
2403 xfs_buf_relse(bp); 2408 xfs_buf_relse(bp);
2404 error = EIO; 2409 error = EIO;
@@ -2467,13 +2472,11 @@ xlog_recover_dquot_pass2(
2467 2472
2468 recddq = item->ri_buf[1].i_addr; 2473 recddq = item->ri_buf[1].i_addr;
2469 if (recddq == NULL) { 2474 if (recddq == NULL) {
2470 cmn_err(CE_ALERT, 2475 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2471 "XFS: NULL dquot in %s.", __func__);
2472 return XFS_ERROR(EIO); 2476 return XFS_ERROR(EIO);
2473 } 2477 }
2474 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { 2478 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2475 cmn_err(CE_ALERT, 2479 xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2476 "XFS: dquot too small (%d) in %s.",
2477 item->ri_buf[1].i_len, __func__); 2480 item->ri_buf[1].i_len, __func__);
2478 return XFS_ERROR(EIO); 2481 return XFS_ERROR(EIO);
2479 } 2482 }
@@ -2498,12 +2501,10 @@ xlog_recover_dquot_pass2(
2498 */ 2501 */
2499 dq_f = item->ri_buf[0].i_addr; 2502 dq_f = item->ri_buf[0].i_addr;
2500 ASSERT(dq_f); 2503 ASSERT(dq_f);
2501 if ((error = xfs_qm_dqcheck(recddq, 2504 error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2502 dq_f->qlf_id, 2505 "xlog_recover_dquot_pass2 (log copy)");
2503 0, XFS_QMOPT_DOWARN, 2506 if (error)
2504 "xlog_recover_dquot_pass2 (log copy)"))) {
2505 return XFS_ERROR(EIO); 2507 return XFS_ERROR(EIO);
2506 }
2507 ASSERT(dq_f->qlf_len == 1); 2508 ASSERT(dq_f->qlf_len == 1);
2508 2509
2509 error = xfs_read_buf(mp, mp->m_ddev_targp, 2510 error = xfs_read_buf(mp, mp->m_ddev_targp,
@@ -2523,8 +2524,9 @@ xlog_recover_dquot_pass2(
2523 * was among a chunk of dquots created earlier, and we did some 2524 * was among a chunk of dquots created earlier, and we did some
2524 * minimal initialization then. 2525 * minimal initialization then.
2525 */ 2526 */
2526 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2527 error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2527 "xlog_recover_dquot_pass2")) { 2528 "xlog_recover_dquot_pass2");
2529 if (error) {
2528 xfs_buf_relse(bp); 2530 xfs_buf_relse(bp);
2529 return XFS_ERROR(EIO); 2531 return XFS_ERROR(EIO);
2530 } 2532 }
@@ -2676,9 +2678,8 @@ xlog_recover_commit_pass1(
2676 /* nothing to do in pass 1 */ 2678 /* nothing to do in pass 1 */
2677 return 0; 2679 return 0;
2678 default: 2680 default:
2679 xlog_warn( 2681 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2680 "XFS: invalid item type (%d) xlog_recover_commit_pass1", 2682 __func__, ITEM_TYPE(item));
2681 ITEM_TYPE(item));
2682 ASSERT(0); 2683 ASSERT(0);
2683 return XFS_ERROR(EIO); 2684 return XFS_ERROR(EIO);
2684 } 2685 }
@@ -2707,9 +2708,8 @@ xlog_recover_commit_pass2(
2707 /* nothing to do in pass2 */ 2708 /* nothing to do in pass2 */
2708 return 0; 2709 return 0;
2709 default: 2710 default:
2710 xlog_warn( 2711 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2711 "XFS: invalid item type (%d) xlog_recover_commit_pass2", 2712 __func__, ITEM_TYPE(item));
2712 ITEM_TYPE(item));
2713 ASSERT(0); 2713 ASSERT(0);
2714 return XFS_ERROR(EIO); 2714 return XFS_ERROR(EIO);
2715 } 2715 }
@@ -2751,10 +2751,11 @@ xlog_recover_commit_trans(
2751 2751
2752STATIC int 2752STATIC int
2753xlog_recover_unmount_trans( 2753xlog_recover_unmount_trans(
2754 struct log *log,
2754 xlog_recover_t *trans) 2755 xlog_recover_t *trans)
2755{ 2756{
2756 /* Do nothing now */ 2757 /* Do nothing now */
2757 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR"); 2758 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2758 return 0; 2759 return 0;
2759} 2760}
2760 2761
@@ -2797,8 +2798,8 @@ xlog_recover_process_data(
2797 dp += sizeof(xlog_op_header_t); 2798 dp += sizeof(xlog_op_header_t);
2798 if (ohead->oh_clientid != XFS_TRANSACTION && 2799 if (ohead->oh_clientid != XFS_TRANSACTION &&
2799 ohead->oh_clientid != XFS_LOG) { 2800 ohead->oh_clientid != XFS_LOG) {
2800 xlog_warn( 2801 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2801 "XFS: xlog_recover_process_data: bad clientid"); 2802 __func__, ohead->oh_clientid);
2802 ASSERT(0); 2803 ASSERT(0);
2803 return (XFS_ERROR(EIO)); 2804 return (XFS_ERROR(EIO));
2804 } 2805 }
@@ -2811,8 +2812,8 @@ xlog_recover_process_data(
2811 be64_to_cpu(rhead->h_lsn)); 2812 be64_to_cpu(rhead->h_lsn));
2812 } else { 2813 } else {
2813 if (dp + be32_to_cpu(ohead->oh_len) > lp) { 2814 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2814 xlog_warn( 2815 xfs_warn(log->l_mp, "%s: bad length 0x%x",
2815 "XFS: xlog_recover_process_data: bad length"); 2816 __func__, be32_to_cpu(ohead->oh_len));
2816 WARN_ON(1); 2817 WARN_ON(1);
2817 return (XFS_ERROR(EIO)); 2818 return (XFS_ERROR(EIO));
2818 } 2819 }
@@ -2825,7 +2826,7 @@ xlog_recover_process_data(
2825 trans, pass); 2826 trans, pass);
2826 break; 2827 break;
2827 case XLOG_UNMOUNT_TRANS: 2828 case XLOG_UNMOUNT_TRANS:
2828 error = xlog_recover_unmount_trans(trans); 2829 error = xlog_recover_unmount_trans(log, trans);
2829 break; 2830 break;
2830 case XLOG_WAS_CONT_TRANS: 2831 case XLOG_WAS_CONT_TRANS:
2831 error = xlog_recover_add_to_cont_trans(log, 2832 error = xlog_recover_add_to_cont_trans(log,
@@ -2833,8 +2834,8 @@ xlog_recover_process_data(
2833 be32_to_cpu(ohead->oh_len)); 2834 be32_to_cpu(ohead->oh_len));
2834 break; 2835 break;
2835 case XLOG_START_TRANS: 2836 case XLOG_START_TRANS:
2836 xlog_warn( 2837 xfs_warn(log->l_mp, "%s: bad transaction",
2837 "XFS: xlog_recover_process_data: bad transaction"); 2838 __func__);
2838 ASSERT(0); 2839 ASSERT(0);
2839 error = XFS_ERROR(EIO); 2840 error = XFS_ERROR(EIO);
2840 break; 2841 break;
@@ -2844,8 +2845,8 @@ xlog_recover_process_data(
2844 dp, be32_to_cpu(ohead->oh_len)); 2845 dp, be32_to_cpu(ohead->oh_len));
2845 break; 2846 break;
2846 default: 2847 default:
2847 xlog_warn( 2848 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
2848 "XFS: xlog_recover_process_data: bad flag"); 2849 __func__, flags);
2849 ASSERT(0); 2850 ASSERT(0);
2850 error = XFS_ERROR(EIO); 2851 error = XFS_ERROR(EIO);
2851 break; 2852 break;
@@ -3030,8 +3031,7 @@ xlog_recover_clear_agi_bucket(
3030out_abort: 3031out_abort:
3031 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3032 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3032out_error: 3033out_error:
3033 xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: " 3034 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3034 "failed to clear agi %d. Continuing.", agno);
3035 return; 3035 return;
3036} 3036}
3037 3037
@@ -3282,7 +3282,7 @@ xlog_valid_rec_header(
3282 if (unlikely( 3282 if (unlikely(
3283 (!rhead->h_version || 3283 (!rhead->h_version ||
3284 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { 3284 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3285 xlog_warn("XFS: %s: unrecognised log version (%d).", 3285 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
3286 __func__, be32_to_cpu(rhead->h_version)); 3286 __func__, be32_to_cpu(rhead->h_version));
3287 return XFS_ERROR(EIO); 3287 return XFS_ERROR(EIO);
3288 } 3288 }
@@ -3740,10 +3740,9 @@ xlog_recover(
3740 return error; 3740 return error;
3741 } 3741 }
3742 3742
3743 cmn_err(CE_NOTE, 3743 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3744 "Starting XFS recovery on filesystem: %s (logdev: %s)", 3744 log->l_mp->m_logname ? log->l_mp->m_logname
3745 log->l_mp->m_fsname, log->l_mp->m_logname ? 3745 : "internal");
3746 log->l_mp->m_logname : "internal");
3747 3746
3748 error = xlog_do_recover(log, head_blk, tail_blk); 3747 error = xlog_do_recover(log, head_blk, tail_blk);
3749 log->l_flags |= XLOG_RECOVERY_NEEDED; 3748 log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3776,9 +3775,7 @@ xlog_recover_finish(
3776 int error; 3775 int error;
3777 error = xlog_recover_process_efis(log); 3776 error = xlog_recover_process_efis(log);
3778 if (error) { 3777 if (error) {
3779 cmn_err(CE_ALERT, 3778 xfs_alert(log->l_mp, "Failed to recover EFIs");
3780 "Failed to recover EFIs on filesystem: %s",
3781 log->l_mp->m_fsname);
3782 return error; 3779 return error;
3783 } 3780 }
3784 /* 3781 /*
@@ -3793,15 +3790,12 @@ xlog_recover_finish(
3793 3790
3794 xlog_recover_check_summary(log); 3791 xlog_recover_check_summary(log);
3795 3792
3796 cmn_err(CE_NOTE, 3793 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
3797 "Ending XFS recovery on filesystem: %s (logdev: %s)", 3794 log->l_mp->m_logname ? log->l_mp->m_logname
3798 log->l_mp->m_fsname, log->l_mp->m_logname ? 3795 : "internal");
3799 log->l_mp->m_logname : "internal");
3800 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3796 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3801 } else { 3797 } else {
3802 cmn_err(CE_DEBUG, 3798 xfs_info(log->l_mp, "Ending clean mount");
3803 "Ending clean XFS mount for filesystem: %s\n",
3804 log->l_mp->m_fsname);
3805 } 3799 }
3806 return 0; 3800 return 0;
3807} 3801}
@@ -3834,10 +3828,8 @@ xlog_recover_check_summary(
3834 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 3828 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3835 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); 3829 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3836 if (error) { 3830 if (error) {
3837 xfs_fs_cmn_err(CE_ALERT, mp, 3831 xfs_alert(mp, "%s agf read failed agno %d error %d",
3838 "xlog_recover_check_summary(agf)" 3832 __func__, agno, error);
3839 "agf read failed agno %d error %d",
3840 agno, error);
3841 } else { 3833 } else {
3842 agfp = XFS_BUF_TO_AGF(agfbp); 3834 agfp = XFS_BUF_TO_AGF(agfbp);
3843 freeblks += be32_to_cpu(agfp->agf_freeblks) + 3835 freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3846,7 +3838,10 @@ xlog_recover_check_summary(
3846 } 3838 }
3847 3839
3848 error = xfs_read_agi(mp, NULL, agno, &agibp); 3840 error = xfs_read_agi(mp, NULL, agno, &agibp);
3849 if (!error) { 3841 if (error) {
3842 xfs_alert(mp, "%s agi read failed agno %d error %d",
3843 __func__, agno, error);
3844 } else {
3850 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); 3845 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
3851 3846
3852 itotal += be32_to_cpu(agi->agi_count); 3847 itotal += be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d447aef84bc3..bb3f9a7b24ed 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -133,9 +133,7 @@ xfs_uuid_mount(
133 return 0; 133 return 0;
134 134
135 if (uuid_is_nil(uuid)) { 135 if (uuid_is_nil(uuid)) {
136 cmn_err(CE_WARN, 136 xfs_warn(mp, "Filesystem has nil UUID - can't mount");
137 "XFS: Filesystem %s has nil UUID - can't mount",
138 mp->m_fsname);
139 return XFS_ERROR(EINVAL); 137 return XFS_ERROR(EINVAL);
140 } 138 }
141 139
@@ -163,8 +161,7 @@ xfs_uuid_mount(
163 161
164 out_duplicate: 162 out_duplicate:
165 mutex_unlock(&xfs_uuid_table_mutex); 163 mutex_unlock(&xfs_uuid_table_mutex);
166 cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount", 164 xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
167 mp->m_fsname);
168 return XFS_ERROR(EINVAL); 165 return XFS_ERROR(EINVAL);
169} 166}
170 167
@@ -311,6 +308,8 @@ xfs_mount_validate_sb(
311 xfs_sb_t *sbp, 308 xfs_sb_t *sbp,
312 int flags) 309 int flags)
313{ 310{
311 int loud = !(flags & XFS_MFSI_QUIET);
312
314 /* 313 /*
315 * If the log device and data device have the 314 * If the log device and data device have the
316 * same device number, the log is internal. 315 * same device number, the log is internal.
@@ -319,28 +318,32 @@ xfs_mount_validate_sb(
319 * a volume filesystem in a non-volume manner. 318 * a volume filesystem in a non-volume manner.
320 */ 319 */
321 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 320 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
322 xfs_fs_mount_cmn_err(flags, "bad magic number"); 321 if (loud)
322 xfs_warn(mp, "bad magic number");
323 return XFS_ERROR(EWRONGFS); 323 return XFS_ERROR(EWRONGFS);
324 } 324 }
325 325
326 if (!xfs_sb_good_version(sbp)) { 326 if (!xfs_sb_good_version(sbp)) {
327 xfs_fs_mount_cmn_err(flags, "bad version"); 327 if (loud)
328 xfs_warn(mp, "bad version");
328 return XFS_ERROR(EWRONGFS); 329 return XFS_ERROR(EWRONGFS);
329 } 330 }
330 331
331 if (unlikely( 332 if (unlikely(
332 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 333 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
333 xfs_fs_mount_cmn_err(flags, 334 if (loud)
334 "filesystem is marked as having an external log; " 335 xfs_warn(mp,
335 "specify logdev on the\nmount command line."); 336 "filesystem is marked as having an external log; "
337 "specify logdev on the mount command line.");
336 return XFS_ERROR(EINVAL); 338 return XFS_ERROR(EINVAL);
337 } 339 }
338 340
339 if (unlikely( 341 if (unlikely(
340 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { 342 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
341 xfs_fs_mount_cmn_err(flags, 343 if (loud)
342 "filesystem is marked as having an internal log; " 344 xfs_warn(mp,
343 "do not specify logdev on\nthe mount command line."); 345 "filesystem is marked as having an internal log; "
346 "do not specify logdev on the mount command line.");
344 return XFS_ERROR(EINVAL); 347 return XFS_ERROR(EINVAL);
345 } 348 }
346 349
@@ -369,7 +372,8 @@ xfs_mount_validate_sb(
369 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 372 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
370 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || 373 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
371 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { 374 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
372 xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); 375 if (loud)
376 xfs_warn(mp, "SB sanity check 1 failed");
373 return XFS_ERROR(EFSCORRUPTED); 377 return XFS_ERROR(EFSCORRUPTED);
374 } 378 }
375 379
@@ -382,7 +386,8 @@ xfs_mount_validate_sb(
382 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || 386 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
383 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * 387 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
384 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { 388 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
385 xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed"); 389 if (loud)
390 xfs_warn(mp, "SB sanity check 2 failed");
386 return XFS_ERROR(EFSCORRUPTED); 391 return XFS_ERROR(EFSCORRUPTED);
387 } 392 }
388 393
@@ -390,12 +395,12 @@ xfs_mount_validate_sb(
390 * Until this is fixed only page-sized or smaller data blocks work. 395 * Until this is fixed only page-sized or smaller data blocks work.
391 */ 396 */
392 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { 397 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
393 xfs_fs_mount_cmn_err(flags, 398 if (loud) {
394 "file system with blocksize %d bytes", 399 xfs_warn(mp,
395 sbp->sb_blocksize); 400 "File system with blocksize %d bytes. "
396 xfs_fs_mount_cmn_err(flags, 401 "Only pagesize (%ld) or less will currently work.",
397 "only pagesize (%ld) or less will currently work.", 402 sbp->sb_blocksize, PAGE_SIZE);
398 PAGE_SIZE); 403 }
399 return XFS_ERROR(ENOSYS); 404 return XFS_ERROR(ENOSYS);
400 } 405 }
401 406
@@ -409,21 +414,23 @@ xfs_mount_validate_sb(
409 case 2048: 414 case 2048:
410 break; 415 break;
411 default: 416 default:
412 xfs_fs_mount_cmn_err(flags, 417 if (loud)
413 "inode size of %d bytes not supported", 418 xfs_warn(mp, "inode size of %d bytes not supported",
414 sbp->sb_inodesize); 419 sbp->sb_inodesize);
415 return XFS_ERROR(ENOSYS); 420 return XFS_ERROR(ENOSYS);
416 } 421 }
417 422
418 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 423 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
419 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 424 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
420 xfs_fs_mount_cmn_err(flags, 425 if (loud)
421 "file system too large to be mounted on this system."); 426 xfs_warn(mp,
427 "file system too large to be mounted on this system.");
422 return XFS_ERROR(EFBIG); 428 return XFS_ERROR(EFBIG);
423 } 429 }
424 430
425 if (unlikely(sbp->sb_inprogress)) { 431 if (unlikely(sbp->sb_inprogress)) {
426 xfs_fs_mount_cmn_err(flags, "file system busy"); 432 if (loud)
433 xfs_warn(mp, "file system busy");
427 return XFS_ERROR(EFSCORRUPTED); 434 return XFS_ERROR(EFSCORRUPTED);
428 } 435 }
429 436
@@ -431,8 +438,9 @@ xfs_mount_validate_sb(
431 * Version 1 directory format has never worked on Linux. 438 * Version 1 directory format has never worked on Linux.
432 */ 439 */
433 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { 440 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
434 xfs_fs_mount_cmn_err(flags, 441 if (loud)
435 "file system using version 1 directory format"); 442 xfs_warn(mp,
443 "file system using version 1 directory format");
436 return XFS_ERROR(ENOSYS); 444 return XFS_ERROR(ENOSYS);
437 } 445 }
438 446
@@ -673,6 +681,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
673 unsigned int sector_size; 681 unsigned int sector_size;
674 xfs_buf_t *bp; 682 xfs_buf_t *bp;
675 int error; 683 int error;
684 int loud = !(flags & XFS_MFSI_QUIET);
676 685
677 ASSERT(mp->m_sb_bp == NULL); 686 ASSERT(mp->m_sb_bp == NULL);
678 ASSERT(mp->m_ddev_targp != NULL); 687 ASSERT(mp->m_ddev_targp != NULL);
@@ -688,7 +697,8 @@ reread:
688 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, 697 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
689 XFS_SB_DADDR, sector_size, 0); 698 XFS_SB_DADDR, sector_size, 0);
690 if (!bp) { 699 if (!bp) {
691 xfs_fs_mount_cmn_err(flags, "SB buffer read failed"); 700 if (loud)
701 xfs_warn(mp, "SB buffer read failed");
692 return EIO; 702 return EIO;
693 } 703 }
694 704
@@ -699,7 +709,8 @@ reread:
699 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 709 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
700 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 710 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
701 if (error) { 711 if (error) {
702 xfs_fs_mount_cmn_err(flags, "SB validate failed"); 712 if (loud)
713 xfs_warn(mp, "SB validate failed");
703 goto release_buf; 714 goto release_buf;
704 } 715 }
705 716
@@ -707,9 +718,9 @@ reread:
707 * We must be able to do sector-sized and sector-aligned IO. 718 * We must be able to do sector-sized and sector-aligned IO.
708 */ 719 */
709 if (sector_size > mp->m_sb.sb_sectsize) { 720 if (sector_size > mp->m_sb.sb_sectsize) {
710 xfs_fs_mount_cmn_err(flags, 721 if (loud)
711 "device supports only %u byte sectors (not %u)", 722 xfs_warn(mp, "device supports %u byte sectors (not %u)",
712 sector_size, mp->m_sb.sb_sectsize); 723 sector_size, mp->m_sb.sb_sectsize);
713 error = ENOSYS; 724 error = ENOSYS;
714 goto release_buf; 725 goto release_buf;
715 } 726 }
@@ -853,8 +864,7 @@ xfs_update_alignment(xfs_mount_t *mp)
853 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || 864 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
854 (BBTOB(mp->m_swidth) & mp->m_blockmask)) { 865 (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
855 if (mp->m_flags & XFS_MOUNT_RETERR) { 866 if (mp->m_flags & XFS_MOUNT_RETERR) {
856 cmn_err(CE_WARN, 867 xfs_warn(mp, "alignment check 1 failed");
857 "XFS: alignment check 1 failed");
858 return XFS_ERROR(EINVAL); 868 return XFS_ERROR(EINVAL);
859 } 869 }
860 mp->m_dalign = mp->m_swidth = 0; 870 mp->m_dalign = mp->m_swidth = 0;
@@ -867,8 +877,9 @@ xfs_update_alignment(xfs_mount_t *mp)
867 if (mp->m_flags & XFS_MOUNT_RETERR) { 877 if (mp->m_flags & XFS_MOUNT_RETERR) {
868 return XFS_ERROR(EINVAL); 878 return XFS_ERROR(EINVAL);
869 } 879 }
870 xfs_fs_cmn_err(CE_WARN, mp, 880 xfs_warn(mp,
871"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)", 881 "stripe alignment turned off: sunit(%d)/swidth(%d) "
882 "incompatible with agsize(%d)",
872 mp->m_dalign, mp->m_swidth, 883 mp->m_dalign, mp->m_swidth,
873 sbp->sb_agblocks); 884 sbp->sb_agblocks);
874 885
@@ -878,9 +889,9 @@ xfs_update_alignment(xfs_mount_t *mp)
878 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 889 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
879 } else { 890 } else {
880 if (mp->m_flags & XFS_MOUNT_RETERR) { 891 if (mp->m_flags & XFS_MOUNT_RETERR) {
881 xfs_fs_cmn_err(CE_WARN, mp, 892 xfs_warn(mp,
882"stripe alignment turned off: sunit(%d) less than bsize(%d)", 893 "stripe alignment turned off: sunit(%d) less than bsize(%d)",
883 mp->m_dalign, 894 mp->m_dalign,
884 mp->m_blockmask +1); 895 mp->m_blockmask +1);
885 return XFS_ERROR(EINVAL); 896 return XFS_ERROR(EINVAL);
886 } 897 }
@@ -1026,14 +1037,14 @@ xfs_check_sizes(xfs_mount_t *mp)
1026 1037
1027 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 1038 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
1028 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1039 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
1029 cmn_err(CE_WARN, "XFS: filesystem size mismatch detected"); 1040 xfs_warn(mp, "filesystem size mismatch detected");
1030 return XFS_ERROR(EFBIG); 1041 return XFS_ERROR(EFBIG);
1031 } 1042 }
1032 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, 1043 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
1033 d - XFS_FSS_TO_BB(mp, 1), 1044 d - XFS_FSS_TO_BB(mp, 1),
1034 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); 1045 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
1035 if (!bp) { 1046 if (!bp) {
1036 cmn_err(CE_WARN, "XFS: last sector read failed"); 1047 xfs_warn(mp, "last sector read failed");
1037 return EIO; 1048 return EIO;
1038 } 1049 }
1039 xfs_buf_relse(bp); 1050 xfs_buf_relse(bp);
@@ -1041,14 +1052,14 @@ xfs_check_sizes(xfs_mount_t *mp)
1041 if (mp->m_logdev_targp != mp->m_ddev_targp) { 1052 if (mp->m_logdev_targp != mp->m_ddev_targp) {
1042 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1053 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1043 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1054 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1044 cmn_err(CE_WARN, "XFS: log size mismatch detected"); 1055 xfs_warn(mp, "log size mismatch detected");
1045 return XFS_ERROR(EFBIG); 1056 return XFS_ERROR(EFBIG);
1046 } 1057 }
1047 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, 1058 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
1048 d - XFS_FSB_TO_BB(mp, 1), 1059 d - XFS_FSB_TO_BB(mp, 1),
1049 XFS_FSB_TO_B(mp, 1), 0); 1060 XFS_FSB_TO_B(mp, 1), 0);
1050 if (!bp) { 1061 if (!bp) {
1051 cmn_err(CE_WARN, "XFS: log device read failed"); 1062 xfs_warn(mp, "log device read failed");
1052 return EIO; 1063 return EIO;
1053 } 1064 }
1054 xfs_buf_relse(bp); 1065 xfs_buf_relse(bp);
@@ -1086,7 +1097,7 @@ xfs_mount_reset_sbqflags(
1086 return 0; 1097 return 0;
1087 1098
1088#ifdef QUOTADEBUG 1099#ifdef QUOTADEBUG
1089 xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes"); 1100 xfs_notice(mp, "Writing superblock quota changes");
1090#endif 1101#endif
1091 1102
1092 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 1103 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
@@ -1094,8 +1105,7 @@ xfs_mount_reset_sbqflags(
1094 XFS_DEFAULT_LOG_COUNT); 1105 XFS_DEFAULT_LOG_COUNT);
1095 if (error) { 1106 if (error) {
1096 xfs_trans_cancel(tp, 0); 1107 xfs_trans_cancel(tp, 0);
1097 xfs_fs_cmn_err(CE_ALERT, mp, 1108 xfs_alert(mp, "%s: Superblock update failed!", __func__);
1098 "xfs_mount_reset_sbqflags: Superblock update failed!");
1099 return error; 1109 return error;
1100 } 1110 }
1101 1111
@@ -1161,8 +1171,7 @@ xfs_mountfs(
1161 * transaction subsystem is online. 1171 * transaction subsystem is online.
1162 */ 1172 */
1163 if (xfs_sb_has_mismatched_features2(sbp)) { 1173 if (xfs_sb_has_mismatched_features2(sbp)) {
1164 cmn_err(CE_WARN, 1174 xfs_warn(mp, "correcting sb_features alignment problem");
1165 "XFS: correcting sb_features alignment problem");
1166 sbp->sb_features2 |= sbp->sb_bad_features2; 1175 sbp->sb_features2 |= sbp->sb_bad_features2;
1167 sbp->sb_bad_features2 = sbp->sb_features2; 1176 sbp->sb_bad_features2 = sbp->sb_features2;
1168 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; 1177 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
@@ -1241,7 +1250,7 @@ xfs_mountfs(
1241 */ 1250 */
1242 error = xfs_rtmount_init(mp); 1251 error = xfs_rtmount_init(mp);
1243 if (error) { 1252 if (error) {
1244 cmn_err(CE_WARN, "XFS: RT mount failed"); 1253 xfs_warn(mp, "RT mount failed");
1245 goto out_remove_uuid; 1254 goto out_remove_uuid;
1246 } 1255 }
1247 1256
@@ -1272,12 +1281,12 @@ xfs_mountfs(
1272 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); 1281 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
1273 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); 1282 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1274 if (error) { 1283 if (error) {
1275 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); 1284 xfs_warn(mp, "Failed per-ag init: %d", error);
1276 goto out_remove_uuid; 1285 goto out_remove_uuid;
1277 } 1286 }
1278 1287
1279 if (!sbp->sb_logblocks) { 1288 if (!sbp->sb_logblocks) {
1280 cmn_err(CE_WARN, "XFS: no log defined"); 1289 xfs_warn(mp, "no log defined");
1281 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); 1290 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
1282 error = XFS_ERROR(EFSCORRUPTED); 1291 error = XFS_ERROR(EFSCORRUPTED);
1283 goto out_free_perag; 1292 goto out_free_perag;
@@ -1290,7 +1299,7 @@ xfs_mountfs(
1290 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), 1299 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1291 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 1300 XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1292 if (error) { 1301 if (error) {
1293 cmn_err(CE_WARN, "XFS: log mount failed"); 1302 xfs_warn(mp, "log mount failed");
1294 goto out_free_perag; 1303 goto out_free_perag;
1295 } 1304 }
1296 1305
@@ -1327,16 +1336,14 @@ xfs_mountfs(
1327 */ 1336 */
1328 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); 1337 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
1329 if (error) { 1338 if (error) {
1330 cmn_err(CE_WARN, "XFS: failed to read root inode"); 1339 xfs_warn(mp, "failed to read root inode");
1331 goto out_log_dealloc; 1340 goto out_log_dealloc;
1332 } 1341 }
1333 1342
1334 ASSERT(rip != NULL); 1343 ASSERT(rip != NULL);
1335 1344
1336 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { 1345 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
1337 cmn_err(CE_WARN, "XFS: corrupted root inode"); 1346 xfs_warn(mp, "corrupted root inode %llu: not a directory",
1338 cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
1339 XFS_BUFTARG_NAME(mp->m_ddev_targp),
1340 (unsigned long long)rip->i_ino); 1347 (unsigned long long)rip->i_ino);
1341 xfs_iunlock(rip, XFS_ILOCK_EXCL); 1348 xfs_iunlock(rip, XFS_ILOCK_EXCL);
1342 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, 1349 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
@@ -1356,7 +1363,7 @@ xfs_mountfs(
1356 /* 1363 /*
1357 * Free up the root inode. 1364 * Free up the root inode.
1358 */ 1365 */
1359 cmn_err(CE_WARN, "XFS: failed to read RT inodes"); 1366 xfs_warn(mp, "failed to read RT inodes");
1360 goto out_rele_rip; 1367 goto out_rele_rip;
1361 } 1368 }
1362 1369
@@ -1368,7 +1375,7 @@ xfs_mountfs(
1368 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { 1375 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
1369 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1376 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1370 if (error) { 1377 if (error) {
1371 cmn_err(CE_WARN, "XFS: failed to write sb changes"); 1378 xfs_warn(mp, "failed to write sb changes");
1372 goto out_rtunmount; 1379 goto out_rtunmount;
1373 } 1380 }
1374 } 1381 }
@@ -1389,10 +1396,7 @@ xfs_mountfs(
1389 * quotachecked license. 1396 * quotachecked license.
1390 */ 1397 */
1391 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { 1398 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
1392 cmn_err(CE_NOTE, 1399 xfs_notice(mp, "resetting quota flags");
1393 "XFS: resetting qflags for filesystem %s",
1394 mp->m_fsname);
1395
1396 error = xfs_mount_reset_sbqflags(mp); 1400 error = xfs_mount_reset_sbqflags(mp);
1397 if (error) 1401 if (error)
1398 return error; 1402 return error;
@@ -1406,7 +1410,7 @@ xfs_mountfs(
1406 */ 1410 */
1407 error = xfs_log_mount_finish(mp); 1411 error = xfs_log_mount_finish(mp);
1408 if (error) { 1412 if (error) {
1409 cmn_err(CE_WARN, "XFS: log mount finish failed"); 1413 xfs_warn(mp, "log mount finish failed");
1410 goto out_rtunmount; 1414 goto out_rtunmount;
1411 } 1415 }
1412 1416
@@ -1435,8 +1439,8 @@ xfs_mountfs(
1435 resblks = xfs_default_resblks(mp); 1439 resblks = xfs_default_resblks(mp);
1436 error = xfs_reserve_blocks(mp, &resblks, NULL); 1440 error = xfs_reserve_blocks(mp, &resblks, NULL);
1437 if (error) 1441 if (error)
1438 cmn_err(CE_WARN, "XFS: Unable to allocate reserve " 1442 xfs_warn(mp,
1439 "blocks. Continuing without a reserve pool."); 1443 "Unable to allocate reserve blocks. Continuing without reserve pool.");
1440 } 1444 }
1441 1445
1442 return 0; 1446 return 0;
@@ -1525,12 +1529,12 @@ xfs_unmountfs(
1525 resblks = 0; 1529 resblks = 0;
1526 error = xfs_reserve_blocks(mp, &resblks, NULL); 1530 error = xfs_reserve_blocks(mp, &resblks, NULL);
1527 if (error) 1531 if (error)
1528 cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. " 1532 xfs_warn(mp, "Unable to free reserved block pool. "
1529 "Freespace may not be correct on next mount."); 1533 "Freespace may not be correct on next mount.");
1530 1534
1531 error = xfs_log_sbcount(mp, 1); 1535 error = xfs_log_sbcount(mp, 1);
1532 if (error) 1536 if (error)
1533 cmn_err(CE_WARN, "XFS: Unable to update superblock counters. " 1537 xfs_warn(mp, "Unable to update superblock counters. "
1534 "Freespace may not be correct on next mount."); 1538 "Freespace may not be correct on next mount.");
1535 xfs_unmountfs_writesb(mp); 1539 xfs_unmountfs_writesb(mp);
1536 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1540 xfs_unmountfs_wait(mp); /* wait for async bufs */
@@ -2013,10 +2017,8 @@ xfs_dev_is_read_only(
2013 if (xfs_readonly_buftarg(mp->m_ddev_targp) || 2017 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
2014 xfs_readonly_buftarg(mp->m_logdev_targp) || 2018 xfs_readonly_buftarg(mp->m_logdev_targp) ||
2015 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { 2019 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
2016 cmn_err(CE_NOTE, 2020 xfs_notice(mp, "%s required on read-only device.", message);
2017 "XFS: %s required on read-only device.", message); 2021 xfs_notice(mp, "write access unavailable, cannot proceed.");
2018 cmn_err(CE_NOTE,
2019 "XFS: write access unavailable, cannot proceed.");
2020 return EROFS; 2022 return EROFS;
2021 } 2023 }
2022 return 0; 2024 return 0;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index edfa178bafb6..4aff56395732 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
309 if (!xfs_mru_elem_zone) 309 if (!xfs_mru_elem_zone)
310 goto out; 310 goto out;
311 311
312 xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); 312 xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
313 if (!xfs_mru_reap_wq) 313 if (!xfs_mru_reap_wq)
314 goto out_destroy_mru_elem_zone; 314 goto out_destroy_mru_elem_zone;
315 315
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 9bb6eda4cd21..a595f29567fe 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -382,7 +382,8 @@ static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
382 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \ 382 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
383 f | XFS_QMOPT_RES_REGBLKS) 383 f | XFS_QMOPT_RES_REGBLKS)
384 384
385extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); 385extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
386 xfs_dqid_t, uint, uint, char *);
386extern int xfs_mount_reset_sbqflags(struct xfs_mount *); 387extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
387 388
388#endif /* __KERNEL__ */ 389#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 12a191385310..8f76fdff4f46 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -76,7 +76,7 @@ xfs_growfs_rt_alloc(
76 xfs_mount_t *mp, /* file system mount point */ 76 xfs_mount_t *mp, /* file system mount point */
77 xfs_extlen_t oblocks, /* old count of blocks */ 77 xfs_extlen_t oblocks, /* old count of blocks */
78 xfs_extlen_t nblocks, /* new count of blocks */ 78 xfs_extlen_t nblocks, /* new count of blocks */
79 xfs_ino_t ino) /* inode number (bitmap/summary) */ 79 xfs_inode_t *ip) /* inode (bitmap/summary) */
80{ 80{
81 xfs_fileoff_t bno; /* block number in file */ 81 xfs_fileoff_t bno; /* block number in file */
82 xfs_buf_t *bp; /* temporary buffer for zeroing */ 82 xfs_buf_t *bp; /* temporary buffer for zeroing */
@@ -86,7 +86,6 @@ xfs_growfs_rt_alloc(
86 xfs_fsblock_t firstblock; /* first block allocated in xaction */ 86 xfs_fsblock_t firstblock; /* first block allocated in xaction */
87 xfs_bmap_free_t flist; /* list of freed blocks */ 87 xfs_bmap_free_t flist; /* list of freed blocks */
88 xfs_fsblock_t fsbno; /* filesystem block for bno */ 88 xfs_fsblock_t fsbno; /* filesystem block for bno */
89 xfs_inode_t *ip; /* pointer to incore inode */
90 xfs_bmbt_irec_t map; /* block map output */ 89 xfs_bmbt_irec_t map; /* block map output */
91 int nmap; /* number of block maps */ 90 int nmap; /* number of block maps */
92 int resblks; /* space reservation */ 91 int resblks; /* space reservation */
@@ -112,9 +111,9 @@ xfs_growfs_rt_alloc(
112 /* 111 /*
113 * Lock the inode. 112 * Lock the inode.
114 */ 113 */
115 if ((error = xfs_trans_iget(mp, tp, ino, 0, 114 xfs_ilock(ip, XFS_ILOCK_EXCL);
116 XFS_ILOCK_EXCL, &ip))) 115 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
117 goto error_cancel; 116
118 xfs_bmap_init(&flist, &firstblock); 117 xfs_bmap_init(&flist, &firstblock);
119 /* 118 /*
120 * Allocate blocks to the bitmap file. 119 * Allocate blocks to the bitmap file.
@@ -155,9 +154,8 @@ xfs_growfs_rt_alloc(
155 /* 154 /*
156 * Lock the bitmap inode. 155 * Lock the bitmap inode.
157 */ 156 */
158 if ((error = xfs_trans_iget(mp, tp, ino, 0, 157 xfs_ilock(ip, XFS_ILOCK_EXCL);
159 XFS_ILOCK_EXCL, &ip))) 158 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
160 goto error_cancel;
161 /* 159 /*
162 * Get a buffer for the block. 160 * Get a buffer for the block.
163 */ 161 */
@@ -1854,7 +1852,6 @@ xfs_growfs_rt(
1854 xfs_rtblock_t bmbno; /* bitmap block number */ 1852 xfs_rtblock_t bmbno; /* bitmap block number */
1855 xfs_buf_t *bp; /* temporary buffer */ 1853 xfs_buf_t *bp; /* temporary buffer */
1856 int error; /* error return value */ 1854 int error; /* error return value */
1857 xfs_inode_t *ip; /* bitmap inode, used as lock */
1858 xfs_mount_t *nmp; /* new (fake) mount structure */ 1855 xfs_mount_t *nmp; /* new (fake) mount structure */
1859 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */ 1856 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */
1860 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ 1857 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
@@ -1918,11 +1915,11 @@ xfs_growfs_rt(
1918 /* 1915 /*
1919 * Allocate space to the bitmap and summary files, as necessary. 1916 * Allocate space to the bitmap and summary files, as necessary.
1920 */ 1917 */
1921 if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, 1918 error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
1922 mp->m_sb.sb_rbmino))) 1919 if (error)
1923 return error; 1920 return error;
1924 if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, 1921 error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
1925 mp->m_sb.sb_rsumino))) 1922 if (error)
1926 return error; 1923 return error;
1927 /* 1924 /*
1928 * Allocate a new (fake) mount/sb. 1925 * Allocate a new (fake) mount/sb.
@@ -1972,10 +1969,8 @@ xfs_growfs_rt(
1972 /* 1969 /*
1973 * Lock out other callers by grabbing the bitmap inode lock. 1970 * Lock out other callers by grabbing the bitmap inode lock.
1974 */ 1971 */
1975 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 1972 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
1976 XFS_ILOCK_EXCL, &ip))) 1973 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
1977 goto error_cancel;
1978 ASSERT(ip == mp->m_rbmip);
1979 /* 1974 /*
1980 * Update the bitmap inode's size. 1975 * Update the bitmap inode's size.
1981 */ 1976 */
@@ -1986,10 +1981,8 @@ xfs_growfs_rt(
1986 /* 1981 /*
1987 * Get the summary inode into the transaction. 1982 * Get the summary inode into the transaction.
1988 */ 1983 */
1989 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, 1984 xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
1990 XFS_ILOCK_EXCL, &ip))) 1985 xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
1991 goto error_cancel;
1992 ASSERT(ip == mp->m_rsumip);
1993 /* 1986 /*
1994 * Update the summary inode's size. 1987 * Update the summary inode's size.
1995 */ 1988 */
@@ -2075,15 +2068,15 @@ xfs_rtallocate_extent(
2075 xfs_extlen_t prod, /* extent product factor */ 2068 xfs_extlen_t prod, /* extent product factor */
2076 xfs_rtblock_t *rtblock) /* out: start block allocated */ 2069 xfs_rtblock_t *rtblock) /* out: start block allocated */
2077{ 2070{
2071 xfs_mount_t *mp = tp->t_mountp;
2078 int error; /* error value */ 2072 int error; /* error value */
2079 xfs_inode_t *ip; /* inode for bitmap file */
2080 xfs_mount_t *mp; /* file system mount structure */
2081 xfs_rtblock_t r; /* result allocated block */ 2073 xfs_rtblock_t r; /* result allocated block */
2082 xfs_fsblock_t sb; /* summary file block number */ 2074 xfs_fsblock_t sb; /* summary file block number */
2083 xfs_buf_t *sumbp; /* summary file block buffer */ 2075 xfs_buf_t *sumbp; /* summary file block buffer */
2084 2076
2077 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2085 ASSERT(minlen > 0 && minlen <= maxlen); 2078 ASSERT(minlen > 0 && minlen <= maxlen);
2086 mp = tp->t_mountp; 2079
2087 /* 2080 /*
2088 * If prod is set then figure out what to do to minlen and maxlen. 2081 * If prod is set then figure out what to do to minlen and maxlen.
2089 */ 2082 */
@@ -2099,12 +2092,7 @@ xfs_rtallocate_extent(
2099 return 0; 2092 return 0;
2100 } 2093 }
2101 } 2094 }
2102 /* 2095
2103 * Lock out other callers by grabbing the bitmap inode lock.
2104 */
2105 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
2106 XFS_ILOCK_EXCL, &ip)))
2107 return error;
2108 sumbp = NULL; 2096 sumbp = NULL;
2109 /* 2097 /*
2110 * Allocate by size, or near another block, or exactly at some block. 2098 * Allocate by size, or near another block, or exactly at some block.
@@ -2123,11 +2111,12 @@ xfs_rtallocate_extent(
2123 len, &sumbp, &sb, prod, &r); 2111 len, &sumbp, &sb, prod, &r);
2124 break; 2112 break;
2125 default: 2113 default:
2114 error = EIO;
2126 ASSERT(0); 2115 ASSERT(0);
2127 } 2116 }
2128 if (error) { 2117 if (error)
2129 return error; 2118 return error;
2130 } 2119
2131 /* 2120 /*
2132 * If it worked, update the superblock. 2121 * If it worked, update the superblock.
2133 */ 2122 */
@@ -2155,7 +2144,6 @@ xfs_rtfree_extent(
2155 xfs_extlen_t len) /* length of extent freed */ 2144 xfs_extlen_t len) /* length of extent freed */
2156{ 2145{
2157 int error; /* error value */ 2146 int error; /* error value */
2158 xfs_inode_t *ip; /* bitmap file inode */
2159 xfs_mount_t *mp; /* file system mount structure */ 2147 xfs_mount_t *mp; /* file system mount structure */
2160 xfs_fsblock_t sb; /* summary file block number */ 2148 xfs_fsblock_t sb; /* summary file block number */
2161 xfs_buf_t *sumbp; /* summary file block buffer */ 2149 xfs_buf_t *sumbp; /* summary file block buffer */
@@ -2164,9 +2152,9 @@ xfs_rtfree_extent(
2164 /* 2152 /*
2165 * Synchronize by locking the bitmap inode. 2153 * Synchronize by locking the bitmap inode.
2166 */ 2154 */
2167 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 2155 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2168 XFS_ILOCK_EXCL, &ip))) 2156 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2169 return error; 2157
2170#if defined(__KERNEL__) && defined(DEBUG) 2158#if defined(__KERNEL__) && defined(DEBUG)
2171 /* 2159 /*
2172 * Check to see that this whole range is currently allocated. 2160 * Check to see that this whole range is currently allocated.
@@ -2199,10 +2187,10 @@ xfs_rtfree_extent(
2199 */ 2187 */
2200 if (tp->t_frextents_delta + mp->m_sb.sb_frextents == 2188 if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
2201 mp->m_sb.sb_rextents) { 2189 mp->m_sb.sb_rextents) {
2202 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) 2190 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
2203 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; 2191 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2204 *(__uint64_t *)&ip->i_d.di_atime = 0; 2192 *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
2205 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2193 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
2206 } 2194 }
2207 return 0; 2195 return 0;
2208} 2196}
@@ -2222,8 +2210,8 @@ xfs_rtmount_init(
2222 if (sbp->sb_rblocks == 0) 2210 if (sbp->sb_rblocks == 0)
2223 return 0; 2211 return 0;
2224 if (mp->m_rtdev_targp == NULL) { 2212 if (mp->m_rtdev_targp == NULL) {
2225 cmn_err(CE_WARN, 2213 xfs_warn(mp,
2226 "XFS: This filesystem has a realtime volume, use rtdev=device option"); 2214 "Filesystem has a realtime volume, use rtdev=device option");
2227 return XFS_ERROR(ENODEV); 2215 return XFS_ERROR(ENODEV);
2228 } 2216 }
2229 mp->m_rsumlevels = sbp->sb_rextslog + 1; 2217 mp->m_rsumlevels = sbp->sb_rextslog + 1;
@@ -2237,7 +2225,7 @@ xfs_rtmount_init(
2237 */ 2225 */
2238 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); 2226 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
2239 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { 2227 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
2240 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", 2228 xfs_warn(mp, "realtime mount -- %llu != %llu",
2241 (unsigned long long) XFS_BB_TO_FSB(mp, d), 2229 (unsigned long long) XFS_BB_TO_FSB(mp, d),
2242 (unsigned long long) mp->m_sb.sb_rblocks); 2230 (unsigned long long) mp->m_sb.sb_rblocks);
2243 return XFS_ERROR(EFBIG); 2231 return XFS_ERROR(EFBIG);
@@ -2246,7 +2234,7 @@ xfs_rtmount_init(
2246 d - XFS_FSB_TO_BB(mp, 1), 2234 d - XFS_FSB_TO_BB(mp, 1),
2247 XFS_FSB_TO_B(mp, 1), 0); 2235 XFS_FSB_TO_B(mp, 1), 0);
2248 if (!bp) { 2236 if (!bp) {
2249 cmn_err(CE_WARN, "XFS: realtime device size check failed"); 2237 xfs_warn(mp, "realtime device size check failed");
2250 return EIO; 2238 return EIO;
2251 } 2239 }
2252 xfs_buf_relse(bp); 2240 xfs_buf_relse(bp);
@@ -2306,20 +2294,16 @@ xfs_rtpick_extent(
2306 xfs_rtblock_t *pick) /* result rt extent */ 2294 xfs_rtblock_t *pick) /* result rt extent */
2307{ 2295{
2308 xfs_rtblock_t b; /* result block */ 2296 xfs_rtblock_t b; /* result block */
2309 int error; /* error return value */
2310 xfs_inode_t *ip; /* bitmap incore inode */
2311 int log2; /* log of sequence number */ 2297 int log2; /* log of sequence number */
2312 __uint64_t resid; /* residual after log removed */ 2298 __uint64_t resid; /* residual after log removed */
2313 __uint64_t seq; /* sequence number of file creation */ 2299 __uint64_t seq; /* sequence number of file creation */
2314 __uint64_t *seqp; /* pointer to seqno in inode */ 2300 __uint64_t *seqp; /* pointer to seqno in inode */
2315 2301
2316 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 2302 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2317 XFS_ILOCK_EXCL, &ip))) 2303
2318 return error; 2304 seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
2319 ASSERT(ip == mp->m_rbmip); 2305 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
2320 seqp = (__uint64_t *)&ip->i_d.di_atime; 2306 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2321 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
2322 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2323 *seqp = 0; 2307 *seqp = 0;
2324 } 2308 }
2325 seq = *seqp; 2309 seq = *seqp;
@@ -2335,7 +2319,7 @@ xfs_rtpick_extent(
2335 b = mp->m_sb.sb_rextents - len; 2319 b = mp->m_sb.sb_rextents - len;
2336 } 2320 }
2337 *seqp = seq + 1; 2321 *seqp = seq + 1;
2338 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2322 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
2339 *pick = b; 2323 *pick = b;
2340 return 0; 2324 return 0;
2341} 2325}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ff614c29b441..09e1f4f35e97 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -154,7 +154,7 @@ xfs_rtmount_init(
154 if (mp->m_sb.sb_rblocks == 0) 154 if (mp->m_sb.sb_rblocks == 0)
155 return 0; 155 return 0;
156 156
157 cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT"); 157 xfs_warn(mp, "Not built with CONFIG_XFS_RT");
158 return ENOSYS; 158 return ENOSYS;
159} 159}
160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 56861d5daaef..d6d6fdfe9422 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -49,9 +49,9 @@ xfs_do_force_shutdown(
49 logerror = flags & SHUTDOWN_LOG_IO_ERROR; 49 logerror = flags & SHUTDOWN_LOG_IO_ERROR;
50 50
51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
52 cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from " 52 xfs_notice(mp,
53 "line %d of file %s. Return address = 0x%p", 53 "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
54 mp->m_fsname, flags, lnnum, fname, __return_address); 54 __func__, flags, lnnum, fname, __return_address);
55 } 55 }
56 /* 56 /*
57 * No need to duplicate efforts. 57 * No need to duplicate efforts.
@@ -69,30 +69,25 @@ xfs_do_force_shutdown(
69 return; 69 return;
70 70
71 if (flags & SHUTDOWN_CORRUPT_INCORE) { 71 if (flags & SHUTDOWN_CORRUPT_INCORE) {
72 xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp, 72 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
73 "Corruption of in-memory data detected. Shutting down filesystem: %s", 73 "Corruption of in-memory data detected. Shutting down filesystem");
74 mp->m_fsname); 74 if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
75 if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
76 xfs_stack_trace(); 75 xfs_stack_trace();
77 }
78 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 76 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
79 if (logerror) { 77 if (logerror) {
80 xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp, 78 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
81 "Log I/O Error Detected. Shutting down filesystem: %s", 79 "Log I/O Error Detected. Shutting down filesystem");
82 mp->m_fsname);
83 } else if (flags & SHUTDOWN_DEVICE_REQ) { 80 } else if (flags & SHUTDOWN_DEVICE_REQ) {
84 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 81 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
85 "All device paths lost. Shutting down filesystem: %s", 82 "All device paths lost. Shutting down filesystem");
86 mp->m_fsname);
87 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { 83 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
88 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 84 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
89 "I/O Error Detected. Shutting down filesystem: %s", 85 "I/O Error Detected. Shutting down filesystem");
90 mp->m_fsname);
91 } 86 }
92 } 87 }
93 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 88 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
94 cmn_err(CE_ALERT, "Please umount the filesystem, " 89 xfs_alert(mp,
95 "and rectify the problem(s)"); 90 "Please umount the filesystem and rectify the problem(s)");
96 } 91 }
97} 92}
98 93
@@ -106,10 +101,9 @@ xfs_ioerror_alert(
106 xfs_buf_t *bp, 101 xfs_buf_t *bp,
107 xfs_daddr_t blkno) 102 xfs_daddr_t blkno)
108{ 103{
109 cmn_err(CE_ALERT, 104 xfs_alert(mp,
110 "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx" 105 "I/O error occurred: meta-data dev %s block 0x%llx"
111 " (\"%s\") error %d buf count %zd", 106 " (\"%s\") error %d buf count %zd",
112 (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
113 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 107 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
114 (__uint64_t)blkno, func, 108 (__uint64_t)blkno, func,
115 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); 109 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
@@ -173,17 +167,9 @@ xfs_extlen_t
173xfs_get_extsz_hint( 167xfs_get_extsz_hint(
174 struct xfs_inode *ip) 168 struct xfs_inode *ip)
175{ 169{
176 xfs_extlen_t extsz; 170 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
177 171 return ip->i_d.di_extsize;
178 if (unlikely(XFS_IS_REALTIME_INODE(ip))) { 172 if (XFS_IS_REALTIME_INODE(ip))
179 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) 173 return ip->i_mount->m_sb.sb_rextsize;
180 ? ip->i_d.di_extsize 174 return 0;
181 : ip->i_mount->m_sb.sb_rextsize;
182 ASSERT(extsz);
183 } else {
184 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
185 ? ip->i_d.di_extsize : 0;
186 }
187
188 return extsz;
189} 175}
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 33dbc4e0ad62..76922793f64f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1446,6 +1446,14 @@ xfs_log_item_batch_insert(
1446 * Bulk operation version of xfs_trans_committed that takes a log vector of 1446 * Bulk operation version of xfs_trans_committed that takes a log vector of
1447 * items to insert into the AIL. This uses bulk AIL insertion techniques to 1447 * items to insert into the AIL. This uses bulk AIL insertion techniques to
1448 * minimise lock traffic. 1448 * minimise lock traffic.
1449 *
1450 * If we are called with the aborted flag set, it is because a log write during
1451 * a CIL checkpoint commit has failed. In this case, all the items in the
1452 * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
1453 * means that checkpoint commit abort handling is treated exactly the same
1454 * as an iclog write error even though we haven't started any IO yet. Hence in
1455 * this case all we need to do is IOP_COMMITTED processing, followed by an
1456 * IOP_UNPIN(aborted) call.
1449 */ 1457 */
1450void 1458void
1451xfs_trans_committed_bulk( 1459xfs_trans_committed_bulk(
@@ -1472,6 +1480,16 @@ xfs_trans_committed_bulk(
1472 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) 1480 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1473 continue; 1481 continue;
1474 1482
1483 /*
1484 * if we are aborting the operation, no point in inserting the
1485 * object into the AIL as we are in a shutdown situation.
1486 */
1487 if (aborted) {
1488 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
1489 IOP_UNPIN(lip, 1);
1490 continue;
1491 }
1492
1475 if (item_lsn != commit_lsn) { 1493 if (item_lsn != commit_lsn) {
1476 1494
1477 /* 1495 /*
@@ -1503,20 +1521,24 @@ xfs_trans_committed_bulk(
1503} 1521}
1504 1522
1505/* 1523/*
1506 * Called from the trans_commit code when we notice that 1524 * Called from the trans_commit code when we notice that the filesystem is in
1507 * the filesystem is in the middle of a forced shutdown. 1525 * the middle of a forced shutdown.
1526 *
1527 * When we are called here, we have already pinned all the items in the
1528 * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
1529 * so we can simply walk the items in the transaction, unpin them with an abort
1530 * flag and then free the items. Note that unpinning the items can result in
1531 * them being freed immediately, so we need to use a safe list traversal method
1532 * here.
1508 */ 1533 */
1509STATIC void 1534STATIC void
1510xfs_trans_uncommit( 1535xfs_trans_uncommit(
1511 struct xfs_trans *tp, 1536 struct xfs_trans *tp,
1512 uint flags) 1537 uint flags)
1513{ 1538{
1514 struct xfs_log_item_desc *lidp; 1539 struct xfs_log_item_desc *lidp, *n;
1515 1540
1516 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 1541 list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
1517 /*
1518 * Unpin all but those that aren't dirty.
1519 */
1520 if (lidp->lid_flags & XFS_LID_DIRTY) 1542 if (lidp->lid_flags & XFS_LID_DIRTY)
1521 IOP_UNPIN(lidp->lid_item, 1); 1543 IOP_UNPIN(lidp->lid_item, 1);
1522 } 1544 }
@@ -1733,7 +1755,6 @@ xfs_trans_commit_cil(
1733 int flags) 1755 int flags)
1734{ 1756{
1735 struct xfs_log_vec *log_vector; 1757 struct xfs_log_vec *log_vector;
1736 int error;
1737 1758
1738 /* 1759 /*
1739 * Get each log item to allocate a vector structure for 1760 * Get each log item to allocate a vector structure for
@@ -1744,9 +1765,7 @@ xfs_trans_commit_cil(
1744 if (!log_vector) 1765 if (!log_vector)
1745 return ENOMEM; 1766 return ENOMEM;
1746 1767
1747 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); 1768 xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1748 if (error)
1749 return error;
1750 1769
1751 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1770 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1752 xfs_trans_free(tp); 1771 xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c2042b736b81..06a9759b6352 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -469,8 +469,6 @@ void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
469void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); 469void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
472int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
473 xfs_ino_t , uint, uint, struct xfs_inode **);
474void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); 472void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
475void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); 473void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
476void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); 474void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c5bbbc45db91..12aff9584e29 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -563,7 +563,7 @@ xfs_trans_ail_delete_bulk(
563 563
564 spin_unlock(&ailp->xa_lock); 564 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) { 565 if (!XFS_FORCED_SHUTDOWN(mp)) {
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, 566 xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
567 "%s: attempting to delete a log item that is not in the AIL", 567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__); 568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c47918c302a5..3bea66132334 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -305,7 +305,7 @@ xfs_trans_read_buf(
305 if (xfs_error_target == target) { 305 if (xfs_error_target == target) {
306 if (((xfs_req_num++) % xfs_error_mod) == 0) { 306 if (((xfs_req_num++) % xfs_error_mod) == 0) {
307 xfs_buf_relse(bp); 307 xfs_buf_relse(bp);
308 cmn_err(CE_DEBUG, "Returning error!\n"); 308 xfs_debug(mp, "Returning error!");
309 return XFS_ERROR(EIO); 309 return XFS_ERROR(EIO);
310 } 310 }
311 } 311 }
@@ -403,7 +403,7 @@ xfs_trans_read_buf(
403 xfs_force_shutdown(tp->t_mountp, 403 xfs_force_shutdown(tp->t_mountp,
404 SHUTDOWN_META_IO_ERROR); 404 SHUTDOWN_META_IO_ERROR);
405 xfs_buf_relse(bp); 405 xfs_buf_relse(bp);
406 cmn_err(CE_DEBUG, "Returning trans error!\n"); 406 xfs_debug(mp, "Returning trans error!");
407 return XFS_ERROR(EIO); 407 return XFS_ERROR(EIO);
408 } 408 }
409 } 409 }
@@ -427,7 +427,7 @@ shutdown_abort:
427 */ 427 */
428#if defined(DEBUG) 428#if defined(DEBUG)
429 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 429 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
430 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); 430 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
431#endif 431#endif
432 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != 432 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
433 (XBF_STALE|XBF_DELWRI)); 433 (XBF_STALE|XBF_DELWRI));
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ccb34532768b..16084d8ea231 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -44,28 +44,6 @@ xfs_trans_inode_broot_debug(
44#endif 44#endif
45 45
46/* 46/*
47 * Get an inode and join it to the transaction.
48 */
49int
50xfs_trans_iget(
51 xfs_mount_t *mp,
52 xfs_trans_t *tp,
53 xfs_ino_t ino,
54 uint flags,
55 uint lock_flags,
56 xfs_inode_t **ipp)
57{
58 int error;
59
60 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
61 if (!error && tp) {
62 xfs_trans_ijoin(tp, *ipp);
63 (*ipp)->i_itemp->ili_lock_flags = lock_flags;
64 }
65 return error;
66}
67
68/*
69 * Add a locked inode to the transaction. 47 * Add a locked inode to the transaction.
70 * 48 *
71 * The inode must be locked, and it cannot be associated with any transaction. 49 * The inode must be locked, and it cannot be associated with any transaction.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d8e6f8cd6f0c..37d8146ee15b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1189,9 +1189,8 @@ xfs_inactive(
1189 * inode might be lost for a long time or forever. 1189 * inode might be lost for a long time or forever.
1190 */ 1190 */
1191 if (!XFS_FORCED_SHUTDOWN(mp)) { 1191 if (!XFS_FORCED_SHUTDOWN(mp)) {
1192 cmn_err(CE_NOTE, 1192 xfs_notice(mp, "%s: xfs_ifree returned error %d",
1193 "xfs_inactive: xfs_ifree() returned an error = %d on %s", 1193 __func__, error);
1194 error, mp->m_fsname);
1195 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1194 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1196 } 1195 }
1197 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 1196 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
@@ -1208,12 +1207,12 @@ xfs_inactive(
1208 */ 1207 */
1209 error = xfs_bmap_finish(&tp, &free_list, &committed); 1208 error = xfs_bmap_finish(&tp, &free_list, &committed);
1210 if (error) 1209 if (error)
1211 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " 1210 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
1212 "xfs_bmap_finish() returned error %d", error); 1211 __func__, error);
1213 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1212 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1214 if (error) 1213 if (error)
1215 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " 1214 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1216 "xfs_trans_commit() returned error %d", error); 1215 __func__, error);
1217 } 1216 }
1218 1217
1219 /* 1218 /*
@@ -1310,7 +1309,7 @@ xfs_create(
1310 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 1309 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1311 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 1310 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1312 if (error) 1311 if (error)
1313 goto std_return; 1312 return error;
1314 1313
1315 if (is_dir) { 1314 if (is_dir) {
1316 rdev = 0; 1315 rdev = 0;
@@ -1390,12 +1389,6 @@ xfs_create(
1390 } 1389 }
1391 1390
1392 /* 1391 /*
1393 * At this point, we've gotten a newly allocated inode.
1394 * It is locked (and joined to the transaction).
1395 */
1396 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1397
1398 /*
1399 * Now we join the directory inode to the transaction. We do not do it 1392 * Now we join the directory inode to the transaction. We do not do it
1400 * earlier because xfs_dir_ialloc might commit the previous transaction 1393 * earlier because xfs_dir_ialloc might commit the previous transaction
1401 * (and release all the locks). An error from here on will result in 1394 * (and release all the locks). An error from here on will result in
@@ -1440,22 +1433,13 @@ xfs_create(
1440 */ 1433 */
1441 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); 1434 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
1442 1435
1443 /*
1444 * xfs_trans_commit normally decrements the vnode ref count
1445 * when it unlocks the inode. Since we want to return the
1446 * vnode to the caller, we bump the vnode ref count now.
1447 */
1448 IHOLD(ip);
1449
1450 error = xfs_bmap_finish(&tp, &free_list, &committed); 1436 error = xfs_bmap_finish(&tp, &free_list, &committed);
1451 if (error) 1437 if (error)
1452 goto out_abort_rele; 1438 goto out_bmap_cancel;
1453 1439
1454 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1440 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1455 if (error) { 1441 if (error)
1456 IRELE(ip); 1442 goto out_release_inode;
1457 goto out_dqrele;
1458 }
1459 1443
1460 xfs_qm_dqrele(udqp); 1444 xfs_qm_dqrele(udqp);
1461 xfs_qm_dqrele(gdqp); 1445 xfs_qm_dqrele(gdqp);
@@ -1469,27 +1453,21 @@ xfs_create(
1469 cancel_flags |= XFS_TRANS_ABORT; 1453 cancel_flags |= XFS_TRANS_ABORT;
1470 out_trans_cancel: 1454 out_trans_cancel:
1471 xfs_trans_cancel(tp, cancel_flags); 1455 xfs_trans_cancel(tp, cancel_flags);
1472 out_dqrele: 1456 out_release_inode:
1457 /*
1458 * Wait until after the current transaction is aborted to
1459 * release the inode. This prevents recursive transactions
1460 * and deadlocks from xfs_inactive.
1461 */
1462 if (ip)
1463 IRELE(ip);
1464
1473 xfs_qm_dqrele(udqp); 1465 xfs_qm_dqrele(udqp);
1474 xfs_qm_dqrele(gdqp); 1466 xfs_qm_dqrele(gdqp);
1475 1467
1476 if (unlock_dp_on_error) 1468 if (unlock_dp_on_error)
1477 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1469 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1478 std_return:
1479 return error; 1470 return error;
1480
1481 out_abort_rele:
1482 /*
1483 * Wait until after the current transaction is aborted to
1484 * release the inode. This prevents recursive transactions
1485 * and deadlocks from xfs_inactive.
1486 */
1487 xfs_bmap_cancel(&free_list);
1488 cancel_flags |= XFS_TRANS_ABORT;
1489 xfs_trans_cancel(tp, cancel_flags);
1490 IRELE(ip);
1491 unlock_dp_on_error = B_FALSE;
1492 goto out_dqrele;
1493} 1471}
1494 1472
1495#ifdef DEBUG 1473#ifdef DEBUG
@@ -2114,9 +2092,8 @@ xfs_symlink(
2114 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, 2092 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
2115 &first_block, resblks, mval, &nmaps, 2093 &first_block, resblks, mval, &nmaps,
2116 &free_list); 2094 &free_list);
2117 if (error) { 2095 if (error)
2118 goto error1; 2096 goto error2;
2119 }
2120 2097
2121 if (resblks) 2098 if (resblks)
2122 resblks -= fs_blocks; 2099 resblks -= fs_blocks;
@@ -2148,7 +2125,7 @@ xfs_symlink(
2148 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, 2125 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
2149 &first_block, &free_list, resblks); 2126 &first_block, &free_list, resblks);
2150 if (error) 2127 if (error)
2151 goto error1; 2128 goto error2;
2152 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2129 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2153 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2130 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2154 2131
@@ -2161,13 +2138,6 @@ xfs_symlink(
2161 xfs_trans_set_sync(tp); 2138 xfs_trans_set_sync(tp);
2162 } 2139 }
2163 2140
2164 /*
2165 * xfs_trans_commit normally decrements the vnode ref count
2166 * when it unlocks the inode. Since we want to return the
2167 * vnode to the caller, we bump the vnode ref count now.
2168 */
2169 IHOLD(ip);
2170
2171 error = xfs_bmap_finish(&tp, &free_list, &committed); 2141 error = xfs_bmap_finish(&tp, &free_list, &committed);
2172 if (error) { 2142 if (error) {
2173 goto error2; 2143 goto error2;